diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..26809e1d8908f323344e17469a246fbcc2decb63 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/plywood-4k.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.hdf5 filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.obj filter=lfs diff=lfs merge=lfs -text
+*.mtl filter=lfs diff=lfs merge=lfs -text
+*.stl filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.dae filter=lfs diff=lfs merge=lfs -text
+*.hdr filter=lfs diff=lfs merge=lfs -text
+*.msh filter=lfs diff=lfs merge=lfs -text
diff --git a/app.py b/app.py
index c779f46a9c5c1cbe85b2ca5999e182e9a3cde2f1..9091d56ef695d515a9b7e56c5045716ff02783ee 100644
--- a/app.py
+++ b/app.py
@@ -1,109 +1,427 @@
 """
-Phantom Video Processor - Hugging Face Space
+Phantom Video Processor - Hugging Face Space Demo
+将人类手部视频转换为机器人演示数据
 """
 
 import gradio as gr
 import spaces
 import subprocess
 import sys
+import os
+import shutil
+import tempfile
 from pathlib import Path
 
-# ========== 环境配置 ==========
-
+# ========== 路径配置 ==========
 PHANTOM_DIR = Path("/home/user/app/phantom")
+DATA_RAW_DIR = PHANTOM_DIR / "data" / "raw"
+DATA_PROCESSED_DIR = PHANTOM_DIR / "data" / "processed"
+MANO_DIR = PHANTOM_DIR / "submodules" / "phantom-hamer" / "_DATA" / "data" / "mano"
+
+# 添加 Phantom 到 Python 路径
+if PHANTOM_DIR.exists():
+    sys.path.insert(0, str(PHANTOM_DIR))
+    sys.path.insert(0, str(PHANTOM_DIR / "phantom"))
+
+# ========== 环境检测 ==========
+def check_environment():
+    """检查环境状态"""
+    status = {
+        "phantom_installed": Path("/tmp/.phantom_ready").exists(),
+        "mano_ready": (MANO_DIR / "MANO_LEFT.pkl").exists() and (MANO_DIR / "MANO_RIGHT.pkl").exists(),
+        "sample_data": (DATA_RAW_DIR / "pick_and_place").exists(),
+        "cuda_available": False,
+        "gpu_name": None
+    }
+
+    try:
+        import torch
+        status["cuda_available"] = torch.cuda.is_available()
+        if status["cuda_available"]:
+            status["gpu_name"] = torch.cuda.get_device_name(0)
+    except:
+        pass
+
+    return status
+
+def get_status_text():
+    """获取状态文本"""
+    status = check_environment()
+    lines = []
+    lines.append("=" * 40)
+    lines.append("环境状态")
+    lines.append("=" * 40)
+    lines.append(f"Phantom 安装: {'✅' if status['phantom_installed'] else '❌ 首次运行需初始化'}")
+    lines.append(f"MANO 模型: {'✅' if status['mano_ready'] else '❌ 请上传 MANO 模型文件'}")
+    lines.append(f"示例数据: {'✅' if status['sample_data'] else '⏳ 将自动下载'}")
+    lines.append(f"CUDA: {'✅ ' + (status['gpu_name'] or '') if status['cuda_available'] else '⏳ GPU 将在处理时分配'}")
+    lines.append("=" * 40)
+    return "\n".join(lines)
+
+# ========== MANO 模型上传 ==========
+def upload_mano_files(left_file, right_file):
+    """上传 MANO 模型文件"""
+    MANO_DIR.mkdir(parents=True, exist_ok=True)
+
+    messages = []
 
-def setup_environment():
-    """配置Phantom环境（仅首次运行）"""
-    
-    # 检查是否已配置
+    if left_file is not None:
+        dest = MANO_DIR / "MANO_LEFT.pkl"
+        shutil.copy(left_file.name, dest)
+        messages.append(f"✅ MANO_LEFT.pkl 已保存")
+
+    if right_file is not None:
+        dest = MANO_DIR / "MANO_RIGHT.pkl"
+        shutil.copy(right_file.name, dest)
+        messages.append(f"✅ MANO_RIGHT.pkl 已保存")
+
+    if not messages:
+        return "⚠️ 请选择文件上传"
+
+    return "\n".join(messages) + "\n\n" + get_status_text()
+
+# ========== 初始化环境 ==========
+def initialize_environment(progress=gr.Progress()):
+    """初始化 Phantom 环境"""
     if Path("/tmp/.phantom_ready").exists():
-        print("✅ Phantom环境已配置")
-        return True
-    
-    print("🔧 首次运行，配置环境（约5-10分钟）...")
-    
-    # 运行setup.sh
+        return "✅ 环境已就绪\n\n" + get_status_text()
+
+    progress(0, desc="开始初始化...")
+
     setup_script = Path("/home/user/app/setup.sh")
-    if setup_script.exists():
-        try:
-            result = subprocess.run(
-                ["bash", str(setup_script)],
-                check=True,
-                capture_output=True,
-                text=True
-            )
-            print(result.stdout)
-            print("✅ 环境配置完成")
-            return True
-        except subprocess.CalledProcessError as e:
-            print(f"❌ 配置失败: {e.stderr}")
-            return False
-    else:
-        print("⚠️ setup.sh不存在")
-        return False
+    if not setup_script.exists():
+        return "❌ setup.sh 不存在"
 
-# 添加Phantom到Python路径
-if PHANTOM_DIR.exists():
-    sys.path.insert(0, str(PHANTOM_DIR))
+    try:
+        # 运行 setup.sh
+        progress(0.1, desc="运行安装脚本...")
+        process = subprocess.Popen(
+            ["bash", str(setup_script)],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1
+        )
+
+        output_lines = []
+        for line in iter(process.stdout.readline, ''):
+            output_lines.append(line.strip())
+            if len(output_lines) > 50:
+                output_lines = output_lines[-50:]  # 保留最后 50 行
+
+        process.wait()
 
-# 启动时配置环境
-phantom_ready = setup_environment()
+        if process.returncode == 0:
+            progress(1.0, desc="完成!")
+            return "✅ 初始化完成!\n\n" + "\n".join(output_lines[-20:]) + "\n\n" + get_status_text()
+        else:
+            return f"❌ 初始化失败 (返回码: {process.returncode})\n\n" + "\n".join(output_lines[-30:])
 
-# ========== 其余代码保持不变 ==========
+    except Exception as e:
+        return f"❌ 初始化错误: {str(e)}"
 
-@spaces.GPU(duration=120)
-def process_video(video_file, robot_type, target_hand):
-    """处理视频"""
+# ========== 视频处理 ==========
+@spaces.GPU(duration=300)
+def process_video(
+    video_file,
+    robot_type,
+    target_hand,
+    processing_mode,
+    use_sample_data,
+    progress=gr.Progress()
+):
+    """
+    处理视频 - 将人类手部转换为机器人
+    """
     import torch
 
-    if video_file is None:
-        return None, None, "请先上传视频"
+    # 状态信息
+    status_lines = []
 
-    # 检查GPU
+    # GPU 检查
     if torch.cuda.is_available():
         gpu = torch.cuda.get_device_name(0)
-        status = f"✅ GPU: {gpu}\n"
+        status_lines.append(f"✅ GPU: {gpu}")
+        status_lines.append(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
     else:
-        status = "⚠️ 未检测到GPU\n"
-
-    status += f"视频: {video_file}\n"
-    status += f"机器人: {robot_type}\n"
-    status += f"手部: {target_hand}\n"
-    
-    if not phantom_ready:
-        status += "\n⚠️ Phantom环境未就绪"
-
-    return None, None, status
-
-# Gradio界面
-with gr.Blocks(title="Phantom") as demo:
-    gr.Markdown("# 🤖 Phantom - 机器人视频生成器")
-
-    with gr.Row():
-        with gr.Column():
-            video_input = gr.Video(label="上传视频")
-            robot_type = gr.Dropdown(
-                choices=["Panda", "Kinova3", "UR5e"],
-                value="Panda",
-                label="机器人类型"
-            )
-            target_hand = gr.Radio(
-                choices=["left", "right"],
-                value="left",
-                label="目标手部"
+        status_lines.append("❌ GPU 不可用")
+        return None, None, "\n".join(status_lines)
+
+    # 检查环境
+    if not Path("/tmp/.phantom_ready").exists():
+        status_lines.append("❌ 请先点击「初始化环境」按钮")
+        return None, None, "\n".join(status_lines)
+
+    # 检查 MANO
+    if not (MANO_DIR / "MANO_LEFT.pkl").exists():
+        status_lines.append("❌ 请先上传 MANO 模型文件")
+        return None, None, "\n".join(status_lines)
+
+    progress(0.1, desc="准备处理...")
+
+    # 确定输入数据
+    if use_sample_data:
+        demo_name = "pick_and_place"
+        data_root = str(DATA_RAW_DIR)
+        status_lines.append(f"📂 使用示例数据: {demo_name}")
+    else:
+        if video_file is None:
+            status_lines.append("❌ 请上传视频或选择使用示例数据")
+            return None, None, "\n".join(status_lines)
+
+        # 创建临时目录存放上传的视频
+        demo_name = "user_upload"
+        user_data_dir = DATA_RAW_DIR / demo_name / "0"
+        user_data_dir.mkdir(parents=True, exist_ok=True)
+
+        # 复制视频到正确位置
+        video_dest = user_data_dir / "video.mkv"
+        shutil.copy(video_file, video_dest)
+        data_root = str(DATA_RAW_DIR)
+        status_lines.append(f"📂 处理上传视频: {video_file}")
+
+    status_lines.append(f"🤖 机器人类型: {robot_type}")
+    status_lines.append(f"✋ 目标手部: {target_hand}")
+    status_lines.append(f"⚙️ 处理模式: {processing_mode}")
+    status_lines.append("-" * 40)
+
+    progress(0.2, desc="开始处理...")
+
+    # 构建处理命令
+    cmd = [
+        sys.executable,
+        str(PHANTOM_DIR / "phantom" / "process_data.py"),
+        f"demo_name={demo_name}",
+        f"data_root_dir={data_root}",
+        f"processed_data_root_dir={str(DATA_PROCESSED_DIR)}",
+        f"mode={processing_mode}",
+        f"robot={robot_type}",
+        f"target_hand={target_hand}",
+        "bimanual_setup=single_arm",
+        "demo_num=0",  # 只处理第一个 demo
+    ]
+
+    status_lines.append(f"命令: {' '.join(cmd)}")
+
+    try:
+        # 运行处理
+        progress(0.3, desc="处理中...")
+
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            cwd=str(PHANTOM_DIR / "phantom"),
+            env={**os.environ, "PYTHONPATH": str(PHANTOM_DIR)}
+        )
+
+        output_lines = []
+        for line in iter(process.stdout.readline, ''):
+            line = line.strip()
+            if line:
+                output_lines.append(line)
+                # 更新进度
+                if "BBOX" in line:
+                    progress(0.4, desc="检测边界框...")
+                elif "HAND2D" in line:
+                    progress(0.5, desc="提取2D手部姿态...")
+                elif "SEGMENTATION" in line:
+                    progress(0.6, desc="分割手臂...")
+                elif "ACTION" in line:
+                    progress(0.7, desc="提取动作...")
+                elif "INPAINT" in line:
+                    progress(0.8, desc="视频修复...")
+                elif "ROBOT" in line:
+                    progress(0.9, desc="叠加机器人...")
+
+        process.wait()
+
+        progress(1.0, desc="完成!")
+
+        # 添加处理输出
+        status_lines.append("-" * 40)
+        status_lines.append("处理日志 (最后 20 行):")
+        status_lines.extend(output_lines[-20:])
+
+        # 查找输出文件
+        output_video = None
+        output_data = None
+
+        processed_dir = DATA_PROCESSED_DIR / demo_name / "0"
+
+        # 查找生成的视频
+        video_pattern = f"video_overlay_{robot_type}_single_arm.mkv"
+        for f in processed_dir.glob("**/*.mkv"):
+            if robot_type.lower() in f.name.lower():
+                output_video = str(f)
+                break
+
+        # 查找训练数据
+        for f in processed_dir.glob("**/training_data*.npz"):
+            output_data = str(f)
+            break
+
+        if output_video:
+            status_lines.append(f"\n✅ 输出视频: {output_video}")
+        if output_data:
+            status_lines.append(f"✅ 训练数据: {output_data}")
+
+        if process.returncode == 0:
+            status_lines.insert(0, "✅ 处理完成!")
+        else:
+            status_lines.insert(0, f"⚠️ 处理完成但有警告 (返回码: {process.returncode})")
+
+        return output_video, output_data, "\n".join(status_lines)
+
+    except Exception as e:
+        import traceback
+        status_lines.append(f"\n❌ 处理错误: {str(e)}")
+        status_lines.append(traceback.format_exc())
+        return None, None, "\n".join(status_lines)
+
+# ========== Gradio 界面 ==========
+with gr.Blocks(
+    title="Phantom - 机器人视频生成器",
+    theme=gr.themes.Soft()
+) as demo:
+
+    gr.Markdown("""
+    # 🤖 Phantom - 将人类视频转换为机器人演示
+
+    **论文**: [Phantom: Training Robots Without Robots Using Only Human Videos](https://phantom-human-videos.github.io/)
+
+    将人类手部操作视频自动转换为机器人演示数据，用于训练机器人策略。
+    """)
+
+    with gr.Tabs():
+        # ========== 环境设置 Tab ==========
+        with gr.TabItem("1️⃣ 环境设置"):
+            gr.Markdown("""
+            ### 首次使用需要完成以下步骤:
+
+            1. **初始化环境** - 安装依赖和下载模型 (首次约 5-10 分钟)
+            2. **上传 MANO 模型** - 需要从官网注册下载
+            """)
+
+            with gr.Row():
+                with gr.Column():
+                    init_btn = gr.Button("🔧 初始化环境", variant="primary", size="lg")
+                    init_output = gr.Textbox(
+                        label="初始化状态",
+                        lines=15,
+                        value=get_status_text()
+                    )
+
+                with gr.Column():
+                    gr.Markdown("""
+                    ### MANO 模型下载
+
+                    1. 访问 [MANO 官网](https://mano.is.tue.mpg.de/)
+                    2. 注册账号并下载模型
+                    3. 上传 `MANO_LEFT.pkl` 和 `MANO_RIGHT.pkl`
+                    """)
+
+                    mano_left = gr.File(label="MANO_LEFT.pkl", file_types=[".pkl"])
+                    mano_right = gr.File(label="MANO_RIGHT.pkl", file_types=[".pkl"])
+                    upload_btn = gr.Button("📤 上传 MANO 模型")
+                    upload_output = gr.Textbox(label="上传状态", lines=5)
+
+            init_btn.click(fn=initialize_environment, outputs=init_output)
+            upload_btn.click(fn=upload_mano_files, inputs=[mano_left, mano_right], outputs=upload_output)
+
+        # ========== 视频处理 Tab ==========
+        with gr.TabItem("2️⃣ 视频处理"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### 输入设置")
+
+                    use_sample = gr.Checkbox(
+                        label="使用示例数据 (pick_and_place)",
+                        value=True,
+                        info="推荐首次使用时勾选，使用预置的示例视频"
+                    )
+
+                    video_input = gr.Video(
+                        label="或上传自己的视频",
+                        interactive=True
+                    )
+
+                    robot_type = gr.Dropdown(
+                        choices=["Panda", "Kinova3", "UR5e", "IIWA", "Jaco"],
+                        value="Panda",
+                        label="机器人类型"
+                    )
+
+                    target_hand = gr.Radio(
+                        choices=["left", "right"],
+                        value="left",
+                        label="目标手部"
+                    )
+
+                    processing_mode = gr.Dropdown(
+                        choices=[
+                            "bbox",
+                            "hand2d",
+                            "arm_segmentation",
+                            "hand_inpaint",
+                            "robot_inpaint",
+                            "all"
+                        ],
+                        value="bbox",
+                        label="处理模式",
+                        info="建议逐步运行: bbox -> hand2d -> arm_segmentation -> hand_inpaint -> robot_inpaint"
+                    )
+
+                    process_btn = gr.Button("🚀 开始处理", variant="primary", size="lg")
+
+                with gr.Column():
+                    gr.Markdown("### 输出结果")
+
+                    video_output = gr.Video(label="生成的机器人视频")
+                    data_output = gr.File(label="训练数据 (NPZ)")
+                    status_output = gr.Textbox(label="处理状态", lines=20)
+
+            process_btn.click(
+                fn=process_video,
+                inputs=[video_input, robot_type, target_hand, processing_mode, use_sample],
+                outputs=[video_output, data_output, status_output]
             )
-            btn = gr.Button("开始处理", variant="primary")
 
-        with gr.Column():
-            video_out = gr.Video(label="结果视频")
-            data_out = gr.File(label="训练数据")
-            status_out = gr.Textbox(label="状态", lines=10)
+        # ========== 说明 Tab ==========
+        with gr.TabItem("📖 说明"):
+            gr.Markdown("""
+            ## 处理流程
+
+            Phantom 将人类手部视频转换为机器人演示数据，处理步骤:
+
+            | 步骤 | 模式 | 描述 |
+            |------|------|------|
+            | 1 | `bbox` | 检测手部边界框 |
+            | 2 | `hand2d` | 提取 2D 手部姿态 |
+            | 3 | `arm_segmentation` | 分割人类手臂 |
+            | 4 | `hand_inpaint` | 移除手臂并修复背景 |
+            | 5 | `robot_inpaint` | 叠加虚拟机器人 |
+
+            ## 输入要求
+
+            - **视频格式**: MKV, MP4 等常见格式
+            - **分辨率**: 推荐 1080p
+            - **内容**: 单手操作视频，手部需清晰可见
+
+            ## GPU Zero 限制
+
+            - 单次处理时间限制: 300 秒
+            - 建议逐步运行各处理模式
+            - 复杂视频可能需要多次处理
+
+            ## 参考资料
 
-    btn.click(
-        fn=process_video,
-        inputs=[video_input, robot_type, target_hand],
-        outputs=[video_out, data_out, status_out]
-    )
+            - [Phantom 论文](https://arxiv.org/abs/2503.00779)
+            - [GitHub 仓库](https://github.com/MarionLepert/phantom)
+            - [MANO 手部模型](https://mano.is.tue.mpg.de/)
+            """)
 
+# 启动
 if __name__ == "__main__":
     demo.queue().launch()
diff --git a/phantom b/phantom
deleted file mode 160000
index a8bb81c1bbe6ade129a1f6f0906482f510354a5e..0000000000000000000000000000000000000000
--- a/phantom
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a8bb81c1bbe6ade129a1f6f0906482f510354a5e
diff --git a/phantom/.gitignore b/phantom/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..9d4c4b47f97936f015d1e9223c36e186b205383d
--- /dev/null
+++ b/phantom/.gitignore
@@ -0,0 +1,11 @@
+*.egg-info
+**/_DATA/*
+data/raw/*
+!data/raw/.gitkeep
+data/processed/*
+!data/processed/.gitkeep
+**/__pycache__/*
+*.pyc
+*.pth
+outputs/*
+phantom/outputs/*
\ No newline at end of file
diff --git a/phantom/.gitmodules b/phantom/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..f965f10b5541f50eba9f54f32884677fd641b8ea
--- /dev/null
+++ b/phantom/.gitmodules
@@ -0,0 +1,15 @@
+[submodule "submodules/phantom-E2FGVI"]
+	path = submodules/phantom-E2FGVI
+	url = git@github.com:MarionLepert/phantom-E2FGVI.git
+[submodule "submodules/sam2"]
+	path = submodules/sam2
+	url = git@github.com:facebookresearch/sam2.git
+[submodule "submodules/phantom-robosuite"]
+	path = submodules/phantom-robosuite
+	url = git@github.com:MarionLepert/phantom-robosuite.git
+[submodule "submodules/phantom-robomimic"]
+	path = submodules/phantom-robomimic
+	url = git@github.com:MarionLepert/phantom-robomimic.git
+[submodule "submodules/phantom-hamer"]
+	path = submodules/phantom-hamer
+	url = git@github.com:MarionLepert/phantom-hamer.git
diff --git a/phantom/LICENSE b/phantom/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a7919af83c0ef65fa3d553a06db8f0f491fd7cba
--- /dev/null
+++ b/phantom/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Stanford Interactive Perception and Robot Learning Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/phantom/README.md b/phantom/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cad6c69ba6b73a8ce9c87ec429c0ba4d25f6150e
--- /dev/null
+++ b/phantom/README.md
@@ -0,0 +1,168 @@
+# Code for Phantom and Masquerade
+[![Python](https://img.shields.io/badge/python-3.10-blue)](https://www.python.org)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+<hr style="border: 2px solid gray;"></hr>
+
+This repository contains the code used to process human videos in [Phantom: Training Robots Without Robots Using Only Human Videos](https://phantom-human-videos.github.io/) and [Masquerade: Learning from In-the-wild Human Videos using Data-Editing](https://masquerade-robot.github.io/). 
+
+<table>
+<tr>
+  <td align="center" width="50%">
+    <h3><a href="https://phantom-human-videos.github.io/">Phantom: Training Robots Without Robots Using Only Human Videos</a></h3>
+    <p><em><a href=https://marionlepert.github.io/>Marion Lepert</a></em>, <em><a href=https://jiayingfang.github.io/>Jiaying Fang</a></em>, <em><a href=https://web.stanford.edu/~bohg/>Jeannette Bohg</a></em></p>
+    <a href="https://phantom-human-videos.github.io/">
+    <img src="docs/teaser_phantom.png" alt="Phantom Teaser" width="90%">
+    </a>
+  </td>
+  <td align="center" width="50%">
+    <h3><a href="https://masquerade-robot.github.io/">Masquerade: Learning from In-the-wild Human Videos using Data-Editing</a></h3>
+    <p><em><a href=https://marionlepert.github.io/>Marion Lepert*</a></em>, <em><a href=https://jiayingfang.github.io/>Jiaying Fang*</a></em>, <em><a href=https://web.stanford.edu/~bohg/>Jeannette Bohg</a></em></p>
+    <img src="docs/teaser_masquerade.png" alt="Masquerade Teaser" width="90%">
+  </td>
+</tr>
+</table>
+
+Both projects use data editing to convert human videos into “robotized” demonstrations. They share much of the same codebase, with some differences in the processing pipeline:
+
+**Phantom**
+* Input: RGBD videos with a single left hand visible in every frame.
+* Data editing: inpaint the single human arm, overlay a rendered robot arm in the same pose.
+* Action labels: extract full 3D end-effector pose (position, orientation, gripper)
+
+**Masquerade**
+* Input: RGB videos from [Epic Kitchens](https://epic-kitchens.github.io/2025); one or both hands may be visible, sometimes occluded.
+* Data editing: segment and inpaint both arms, overlay a bimanual robot whose effectors follow the estimated poses (with a 3-4cm error along the depth direction due to lack of depth data)
+* Action labels: use 2D projected waypoints as auxiliary supervision only (not full 3D actions)
+
+
+
+## Installation
+1. Clone this repo recursively 
+
+```bash
+git clone --recursive git@github.com:MarionLepert/phantom.git
+```
+
+2. Run the following script from the root directory to install the required conda environment.
+```bash
+./install.sh
+```
+
+3. Download the MANO hand models. To do so, go to the [MANO website](https://mano.is.tue.mpg.de/) and register to be able to download the models. Download the left and right hand models and move MANO_LEFT.pkl and MANO_RIGHT.pkl inside the `$ROOT_DIR/submodules/phantom-hamer/_DATA/data/mano/` folder.
+
+## Getting Started
+Process **Phantom** sample data (manually collected in-lab videos)
+```bash
+conda activate phantom
+
+python process_data.py demo_name=pick_and_place data_root_dir=../data/raw processed_data_root_dir=../data/processed mode=all
+```
+
+Process **Masquerade** sample data ([Epic Kitchens](https://epic-kitchens.github.io/2025) video)
+```bash
+conda activate phantom
+
+python process_data.py demo_name=epic data_root_dir=../data/raw processed_data_root_dir=../data/processed mode=all --config-name=epic
+```
+
+
+## Codebase Overview
+
+### Process data
+Each video is processed using the following steps:
+
+1. **Extract human hand bounding boxes**: `bbox_processor.py` 
+     * `mode=bbox`
+
+2. **Extract 2d human hand poses**: `hand_processor.py` 
+     * `mode=hand2d`: extract the 2d hand pose
+
+3. **Extract human and arm segmentation masks**: `segmentation_processor.py`
+     * `mode=hand_segmentation`: used for depth alignment in hand pose refinement (only works for hand3d)
+     * `mode=arm_segmentation`: needed in all cases to inpaint the human 
+
+2. **Extract 3d human hand poses**: `hand_processor.py` 
+     * `mode=hand3d`: extract the 3d hand pose (note: requires depth, and was only tested on the left hand)
+
+4. **Retarget human actions to robot actions**: `action_processor.py`
+     * `mode=action` 
+
+5. **Smooth human poses**: `smoothing_processor.py`
+     * `mode=smoothing`
+
+6. **Remove hand from videos using inpainting**: `handinpaint_processor.py`
+     * `mode=hand_inpaint`
+     * Inpainting method [E2FGVI](https://arxiv.org/pdf/2204.02663) is used.
+
+7. **Overlay virtual robot on video**: `robotinpaint_processor.py`
+     * `mode=robot_inpaint`: overlay a single robot (default) or bimanual (epic mode) robot on the image
+
+
+### Config reference (see configuration files in `configs/`)
+
+| Flag | Type | Required | Choices | Description |
+|------|------|----------|---------|-------------|
+| `--demo_name` | `str` | ✅ | - | Name of the demonstration/dataset to process |
+| `--mode` | `str` (multiple) | ✅ | `bbox`, `hand2d`, `hand3d`, `hand_segmentation`, `arm_segmentation`, `action`, `smoothing`, `hand_inpaint`, `robot_inpaint`, `all` | Processing modes to run (can specify multiple with e.g. `'mode=[bbox,hand2d]'`) |
+| `--robot_name` | `str` | ✅ | `Panda`, `Kinova3`, `UR5e`, `IIWA`, `Jaco` | Type of robot to use for overlays |
+| `--gripper_name` | `str` | ❌ | `Robotiq85` | Type of gripper to use |
+| `--data_root_dir` | `str` | ❌ | - | Root directory containing raw video data |
+| `--processed_data_root_dir` | `str` | ❌ | - | Root directory to save processed data |
+| `--epic` | `bool` | ❌ | - | Use Epic-Kitchens dataset processing mode |
+| `--bimanual_setup` | `str` | ❌ | `single_arm`, `shoulders` | Bimanual setup configuration to use (shoulders corresponds to the bimanual hardware configuration used in Masquerade) |
+| `--target_hand` | `str` | ❌ | `left`, `right`, `both` | Which hand(s) to target for processing |
+| `--camera_intrinsics` | `str` | ❌ | - | Path to camera intrinsics file |
+| `--camera_extrinsics` | `str` | ❌ | - | Path to camera extrinsics file |
+| `--input_resolution` | `int` | ❌ | - | Resolution of input videos |
+| `--output_resolution` | `int` | ❌ | - | Resolution of output videos |
+| `--depth_for_overlay` | `bool` | ❌ | - | Use depth information for overlays |
+| `--demo_num` | `str` | ❌ | - | Process a single demo number instead of all demos |
+| `--debug_cameras` | `str` (multiple) | ❌ | - | Additional camera names to include for debugging |
+| `--constrained_hand` | `bool` | ❌ | - | Use constrained hand processing |
+| `--render` | `bool` | ❌ | - | Render the robot overlay on the video |
+
+**Note** Please specify `--bimanual_setup single_arm` along with `--target_hand left` or `--target_hand right` if you are using single arm. For bimanual setups, use `--bimanual_setup shoulders`.
+
+### Camera details
+* **Phantom**: a Zed2 camera was used to capture the sample data at HD1080 resolution. 
+* **Masquerade**: We used Epic-Kitchens videos and used the camera intrinsics provided in the dataset. To use videos captured with a different camera resolution, update the camera intrinsics and extrinsics files in `$ROOT_DIR/phantom/camera/`.
+
+### Train policy
+After processing the video data, the edited data can be used to train a policy. The following files should be used:
+
+* Observations
+  * Phantom Samples: extract RGB images from `data/processed/pick_and_place/*/video_overlay_Panda_single_arm.mkv`
+  * Epic (In-the-wild Data) Samples: extract RGB images from `data/processed/epic/*/video_overlay_Kinova3_shoulders.mkv`
+
+* Actions
+  * Phantom Samples: All data stored in `data/processed/pick_and_place/*/inpaint_processor/training_data_single_arm.npz`
+  * Epic (In-the-wild Data) Samples: All data stored in `data/processed/epic/*/inpaint_processor/training_data_shoulders.npz`
+
+
+In Phantom, [Diffusion Policy](https://github.com/real-stanford/diffusion_policy) was used for policy training.
+
+
+## Citation
+```bibtex
+@article{lepert2025phantomtrainingrobotsrobots,
+        title={Phantom: Training Robots Without Robots Using Only Human Videos}, 
+        author={Marion Lepert and Jiaying Fang and Jeannette Bohg},
+        year={2025},
+        eprint={2503.00779},
+        archivePrefix={arXiv},
+        primaryClass={cs.RO},
+        url={https://arxiv.org/abs/2503.00779}, 
+  }
+```
+
+```bibtex
+@misc{lepert2025masqueradelearninginthewildhuman,
+      title={Masquerade: Learning from In-the-wild Human Videos using Data-Editing}, 
+      author={Marion Lepert and Jiaying Fang and Jeannette Bohg},
+      year={2025},
+      eprint={2508.09976},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2508.09976}, 
+}
+```
diff --git a/phantom/configs/default.yaml b/phantom/configs/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7de4caa5a2357a1b8466b9d079b14c4fd093e1e6
--- /dev/null
+++ b/phantom/configs/default.yaml
@@ -0,0 +1,30 @@
+# Default configuration (PHANTOM paper settings)
+debug: false
+verbose: false
+skip_existing: false
+n_processes: 1
+data_root_dir: "../data/raw_data/"
+processed_data_root_dir: "../data/processed_data/"
+demo_name: ""
+
+# Processing settings
+mode: ["bbox"]  # Default processing mode - must be one of: bbox, hand2d, hand3d, hand_segmentation, arm_segmentation, action, smoothing, hand_inpaint, robot_inpaint, all
+demo_num: null  # Process specific demo number (null = process all)
+
+# Additional settings
+debug_cameras: []
+
+# PHANTOM paper configuration (default)
+input_resolution: 1080
+output_resolution: 240
+robot: "Panda"
+gripper: "Robotiq85"
+square: true
+epic: false
+bimanual_setup: "single_arm"
+target_hand: "left"
+constrained_hand: true
+depth_for_overlay: true
+render: false
+camera_intrinsics: "camera/camera_intrinsics_HD1080.json"
+camera_extrinsics: "camera/camera_extrinsics.json"
diff --git a/phantom/configs/epic.yaml b/phantom/configs/epic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cbcd86f3122a51cdd79f52027fdabe579755abc1
--- /dev/null
+++ b/phantom/configs/epic.yaml
@@ -0,0 +1,31 @@
+# Default configuration (PHANTOM paper settings)
+debug: false
+verbose: false
+skip_existing: false
+n_processes: 1
+data_root_dir: "../data/raw_data/"
+processed_data_root_dir: "../data/processed_data/"
+demo_name: ""
+
+# Processing settings
+mode: ["bbox"]  # Default processing mode
+demo_num: null  # Process specific demo number (null = process all videos in the root folder)
+
+# Additional settings
+debug_cameras: [] # Add other robomimic cameras like sideview, etc. Warning: this significantly slows down the processing time
+
+
+# EPIC-KITCHENS configuration override
+input_resolution: 256
+output_resolution: 256
+robot: "Kinova3"
+gripper: "Robotiq85"
+square: false
+epic: true
+bimanual_setup: "shoulders"
+target_hand: "both"
+constrained_hand: false
+depth_for_overlay: false
+render: false
+camera_intrinsics: "camera/camera_intrinsics_epic.json"
+camera_extrinsics: "camera/camera_extrinsics_ego_bimanual_shoulders.json"
diff --git a/phantom/configs/sam2_hiera_l.yaml b/phantom/configs/sam2_hiera_l.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1092802b1d24be6fedf78939f45b0d021d4ec560
--- /dev/null
+++ b/phantom/configs/sam2_hiera_l.yaml
@@ -0,0 +1,117 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False
diff --git a/phantom/data/__init__.py b/phantom/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/docs/teaser_masquerade.png b/phantom/docs/teaser_masquerade.png
new file mode 100644
index 0000000000000000000000000000000000000000..821d8082dbe9ffbb0a2a2a21e3584fa204932b7e
--- /dev/null
+++ b/phantom/docs/teaser_masquerade.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f0f5355b51b44f98b8aced3b5c41255d3e9a04b0810a4d9b616c67e1ba05b9c
+size 1278978
diff --git a/phantom/docs/teaser_phantom.png b/phantom/docs/teaser_phantom.png
new file mode 100644
index 0000000000000000000000000000000000000000..fdef26e45b26c5a990313f7aa2c73374c7edba34
--- /dev/null
+++ b/phantom/docs/teaser_phantom.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a79506ef23efac9c85af0805ca5e23ec59a6a90e0de7bc475cfde94bd793f9c0
+size 3089124
diff --git a/phantom/install.sh b/phantom/install.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f98f34492a025c1bd46fe9a46d6bce65ca2ed12f
--- /dev/null
+++ b/phantom/install.sh
@@ -0,0 +1,67 @@
+eval "$(conda shell.bash hook)"
+# ######################## Phantom Env ###############################
+conda create -n phantom python=3.10 -y
+conda activate phantom
+conda install nvidia/label/cuda-12.1.0::cuda-toolkit -c nvidia/label/cuda-12.1.0 -y
+
+# Install SAM2
+cd submodules/sam2
+pip install -v -e ".[notebooks]"
+cd ../..
+
+# Install Hamer
+cd submodules/phantom-hamer
+pip install -e .\[all\]
+pip install -v -e third-party/ViTPose
+wget https://www.cs.utexas.edu/~pavlakos/hamer/data/hamer_demo_data.tar.gz
+tar --warning=no-unknown-keyword --exclude=".*" -xvf hamer_demo_data.tar.gz
+cd ../..
+
+# Install mmcv
+pip install --index-url https://download.pytorch.org/whl/cu121 torch==2.1.0 torchvision==0.16.0
+pip install mmcv==1.3.9
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html
+pip install numpy==1.26.4
+
+# Install phantom-robosuite
+cd submodules/phantom-robosuite
+pip install -e .
+cd ../..
+
+# Install phantom-robomimic
+cd submodules/phantom-robomimic
+pip install -e .
+cd ../..
+
+# Install additional packages
+pip install joblib mediapy open3d pandas
+pip install transformers==4.42.4
+pip install PyOpenGL==3.1.4
+pip install Rtree
+pip install git+https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes.git
+pip install protobuf==3.20.0
+pip install hydra-core==1.3.2
+pip install omegaconf==2.3.0
+
+# Download E2FGVI weights
+cd submodules/phantom-E2FGVI/E2FGVI/release_model/
+pip install gdown
+gdown --fuzzy https://drive.google.com/file/d/10wGdKSUOie0XmCr8SQ2A2FeDe-mfn5w3/view?usp=sharing
+cd ../..
+
+# Install phantom-E2FGVI
+pip install -e .
+cd ../..
+
+# Install phantom 
+pip install -e .
+
+# Download sample data
+cd data/raw
+wget https://download.cs.stanford.edu/juno/phantom/pick_and_place.zip
+unzip pick_and_place.zip
+rm pick_and_place.zip
+wget https://download.cs.stanford.edu/juno/phantom/epic.zip
+unzip epic.zip
+rm epic.zip
+cd ../..
diff --git a/phantom/phantom/__init__.py b/phantom/phantom/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/phantom/camera/__init__.py b/phantom/phantom/camera/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/phantom/camera/camera_extrinsics.json b/phantom/phantom/camera/camera_extrinsics.json
new file mode 100644
index 0000000000000000000000000000000000000000..002325929b83dabaae9357cf13f684994f5a200a
--- /dev/null
+++ b/phantom/phantom/camera/camera_extrinsics.json
@@ -0,0 +1,42 @@
+[
+    {
+        "camera_base_ori": [
+            [
+                0.9842690634302423,
+                -0.053375086066005106,
+                0.1684206369825258
+            ],
+            [
+                -0.1763762231197722,
+                -0.35235905397979306,
+                0.9190944048336218
+            ],
+            [
+                0.010287793357058851,
+                -0.934341584895969,
+                -0.3562302121408726
+            ]
+        ],
+        "camera_base_ori_rotvec": [
+            -1.930138005212092,
+            0.16467696378244215,
+            -0.12809137765065973
+        ],
+        "camera_base_pos": [
+            0.3407932803063093,
+            -0.40868423448040403,
+            0.39911982578151795
+        ],
+        "camera_base_quat": [
+            0.8204965462375373,
+            -0.07000374049084156,
+            0.054451304871138306,
+            -0.564729979129313
+        ],
+        "p_marker_ee": [
+            -0.01874144739551215,
+            0.029611448317719172,
+            -0.013687685723932594
+        ]
+    }
+]
\ No newline at end of file
diff --git a/phantom/phantom/camera/camera_extrinsics_ego_bimanual_shoulders.json b/phantom/phantom/camera/camera_extrinsics_ego_bimanual_shoulders.json
new file mode 100644
index 0000000000000000000000000000000000000000..88a9a82435a8d7b9b3f3be32671ff1c1fa8f1573
--- /dev/null
+++ b/phantom/phantom/camera/camera_extrinsics_ego_bimanual_shoulders.json
@@ -0,0 +1,52 @@
+[
+    {
+        "num_marker_seen": 114,
+        "stage2_retry": 11,
+        "pixel_error": 2.1157278874907863,
+        "proj_func": "hand_marker_proj_world_camera",
+        "intrinsics": {
+            "fx": 731.4708862304688,
+            "fy": 731.4708862304688,
+            "ppx": 646.266357421875,
+            "ppy": 355.9967956542969
+        },
+        "camera_base_ori": [
+            [
+                -0.7220417114840215,
+                0.37764981440725887,
+                0.579686453658689
+            ],
+            [
+                0.020370475586732495,
+                0.8491206965938227,
+                -0.527805917303316
+            ],
+            [
+                -0.6915495720493177,
+                -0.3692893991088662,
+                -0.6207934673498243
+            ]
+        ],
+        "camera_base_ori_rotvec": [
+            0.2877344548443808,
+            2.3075097094104504,
+            -0.6485227972051454
+        ],
+        "camera_base_pos": [
+            -0.5123627783256401,
+            -0.11387480700266536,
+            0.3151264229148423
+        ],
+        "p_marker_ee": [
+            -0.041990731174163416,
+            -0.02636865486252487,
+            -0.01442948433864288
+        ],
+        "camera_base_quat": [
+            0.11139014686225811,
+            0.8933022830245745,
+            -0.25106152012025673,
+            0.35576871621882866
+        ]
+    }
+]
\ No newline at end of file
diff --git a/phantom/phantom/camera/camera_intrinsics_HD1080.json b/phantom/phantom/camera/camera_intrinsics_HD1080.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff52d7fe21d3d1b5bc1b078e3ac4e5ba0292f327
--- /dev/null
+++ b/phantom/phantom/camera/camera_intrinsics_HD1080.json
@@ -0,0 +1,48 @@
+{
+    "left": {
+        "fx": 1057.7322998046875,
+        "fy": 1057.7322998046875,
+        "cx": 972.5150756835938,
+        "cy": 552.568359375,
+        "disto": [
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "v_fov": 54.09259796142578,
+        "h_fov": 84.45639038085938,
+        "d_fov": 92.32276916503906
+    },
+    "right": {
+        "fx": 1057.7322998046875,
+        "fy": 1057.7322998046875,
+        "cx": 972.5150756835938,
+        "cy": 552.568359375,
+        "disto": [
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "v_fov": 54.09259796142578,
+        "h_fov": 84.45639038085938,
+        "d_fov": 92.32276916503906
+    }
+}
\ No newline at end of file
diff --git a/phantom/phantom/camera/camera_intrinsics_epic.json b/phantom/phantom/camera/camera_intrinsics_epic.json
new file mode 100644
index 0000000000000000000000000000000000000000..29986434a940212bca3f49df336bfeb3520a839a
--- /dev/null
+++ b/phantom/phantom/camera/camera_intrinsics_epic.json
@@ -0,0 +1,48 @@
+{
+    "left": {
+        "fx": 248.7892127911359,
+        "fy": 248.7892127911359,
+        "cx": 228,
+        "cy": 128,
+        "disto": [
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "v_fov": 54.6,
+        "h_fov": 83.21271514892578,
+        "d_fov": 91.07240295410156
+    },
+    "right": {
+        "fx": 248.7892127911359,
+        "fy": 248.7892127911359,
+        "cx": 228,
+        "cy": 128,
+        "disto": [
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "v_fov": 54.6,
+        "h_fov": 83.21271514892578,
+        "d_fov": 91.07240295410156
+    }
+}
\ No newline at end of file
diff --git a/phantom/phantom/detectors/detector_detectron2.py b/phantom/phantom/detectors/detector_detectron2.py
new file mode 100644
index 0000000000000000000000000000000000000000..608dd61d900a476a51ddc0285afd503ffa753047
--- /dev/null
+++ b/phantom/phantom/detectors/detector_detectron2.py
@@ -0,0 +1,121 @@
+"""
+Wrapper around detectron2 for object detection
+"""
+import os
+import numpy as np
+from pathlib import Path
+from typing import Tuple
+import cv2
+import logging
+import mediapy as media
+import requests
+import hamer  # type: ignore
+from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy  # type: ignore
+from detectron2.config import LazyConfig  # type: ignore
+
+logger = logging.getLogger(__name__)
+
+def download_detectron_ckpt(root_dir: str, ckpt_path: str) -> None:
+    url = "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h/f328730692/model_final_f05665.pkl"
+    save_path = Path(root_dir, ckpt_path)
+    save_path.parent.mkdir(exist_ok=True, parents=True)
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(save_path, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        logger.info(f"File downloaded successfully and saved to {save_path}")
+    else:
+        logger.info(f"Failed to download the file. Status code: {response.status_code}")
+
+
+class DetectorDetectron2:
+    def __init__(self, root_dir: str):
+        cfg_path = (Path(hamer.__file__).parent / "configs" / "cascade_mask_rcnn_vitdet_h_75ep.py")
+        detectron2_cfg = LazyConfig.load(str(cfg_path))
+
+        detectron2_cfg.train.init_checkpoint = os.path.join(
+            root_dir, "_DATA/detectron_ckpts/model_final_f05665.pkl"
+        )
+        if not os.path.exists(detectron2_cfg.train.init_checkpoint):
+            download_detectron_ckpt(
+                root_dir, "_DATA/detectron_ckpts/model_final_f05665.pkl"
+            )
+        for predictor in detectron2_cfg.model.roi_heads.box_predictors:
+            predictor.test_score_thresh = 0.25
+        self.detectron2 = DefaultPredictor_Lazy(detectron2_cfg)
+
+    def get_bboxes(self, img: np.ndarray, visualize: bool=False, 
+                   visualize_wait: bool=True) -> Tuple[np.ndarray, np.ndarray]:
+        """ Get bounding boxes and scores for the detected hand in the image """
+        det_out = self.detectron2(img)
+
+        det_instances = det_out["instances"]
+        valid_idx = (det_instances.pred_classes == 0) & (det_instances.scores > 0.5)
+        pred_bboxes = det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
+        pred_scores = det_instances.scores[valid_idx].cpu().numpy()
+
+        if visualize:
+            img_rgb = img.copy()
+            img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+            for bbox, score in zip(pred_bboxes, pred_scores):
+                cv2.rectangle(
+                    img_bgr,
+                    (int(bbox[0]), int(bbox[1])),
+                    (int(bbox[2]), int(bbox[3])),
+                    (0, 255, 0),
+                    2,
+                )
+                cv2.putText(img_bgr,
+                            f"{score:.4f}",
+                            (int(bbox[0]), int(bbox[1])),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            1,
+                            (0, 255, 0),
+                            2,
+                            cv2.LINE_AA)
+
+            cv2.imshow(f"Detected bounding boxes", img_bgr)
+            if visualize_wait:
+                cv2.waitKey(0)
+            else:
+                cv2.waitKey(1)
+
+        return pred_bboxes, pred_scores
+    
+    def get_best_bbox(self, img: np.ndarray, visualize: bool=False, 
+                      visualize_wait: bool=True) -> Tuple[np.ndarray, float]:
+        """ Get the best bounding box and score for the detected hand in the image """
+        bboxes, scores = self.get_bboxes(img)
+        if len(bboxes) == 0:
+            logger.info("No bbox found with Detectron")
+            return np.array([]), 0
+        best_idx = scores.argmax()
+        best_bbox, best_score = bboxes[best_idx], scores[best_idx]
+
+        if visualize: 
+            img_rgb = img.copy()
+            img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+            cv2.rectangle(
+                img_bgr,
+                (int(best_bbox[0]), int(best_bbox[1])),
+                (int(best_bbox[2]), int(best_bbox[3])),
+                (0, 255, 0),
+                2,
+            )
+            cv2.putText(img_bgr,
+                        f"{best_score:.4f}",
+                        (int(best_bbox[0]), int(best_bbox[1])),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        1,
+                        (0, 255, 0),
+                        2,
+                        cv2.LINE_AA)
+
+            cv2.imshow(f"Best detected bounding box", img_bgr)
+            if visualize_wait:
+                cv2.waitKey(0)
+            else:
+                cv2.waitKey(1)
+        
+        return best_bbox, best_score
\ No newline at end of file
diff --git a/phantom/phantom/detectors/detector_dino.py b/phantom/phantom/detectors/detector_dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad8fa9162d5545ebd9f9e6bacf434e56a24e37a
--- /dev/null
+++ b/phantom/phantom/detectors/detector_dino.py
@@ -0,0 +1,108 @@
+"""
+Wrapper around DINO-V2 for object detection
+"""
+from typing import Sequence, Tuple, Optional
+import numpy as np
+from transformers import pipeline  # type: ignore
+from PIL import Image
+import cv2
+import logging
+
+from phantom.utils.image_utils import DetectionResult
+
+logger = logging.getLogger(__name__)
+
+class DetectorDino:
+    def __init__(self, detector_id: str):
+        self.detector = pipeline(
+            model=detector_id,
+            task="zero-shot-object-detection",
+            device="cuda",
+            batch_size=4,
+        )
+
+    def get_bboxes(self, frame: np.ndarray, object_name: str, threshold: float = 0.4, 
+                   visualize: bool = False, pause_visualization: bool = True) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Detect objects in a frame and return their bounding boxes and confidence scores.
+        
+        Args:
+            frame: Input image as numpy array in RGB format
+            object_name: Target object category to detect
+            threshold: Confidence threshold for detection (0.0-1.0)
+            visualize: If True, displays detection results visually
+            pause_visualization: If True, waits for key press when visualizing
+            
+        Returns:
+            Tuple of (bounding_boxes, confidence_scores) as numpy arrays
+            Empty arrays if no objects detected
+        """
+        img_pil = Image.fromarray(frame)
+        labels = [f"{object_name}."]
+        results = self.detector(img_pil, candidate_labels=labels, threshold=threshold)
+        results = [DetectionResult.from_dict(result) for result in results]
+        if not results:
+            return np.array([]), np.array([])
+        bboxes = np.array([np.array(result.box.xyxy) for result in results])
+        scores = np.array([result.score for result in results])
+
+        if visualize:
+            img_rgb = frame.copy()
+            img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+            for bbox, score in zip(bboxes, scores):
+                cv2.rectangle(
+                    img_bgr,
+                    (int(bbox[0]), int(bbox[1])),
+                    (int(bbox[2]), int(bbox[3])),
+                    (0, 255, 0),
+                    2,
+                )
+                cv2.putText(img_bgr,
+                            f"{score:.4f}",
+                            (int(bbox[0]), int(bbox[1])),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            1,
+                            (0, 255, 0),
+                            2,
+                            cv2.LINE_AA)
+            cv2.imshow("Detection", img_bgr)
+            if pause_visualization:
+                cv2.waitKey(0)
+            else:
+                cv2.waitKey(1)
+        return bboxes, scores
+
+
+    def get_best_bbox(self, frame: np.ndarray, object_name: str, threshold: float = 0.4, 
+               visualize: bool = False, pause_visualization: bool = True) -> Optional[np.ndarray]:
+        bboxes, scores = self.get_bboxes(frame, object_name, threshold)
+        if len(bboxes) == 0:
+            return None
+        best_idx = np.array(scores).argmax()
+        best_bbox, best_score = bboxes[best_idx], scores[best_idx]
+
+        if visualize:
+            img_rgb = frame.copy()
+            img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+            cv2.rectangle(
+                img_bgr,
+                (best_bbox[0], best_bbox[1]),
+                (best_bbox[2], best_bbox[3]),
+                (0, 255, 0),
+                2,
+            )
+            cv2.putText(img_bgr,
+                    f"{best_score:.4f}",
+                    (int(best_bbox[0]), int(best_bbox[1])),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    1,
+                    (0, 255, 0),
+                    2,
+                    cv2.LINE_AA)
+            cv2.imshow("Detection", img_bgr)
+            if pause_visualization:
+                cv2.waitKey(0)
+            else:
+                cv2.waitKey(1)
+        return best_bbox
+    
\ No newline at end of file
diff --git a/phantom/phantom/detectors/detector_hamer.py b/phantom/phantom/detectors/detector_hamer.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc6f5143255df1a1b9f6086854266d86a53278a9
--- /dev/null
+++ b/phantom/phantom/detectors/detector_hamer.py
@@ -0,0 +1,447 @@
+"""
+Wrapper around HaMeR for hand pose estimation
+"""
+import os
+import logging
+import numpy as np
+from pathlib import Path
+from typing import Optional, Tuple
+
+import cv2
+import torch
+from hamer.utils import recursive_to  # type: ignore
+import matplotlib.pyplot as plt
+
+from hamer.models import HAMER, DEFAULT_CHECKPOINT  # type: ignore
+import sys
+import os
+# Add the phantom-hamer directory to Python path for vitpose_model import
+hamer_path = os.path.join(os.path.dirname(__file__), '..', '..', 'submodules', 'phantom-hamer')
+if hamer_path not in sys.path:
+    sys.path.insert(0, hamer_path)
+from vitpose_model import ViTPoseModel  # type: ignore
+from hamer.datasets.vitdet_dataset import ViTDetDataset  # type: ignore
+from hamer.utils.renderer import cam_crop_to_full  # type: ignore
+from hamer.utils.geometry import perspective_projection  # type: ignore
+from hamer.configs import get_config  # type: ignore
+from yacs.config import CfgNode as CN  # type: ignore
+
+from phantom.utils.data_utils import get_parent_folder_of_package
+
+logger = logging.getLogger(__name__)
+
+THUMB_VERTEX = 756
+INDEX_FINGER_VERTEX = 350
+
+class DetectorHamer:
+    """
+    Detector using the HaMeR model for 3D hand pose estimation.
+    
+    The detection pipeline consists of:
+    - Initial hand detection using general object detectors
+    - Hand type classification (left/right) using ViTPose
+    - 3D pose estimation using HaMeR
+    - MANO parameters estimation for mesh reconstruction
+    
+    Dependencies:
+    - HaMeR model for 3D pose estimation
+    - ViTPose for keypoint detection
+    - DINO and Detectron2 for initial hand detection
+    """
+    def __init__(self):
+        root_dir = get_parent_folder_of_package("hamer")
+        checkpoint_path = Path(root_dir, DEFAULT_CHECKPOINT)
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        self.rescale_factor = 2.0 # Factor for padding the box
+        self.batch_size = 1 # Batch size for inference
+
+        self.model, self.model_cfg = self.load_hamer_model(checkpoint_path, root_dir)
+        self.model.to(self.device)
+        self.model.eval()
+
+        root_dir = "../submodules/phantom-hamer/"
+        vit_dir = os.path.join(root_dir, "third-party/ViTPose/")
+        self.cpm = ViTPoseModel(device=self.device, root_dir=root_dir, vit_dir=vit_dir)
+
+        self.faces_right = self.model.mano.faces
+        self.faces_left = self.faces_right[:,[0,2,1]]
+
+    def detect_hand_keypoints(self, 
+                              img: np.ndarray,
+                              hand_side: str,
+                              visualize: bool=False, 
+                              visualize_3d: bool=False, 
+                              pause_visualization: bool=True, 
+                              bboxes: Optional[np.ndarray]=None,
+                              is_right: Optional[np.ndarray]=None,
+                              kpts_2d_only: Optional[bool]=False,
+                              camera_params: Optional[dict]=None) -> Optional[dict]:
+        """
+        Detect hand keypoints in the input image.
+        
+        The method performs the following steps:
+        1. Detect hand bounding boxes using object detectors
+        2. Optionally refine boxes using ViTPose to determine hand type (left/right)
+        3. Run HaMeR model to estimate 3D hand pose
+        4. Project 3D keypoints back to 2D for visualization
+        
+        Args:
+            img: Input RGB image as numpy array
+            hand_side: Target hand side to detect (left or right)
+            visualize: If True, displays detection results in a window
+            visualize_3d: If True, shows 3D visualization of keypoints and mesh
+            pause_visualization: If True, waits for key press when visualizing
+            bboxes: Bounding boxes of the hands
+            is_right: Whether the hand is right
+            kpts_2d_only: If True, only cares about 2D keypoints, i.e., use default 
+            focal length in HaMeR instead of real camera intrinsics
+            camera_params: Optional camera intrinsics (fx, fy, cx, cy)
+            
+        Returns:
+            Dictionary containing:
+                - annotated_img: Image with keypoints drawn
+                - success: Whether detection was successful (21 keypoints found)
+                - kpts_3d: 3D keypoints in camera space
+                - kpts_2d: 2D keypoints projected onto image
+                - verts: 3D mesh vertices
+                - T_cam_pred: Camera transformation matrix
+                - Various camera parameters and MANO pose parameters
+        """
+        if not kpts_2d_only:
+            scaled_focal_length, camera_center = self.get_image_params(img, camera_params)
+        else:
+            scaled_focal_length, camera_center = self.get_image_params(img, camera_params=None)
+
+
+        dataset = ViTDetDataset(self.model_cfg, img, bboxes, is_right, rescale_factor=self.rescale_factor)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=0)
+
+        list_2d_kpts, list_3d_kpts, list_verts = [], [], []
+        T_cam_pred_all: list[torch.Tensor] = []
+        list_global_orient = []
+        kpts_2d_hamer = None
+        for batch in dataloader:
+            batch = recursive_to(batch, "cuda")
+            with torch.no_grad():
+                out = self.model(batch)
+
+            batch_T_cam_pred_all = DetectorHamer.get_all_T_cam_pred(batch, out, scaled_focal_length)
+
+            for idx in range(len(batch_T_cam_pred_all)):
+                kpts_3d = out["pred_keypoints_3d"][idx].detach().cpu().numpy()  # [21, 3]
+                verts = out["pred_vertices"][idx].detach().cpu().numpy()  # [778, 3]
+                is_right = batch["right"][idx].cpu().numpy()
+                global_orient = out["pred_mano_params"]["global_orient"][idx].detach().cpu().numpy()
+                hand_pose = out["pred_mano_params"]["hand_pose"][idx].detach().cpu().numpy()
+                list_global_orient.append(global_orient)
+
+                if hand_side == "left":
+                    kpts_3d, verts = DetectorHamer.convert_right_hand_keypoints_to_left_hand(kpts_3d, verts)
+
+                T_cam_pred = batch_T_cam_pred_all[idx]
+
+                img_w, img_h = batch["img_size"][idx].float()
+
+                kpts_2d_hamer = DetectorHamer.project_3d_kpt_to_2d(kpts_3d, img_w, img_h, scaled_focal_length, 
+                                                            camera_center, T_cam_pred)
+
+                # Keep T_cam_pred as tensor
+                list_2d_kpts.append(kpts_2d_hamer)
+                list_3d_kpts.append(kpts_3d + T_cam_pred.cpu().numpy())
+                list_verts.append(verts + T_cam_pred.cpu().numpy())
+
+            T_cam_pred_all += batch_T_cam_pred_all
+
+        annotated_img = DetectorHamer.visualize_2d_kpt_on_img(
+            kpts_2d=list_2d_kpts[0],
+            img=img,
+        )
+
+        if visualize:
+            if bboxes is not None:
+                cv2.rectangle(annotated_img, (int(bboxes[0][0]), int(bboxes[0][1])), (int(bboxes[0][2]), int(bboxes[0][3])), (0, 255, 0), 2)
+            cv2.imshow("Annotated Image", annotated_img)
+            cv2.waitKey(0 if pause_visualization else 1)
+
+        if visualize_3d:
+            DetectorHamer.visualize_keypoints_3d(annotated_img, list_3d_kpts[0], list_verts[0])
+
+
+        return {
+            "annotated_img": annotated_img,
+            "success": len(list_2d_kpts[0]) == 21,
+            "kpts_3d": list_3d_kpts[0],
+            "kpts_2d": np.rint(list_2d_kpts[0]).astype(np.int32),
+            "verts": list_verts[0],
+            "T_cam_pred": T_cam_pred_all[0],
+            "scaled_focal_length": scaled_focal_length,
+            "camera_center": camera_center,
+            "img_w": img_w,
+            "img_h": img_h,
+            "global_orient": list_global_orient[0],
+            "hand_pose": hand_pose,
+        }
+    
+    def get_image_params(self, img: np.ndarray, camera_params: Optional[dict]) -> Tuple[float, torch.Tensor]:
+        """
+        Get the scaled focal length and camera center.
+        """
+        img_w = img.shape[1]
+        img_h = img.shape[0]
+        if camera_params is not None:
+            scaled_focal_length = camera_params["fx"]
+            cx = camera_params["cx"]
+            cy = camera_params["cy"]
+            camera_center = torch.tensor([img_w-cx, img_h-cy])
+        else:
+            scaled_focal_length = (self.model_cfg.EXTRA.FOCAL_LENGTH / self.model_cfg.MODEL.IMAGE_SIZE 
+                                   * max(img_w, img_h))
+            camera_center = torch.tensor([img_w, img_h], dtype=torch.float).reshape(1, 2) / 2.0
+        return scaled_focal_length, camera_center
+    
+    @staticmethod
+    def convert_right_hand_keypoints_to_left_hand(kpts, verts):
+        """
+        Convert right hand keypoints/vertices to left hand by mirroring across the Y-Z plane.
+        
+        This is done by flipping the X coordinates of both keypoints and vertices.
+        The MANO model internally uses right hand, so this conversion is needed
+        when processing left hands.
+        
+        Args:
+            kpts: 3D keypoints [21, 3]
+            verts: 3D mesh vertices [778, 3]
+            
+        Returns:
+            Transformed keypoints and vertices
+        """
+        kpts[:,0] = -kpts[:,0]
+        verts[:,0] = -verts[:,0]
+        return kpts, verts
+
+    @staticmethod
+    def visualize_keypoints_3d(annotated_img: np.ndarray, kpts_3d: np.ndarray, verts: np.ndarray) -> None:
+        nfingers = len(kpts_3d) - 1
+        npts_per_finger = 4
+        list_fingers = [np.vstack([kpts_3d[0], kpts_3d[i:i + npts_per_finger]]) for i in range(1, nfingers, npts_per_finger)]
+        finger_colors_bgr = [(0, 255, 0), (0, 0, 255), (255, 0, 0), (255, 0, 255), (0, 255, 255)]
+        finger_colors_rgb = [(color[2], color[1], color[0]) for color in finger_colors_bgr]
+        fig, axs = plt.subplots(1,2, figsize=(20, 10))
+        axs[0] = fig.add_subplot(111, projection='3d')
+        for finger_idx, finger_pts in enumerate(list_fingers):
+            for i in range(len(finger_pts) - 1):
+                color = finger_colors_rgb[finger_idx]
+                axs[0].plot(
+                    [finger_pts[i][0], finger_pts[i + 1][0]],
+                    [finger_pts[i][1], finger_pts[i + 1][1]],
+                    [finger_pts[i][2], finger_pts[i + 1][2]],
+                    color=np.array(color)/255.0,
+                )
+        axs[0].scatter(kpts_3d[:, 0], kpts_3d[:, 1], kpts_3d[:, 2])
+        axs[0].scatter(verts[:, 0], verts[:, 1], verts[:, 2])
+        annotated_img_rgb = cv2.cvtColor(annotated_img, cv2.COLOR_BGR2RGB)
+        axs[1].imshow(annotated_img_rgb)
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111)
+        ax.imshow(annotated_img_rgb)
+
+        plt.show()
+
+    @staticmethod
+    def get_all_T_cam_pred(batch: dict, out: dict, scaled_focal_length: float) -> torch.Tensor:
+        """
+        Get the camera transformation matrix
+        """
+        multiplier = 2 * batch["right"] - 1
+        pred_cam = out["pred_cam"]
+        pred_cam[:, 1] = multiplier * pred_cam[:, 1]
+        box_center = batch["box_center"].float()
+        box_size = batch["box_size"].float()
+        # NOTE: FOR HaMeR, they are using the img_size as (W, H)
+        W_H_shapes = batch["img_size"].float() 
+
+        multiplier = 2 * batch["right"] - 1
+        T_cam_pred_all = cam_crop_to_full(
+            pred_cam, box_center, box_size, W_H_shapes, scaled_focal_length
+        )
+
+        return T_cam_pred_all
+
+    @staticmethod
+    def visualize_2d_kpt_on_img(kpts_2d: np.ndarray, img: np.ndarray) -> np.ndarray:
+        """
+        Plot 2D hand keypoints on the image with finger connections.
+        
+        Each finger is drawn with a different color:
+        - Thumb: Green
+        - Index: Blue
+        - Middle: Red
+        - Ring: Magenta
+        - Pinky: Cyan
+        
+        Args:
+            kpts_2d: 2D keypoints as integers [21, 2]
+            img: Input RGB image
+            
+        Returns:
+            Image with keypoints and connections drawn (BGR format)
+        """
+        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        pts = kpts_2d.astype(np.int32)
+        nfingers = len(pts) - 1
+        npts_per_finger = 4
+        list_fingers = [np.vstack([pts[0], pts[i:i + npts_per_finger]]) for i in range(1, nfingers, npts_per_finger)]
+        finger_colors = [(0, 255, 0), (0, 0, 255), (255, 0, 0), (255, 0, 255), (0, 255, 255)]
+        thickness = 5 if img_bgr.shape[0] > 1000 else 2
+        for finger_idx, finger_pts in enumerate(list_fingers):
+            for i in range(len(finger_pts) - 1):
+                color = finger_colors[finger_idx]
+                cv2.line(
+                    img_bgr,
+                    tuple(finger_pts[i]),
+                    tuple(finger_pts[i + 1]),
+                    color,
+                    thickness=thickness,
+                )
+
+        cv2.line(img_bgr, [1787, 1522], [1656,1400], (255,0,0), thickness=thickness)
+
+        for pt in pts:
+            cv2.circle(img_bgr, (pt[0], pt[1]), radius=thickness, color=(0,0,0), thickness=thickness-1)
+
+        return img_bgr
+    
+
+    @staticmethod
+    def project_3d_kpt_to_2d(kpts_3d: torch.Tensor, img_w: int, img_h: int, scaled_focal_length: float,
+                                camera_center: torch.Tensor, T_cam: Optional[torch.Tensor] = None,) -> np.ndarray:
+        """
+        Project 3D keypoints to 2D image coordinates using perspective projection.
+        """
+        batch_size = 1
+
+        rotation = torch.eye(3).unsqueeze(0)
+        assert T_cam is not None
+
+        T_cam = T_cam.cpu()
+        kpts_3d = torch.tensor(kpts_3d).cpu()
+
+        T_cam = T_cam.clone().cuda()
+        kpts_3d = kpts_3d.clone().cuda()
+        rotation = rotation.cuda()
+
+        scaled_focal_length_full = torch.tensor([scaled_focal_length, scaled_focal_length]).reshape(1, 2)
+
+        # IMPORTANT: The perspective_projection function assumes T_cam has not been added to kpts_3d already!
+        kpts_2d = perspective_projection(
+            kpts_3d.reshape(batch_size, -1, 3),
+            rotation=rotation.repeat(batch_size, 1, 1),
+            translation=T_cam.reshape(batch_size, -1),
+            focal_length=scaled_focal_length_full.repeat(batch_size, 1),
+            camera_center=camera_center.repeat(batch_size, 1),
+            ).reshape(batch_size, -1, 2)
+        kpts_2d = kpts_2d[0].cpu().numpy()
+
+        return np.rint(kpts_2d).astype(np.int32)
+
+    @staticmethod
+    def annotate_bboxes_on_img(img: np.ndarray, debug_bboxes: dict) -> np.ndarray:
+        """
+        Annotate bounding boxes on the image.
+
+        :param img: Input image (numpy array)
+        :param debug_bboxes: Dictionary containing different sets of bounding boxes and optional scores
+        :return: Annotated image
+        """
+        color_dict = {
+            "dino_bboxes": (0, 255, 0),
+            "det_bboxes": (0, 0, 255),
+            "refined_bboxes": (255, 0, 0),
+            "filtered_bboxes": (255, 255, 0),
+        }
+        corner_dict = {
+            "dino_bboxes": "top_left",
+            "det_bboxes": "top_right",
+            "refined_bboxes": "bottom_left",
+            "filtered_bboxes": "bottom_right",
+        }
+        
+        def draw_bbox_and_label(bbox, label, color, label_pos, include_label=True):
+            """ Helper function to draw the bounding box and add label """
+            cv2.rectangle(
+                img,
+                (int(bbox[0]), int(bbox[1])),
+                (int(bbox[2]), int(bbox[3])),
+                color,
+                2,
+            )
+            if include_label:
+                cv2.putText(
+                    img, label, label_pos, 
+                    cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2, cv2.LINE_AA
+                )
+
+        label_pos_dict = {
+            "top_left": lambda bbox: (int(bbox[0]), int(bbox[1]) - 10),
+            "bottom_right": lambda bbox: (int(bbox[2]) - 150, int(bbox[3]) - 10),
+            "top_right": lambda bbox: (int(bbox[2]) - 150, int(bbox[1]) - 10),
+            "bottom_left": lambda bbox: (int(bbox[0]), int(bbox[3]) - 10),
+        }
+
+        for key, value in debug_bboxes.items():
+            # Unpack bboxes and scores
+            if key in ["dino_bboxes", "det_bboxes"]:
+                bboxes, scores = value
+            else:
+                bboxes = value
+                scores = [None] * len(bboxes)  
+
+            color = color_dict.get(key, (0, 0, 0)) 
+            label_pos_fn = label_pos_dict[corner_dict.get(key, "top_left")]
+
+            # Draw each bounding box and its label
+            for idx, bbox in enumerate(bboxes):
+                score_text = f" {scores[idx]:.3f}" if scores[idx] is not None else ""
+                label = key.split("_")[0] + score_text
+
+                # Draw bounding box and label on the image
+                label_pos = label_pos_fn(bbox)
+                if key in ["dino_bboxes", "det_bboxes"] or idx == 0:
+                    draw_bbox_and_label(bbox, label, color, label_pos)
+        return img
+
+
+    @staticmethod
+    def load_hamer_model(checkpoint_path: str, root_dir: Optional[str] = None) -> Tuple[HAMER, CN]:
+        """
+        Load the HaMeR model from the checkpoint path.
+        """
+        model_cfg_path = str(Path(checkpoint_path).parent.parent / "model_config.yaml")
+        model_cfg = get_config(model_cfg_path, update_cachedir=True)
+        # update model and params path
+        if root_dir:
+            model_cfg.defrost()
+            model_cfg.MANO.DATA_DIR = os.path.join(root_dir, model_cfg.MANO.DATA_DIR)
+            model_cfg.MANO.MODEL_PATH = os.path.join(root_dir, model_cfg.MANO.MODEL_PATH.replace("./", ""))
+            model_cfg.MANO.MEAN_PARAMS = os.path.join(root_dir, model_cfg.MANO.MEAN_PARAMS.replace("./", ""))
+            model_cfg.freeze()
+
+        # Override some config values, to crop bbox correctly
+        if (model_cfg.MODEL.BACKBONE.TYPE == "vit") and ("BBOX_SHAPE" not in model_cfg.MODEL):
+            model_cfg.defrost()
+            assert (
+                model_cfg.MODEL.IMAGE_SIZE == 256
+            ), f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone"
+            model_cfg.MODEL.BBOX_SHAPE = [192, 256]
+            model_cfg.freeze()
+
+        # Update config to be compatible with demo
+        if "PRETRAINED_WEIGHTS" in model_cfg.MODEL.BACKBONE:
+            model_cfg.defrost()
+            model_cfg.MODEL.BACKBONE.pop("PRETRAINED_WEIGHTS")
+            model_cfg.freeze()
+
+        model = HAMER.load_from_checkpoint(checkpoint_path, strict=False, cfg=model_cfg)
+        return model, model_cfg
diff --git a/phantom/phantom/detectors/detector_sam2.py b/phantom/phantom/detectors/detector_sam2.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe180fe7652d4bf3739c5a5f9aa90054da356945
--- /dev/null
+++ b/phantom/phantom/detectors/detector_sam2.py
@@ -0,0 +1,240 @@
+"""
+Wrapper around SAM2 for object segmentation
+"""
+import numpy as np
+import pdb
+import os 
+import logging
+import requests
+from typing import Tuple, Optional
+from pathlib import Path
+import matplotlib.pyplot as plt
+from matplotlib.axes import Axes
+import cv2
+from PIL import Image
+import torch
+from sam2.build_sam import build_sam2  # type: ignore
+from sam2.sam2_image_predictor import SAM2ImagePredictor  # type: ignore
+from sam2.build_sam import build_sam2_video_predictor  # type: ignore
+
+logger = logging.getLogger(__name__)
+
+def download_sam2_ckpt(ckpt_path: str) -> None:
+    url = "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt"
+    save_path = Path(ckpt_path)
+    save_path.parent.mkdir(exist_ok=True, parents=True)
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(save_path, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        logger.info(f"File downloaded successfully and saved to {save_path}")
+    else:
+        logger.info(f"Failed to download the file. Status code: {response.status_code}")
+
+class DetectorSam2:
+    """
+    A detector that uses the SAM2 model for object segmentation in images and videos.
+    """
+    def __init__(self):
+        checkpoint = "../submodules/sam2/checkpoints/sam2_hiera_large.pt"
+        model_cfg = "sam2_hiera_l.yaml"
+        
+        if not os.path.exists(checkpoint):
+            download_sam2_ckpt(checkpoint)
+        self.device = "cuda"
+        
+        self.video_predictor = build_sam2_video_predictor(model_cfg, checkpoint, device=self.device)
+    
+    def segment_video(self, video_dir: Path, bbox: np.ndarray, points: np.ndarray, 
+                      indices: int, reverse: bool=False, output_bboxes: Optional[np.ndarray]=None):
+        """
+        Segment an object across video frames using SAM2's video tracking capabilities.
+        
+        Parameters:
+            video_dir: Directory containing video frames as image files
+            bbox: Bounding box coordinates [x0, y0, x1, y1] for the object to track
+            points: Point(s) on the object to track
+            start_idx: Frame index to start tracking from
+            
+        Returns:
+            video_segments: Dictionary mapping frame indices to segmentation masks
+            list_annotated_imgs: Array of frames with the segmented object masked out
+        """
+        frame_names = os.listdir(video_dir)
+        frame_names = sorted(frame_names)
+        with torch.inference_mode(), torch.autocast(self.device, dtype=torch.bfloat16):
+            state = self.video_predictor.init_state(video_path=str(video_dir))
+            self.video_predictor.reset_state(state)
+
+            for point, idx in zip(points, indices):
+                try: 
+                    if bbox is None or np.all(bbox) == 0:
+                        self.video_predictor.add_new_points_or_box(
+                            state,
+                            frame_idx=int(idx),
+                            obj_id=0,
+                            points=np.array(point),
+                            labels=np.ones(len(point)),
+                        )
+                    else:
+                        self.video_predictor.add_new_points_or_box(
+                            state,
+                            frame_idx=int(idx),
+                            obj_id=0,
+                            box=np.array(bbox),
+                            points=np.array(point),
+                            labels=np.ones(len(point)),
+                        )
+                except Exception as e:
+                    print("Error in adding new points or box:", e)
+                    pdb.set_trace()
+ 
+            video_segments = {}
+            for (
+                out_frame_idx,
+                out_obj_ids,
+                out_mask_logits,
+            ) in self.video_predictor.propagate_in_video(state, reverse=reverse):
+                video_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                    for i, out_obj_id in enumerate(out_obj_ids)
+                }
+
+        frame_indices = list(video_segments.keys())
+        frame_indices.sort()
+        list_annotated_imgs = {}
+        for out_frame_idx in frame_indices:
+            img = Image.open(os.path.join(video_dir, frame_names[out_frame_idx]))
+            img_arr = np.array(img)
+            mask = video_segments[out_frame_idx][0]
+            if output_bboxes is not None:
+                # Crop the mask to the bounding box
+                output_bbox = output_bboxes[out_frame_idx].astype(np.int32)
+                if output_bbox.sum() > 0:
+                    bbox_mask = np.zeros_like(mask)
+                    bbox_mask = self._crop_mask_to_bbox(mask, output_bbox)
+                    mask = mask * bbox_mask
+            img_arr[mask[0]] = (0, 0, 0)
+            list_annotated_imgs[out_frame_idx] = img_arr
+
+        if output_bboxes is not None:
+            for out_frame_idx in frame_indices:
+                output_bbox = output_bboxes[out_frame_idx].astype(np.int32)
+                mask = video_segments[out_frame_idx][0]
+                mask_ori = mask.copy()
+                if output_bbox.sum() > 0:
+                    bbox_mask = np.zeros_like(mask)
+                    bbox_mask = self._crop_mask_to_bbox(mask, output_bbox)
+                    mask = mask * bbox_mask
+                    video_segments[out_frame_idx] = {
+                        0: mask
+                    }
+    
+        # Fix gpu memory leak
+        torch.cuda.empty_cache()
+
+        return video_segments, list_annotated_imgs
+    
+    def _crop_mask_to_bbox(self, mask: np.ndarray, bbox: np.ndarray) -> np.ndarray:
+        """
+        Crop a mask to a bounding box.
+        """
+        margin = 20
+        bbox = bbox.astype(np.int32)
+        x0, y0, x1, y1 = bbox
+        x0 = max(0, x0 - margin)
+        x1 = min(mask.shape[2], x1 + margin)
+        y0 = max(0, y0 - margin)
+        y1 = min(mask.shape[1], y1 + margin)
+        bbox_mask = np.zeros_like(mask)
+        bbox_mask[:, y0:y1, x0:x1] = 1
+        return bbox_mask
+
+    def segment_video_from_mask(self, video_dir: str, mask: np.ndarray, frame_idx: int, reverse=False):
+        """
+        Propagate a segmentation mask through video frames (forward or backward).
+        
+        Parameters:
+            video_dir: Directory containing video frames
+            mask: Initial segmentation mask to propagate
+            frame_idx: Frame index where the mask is defined
+            reverse: If True, propagate backward in time; if False, propagate forward
+            
+        Returns:
+            frame_indices: List of frame indices where masks were generated
+            video_segments: Dictionary mapping frame indices to segmentation masks
+        """
+        with torch.inference_mode(), torch.autocast(self.device, dtype=torch.bfloat16):
+            state = self.video_predictor.init_state(video_path=video_dir)
+            self.video_predictor.reset_state(state)
+
+            self.video_predictor.add_new_mask(state, frame_idx, 0, mask)
+
+            video_segments = {}
+            mask_prob = {}
+            for (
+                out_frame_idx,
+                out_obj_ids,
+                out_mask_logits,
+            ) in self.video_predictor.propagate_in_video(state, reverse=reverse):
+                mask_prob[out_frame_idx] = torch.mean(torch.sigmoid(out_mask_logits))
+                video_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                    for i, out_obj_id in enumerate(out_obj_ids)
+                }
+
+        frame_indices = list(video_segments.keys())
+        frame_indices.sort()
+        return frame_indices, video_segments
+
+    @staticmethod
+    def show_mask(mask: np.ndarray, ax: Axes, random_color: bool=False, borders: bool = True) -> None:
+        if random_color:
+            color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+        else:
+            color = np.array([30/255, 144/255, 255/255, 0.6])
+        h, w = mask.shape[-2:]
+        mask = mask.astype(np.uint8)
+        mask_image =  mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+        if borders:
+            contours, _ = cv2.findContours(mask,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) 
+            # Try to smooth contours
+            contours = [cv2.approxPolyDP(contour, epsilon=0.01, closed=True) for contour in contours]
+            mask_image = cv2.drawContours(mask_image, contours, -1, (1, 1, 1, 0.5), thickness=2) 
+        ax.imshow(mask_image)
+
+
+    @staticmethod
+    def show_masks(image: np.ndarray, masks: np.ndarray, scores: np.ndarray, point_coords: Optional[np.ndarray]=None, 
+                   box_coords: Optional[np.ndarray]=None, input_labels: Optional[np.ndarray]=None, borders: bool=True) -> None:
+        n_masks = len(masks)
+        fig, axs = plt.subplots(1, n_masks, figsize=(10*n_masks, 10))
+        for i, (mask, score) in enumerate(zip(masks, scores)):
+            axs[i].imshow(image)
+            DetectorSam2.show_mask(mask, axs[i], borders=borders)
+            if point_coords is not None:
+                assert input_labels is not None
+                DetectorSam2.show_points(point_coords, input_labels, axs[i])
+            if box_coords is not None:
+                DetectorSam2.show_box(box_coords, axs[i])
+            if len(scores) > 1:
+                axs[i].set_title(f"Mask {i+1}, Score: {score:.3f}", fontsize=18)
+            axs[i].axis('off')
+        plt.show()
+
+    @staticmethod
+    def show_box(box: np.ndarray, ax: Axes) -> None:
+        x0, y0 = box[0], box[1]
+        w, h = box[2] - box[0], box[3] - box[1]
+        ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))    
+
+
+    @staticmethod
+    def show_points(coords: np.ndarray, labels: np.ndarray, ax: Axes, marker_size: int=375) -> None:
+        pos_points = coords[labels==1]
+        neg_points = coords[labels==0]
+        ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', 
+                   s=marker_size, edgecolor='white', linewidth=1.25)
+        ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', 
+                   s=marker_size, edgecolor='white', linewidth=1.25)   
diff --git a/phantom/phantom/hand.py b/phantom/phantom/hand.py
new file mode 100644
index 0000000000000000000000000000000000000000..a13fe0ecbff77249d2251315106d6160c0de5d20
--- /dev/null
+++ b/phantom/phantom/hand.py
@@ -0,0 +1,805 @@
+"""
+Hand Model Module
+
+This module provides hand modeling for action processors. It converts detected hand 
+keypoints into kinematic models that can be used for robot control
+
+Key Components:
+- HandModel: Base class for unconstrained hand kinematic modeling
+- PhysicallyConstrainedHandModel: Extended class with constrained joint and velocity limits
+- Grasp point and orientation calculation for robot end-effector control
+
+The hand model follows the MediaPipe hand landmark convention with 21 keypoints:
+- Wrist (1 point)
+- Thumb (4 points: MCP, PIP, DIP, TIP)
+- Index finger (4 points: MCP, PIP, DIP, TIP)
+- Middle finger (4 points: MCP, PIP, DIP, TIP)
+- Ring finger (4 points: MCP, PIP, DIP, TIP)
+- Pinky finger (4 points: MCP, PIP, DIP, TIP)
+
+Coordinate System:
+- All calculations performed in robot coordinate frame
+- Grasp orientations aligned with robot end-effector conventions
+- Joint rotations represented as rotation matrices and Euler angles
+"""
+
+from typing import Optional, List, Dict, Tuple, Union, Any
+import numpy as np
+import pdb
+import torch
+from scipy.spatial.transform import Rotation
+import logging
+
+from phantom.utils.transform_utils import * 
+logger = logging.getLogger(__name__)
+
+class HandModel:
+    """
+    Base class for hand kinematic modeling and trajectory analysis.
+    
+    This class provides a kinematic representation of a human hand using 21 keypoints
+    from hand pose estimation. It calculates joint rotations, tracks hand motion over
+    time, and computes grasp points and orientations suitable for robot control.
+    
+    The kinematic structure follows a tree topology with the wrist as the root,
+    and each finger as a separate chain. Joint rotations are calculated relative
+    to parent joints using vector alignment methods.
+    
+    Key Features:
+    - 21-point hand keypoint processing
+    - Joint rotation calculation using vector alignment
+    - Grasp point computation from thumb-index / thumb-middle finger positioning
+    - End-effector orientation calculation for robot control
+    
+    Attributes:
+        robot_name (str): Name of the target robot for coordinate frame alignment
+        kinematic_tree (List[Tuple[int, int]]): Parent-child relationships for hand joints
+        joint_to_neighbors_mapping (Dict[int, Tuple[int, int, int]]): Mapping of joints to their neighbors
+        vertex_positions (List[np.ndarray]): Time series of hand keypoint positions
+        joint_rotations (List[List[np.ndarray]]): Time series of joint rotation matrices
+        grasp_points (List[np.ndarray]): Time series of computed grasp points
+        grasp_oris (List[np.ndarray]): Time series of grasp orientation matrices
+        timestamps (List[float]): Time stamps for each frame
+        num_joints (int): Total number of joints in the hand model
+        joint_rotations_xyz (List[List[np.ndarray]]): Time series of Euler angle representations
+    """
+    def __init__(self, robot_name: str) -> None:
+        """
+        Initialize the hand model with kinematic structure.
+        
+        Args:
+            robot_name: Name of the target robot for coordinate alignment
+        """
+        self.robot_name: str = robot_name
+        
+        # Define the kinematic tree structure for hand joints
+        # Format: (joint_index, parent_index) where -1 indicates root (wrist)
+        self.kinematic_tree: List[Tuple[int, int]] = [
+            (0, -1),    # wrist base (root of the kinematic tree)
+
+            # Thumb chain (4 joints)
+            (1, 0),     # thumb mcp 
+            (2, 1),     # thumb pip 
+            (3, 2),     # thumb dip 
+            (4, 3),     # thumb tip
+
+            # Index finger chain (4 joints)
+            (5, 0),     # index mcp
+            (6, 5),     # index pip
+            (7, 6),     # index dip
+            (8, 7),     # index tip
+
+            # Middle finger chain (4 joints)
+            (9, 0),     # middle mcp
+            (10, 9),    # middle pip
+            (11, 10),   # middle dip
+            (12, 11),   # middle tip
+
+            # Ring finger chain (4 joints)
+            (13, 0),    # ring mcp
+            (14, 13),   # ring pip
+            (15, 14),   # ring dip
+            (16, 15),   # ring tip
+
+            # Pinky finger chain (4 joints)
+            (17, 0),    # pinky mcp
+            (18, 17),   # pinky pip
+            (19, 18),   # pinky dip
+            (20, 19),   # pinky tip
+        ]
+
+        # Mapping from joint index to (current_vertex, child_vertex, parent_vertex)
+        # This defines the local coordinate system for each joint rotation calculation
+        self.joint_to_neighbors_mapping: Dict[int, Tuple[int, int, int]] = {
+            # Thumb joint mappings
+            0: (0, 1, -1),  # wrist to thumb mcp (no parent)
+            1: (1, 2, 0),   # thumb mcp to pip (parent: wrist)
+            2: (2, 3, 1),   # thumb pip to dip (parent: thumb mcp)
+            3: (3, 4, 2),   # thumb dip to tip (parent: thumb pip)
+            
+            # Index finger joint mappings
+            4: (0, 5, -1),  # wrist to index mcp (no parent)
+            5: (5, 6, 0),   # index mcp to pip (parent: wrist)
+            6: (6, 7, 5),   # index pip to dip (parent: index mcp)
+            7: (7, 8, 6),   # index dip to tip (parent: index pip)
+            
+            # Middle finger joint mappings
+            8: (0, 9, -1),  # wrist to middle mcp (no parent)
+            9: (9, 10, 0),  # middle mcp to pip (parent: wrist)
+            10: (10, 11, 9), # middle pip to dip (parent: middle mcp)
+            11: (11, 12, 10),# middle dip to tip (parent: middle pip)
+            
+            # Ring finger joint mappings
+            12: (0, 13, -1), # wrist to ring mcp (no parent)
+            13: (13, 14, 0),# ring mcp to pip (parent: wrist)
+            14: (14, 15, 13),# ring pip to dip (parent: ring mcp)
+            15: (15, 16, 14),# ring dip to tip (parent: ring pip)
+            
+            # Pinky finger joint mappings
+            16: (0, 17, -1), # wrist to pinky mcp (no parent)
+            17: (17, 18, 0),# pinky mcp to pip (parent: wrist)
+            18: (18, 19, 17),# pinky pip to dip (parent: pinky mcp)
+            19: (19, 20, 18),# pinky dip to tip (parent: pinky pip)
+        }
+        
+        self.num_joints: int = len(self.joint_to_neighbors_mapping)
+        
+        # Time series data storage
+        self.vertex_positions: List[np.ndarray] = []    # List of (21, 3) arrays for each timestep
+        self.joint_rotations: List[List[np.ndarray]] = []     # List of rotation matrices for each joint
+        self.joint_rotations_xyz: List[List[np.ndarray]] = [] # List of Euler angle representations
+        self.grasp_points: List[np.ndarray] = []        # List of computed grasp points (3D positions)
+        self.grasp_oris: List[np.ndarray] = []          # List of grasp orientation matrices (3x3)
+        self.timestamps: List[float] = []          # List of timestamps for temporal analysis
+
+    def calculate_joint_rotation(self, current_pos: np.ndarray, child_pos: np.ndarray, parent_pos: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Calculate the rotation matrix for a single joint using vector alignment.
+        
+        This method computes the rotation that aligns the previous direction vector
+        with the current direction vector. For root joints (no parent), it uses
+        a default upward direction as the reference.
+        
+        Args:
+            current_pos: 3D position of the current joint
+            child_pos: 3D position of the child joint
+            parent_pos: 3D position of the parent joint
+            
+        Returns:
+            Tuple containing:
+                - rotation_matrix: 3x3 rotation matrix
+                - euler_angles: Rotation as XYZ Euler angles
+        """
+        # Calculate current direction vector (current -> child)
+        current_dir = child_pos - current_pos
+        current_norm = np.linalg.norm(current_dir)
+        if current_norm < 1e-10:
+            return np.eye(3), np.array([0,0,0])
+        current_dir /= current_norm
+        
+        # Calculate previous direction vector (parent -> current, or default up)
+        prev_dir = np.array([0.0, 0.0, 1.0]) if parent_pos is None else current_pos - parent_pos
+        prev_norm = np.linalg.norm(prev_dir)
+        if prev_norm < 1e-10:
+            return np.eye(3), np.array([0,0,0])
+        prev_dir /= prev_norm
+        
+        # Check if vectors are already aligned (no rotation needed)
+        if np.abs((np.abs(np.dot(current_dir, prev_dir)) - 1)) < 1e-8:
+            return np.eye(3), np.array([0,0,0])
+        
+        # Calculate rotation that aligns prev_dir with current_dir
+        rotation, _ = Rotation.align_vectors([current_dir], [prev_dir])
+        return rotation.as_matrix(), rotation.as_euler('xyz')
+    
+    def calculate_frame_rotations(self, vertices: np.ndarray) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+        """
+        Calculate rotation matrices for all joints in a single frame.
+        
+        This method processes all joints in the hand and computes their rotations
+        based on the kinematic structure and current vertex positions.
+        
+        Args:
+            vertices: Hand keypoints, shape (21, 3)
+            
+        Returns:
+            Tuple containing:
+                - rotation_matrices: List of 3x3 rotation matrices
+                - euler_angles: List of XYZ Euler angle arrays
+        """
+        rotations, rotations_xyz = zip(*[
+            self.calculate_joint_rotation(vertices[m[0]], vertices[m[1]],
+                                           None if m[2] == -1 else vertices[m[2]])
+            for m in self.joint_to_neighbors_mapping.values()
+        ])
+        return list(rotations), list(rotations_xyz)
+    
+    def calculate_angular_velocity(self, joint_idx: int, t1: int, t2: int) -> np.ndarray:
+        """
+        Calculate angular velocity for a specific joint between two time frames.
+        
+        Angular velocity is computed as the rotation vector difference divided
+        by the time difference between frames.
+        
+        Args:
+            joint_idx: Index of the joint
+            t1: Index of the first time frame
+            t2: Index of the second time frame
+            
+        Returns:
+            Angular velocity vector (3,) in rad/s
+        """
+        dt = self.timestamps[t2] - self.timestamps[t1]
+        if dt == 0:
+            return np.zeros(3)
+        
+        # Get rotation matrices for the two time frames
+        R1, R2 = self.joint_rotations[t1][joint_idx], self.joint_rotations[t2][joint_idx]
+        
+        # Calculate relative rotation and convert to angular velocity
+        R_relative = Rotation.from_matrix(R2) * Rotation.from_matrix(R1).inv()
+        return R_relative.as_rotvec() / dt
+
+    def calculate_frame_angular_velocities(self, current_frame_idx: int) -> np.ndarray:
+        """
+        Calculate angular velocities for all joints at the current frame.
+        
+        This method computes the angular velocity vectors for all joints by
+        comparing rotations with the previous frame. Returns zeros for the
+        first frame since no previous frame exists.
+        
+        Args:
+            current_frame_idx: Index of the current frame. Must be > 0.
+            
+        Returns:
+            Array of angular velocity vectors (shape: num_joints x 3)
+            Each row contains [wx, wy, wz] for one joint.
+            Returns zeros if current_frame_idx == 0.
+        """
+        if current_frame_idx == 0:
+            return np.zeros((self.num_joints, 3))
+            
+        prev_frame_idx = current_frame_idx - 1
+        
+        # Vectorized calculation for all joints
+        velocities = np.array([
+            self.calculate_angular_velocity(joint_idx, prev_frame_idx, current_frame_idx)
+            for joint_idx in range(self.num_joints)
+        ])
+        
+        return velocities
+    
+    def calculate_grasp_plane(self, vertices: np.ndarray) -> np.ndarray:
+        """
+        Calculate the plane that best fits through a set of hand vertices.
+        
+        This method uses Singular Value Decomposition (SVD) to find the plane.
+        The plane is typically fitted through thumb and index finger points.
+        
+        Args:
+            vertices: Set of 3D points to fit plane through, shape (N, 3)
+            
+        Returns:
+            Plane coefficients [a, b, c, d] for ax + by + cz + d = 0
+        """
+        # Create augmented matrix with homogeneous coordinates for plane fitting
+        A = np.c_[vertices[:, 0], vertices[:, 1], vertices[:, 2], np.ones(vertices.shape[0])]
+
+        # Right-hand side is zeros for the plane equation ax + by + cz + d = 0
+        b = np.zeros(vertices.shape[0])
+
+        # Use SVD to solve the least squares problem
+        U, S, Vt = np.linalg.svd(A)
+
+        # Plane coefficients are in the last row of Vt (smallest singular value)
+        plane_coeffs = Vt[-1, :]
+
+        # Normalize coefficients for easier interpretation (unit normal vector)
+        plane_coeffs = plane_coeffs / np.linalg.norm(plane_coeffs[:3])
+
+        return plane_coeffs  # [a, b, c, d]
+    
+    def calculate_grasp_point(self, grasp_plane: np.ndarray, vertices: np.ndarray) -> np.ndarray:
+        """
+        Calculate the optimal grasp point for robot end-effector positioning.
+        
+        The grasp point is computed as the midpoint between projected thumb tip
+        and index finger tip on the grasp plane. This provides a stable reference
+        point for robot grasping operations.
+        
+        Args:
+            grasp_plane: Plane coefficients [a, b, c, d]
+            vertices: Hand keypoints, shape (21, 3)
+            
+        Returns:
+            3D grasp point coordinates
+        """
+        # Project fingertips onto the grasp plane
+        thumb_pt = project_point_to_plane(vertices[4], grasp_plane)
+        index_pt = project_point_to_plane(vertices[8], grasp_plane)
+        
+        # Compute midpoint as the grasp reference
+        hand_ee_pt = np.mean([thumb_pt, index_pt], axis=0)
+        return hand_ee_pt
+
+    def add_frame(self, vertices: np.ndarray, timestamp: float, hand_detected: bool = True) -> None:
+        """
+        Add a new frame of vertex positions and calculate corresponding data.
+        
+        This is the main method for processing hand data over time. It computes
+        grasp points, orientations, and stores all relevant information for
+        the current timestep.
+        
+        Args:
+            vertices: Array of 21 3D vertex positions
+            timestamp: Time of the frame in seconds
+            hand_detected: Whether hand was successfully detected
+        """
+        if len(vertices) != 21:
+            raise ValueError(f"Expected 21 vertices, got {len(vertices)}")
+        
+        # Handle frames without hand detection
+        if not hand_detected: 
+            self.vertex_positions.append(np.zeros((21, 3)))
+            self.grasp_points.append(np.zeros(3))
+            self.grasp_oris.append(np.eye(3))
+            self.timestamps.append(timestamp)
+            return
+        
+        # Extract key finger tip positions
+        thumb_tip = vertices[4]
+        index_tip = vertices[8]
+        middle_tip = vertices[12]
+
+        # Calculate grasp point as midpoint between thumb and middle finger tips
+        control_point = (thumb_tip + middle_tip) / 2
+        grasp_pt = control_point
+
+        # Calculate gripper orientation from thumb-index finger configuration
+        gripper_ori, _ = HandModel.get_gripper_orientation(thumb_tip, index_tip, vertices)
+        
+        # Apply 90-degree rotation to align with robot gripper convention
+        rot_90_deg = Rotation.from_euler('Z', 90, degrees=True).as_matrix()
+        grasp_ori = gripper_ori @ rot_90_deg
+
+        # Store all frame data
+        self.vertex_positions.append(vertices)
+        self.grasp_points.append(grasp_pt)
+        self.grasp_oris.append(grasp_ori)
+        self.timestamps.append(timestamp)
+
+
+    def get_joint_data(self, joint_idx: int) -> Dict[str, Union[List[float], List[np.ndarray]]]:
+        """
+        Get all trajectory data for a specific joint across all frames.
+        
+        Args:
+            joint_idx: Index of the joint
+            
+        Returns:
+            Dictionary containing joint trajectory data with keys:
+                - 'timestamps': List of time stamps
+                - 'rotations': List of rotation matrices for this joint
+        """
+        return {
+            'timestamps': self.timestamps,
+            'rotations': [frame[joint_idx] for frame in self.joint_rotations],
+        }
+    
+    @staticmethod
+    def get_parallel_plane(a: float, b: float, c: float, d: float, dist: float) -> Tuple[float, float, float, float]:
+        """
+        Calculate coefficients of a plane parallel to the given plane at specified distance.
+        
+        This utility method is useful for creating offset grasp planes that account
+        for gripper thickness or provide clearance during grasping operations.
+        
+        Parameters:
+            a, b, c, d: Coefficients of the original plane ax + by + cz + d = 0
+            dist: Distance between planes (positive moves in normal direction)
+        
+        Returns:
+            (a, b, c, d_new) coefficients of the parallel plane
+        """
+        # Calculate the magnitude of the normal vector
+        normal_magnitude = np.sqrt(a**2 + b**2 + c**2)
+        
+        # Parallel plane has same normal direction, only d changes
+        d_new = d - dist * normal_magnitude
+        
+        return (a, b, c, d_new)
+
+    @staticmethod
+    def get_gripper_orientation(thumb_tip: np.ndarray, index_tip: np.ndarray, vertices: np.ndarray, grasp_plane: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Compute robot gripper orientation matrix from hand keypoints and fingertip positions.
+        
+        This method calculates a coordinate frame suitable for robot gripper control
+        based on the relative positions of thumb, index finger, and wrist. The resulting
+        orientation matrix can be directly used for robot end-effector control.
+        
+        Args:
+            thumb_tip: 3D position of thumb tip
+            index_tip: 3D position of index finger tip  
+            vertices: All hand keypoints, shape (21, 3)
+            grasp_plane: Plane coefficients [a,b,c,d]
+            
+        Returns:
+            Tuple containing:
+                - gripper_orientation: 3x3 rotation matrix
+                - z_axis: Z-axis direction vector of the gripper frame
+        """
+        # Calculate gripper opening direction (thumb to index finger)
+        gripper_direction = thumb_tip - index_tip
+        
+        # Calculate gripper reference point (midpoint of fingertips)
+        midpoint = (thumb_tip + index_tip) / 2
+        
+        if grasp_plane is None:
+            # Use palm geometry when no plane is provided
+            palm_axis = vertices[5] - midpoint  # index MCP to midpoint
+            x_axis = gripper_direction / max(np.linalg.norm(gripper_direction), 1e-10)
+            z_axis = -palm_axis / max(np.linalg.norm(palm_axis), 1e-10)
+        else:
+            # Use grasp plane for orientation calculation
+            palm_axis = project_point_to_plane(vertices[0], grasp_plane) - project_point_to_plane(vertices[1], grasp_plane)
+            z_axis = -palm_axis / max(np.linalg.norm(palm_axis), 1e-10)
+            x_axis = np.cross(grasp_plane[:3], z_axis)
+            x_axis /= max(np.linalg.norm(x_axis), 1e-10)
+
+        # Compute y-axis
+        y_axis = np.cross(z_axis, x_axis)
+        y_axis /= max(np.linalg.norm(y_axis), 1e-10)
+
+        # Ensure orthogonality by recalculating z_axis
+        z_axis = np.cross(x_axis, y_axis)
+        z_axis /= max(np.linalg.norm(z_axis), 1e-10)
+
+        # Check orientation consistency with palm direction
+        if type(palm_axis) == torch.Tensor:
+            palm_axis = palm_axis.cpu().numpy()
+        if z_axis @ palm_axis > 0:
+            x_axis, y_axis, z_axis = -x_axis, -y_axis, -z_axis
+
+        # Construct orientation matrix
+        gripper_ori = np.column_stack([x_axis, y_axis, z_axis])
+
+        # Ensure proper handedness (right-handed coordinate system)
+        if np.linalg.det(gripper_ori) < 0:
+            x_axis = -x_axis  # Flip one axis to fix handedness
+            gripper_ori = np.column_stack([x_axis, y_axis, z_axis])
+
+        # Verify determinant for debugging
+        det = np.linalg.det(gripper_ori)
+        if det < 0.9:
+            pdb.set_trace()
+
+        return gripper_ori, z_axis
+
+
+class PhysicallyConstrainedHandModel(HandModel):
+    """
+    Extended hand model with physical constraints and realistic joint limits.
+    
+    This class builds upon the base HandModel by adding realistic constraints
+    that enforce physically plausible hand poses and motion. It includes:
+    - Joint angle limits based on human hand anatomy
+    - Angular velocity constraints for smooth motion
+    - Pose reconstruction with constraint enforcement
+    - Enhanced grasp point calculation with plane-based refinement
+
+    Constrained hand model is used in Phantom
+    
+    Key Constraints:
+    - Anatomically correct joint limits for each finger joint
+    - Velocity limiting to prevent jerky motions
+    - Iterative pose refinement with constraint satisfaction
+    - More robust grasp plane calculation and orientation alignment
+    
+    Attributes:
+        joint_limits (Dict[int, Tuple[float, ...]]): Joint angle limits for each joint in radians
+        max_angular_velocity (float): Maximum allowed angular velocity in rad/s
+    """
+    def __init__(self, robot_name: str) -> None:
+        """
+        Initialize the physically constrained hand model.
+        
+        Args:
+            robot_name: Name of the target robot for coordinate alignment
+        """
+        super().__init__(robot_name)
+        
+        # Define joint rotation limits (in radians) for each joint
+        # Format: (min_x, max_x, min_y, max_y, min_z, max_z) for XYZ Euler angles
+        small_angle = np.pi/40  # Small constraint for fine motor control
+        
+        self.joint_limits: Dict[int, Tuple[float, float, float, float, float, float]] = {
+            # Thumb joints - more flexible due to opposable nature
+            0: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # wrist to thumb mcp
+            1: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # thumb mcp to pip
+            2: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # thumb pip to dip
+            3: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # thumb dip to tip
+            
+            # Index finger joints - moderate constraints
+            4: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # wrist to index mcp
+            5: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # index mcp to pip
+            6: (-small_angle, small_angle, -np.pi/8, np.pi/8, -small_angle, small_angle), # index pip to dip
+            7: (-small_angle, small_angle, -np.pi/8, np.pi/8, -small_angle, small_angle), # index dip to tip
+            
+            # Middle finger joints - tighter constraints for stability
+            8: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # wrist to middle mcp
+            9: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # middle mcp to pip
+            10: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # middle pip to dip
+            11: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # middle dip to tip
+            
+            # Ring finger joints - similar to middle finger
+            12: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # wrist to ring mcp
+            13: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # ring mcp to pip
+            14: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # ring pip to dip
+            15: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # ring dip to tip
+            
+            # Pinky finger joints - most constrained due to size
+            16: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # wrist to pinky mcp
+            17: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # pinky mcp to pip
+            18: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # pinky pip to dip
+            19: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # pinky dip to tip
+        }
+        
+        # Maximum angular velocity constraint (2π rad/s = 360°/s)
+        self.max_angular_velocity: float = np.pi * 2
+    
+    def reconstruct_vertices(self, input_vertices: np.ndarray, rotations: List[np.ndarray]) -> np.ndarray:
+        """
+        Reconstruct vertex positions from base vertex and constrained rotations.
+        
+        This method applies the kinematic chain to reconstruct hand vertex positions
+        while respecting the calculated bone lengths from the input vertices.
+        This ensures consistent hand proportions while applying constraints.
+        
+        Args:
+            input_vertices: Original vertex positions, shape (21, 3)
+            rotations: List of constrained rotation matrices
+            
+        Returns:
+            Reconstructed vertex positions, shape (21, 3)
+        """
+        vertices = np.zeros((21, 3))
+        vertices[0] = input_vertices[0]  # Wrist position remains fixed
+        
+        # Calculate bone lengths from original vertices to maintain proportions
+        bone_lengths: Dict[Tuple[int, int], float] = {}
+        min_bone_length = 1e-6  # Minimum length to avoid numerical issues
+        
+        # Extract bone lengths from the kinematic chain
+        for current in range(self.num_joints):
+            mapping = self.joint_to_neighbors_mapping[current]
+            current_vertex = mapping[0]
+            child_vertex = mapping[1]
+            parent_vertex = mapping[2]
+            
+            # Calculate bone length for current->child connection
+            if child_vertex != -2:
+                length = np.linalg.norm(input_vertices[child_vertex] - input_vertices[current_vertex])
+                bone_lengths[(current_vertex, child_vertex)] = max(length, min_bone_length)
+        
+        # Reconstruct positions following the kinematic chain
+        for current in range(self.num_joints):
+            mapping = self.joint_to_neighbors_mapping[current]
+            current_vertex = mapping[0]
+            child_vertex = mapping[1]
+            parent_vertex = mapping[2]
+
+            if child_vertex == -2:
+                continue
+            
+            # Get positions and rotation for this joint
+            parent_pos = vertices[parent_vertex]
+            current_pos = vertices[current_vertex]
+            rotation = rotations[current]
+            
+            # Determine reference direction for rotation application
+            if parent_vertex == -1:
+                # Root joints use upward direction as reference
+                prev_dir = np.array([0, 0, 1])
+            else:
+                # Use direction from parent to current vertex
+                prev_dir = vertices[current_vertex] - vertices[parent_vertex]
+                prev_dir = prev_dir / np.linalg.norm(prev_dir)
+            
+            # Apply rotation to get new direction
+            current_dir = rotation @ prev_dir
+            
+            # Position child vertex using calculated bone length
+            bone_length = bone_lengths[(current_vertex, child_vertex)]
+            vertices[child_vertex] = current_pos + current_dir * bone_length
+
+        return vertices
+
+    def constrain_rotation(self, rotation_matrix: np.ndarray, joint_idx: int) -> np.ndarray:
+        """
+        Apply joint angle constraints to a rotation matrix.
+        
+        This method converts the rotation to Euler angles, clips them to the
+        joint limits, and converts back to a rotation matrix. This ensures
+        all joint angles remain within anatomically realistic ranges.
+        
+        Args:
+            rotation_matrix: 3x3 rotation matrix to constrain
+            joint_idx: Index of the joint for limit lookup
+            
+        Returns:
+            Constrained 3x3 rotation matrix
+        """
+        try:
+            # Convert rotation matrix to Euler angles
+            rot = Rotation.from_matrix(rotation_matrix)
+            euler = rot.as_euler('xyz')
+            
+            # Get joint limits for this joint
+            limits = self.joint_limits[joint_idx]
+            
+            # Clip Euler angles to the specified limits
+            constrained_euler = np.clip(euler, 
+                                      [limits[0], limits[2], limits[4]],  # min limits
+                                      [limits[1], limits[3], limits[5]])  # max limits
+            
+            # Convert back to rotation matrix if any clipping occurred
+            if not np.allclose(euler, constrained_euler):
+                return Rotation.from_euler('xyz', constrained_euler).as_matrix()
+            return rotation_matrix
+            
+        except ValueError:
+            logger.error("Error constraining rotation")
+            # Return identity matrix if rotation is invalid
+            return np.eye(3)
+        
+    def constrain_velocity(self, velocity: np.ndarray) -> np.ndarray:
+        """
+        Apply angular velocity constraints to limit motion speed.
+        
+        This method ensures that joint angular velocities don't exceed the
+        maximum allowed velocity, preventing jerky or unrealistic motions.
+        
+        Args:
+            velocity: Angular velocity vector to constrain
+            
+        Returns:
+            Constrained angular velocity vector
+        """
+        velocity_magnitude = np.linalg.norm(velocity)
+        if velocity_magnitude > self.max_angular_velocity:
+            # Scale velocity to maximum while preserving direction
+            return velocity * (self.max_angular_velocity / velocity_magnitude)
+        return velocity
+
+    def add_frame(self, vertices: np.ndarray, timestamp: float, finger_pts: Any) -> None:
+        """
+        Add a new frame with physical constraints applied.
+        
+        This method extends the base add_frame functionality by applying
+        joint limits, velocity constraints, and enhanced grasp calculations.
+        The result is a more realistic and stable hand model suitable for
+        robot control applications.
+        
+        Args:
+            vertices: Hand keypoints, shape (21, 3)
+            timestamp: Time of the frame in seconds
+            finger_pts: Additional finger point data (currently unused)
+        """
+        # Calculate initial rotations from raw vertex positions
+        rotations, rotations_xyz = self.calculate_frame_rotations(vertices)
+
+        # Apply joint angle constraints to all rotations
+        constrained_rotations: List[np.ndarray] = []
+        for joint_idx, rotation in enumerate(rotations):
+            constrained_rot = self.constrain_rotation(rotation, joint_idx)
+            constrained_rotations.append(constrained_rot)
+        
+        # Apply velocity constraints if this is not the first frame
+        if len(self.timestamps) > 0:
+            dt = timestamp - self.timestamps[-1]
+            for joint_idx in range(self.num_joints):
+                # Calculate angular velocity for this joint
+                prev_rot = Rotation.from_matrix(self.joint_rotations[-1][joint_idx])
+                curr_rot = Rotation.from_matrix(constrained_rotations[joint_idx])
+                rel_rot = curr_rot * prev_rot.inv()
+                velocity = rel_rot.as_rotvec() / dt
+                
+                # Apply velocity constraint if needed
+                if np.linalg.norm(velocity) > self.max_angular_velocity:
+                    # Constrain velocity and reconstruct rotation
+                    constrained_velocity = self.constrain_velocity(velocity)
+                    delta_rot = Rotation.from_rotvec(constrained_velocity * dt)
+                    new_rot = delta_rot * prev_rot
+                    constrained_rotations[joint_idx] = new_rot.as_matrix()
+        
+        # Reconstruct vertices with constrained rotations
+        constrained_vertices = self.reconstruct_vertices(vertices, constrained_rotations)
+
+        # Extract key points for grasp calculation
+        thumb_tip = constrained_vertices[4]
+        index_tip = constrained_vertices[8]
+        
+        # Calculate grasp plane using thumb and index finger regions
+        grasp_plane = self.calculate_grasp_plane(constrained_vertices[3:9])
+        
+        # Organize fingers for direction analysis
+        n_fingers = len(constrained_vertices) - 1
+        npts_per_finger = 4
+        list_fingers = [np.vstack([constrained_vertices[0], constrained_vertices[i:i + npts_per_finger]]) 
+                       for i in range(1, n_fingers, npts_per_finger)]
+        
+        # Calculate finger direction vector for plane orientation
+        dir_vec = list_fingers[1][1] - list_fingers[-1][1]  # index to pinky MCP
+        dir_vec = dir_vec / np.linalg.norm(dir_vec)
+        
+        # Ensure consistent plane orientation (normal pointing away from palm)
+        if np.dot(dir_vec, grasp_plane[:3]) > 0:
+            grasp_plane = -grasp_plane
+        
+        # Create slightly offset plane for grasp point calculation
+        shifted_grasp_plane = self.get_parallel_plane(*grasp_plane, 0.01)
+        grasp_pt = self.calculate_grasp_point(shifted_grasp_plane, constrained_vertices)
+
+        # Calculate gripper orientation using the grasp plane
+        gripper_ori, _ = HandModel.get_gripper_orientation(thumb_tip, index_tip, constrained_vertices, grasp_plane)
+        
+        # Apply coordinate frame transformations for robot compatibility
+        rot_90_deg = Rotation.from_euler('Z', 90, degrees=True).as_matrix()
+        grasp_ori = gripper_ori @ rot_90_deg
+        
+        # Apply pitch adjustment 
+        angle = -np.pi/18 * 1.0  # -10 degrees
+        grasp_ori = Rotation.from_rotvec(angle * np.array([1, 0, 0])).apply(grasp_ori)
+
+        # Offset grasp point along gripper Z-axis for clearance
+        unit_vectors = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        transformed_vectors = unit_vectors @ grasp_ori.T
+        grasp_pt = grasp_pt - transformed_vectors[2] * 0.015  # 1.5cm offset
+
+        # Store all frame data
+        self.joint_rotations.append(constrained_rotations)
+        self.joint_rotations_xyz.append(rotations_xyz)
+        self.vertex_positions.append(constrained_vertices)
+        self.grasp_points.append(grasp_pt)
+        self.grasp_oris.append(grasp_ori)
+        self.timestamps.append(timestamp)
+
+
+def get_list_finger_pts_from_skeleton(skeleton_pts: np.ndarray) -> Dict[str, np.ndarray]:
+    """
+    Organize hand skeleton points into finger-specific groups.
+    
+    This utility function takes the 21-point hand skeleton and organizes
+    it into a dictionary with separate arrays for each finger. This makes
+    it easier to perform finger-specific calculations and analysis.
+    
+    Args:
+        skeleton_pts: Hand skeleton points, shape (21, 3)
+            Points are ordered as: wrist, thumb(4), index(4), middle(4), ring(4), pinky(4)
+    
+    Returns:
+        Dictionary with finger names as keys and point arrays as values:
+            - "thumb": Wrist + 4 thumb points, shape (5, 3)
+            - "index": Wrist + 4 index points, shape (5, 3) 
+            - "middle": Wrist + 4 middle points, shape (5, 3)
+            - "ring": Wrist + 4 ring points, shape (5, 3)
+            - "pinky": Wrist + 4 pinky points, shape (5, 3)
+    """
+    n_fingers = len(skeleton_pts) - 1  # Exclude wrist point
+    npts_per_finger = 4  # MCP, PIP, DIP, TIP for each finger
+    
+    # Create finger arrays by combining wrist with each finger's points
+    list_fingers = [
+        np.vstack([skeleton_pts[0], skeleton_pts[i : i + npts_per_finger]])
+        for i in range(1, n_fingers, npts_per_finger)
+    ]
+    
+    # Return organized finger dictionary
+    return {
+        "thumb": list_fingers[0], 
+        "index": list_fingers[1], 
+        "middle": list_fingers[2], 
+        "ring": list_fingers[3], 
+        "pinky": list_fingers[4]
+    }
\ No newline at end of file
diff --git a/phantom/phantom/process_data.py b/phantom/phantom/process_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..21e276043b093e33ae09f8503578c803bee873eb
--- /dev/null
+++ b/phantom/phantom/process_data.py
@@ -0,0 +1,243 @@
+import logging
+from enum import Enum
+from tqdm import tqdm
+from joblib import Parallel, delayed  # type: ignore
+import hydra
+from omegaconf import DictConfig
+
+from phantom.processors.base_processor import BaseProcessor
+
+logging.basicConfig(level=logging.WARNING, format="%(name)s - %(levelname)s - %(message)s")
+
+class ProcessingMode(Enum):
+    """Enumeration of valid processing modes."""
+    BBOX = "bbox"
+    HAND2D = "hand2d"
+    HAND3D = "hand3d"
+    HAND_SEGMENTATION = "hand_segmentation"
+    ARM_SEGMENTATION = "arm_segmentation"
+    ACTION = "action"
+    SMOOTHING = "smoothing"
+    HAND_INPAINT = "hand_inpaint"
+    ROBOT_INPAINT = "robot_inpaint"
+    ALL = "all"
+
+PROCESSING_ORDER = [
+    "bbox",
+    "hand2d",
+    "arm_segmentation",
+    "hand_segmentation",
+    "hand3d",
+    "action",
+    "smoothing",
+    "hand_inpaint",
+    "robot_inpaint",
+]
+
+PROCESSING_ORDER_EPIC = [
+    "bbox",
+    "hand2d",
+    "arm_segmentation",
+    "action",
+    "smoothing",
+    "hand_inpaint",
+    "robot_inpaint",
+]
+
+def process_one_demo(data_sub_folder: str, cfg: DictConfig, processor_classes: dict) -> None:
+    # Choose processing order based on epic flag
+    processing_order = PROCESSING_ORDER_EPIC if cfg.epic else PROCESSING_ORDER
+    
+    # Handle both string and list modes
+    if isinstance(cfg.mode, str):
+        # Handle comma-separated string format
+        if ',' in cfg.mode:
+            selected_modes = []
+            for mode in cfg.mode.split(','):
+                mode = mode.strip()
+                if mode == "all":
+                    selected_modes.extend(processing_order)
+                elif mode in processing_order:
+                    selected_modes.append(mode)
+        else:
+            selected_modes = [m for m in processing_order if m in cfg.mode or "all" in cfg.mode]
+    else:
+        # For list of modes, use the order provided by user
+        selected_modes = []
+        for mode in cfg.mode:
+            if mode == "all":
+                selected_modes.extend(processing_order)
+            elif mode in processing_order:
+                selected_modes.append(mode)
+    
+    for mode in selected_modes:
+        print(f"----------------- {mode.upper()} PROCESSOR -----------------")
+        processor_cls = processor_classes[mode]
+        processor = processor_cls(cfg)
+        try:
+            processor.process_one_demo(data_sub_folder)
+        except Exception as e:
+            print(f"Error in {mode} processing: {e}")
+            if cfg.debug:
+                raise
+
+def process_all_demos(cfg: DictConfig, processor_classes: dict) -> None:
+    # Choose processing order based on epic flag
+    processing_order = PROCESSING_ORDER_EPIC if cfg.epic else PROCESSING_ORDER
+    
+    # Handle both string and list modes
+    if isinstance(cfg.mode, str):
+        # Handle comma-separated string format
+        if ',' in cfg.mode:
+            selected_modes = []
+            for mode in cfg.mode.split(','):
+                mode = mode.strip()
+                if mode == "all":
+                    selected_modes.extend(processing_order)
+                elif mode in processing_order:
+                    selected_modes.append(mode)
+        else:
+            selected_modes = [m for m in processing_order if m in cfg.mode or "all" in cfg.mode]
+    else:
+        # For list of modes, use the order provided by user
+        selected_modes = []
+        for mode in cfg.mode:
+            if mode == "all":
+                selected_modes.extend(processing_order)
+            elif mode in processing_order:
+                selected_modes.append(mode)
+    
+    base_processor = BaseProcessor(cfg)
+    all_data_folders = base_processor.all_data_folders.copy()
+    for mode in selected_modes:
+        print(f"----------------- {mode.upper()} PROCESSOR -----------------")
+        processor_cls = processor_classes[mode]
+        processor = processor_cls(cfg)
+        for data_sub_folder in tqdm(all_data_folders):
+            try:
+                processor.process_one_demo(data_sub_folder)
+            except Exception as e:
+                print(f"Error in {mode} processing: {e}")
+                if cfg.debug:
+                    raise
+
+def process_all_demos_parallel(cfg: DictConfig, processor_classes: dict) -> None:
+    # Choose processing order based on epic flag
+    processing_order = PROCESSING_ORDER_EPIC if cfg.epic else PROCESSING_ORDER
+    
+    # Handle both string and list modes
+    if isinstance(cfg.mode, str):
+        # Handle comma-separated string format
+        if ',' in cfg.mode:
+            selected_modes = []
+            for mode in cfg.mode.split(','):
+                mode = mode.strip()
+                if mode == "all":
+                    selected_modes.extend(processing_order)
+                elif mode in processing_order:
+                    selected_modes.append(mode)
+        else:
+            selected_modes = [m for m in processing_order if m in cfg.mode or "all" in cfg.mode]
+    else:
+        # For list of modes, use the order provided by user
+        selected_modes = []
+        for mode in cfg.mode:
+            if mode == "all":
+                selected_modes.extend(processing_order)
+            elif mode in processing_order:
+                selected_modes.append(mode)
+    
+    base_processor = BaseProcessor(cfg)
+    all_data_folders = base_processor.all_data_folders.copy()
+    for mode in selected_modes:
+        print(f"----------------- {mode.upper()} PROCESSOR -----------------")
+        processor_cls = processor_classes[mode]
+        processor = processor_cls(cfg) 
+        Parallel(n_jobs=cfg.n_processes)(
+            delayed(processor.process_one_demo)(data_sub_folder) for data_sub_folder in all_data_folders
+        )
+
+def get_processor_classes(cfg: DictConfig) -> dict:
+    """Initialize the processor classes"""
+    from phantom.processors.bbox_processor import BBoxProcessor
+    from phantom.processors.segmentation_processor import HandSegmentationProcessor, ArmSegmentationProcessor
+    from phantom.processors.hand_processor import Hand2DProcessor, Hand3DProcessor
+    from phantom.processors.action_processor import ActionProcessor
+    from phantom.processors.smoothing_processor import SmoothingProcessor
+    from phantom.processors.robotinpaint_processor import RobotInpaintProcessor
+    from phantom.processors.handinpaint_processor import HandInpaintProcessor
+    
+    return {
+        "bbox": BBoxProcessor,
+        "hand2d": Hand2DProcessor,
+        "hand3d": Hand3DProcessor,
+        "hand_segmentation": HandSegmentationProcessor,
+        "arm_segmentation": ArmSegmentationProcessor,
+        "action": ActionProcessor,
+        "smoothing": SmoothingProcessor,
+        "robot_inpaint": RobotInpaintProcessor,
+        "hand_inpaint": HandInpaintProcessor,
+    }
+
+def validate_mode(cfg: DictConfig) -> None:
+    """
+    Validate that the mode parameter contains only valid processing modes.
+    
+    Args:
+        cfg: Configuration object containing mode parameter
+        
+    Raises:
+        ValueError: If mode contains invalid options
+    """
+    if isinstance(cfg.mode, str):
+        # Handle comma-separated string format
+        if ',' in cfg.mode:
+            modes = [mode.strip() for mode in cfg.mode.split(',')]
+        else:
+            modes = [cfg.mode]
+    else:
+        modes = cfg.mode
+    
+    # Get valid modes from enum
+    valid_modes = {mode.value for mode in ProcessingMode}
+    invalid_modes = [mode for mode in modes if mode not in valid_modes]
+    
+    if invalid_modes:
+        valid_mode_list = [mode.value for mode in ProcessingMode]
+        raise ValueError(
+            f"Invalid mode(s): {invalid_modes}. "
+            f"Valid modes are: {valid_mode_list}"
+        )
+
+def main(cfg: DictConfig):
+    # Validate mode parameter
+    validate_mode(cfg)
+    
+    # Get processor classes
+    processor_classes = get_processor_classes(cfg)
+    
+    if cfg.n_processes > 1:
+        process_all_demos_parallel(cfg, processor_classes)
+    elif cfg.demo_num is not None:
+        process_one_demo(cfg.demo_num, cfg, processor_classes)
+    else:
+        process_all_demos(cfg, processor_classes)
+
+@hydra.main(version_base=None, config_path="../configs", config_name="default")
+def hydra_main(cfg: DictConfig):
+    """
+    Main entry point using Hydra configuration.
+    
+    Example usage:
+    - Process all demos with bbox: python process_data.py mode=bbox
+    - Process single demo: python process_data.py mode=bbox demo_num=0
+    - Use EPIC dataset: python process_data.py dataset=epic mode=bbox
+    - Parallel processing: python process_data.py mode=bbox n_processes=4
+    - Process multiple modes sequentially: python process_data.py mode=bbox,hand3d
+    - Process with custom order: python process_data.py mode=hand3d,bbox,action
+    - Process with bracket notation (use quotes): python process_data.py "mode=[bbox,hand3d]"
+    """
+    main(cfg)
+
+if __name__ == "__main__":
+    hydra_main()
diff --git a/phantom/phantom/processors/__init__.py b/phantom/phantom/processors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/phantom/processors/action_processor.py b/phantom/phantom/processors/action_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b7e029d30cfc9d97c142b327b84b1101b22756b
--- /dev/null
+++ b/phantom/phantom/processors/action_processor.py
@@ -0,0 +1,478 @@
+"""
+Action Processor Module
+
+This module processes hand motion capture data and converts it into robot-executable actions.
+It handles both single-arm and bimanual robotic setups, converting detected hand keypoints
+into end-effector positions, orientations, and gripper widths that can be used for robot control.
+
+Key Features:
+- Converts hand keypoints from camera frame to robot frame
+- Supports both unconstrained and physically constrained hand models
+- Handles missing hand detections with interpolation
+- Processes bimanual data with union-based frame selection
+- Generates neutral poses when no hand data is available
+
+The processor follows this pipeline:
+1. Load hand sequence data (keypoints, detection flags)
+2. Convert keypoints to robot coordinate frame
+3. Apply hand model constraints (optional)
+4. Extract end-effector poses and gripper states
+5. Refine actions to handle missing detections
+6. Save processed actions for robot execution
+"""
+
+import os 
+import numpy as np
+from typing import Tuple, Optional
+from dataclasses import dataclass
+import logging
+from scipy.spatial.transform import Rotation
+
+from phantom.processors.base_processor import BaseProcessor
+from phantom.processors.phantom_data import HandSequence
+from phantom.processors.paths import Paths
+from phantom.hand import HandModel, PhysicallyConstrainedHandModel, get_list_finger_pts_from_skeleton
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class EEActions:
+    """
+    Container for bimanual end-effector action data.
+    
+    This dataclass holds the processed robot actions for a sequence of timesteps,
+    including 3D positions, 3D orientations, and gripper opening widths.
+    
+    Attributes:
+        ee_pts (np.ndarray): End-effector positions, shape (N, 3) in robot frame coordinates
+        ee_oris (np.ndarray): End-effector orientations as rotation matrices, shape (N, 3, 3)
+        ee_widths (np.ndarray): Gripper opening widths in meters, shape (N,)
+    """
+    ee_pts: np.ndarray      # End-effector positions (N, 3)
+    ee_oris: np.ndarray     # End-effector orientations (N, 3, 3) as rotation matrices
+    ee_widths: np.ndarray   # Gripper widths (N,)
+
+class ActionProcessor(BaseProcessor): 
+    """
+    Processor for converting hand motion capture data into robot-executable actions.
+    
+    This class handles the complete pipeline from raw hand keypoints to refined robot actions.
+    It supports both single-arm and bimanual robotic setups, with intelligent handling of
+    missing hand detections and physically realistic constraints.
+    
+    The processor can operate in different modes:
+    - Single arm: Processes only left or right hand data
+    - Bimanual: Processes both hands with union-based frame selection
+    
+    Key processing steps:
+    1. Load hand sequences with 3D keypoints and detection flags
+    2. Transform keypoints from camera frame to robot frame
+    3. Fit hand model (optionally with physical constraints)
+    4. Extract end-effector poses and gripper states
+    5. Refine actions using last-valid-value interpolation
+    6. Generate neutral poses for undetected periods
+    
+    Attributes:
+        dt (float): Time delta between frames (1/15 seconds for 15Hz processing)
+        bimanual_setup (str): Setup type ("single_arm", "shoulders", etc.)
+        target_hand (str): Which hand to process in single-arm mode ("left"/"right")
+        constrained_hand (bool): Whether to use physically constrained hand model
+        T_cam2robot (np.ndarray): 4x4 transformation matrix from camera to robot frame
+    """
+    def __init__(self, args):
+        # Set processing frequency to 15Hz 
+        self.dt = 1/15 
+        super().__init__(args)
+
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration recording into robot actions.
+        
+        This is the main entry point for processing one demo. It handles both
+        single-arm and bimanual processing modes, loading the raw hand data,
+        converting it to robot actions, and saving the results.
+        
+        Args:
+            data_sub_folder (str): Path to the folder containing this demo's data
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+
+        # Load hand sequence data for both hands
+        left_sequence, right_sequence = self._load_sequences(paths)
+
+        # Handle single-arm processing mode
+        if self.bimanual_setup == "single_arm":
+            self._process_single_arm(left_sequence, right_sequence, paths)
+        else:
+            self._process_bimanual(left_sequence, right_sequence, paths)
+
+    def _process_single_arm(self, left_sequence: HandSequence, right_sequence: HandSequence, paths) -> None:
+        """Process single-arm setup with one target hand."""
+        # Select target hand based on configuration
+        target_sequence = left_sequence if self.target_hand == "left" else right_sequence
+        
+        # Process the selected hand sequence
+        target_actions = self._process_hand_sequence(target_sequence, self.T_cam2robot)
+        
+        # Get indices where hand was detected for this sequence
+        union_indices = np.where(target_sequence.hand_detected)[0]
+        
+        # Refine actions to handle missing detections
+        target_actions_refined = self._refine_actions(target_sequence, target_actions, union_indices, self.target_hand)
+        
+        # Save results for the selected hand only
+        if self.target_hand == "left":
+            self._save_results(paths, union_indices=union_indices, left_actions=target_actions_refined)
+        else:
+            self._save_results(paths, union_indices=union_indices, right_actions=target_actions_refined)
+
+    def _process_bimanual(self, left_sequence: HandSequence, right_sequence: HandSequence, paths) -> None:
+        """Process bimanual setup with both hands."""
+        # Process both hand sequences
+        left_actions = self._process_hand_sequence(left_sequence, self.T_cam2robot)
+        right_actions = self._process_hand_sequence(right_sequence, self.T_cam2robot)
+        
+        # Combine detection results using OR logic - frame is valid if either hand detected
+        union_indices = np.where(left_sequence.hand_detected | right_sequence.hand_detected)[0]
+
+        # Refine actions for both hands using the union indices
+        left_actions_refined = self._refine_actions(left_sequence, left_actions, union_indices, "left")
+        right_actions_refined = self._refine_actions(right_sequence, right_actions, union_indices, "right")
+
+        # Save results for both hands
+        self._save_results(paths, union_indices, left_actions_refined, right_actions_refined)
+    
+
+    def _load_sequences(self, paths) -> Tuple[HandSequence, HandSequence]:
+        """
+        Load hand sequences from disk for both left and right hands.
+        
+        HandSequence objects contain the processed keypoint data, detection flags,
+        and other metadata needed for action processing.
+        
+        Args:
+            paths: Paths object containing file locations for hand data
+            
+        Returns:
+            Tuple[HandSequence, HandSequence]: Left and right hand sequences
+        """
+        return (
+            HandSequence.load(paths.hand_data_left),
+            HandSequence.load(paths.hand_data_right)
+        )
+    
+    def _process_hand_sequence(
+        self, 
+        sequence: HandSequence, 
+        T_cam2robot: np.ndarray,
+    ) -> EEActions:
+        """
+        Process a single hand sequence into end-effector actions.
+        
+        This method performs the following processing pipeline for one hand:
+        1. Transform keypoints from camera frame to robot frame
+        2. Fit a hand model to the keypoint sequence
+        3. Extract end-effector poses and gripper states
+        
+        Args:
+            sequence (HandSequence): Hand keypoint sequence with detection flags
+            T_cam2robot (np.ndarray): 4x4 transformation matrix from camera to robot frame
+            
+        Returns:
+            EEActions: Processed end-effector positions, orientations, and gripper widths
+        """
+        # Convert keypoints from camera frame to robot frame coordinates
+        kpts_3d_cf = sequence.kpts_3d  # Camera frame keypoints
+        kpts_3d_rf = ActionProcessor._convert_pts_to_robot_frame(
+            kpts_3d_cf, 
+            T_cam2robot
+        )
+
+        # Create and fit hand model to the keypoint sequence
+        hand_model = self._get_hand_model(kpts_3d_rf, sequence.hand_detected)
+        
+        # Extract end-effector poses and gripper states from fitted model
+        kpts_3d, ee_pts, ee_oris = self._get_model_keypoints(hand_model)
+        
+        # Compute gripper opening distances from fingertip positions
+        ee_widths = self._compute_gripper_distances(
+            kpts_3d, 
+            sequence.hand_detected
+        )
+        
+        return EEActions(
+            ee_pts=ee_pts,
+            ee_oris=ee_oris,
+            ee_widths=ee_widths,
+        )
+
+    def _get_hand_model(self, kpts_3d_rf: np.ndarray, hand_detected: np.ndarray) -> HandModel | PhysicallyConstrainedHandModel:
+        """
+        Create and fit a hand model to the keypoint sequence.
+        
+        The hand model can be either unconstrained (simple fitting) or physically
+        constrained (enforces realistic hand poses and robot constraints).
+        
+        Args:
+            kpts_3d_rf (np.ndarray): Hand keypoints in robot frame, shape (N, 21, 3)
+            hand_detected (np.ndarray): Boolean array indicating valid detections, shape (N,)
+            
+        Returns:
+            HandModel | PhysicallyConstrainedHandModel: Fitted hand model with trajectory data
+        """
+        # Choose hand model type based on configuration
+        if self.constrained_hand:
+            hand_model = PhysicallyConstrainedHandModel(self.robot)
+        else:
+            hand_model = HandModel(self.robot)
+        
+        # Add each frame to the model for trajectory fitting
+        for t_idx in range(len(kpts_3d_rf)):
+            hand_model.add_frame(
+                kpts_3d_rf[t_idx], 
+                t_idx * self.dt,  # Convert frame index to time
+                hand_detected[t_idx]
+            )
+        return hand_model
+    
+    def _get_model_keypoints(self, model: HandModel | PhysicallyConstrainedHandModel) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Extract keypoints and end-effector data from fitted hand model.
+        
+        Args:
+            model (HandModel | PhysicallyConstrainedHandModel): Fitted hand model
+            
+        Returns:
+            Tuple containing:
+                - kpts_3d (np.ndarray): Model keypoint positions, shape (N, 21, 3)
+                - ee_pts (np.ndarray): End-effector positions, shape (N, 3)
+                - ee_oris (np.ndarray): End-effector orientations, shape (N, 3, 3)
+        """
+        kpts_3d = np.array(model.vertex_positions)   # All hand keypoints
+        ee_pts = np.array(model.grasp_points)        # End-effector positions (palm center)
+        ee_oris = np.array(model.grasp_oris)         # End-effector orientations (rotation matrices)
+        return kpts_3d, ee_pts, ee_oris
+
+    def _compute_gripper_distances(
+        self, 
+        kpts_3d_rf: np.ndarray, 
+        hand_detected: np.ndarray
+    ) -> np.ndarray:
+        """
+        Compute gripper opening distances for all frames in the sequence.
+        
+        The gripper distance is calculated as the Euclidean distance between
+        the thumb tip and index finger tip, providing a proxy for gripper state.
+        
+        Args:
+            kpts_3d_rf (np.ndarray): Hand keypoints in robot frame, shape (N, 21, 3)
+            hand_detected (np.ndarray): Boolean flags for valid detections, shape (N,)
+            
+        Returns:
+            np.ndarray: Gripper distances for each frame, shape (N,)
+        """
+        gripper_dists = np.zeros(len(kpts_3d_rf))
+        
+        for idx in range(len(kpts_3d_rf)):
+            if hand_detected[idx]:
+                # Only compute distance for frames with valid hand detection
+                gripper_dists[idx] = ActionProcessor._compute_gripper_opening(
+                    kpts_3d_rf[idx]
+                )
+            # Note: Invalid frames remain at 0.0, will be refined later
+        return gripper_dists
+
+    def _refine_actions(
+        self, 
+        sequence: HandSequence, 
+        actions: EEActions,
+        union_indices: np.ndarray,
+        hand_side: str
+    ) -> EEActions:
+        """
+        Refine actions to handle missing hand detections using last-valid-value interpolation.
+        
+        When hand detection fails, this method fills in missing values by carrying forward
+        the last valid pose and gripper state. This creates smooth, executable trajectories
+        even when the vision system temporarily loses tracking.
+        
+        Args:
+            sequence (HandSequence): Original hand sequence with detection flags
+            actions (EEActions): Raw actions from hand model
+            union_indices (np.ndarray): Frame indices to include in final trajectory
+            hand_side (str): "left" or "right" for neutral pose generation
+            
+        Returns:
+            EEActions: Refined actions with interpolated values for missing detections
+        """
+        # Find frames where this hand was actually detected
+        hand_detected_indices = np.where(sequence.hand_detected)[0]
+        
+        # If no valid detections, return neutral pose for entire sequence
+        if len(hand_detected_indices) == 0:
+            return self._get_neutral_actions(hand_side, len(union_indices))
+
+        # Apply carry-forward interpolation
+        return self._apply_carry_forward_interpolation(sequence, actions, union_indices, hand_detected_indices)
+
+    def _apply_carry_forward_interpolation(
+        self, 
+        sequence: HandSequence, 
+        actions: EEActions,
+        union_indices: np.ndarray,
+        hand_detected_indices: np.ndarray
+    ) -> EEActions:
+        """Apply last-valid-value interpolation to fill missing detections."""
+        # Initialize with first valid detection values
+        first_valid_idx = hand_detected_indices[0]
+        last_valid_pt = actions.ee_pts[first_valid_idx]
+        last_valid_ori = actions.ee_oris[first_valid_idx]
+        last_valid_width = actions.ee_widths[first_valid_idx]
+        
+        # Process each frame in the union sequence
+        ee_pts_refined = []
+        ee_oris_refined = []
+        ee_widths_refined = []
+        
+        for idx in union_indices:
+            if sequence.hand_detected[idx]:
+                # Update with new valid values when available
+                last_valid_pt = actions.ee_pts[idx]
+                last_valid_ori = actions.ee_oris[idx]
+                last_valid_width = actions.ee_widths[idx]
+            
+            # Always append the last valid values (carry-forward for missing frames)
+            ee_pts_refined.append(last_valid_pt)
+            ee_oris_refined.append(last_valid_ori)
+            ee_widths_refined.append(last_valid_width)
+        
+        return EEActions(
+            ee_pts=np.array(ee_pts_refined),
+            ee_oris=np.array(ee_oris_refined),
+            ee_widths=np.array(ee_widths_refined),
+        )
+    
+    def _get_neutral_actions(self, hand_side: str, n_frames: int) -> EEActions:
+        """
+        Generate neutral pose actions when no hand detection is available.
+        
+        Neutral poses place the robot arms in out-of-frame positions.
+        
+        Args:
+            hand_side (str): "left" or "right" to determine which neutral pose to use
+            n_frames (int): Number of frames to generate
+            
+        Returns:
+            EEActions: Neutral pose actions for the specified number of frames
+        """
+        # Define neutral pose configurations
+        neutral_configs = {
+            "single_arm": {
+                "right": {"pos": [0.2, -0.8, 0.3], "quat": [1, 0.0, 0.0, 0.0]},
+                "left": {"pos": [0.2, 0.8, 0.3], "quat": [1, 0.0, 0.0, 0.0]}
+            },
+            "shoulders": {
+                "right": {"pos": [0.4, -0.5, 0.3], "quat": [-0.7071, 0.0, 0.0, 0.7071]},
+                "left": {"pos": [0.4, 0.5, 0.3], "quat": [0.7071, 0.0, 0.0, 0.7071]}
+            }
+        }
+        
+        # Get configuration for current setup and hand
+        config = neutral_configs[self.bimanual_setup][hand_side]
+        
+        # Convert to numpy arrays and create rotation matrix
+        neutral_pos = np.array(config["pos"])
+        neutral_ori = Rotation.from_quat(config["quat"], scalar_first=False).as_matrix()
+        neutral_width = 0.085  # Standard gripper opening (8.5cm)
+        
+        # Create arrays replicated for all frames
+        return EEActions(
+            ee_pts=np.repeat(neutral_pos.reshape(1, 3), n_frames, axis=0),
+            ee_oris=np.repeat(neutral_ori.reshape(1, 3, 3), n_frames, axis=0),
+            ee_widths=np.full(n_frames, neutral_width)
+        )
+
+    def _save_results(
+        self, 
+        paths: Paths, 
+        union_indices: np.ndarray,
+        left_actions: Optional[EEActions] = None,
+        right_actions: Optional[EEActions] = None,
+    ) -> None:
+        """
+        Save processed action results to disk in NPZ format.
+        
+        The saved files contain all necessary data for robot execution:
+        - union_indices: Valid frame indices in the original sequence
+        - ee_pts: End-effector positions
+        - ee_oris: End-effector orientations (rotation matrices)
+        - ee_widths: Gripper opening widths
+        
+        Args:
+            paths (Paths): File path configuration object
+            union_indices (np.ndarray): Valid frame indices
+            left_actions (Optional[EEActions]): Left hand actions to save
+            right_actions (Optional[EEActions]): Right hand actions to save
+        """
+        # Create output directory if it doesn't exist
+        os.makedirs(paths.action_processor, exist_ok=True)
+        
+        # Save actions for each hand if provided
+        if left_actions is not None:
+            self._save_hand_actions(paths.actions_left, union_indices, left_actions)
+        if right_actions is not None:
+            self._save_hand_actions(paths.actions_right, union_indices, right_actions)
+
+    def _save_hand_actions(self, base_path: str, union_indices: np.ndarray, actions: EEActions) -> None:
+        """Save actions for a single hand to NPZ file."""
+        file_path = str(base_path).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        np.savez(
+            file_path,
+            union_indices=union_indices,
+            ee_pts=actions.ee_pts,
+            ee_oris=actions.ee_oris,
+            ee_widths=actions.ee_widths
+        )
+    
+    @staticmethod
+    def _compute_gripper_opening(skeleton_pts: np.ndarray) -> float:
+        """
+        Compute gripper opening distance from hand keypoints for a single frame.
+        
+        The gripper distance is calculated as the Euclidean distance between
+        the thumb tip and index finger tip.
+        
+        Args:
+            skeleton_pts (np.ndarray): Hand keypoints for one frame, shape (21, 3)
+            
+        Returns:
+            float: Distance between thumb tip and index finger tip in meters
+        """
+        # Extract finger tip positions from the hand skeleton
+        finger_dict = get_list_finger_pts_from_skeleton(skeleton_pts)
+        
+        # Compute distance between thumb tip and index finger tip
+        return np.linalg.norm(finger_dict["thumb"][-1] - finger_dict["index"][-1])
+    
+    @staticmethod
+    def _convert_pts_to_robot_frame(skeleton_poses_cf: np.ndarray, T_cam2robot: np.ndarray) -> np.ndarray:
+        """
+        Convert hand keypoints from camera frame to robot frame coordinates.
+        
+        Args:
+            skeleton_poses_cf (np.ndarray): Hand poses in camera frame, shape (N, 21, 3)
+            T_cam2robot (np.ndarray): 4x4 transformation matrix from camera to robot frame
+            
+        Returns:
+            np.ndarray: Hand poses in robot frame, shape (N, 21, 3)
+        """
+        # Convert to homogeneous coordinates by adding ones
+        pts_h = np.ones((skeleton_poses_cf.shape[0], skeleton_poses_cf.shape[1], 1))
+        skeleton_poses_cf_h = np.concatenate([skeleton_poses_cf, pts_h], axis=-1)
+        
+        # Apply transformation matrix to convert coordinate frames
+        skeleton_poses_rf_h0 = np.einsum('ij,bpj->bpi', T_cam2robot, skeleton_poses_cf_h)
+        
+        # Remove homogeneous coordinate and return 3D points
+        return skeleton_poses_rf_h0[..., :3]
\ No newline at end of file
diff --git a/phantom/phantom/processors/base_processor.py b/phantom/phantom/processors/base_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..12b6c85b0d1420415f833954bd2195341a153a27
--- /dev/null
+++ b/phantom/phantom/processors/base_processor.py
@@ -0,0 +1,209 @@
+import os
+import json
+import logging
+import numpy as np
+import shutil
+import errno
+from typing import Tuple
+from pathlib import Path
+from omegaconf import DictConfig
+
+from phantom.utils.data_utils import get_parent_folder_of_package
+from phantom.utils.image_utils import get_intrinsics_from_json, get_transformation_matrix_from_extrinsics
+from phantom.processors.paths import Paths, PathsConfig
+
+logger = logging.getLogger(__name__)
+
+class BaseProcessor: 
+    def __init__(self, cfg: DictConfig): 
+        # Store configuration for potential future use
+        self.cfg = cfg
+        
+        # Apply configuration to instance attributes
+        self._apply_config(cfg)
+        
+        # Validate configuration
+        self._validate_config(cfg)
+        
+        # Set up paths and data folders
+        self._setup_paths_and_folders(cfg)
+        
+        # Initialize camera parameters
+        self._init_camera_parameters()
+
+    def _apply_config(self, cfg: DictConfig) -> None:
+        """Apply configuration to instance attributes."""
+        # Basic attributes
+        self.input_resolution = cfg.input_resolution
+        self.output_resolution = cfg.output_resolution
+        self.project_folder = get_parent_folder_of_package("phantom")
+        self.debug = cfg.debug
+        self.n_processes = cfg.n_processes
+        self.verbose = cfg.verbose
+        self.skip_existing = cfg.skip_existing
+        self.robot = cfg.robot
+        self.gripper = cfg.gripper
+        self.square = cfg.square
+        self.epic = cfg.epic
+        self.bimanual_setup = cfg.bimanual_setup
+        self.target_hand = cfg.target_hand
+        self.constrained_hand = cfg.constrained_hand
+        self.depth_for_overlay = cfg.depth_for_overlay
+        self.render = cfg.render
+        self.debug_cameras = getattr(cfg, 'debug_cameras', [])
+        
+        # Apply bimanual setup logic
+        if self.bimanual_setup != "single_arm":
+            self.target_hand = "both"
+
+    def _validate_config(self, cfg: DictConfig) -> None:
+        """Validate critical configuration parameters."""
+        if cfg.input_resolution <= 0 or cfg.output_resolution <= 0:
+            raise ValueError(f"Resolutions must be positive: input={cfg.input_resolution}, output={cfg.output_resolution}")
+        
+        if not os.path.exists(cfg.data_root_dir):
+            raise FileNotFoundError(f"Data root directory not found: {cfg.data_root_dir}")
+        
+        if not os.path.exists(cfg.camera_intrinsics):
+            raise FileNotFoundError(f"Camera intrinsics file not found: {cfg.camera_intrinsics}")
+
+    def _setup_paths_and_folders(self, cfg: DictConfig) -> None:
+        """Set up paths configuration and create necessary directories."""
+        # Set up paths configuration
+        self.paths_config = PathsConfig()
+        self.paths_config.config['data_root'] = cfg.data_root_dir
+        self.paths_config.config['processed_root'] = cfg.processed_data_root_dir
+        
+        self.data_folder = os.path.join(cfg.data_root_dir, cfg.demo_name)
+        self.processed_data_folder = os.path.join(cfg.processed_data_root_dir, cfg.demo_name)
+        
+        # Validate that data folder exists
+        if not os.path.exists(self.data_folder):
+            raise FileNotFoundError(f"Data folder not found: {self.data_folder}")
+            
+        os.makedirs(self.processed_data_folder, exist_ok=True)
+
+        # Get all folders in data_folder
+        try:
+            all_data_folders = [d1 for d1 in os.listdir(self.data_folder) if os.path.isdir(os.path.join(self.data_folder, d1))]
+            self.all_data_folders = sorted(all_data_folders, key=lambda x: int(x))
+            self.all_data_folders_idx = {x: idx for idx, x in enumerate(self.all_data_folders)}
+        except OSError as e:
+            if e.errno == errno.EACCES:
+                raise PermissionError(f"Permission denied accessing data folder: {self.data_folder}")
+            elif e.errno == errno.ENOENT:
+                raise FileNotFoundError(f"Data folder not found: {self.data_folder}")
+            else:
+                raise RuntimeError(f"OS error accessing data folder {self.data_folder}: {e}")
+        except ValueError as e:
+            raise ValueError(f"Invalid folder name format in {self.data_folder}. Folders should be numbered: {e}")
+
+    def _init_camera_parameters(self) -> None:
+        """Initialize camera intrinsics and extrinsics."""
+        # Get camera intrinsics and extrinsics
+        self.intrinsics_dict, self.intrinsics_matrix = self.get_intrinsics(self.cfg.camera_intrinsics)
+
+        # Use camera_extrinsics from config if available, otherwise determine from bimanual_setup
+        if hasattr(self.cfg, 'camera_extrinsics') and self.cfg.camera_extrinsics:
+            camera_extrinsics_path = self.cfg.camera_extrinsics
+        else:
+            camera_extrinsics_path = self._get_camera_extrinsics_path()
+            
+        self.T_cam2robot, self.extrinsics = self.get_extrinsics(camera_extrinsics_path)
+
+    def _get_camera_extrinsics_path(self) -> str:
+        """Get the appropriate camera extrinsics path based on bimanual setup."""
+        if self.bimanual_setup == "shoulders":
+            return "camera/camera_extrinsics_ego_bimanual_shoulders.json"
+        elif self.bimanual_setup == "single_arm":
+            return "camera/camera_extrinsics.json"
+        else:
+            raise ValueError(f"Invalid bimanual setup: {self.bimanual_setup}. Must be 'single_arm' or 'shoulders'.")
+    
+    def get_paths(self, data_path: str) -> Paths:
+        """
+        Get all file paths for a demo.
+        
+        Args:
+            data_path: Path to the demo data
+            
+        Returns:
+            Paths object containing all file paths
+        """
+        paths = Paths(
+            data_path=Path(data_path),
+            robot_name=self.robot
+        )
+        paths.ensure_directories_exist()
+        return paths
+    
+    def get_save_folder(self, data_sub_folder: str) -> str:
+        data_sub_folder_fullpath = os.path.join(self.data_folder, str(data_sub_folder))
+        save_folder = os.path.join(self.processed_data_folder, str(data_sub_folder))
+        # Check existing dirs using os.scandir
+        with os.scandir(self.processed_data_folder) as it:
+            existing_dirs = {entry.name for entry in it if entry.is_dir()}
+        if str(data_sub_folder) not in existing_dirs:
+            shutil.copytree(data_sub_folder_fullpath, save_folder)
+        return save_folder
+    
+    def process_one_demo(self, data_sub_folder: str): 
+        raise NotImplementedError
+
+    def get_intrinsics(self, intrinsics_path: str) -> Tuple[dict, np.ndarray]:
+        intrinsics_matrix, intrinsics_dict = get_intrinsics_from_json(intrinsics_path)
+        if self.square:
+            intrinsics_dict, intrinsics_matrix = self.update_intrinsics_for_square_image(self.input_resolution,
+                                                                                        intrinsics_dict, 
+                                                                                        intrinsics_matrix)
+        return intrinsics_dict, intrinsics_matrix
+    
+    def get_extrinsics(self, extrinsics_path: str) -> Tuple[np.ndarray, dict]:
+        """Load and process camera extrinsics from JSON file.
+        
+        Args:
+            extrinsics_path: Path to the extrinsics JSON file
+            
+        Returns:
+            Tuple of (transformation_matrix, extrinsics_dict)
+            
+        Raises:
+            FileNotFoundError: If extrinsics file doesn't exist
+            json.JSONDecodeError: If extrinsics file is invalid JSON
+            ValueError: If extrinsics data is invalid
+        """
+        if not os.path.exists(extrinsics_path):
+            raise FileNotFoundError(f"Camera extrinsics file not found: {extrinsics_path}")
+            
+        try:
+            with open(extrinsics_path, "r") as f:
+                camera_extrinsics = json.load(f)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in extrinsics file {extrinsics_path}: {str(e)}")
+        
+        try:
+            T_cam2robot = get_transformation_matrix_from_extrinsics(camera_extrinsics)
+        except Exception as e:
+            raise ValueError(f"Failed to process extrinsics data from {extrinsics_path}: {str(e)}")
+            
+        return T_cam2robot, camera_extrinsics
+
+    @staticmethod
+    def update_intrinsics_for_square_image(img_h: int, intrinsics_dict: dict, 
+                                           intrinsics_matrix: np.ndarray) -> Tuple[dict, np.ndarray]:
+        """
+        Adjusts camera intrinsic parameters for a square image by modifying the principal point offset.
+
+        Args:
+            img_h (int): Height of the image (assumed to be square).
+            intrinsics_dict (dict): Dictionary of intrinsic parameters.
+            intrinsics_matrix (np.ndarray): Intrinsic matrix.
+
+        Returns:
+            Tuple[dict, np.ndarray]: Updated intrinsic parameters and matrix.
+        """
+        img_w = img_h * 16 // 9
+        offset = (img_w - img_h) // 2
+        intrinsics_dict["cx"] -= offset
+        intrinsics_matrix[0, 2] -= offset
+        return intrinsics_dict, intrinsics_matrix
diff --git a/phantom/phantom/processors/bbox_processor.py b/phantom/phantom/processors/bbox_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e83b54b913ccc762691aad77a3843ff135157d7c
--- /dev/null
+++ b/phantom/phantom/processors/bbox_processor.py
@@ -0,0 +1,851 @@
+"""
+Bounding Box Processor Module
+
+This module provides video processing capabilities for detecting and tracking hand bounding boxes
+in demonstration videos. It serves as the first stage in the hand processing pipeline, providing
+spatial localization data for downstream pose estimation and segmentation tasks.
+
+Key Features:
+- Multiple hand detection methods (DINO, EPIC-KITCHENS integration)
+- Bimanual hand tracking with left/right classification
+- Temporal consistency through outlier filtering and interpolation
+- Spatial constraint validation (edge detection, center positioning)
+- Visualization and annotation generation
+
+Processing Pipeline:
+1. Video loading and validation
+2. Frame-by-frame hand detection using configured detectors
+3. Bounding box classification (left/right) based on spatial positioning
+4. Temporal filtering to remove outliers and large jumps
+5. Gap interpolation for smooth trajectories
+6. Edge distance calculation for quality assessment
+7. Result visualization and storage
+
+The processor supports multiple detection backends:
+- DINO-based detection for general hand detection
+- EPIC-KITCHENS pre-computed detections
+- Configurable confidence thresholds and spatial constraints
+
+Output Data:
+- Hand detection flags per frame (boolean arrays)
+- Bounding box coordinates [x1, y1, x2, y2] per frame
+- Bounding box centers [x, y] per frame
+- Distance metrics to image edges
+- Annotated visualization videos
+"""
+
+import os
+import pickle
+import logging
+import numpy as np
+import mediapy as media
+import cv2
+import itertools
+import time
+import matplotlib.pyplot as plt
+from typing import List, Tuple, Optional, Any, Dict
+from typing_extensions import Literal
+import numpy.typing as npt
+from omegaconf import DictConfig
+
+from phantom.processors.base_processor import BaseProcessor
+from phantom.processors.paths import Paths
+from phantom.processors.phantom_data import hand_side_dict
+
+from phantom.utils.bbox_utils import get_bbox_center, get_bbox_center_min_dist_to_edge
+
+logger = logging.getLogger(__name__)
+
+# Type aliases for better readability
+DetectionResults = Dict[str, npt.NDArray]
+BBoxArray = npt.NDArray[np.float32]  # [x1, y1, x2, y2]
+CenterArray = npt.NDArray[np.float32]  # [x, y]
+DetectionFlagArray = npt.NDArray[np.bool_]
+HandSide = Literal["left", "right"]
+
+class BBoxProcessor(BaseProcessor):
+    # Detection configuration constants
+    HAND_SIDE_MARGIN = 50  # Pixel margin for hand side classification tolerance
+    OVERLAP_THRESHOLD = 0.3  # Threshold for considering bboxes as overlapping
+    MAX_INTERPOLATION_GAP = 10  # Maximum frames to interpolate over
+    MAX_SPATIAL_JUMP = 200.0  # Maximum allowed pixel jump between detections
+    MAX_JUMP_LOOKAHEAD = 10  # Maximum consecutive distant points to filter
+    DINO_CONFIDENCE_THRESH = 0.2  # Default confidence threshold
+    
+    # Visualization constants
+    LEFT_HAND_COLOR = (0, 0, 255)  # BGR format - Red for left hand
+    RIGHT_HAND_COLOR = (0, 255, 0)  # BGR format - Green for right hand
+    BBOX_THICKNESS = 2  # Thickness of bounding box lines
+    
+    """
+    Bounding box detection and tracking processor for hand localization in videos.
+    
+    This processor serves as the foundation of the hand processing pipeline by detecting
+    and tracking hand bounding boxes across video frames. It handles both single-arm
+    and bimanual setups.
+    
+    The processor employs multiple strategies for reliable detection:
+    - Primary detection using DINO or pre-computed EPIC data
+    - Spatial reasoning for left/right hand classification
+    - Temporal filtering to maintain trajectory consistency
+    - Gap interpolation for handling missing detections
+    - Quality assessment through edge distance metrics
+    
+    Attributes:
+        H (int): Video frame height (set during processing)
+        W (int): Video frame width (set during processing)
+        center (int): Horizontal center of the frame for left/right classification
+        margin (int): Pixel margin for hand side classification tolerance
+        confidence_threshold (float): Minimum confidence for valid detections
+        dino_detector: DINO-based hand detector (if not using EPIC data)
+        filtered_hand_detection_data (dict): Processed EPIC detection data
+        sorted_keys (list): Sorted frame indices for EPIC data processing
+    """
+    def __init__(self, cfg: DictConfig) -> None:
+        """
+        Initialize the bounding box processor with configuration parameters.
+        
+        Args:
+            cfg: Hydra configuration object containing processing configuration
+                 including confidence thresholds, target hands, and dataset type
+        """
+        super().__init__(cfg)    
+        # Image dimensions (set when processing video)
+        self.H: int = 0
+        self.W: int = 0
+
+        # Initialize detection backend based on dataset type
+        if not self.epic:
+            from phantom.detectors.detector_dino import DetectorDino
+            self.dino_detector: DetectorDino = DetectorDino("IDEA-Research/grounding-dino-base")
+        else:
+            self.dino_detector: Optional[DetectorDino] = None
+            
+        # EPIC-specific attributes
+        self.filtered_hand_detection_data: Dict[str, List[Any]] = {}
+        self.sorted_keys: List[str] = []
+
+    # ============================================================================
+    # COMMON/SHARED METHODS (Used by both Phantom and EPIC modes)
+    # ============================================================================
+
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration video to extract hand bounding boxes.
+        
+        Args:
+            data_sub_folder: Path to the demonstration data folder containing the video
+                           and any pre-computed hand detection data.
+
+        The method performs the following steps:
+        1. Loads and validates input video and detection data
+        2. Processes each frame to detect and classify hand positions
+        3. Applies post-processing filters for temporal consistency
+        4. Generates quality metrics and visualizations
+        5. Saves all results in standardized format
+
+        Raises:
+            FileNotFoundError: If required input files (video, detection data) are not found
+            ValueError: If video frames or hand detection data are invalid
+        """
+        # Setup and validation
+        save_folder = self.get_save_folder(data_sub_folder)
+
+        paths = self.get_paths(save_folder)
+
+        # Load and validate input data
+        imgs_rgb = self._load_video(paths)
+
+        # Process frames based on dataset type
+        if self.epic:
+            self._load_epic_hand_data(paths)
+            detection_results = self._process_epic_frames(imgs_rgb)
+        else:
+            detection_results = self._process_frames(imgs_rgb)
+
+        # Post-process results for temporal consistency
+        processed_results = self._post_process_detections(detection_results)
+        
+        # Generate visualization for quality assessment
+        visualization_results = self._generate_visualization(imgs_rgb, processed_results)
+        
+        # Save all results to disk
+        self._save_results(paths, processed_results, visualization_results)
+
+
+    def _load_video(self, paths: Paths) -> np.ndarray:
+        """
+        Load and validate video data from the specified path.
+        
+        Args:
+            paths: Paths object containing video file locations
+            
+        Returns:
+            RGB video frames as array
+            
+        Raises:
+            FileNotFoundError: If video file doesn't exist
+            ValueError: If video is empty or corrupted
+        """
+        if not os.path.exists(paths.video_left):
+            raise FileNotFoundError(f"Video file not found: {paths.video_left}")
+        
+        imgs_rgb = media.read_video(getattr(paths, f"video_left"))
+        if len(imgs_rgb) == 0:
+            raise ValueError("Empty video file")
+        
+        # Store video dimensions for coordinate calculations
+        self.H, self.W, _ = imgs_rgb[0].shape
+        self.center: int = self.W // 2  # Center line for left/right classification
+        return imgs_rgb
+
+    # ============================================================================
+    # PHANTOM-SPECIFIC METHODS (DINO Detection)
+    # ============================================================================
+    def _process_frames(self, imgs_rgb: np.ndarray) -> Dict[str, np.ndarray]:
+        """
+        Process RGB frames using DINO detector for hand detection and classification.
+        
+        This method handles the core detection pipeline for non-EPIC datasets,
+        using DINO for hand detection and implementing spatial reasoning for
+        left/right classification.
+        
+        Args:
+            imgs_rgb: Array of RGB images with shape (num_frames, height, width, 3)
+            
+        Returns:
+            Dictionary containing:
+                - left/right_hand_detected: Boolean arrays indicating hand detection per frame
+                - left/right_bboxes: Bounding box coordinates [x1,y1,x2,y2] per frame
+                - left/right_bboxes_ctr: Bounding box centers [x,y] per frame
+        """
+        num_frames = len(imgs_rgb)
+        
+        detection_arrays = self._initialize_detection_arrays(num_frames)
+
+        for idx in range(num_frames):
+            try:
+                # Run DINO detection on current frame
+                bboxes, scores = self.dino_detector.get_bboxes(imgs_rgb[idx], "a hand", threshold=self.DINO_CONFIDENCE_THRESH, visualize=False)
+                if len(bboxes) == 0:
+                    continue
+
+                bboxes = np.array(bboxes)
+                scores = np.array(scores)
+                
+                # Process detections for current frame
+                self._process_frame_detections(idx, bboxes, scores, detection_arrays)
+            except Exception as e:
+                logger.warning(f"Frame {idx} processing failed: {str(e)}")
+                continue
+
+        return {
+            'left_hand_detected': detection_arrays['left_hand_detected'],
+            'right_hand_detected': detection_arrays['right_hand_detected'],
+            'left_bboxes': detection_arrays['left_bboxes'],
+            'right_bboxes': detection_arrays['right_bboxes'],
+            'left_bboxes_ctr': detection_arrays['left_bboxes_ctr'],
+            'right_bboxes_ctr': detection_arrays['right_bboxes_ctr'],
+        }
+
+    def _initialize_detection_arrays(self, num_frames: int) -> Dict[str, npt.NDArray]:
+        """
+        Initialize arrays for storing detection results.
+        
+        Args:
+            num_frames: Number of frames in the video
+            
+        Returns:
+            Dictionary containing pre-allocated arrays for left/right hand detections,
+            bounding boxes, centers, and detection flags
+        """
+        return {
+            'left_bboxes': np.zeros((num_frames, 4)),
+            'right_bboxes': np.zeros((num_frames, 4)),
+            'left_bboxes_ctr': np.zeros((num_frames, 2)),
+            'right_bboxes_ctr': np.zeros((num_frames, 2)),
+            'left_hand_detected': np.zeros(num_frames, dtype=bool),
+            'right_hand_detected': np.zeros(num_frames, dtype=bool)
+        }
+
+    def _process_frame_detections(self, idx: int, bboxes: npt.NDArray, scores: npt.NDArray, 
+                                 detection_arrays: Dict[str, npt.NDArray]) -> None:
+        """
+        Process detections for a single frame.
+        
+        Args:
+            idx: Frame index
+            bboxes: Array of detected bounding boxes
+            scores: Array of detection confidence scores
+            detection_arrays: Dictionary to store detection results
+        """
+        if len(bboxes) == 0:
+            return
+            
+        # Always select the bounding box with the highest score
+        best_idx = np.argmax(scores)
+        best_bbox = bboxes[best_idx]
+        best_bbox_ctr = get_bbox_center(best_bbox)
+        
+        # Assign hand type directly based on self.target_hand
+        if self.target_hand == "left":
+            detection_arrays['left_bboxes'][idx] = best_bbox
+            detection_arrays['left_bboxes_ctr'][idx] = best_bbox_ctr
+            detection_arrays['left_hand_detected'][idx] = True
+        elif self.target_hand == "right":
+            detection_arrays['right_bboxes'][idx] = best_bbox
+            detection_arrays['right_bboxes_ctr'][idx] = best_bbox_ctr
+            detection_arrays['right_hand_detected'][idx] = True
+     
+
+    # ============================================================================
+    # EPIC-SPECIFIC METHODS (EPIC Dataset Processing)
+    # ============================================================================
+
+    def _validate_epic_data_structure(self, epic_data: List[Any]) -> bool:
+        """Validate EPIC data structure before processing."""
+        if not epic_data:
+            return False
+        
+        # Check if first item has required attributes
+        try:
+            first_item = epic_data[0]
+            if not hasattr(first_item, 'side') or not hasattr(first_item, 'bbox'):
+                logging.warning("EPIC data missing required attributes: 'side' or 'bbox'")
+                return False
+            
+            # Check if bbox has required attributes
+            bbox = first_item.bbox
+            required_attrs = ['left', 'right', 'top', 'bottom']
+            if not all(hasattr(bbox, attr) for attr in required_attrs):
+                logging.warning("EPIC bbox missing required attributes: left, right, top, bottom")
+                return False
+                
+            return True
+        except Exception as e:
+            logging.warning(f"Error validating EPIC data structure: {str(e)}")
+            return False
+
+    def _load_epic_hand_data(self, paths: Paths) -> Dict[str, Any]:
+        """
+        Load and validate pre-computed hand detection data from EPIC-KITCHENS dataset.
+        
+        EPIC-KITCHENS provides pre-computed hand detection annotations that we can
+        use directly instead of running our own detection. This method filters and
+        sorts the data for efficient frame-by-frame processing.
+        
+        Args:
+            paths: Paths object containing detection data file location
+            
+        Returns:
+            Dictionary of filtered and sorted hand detection data
+            
+        Raises:
+            FileNotFoundError: If detection data file doesn't exist
+        """
+        if not os.path.exists(paths.hand_detection_data):
+            raise FileNotFoundError(f"Hand detection data not found: {paths.hand_detection_data}")
+        
+        with open(paths.hand_detection_data, 'rb') as f:
+            hand_detection_data = dict(pickle.load(f))
+        
+        # Filter out detection objects without valid side information
+        filtered_data = {
+            key: [obj for obj in obj_list if hasattr(obj, 'side')]
+            for key, obj_list in hand_detection_data.items()
+        }
+        
+        # Sort by frame index for sequential processing
+        self.filtered_hand_detection_data = dict(sorted(filtered_data.items(), key=lambda x: int(x[0])))
+        self.sorted_keys = sorted(self.filtered_hand_detection_data.keys(), key=lambda k: int(k))
+        
+        return self.filtered_hand_detection_data
+
+    def _process_epic_frames(self, imgs_rgb: npt.NDArray[np.uint8]) -> DetectionResults:
+        """
+        Process frames using pre-computed EPIC-KITCHENS hand detection data.
+        
+        This method processes EPIC-KITCHENS dataset videos using their provided
+        hand detection annotations, converting them to our standard format while
+        applying spatial validation constraints.
+        
+        Args:
+            imgs_rgb: Array of RGB images for dimension reference
+            
+        Returns:
+            Dictionary containing detection results in the same format as _process_frames
+        """
+        num_frames = len(imgs_rgb)
+        
+        detection_arrays = self._initialize_detection_arrays(num_frames)
+
+        # Process each frame using EPIC detection data
+        for idx in range(num_frames):
+            try:
+                epic_data = self.filtered_hand_detection_data[self.sorted_keys[idx]]
+                
+                if len(epic_data) == 0:
+                    continue
+                
+                # Process frame detections
+                self._process_epic_frame_detections(idx, epic_data, detection_arrays)
+            except KeyError:
+                logger.warning(f"Missing EPIC data for frame {idx}")
+                continue
+            except Exception as e:
+                logger.warning(f"EPIC frame {idx} processing failed: {str(e)}")
+                continue
+
+        return {
+            'left_hand_detected': detection_arrays['left_hand_detected'],
+            'right_hand_detected': detection_arrays['right_hand_detected'],
+            'left_bboxes': detection_arrays['left_bboxes'],
+            'right_bboxes': detection_arrays['right_bboxes'],
+            'left_bboxes_ctr': detection_arrays['left_bboxes_ctr'],
+            'right_bboxes_ctr': detection_arrays['right_bboxes_ctr']
+        }
+
+    def _process_epic_frame_detections(self, idx: int, epic_data: List[Any], 
+                                      detection_arrays: Dict[str, npt.NDArray]) -> None:
+        """Process EPIC detections for a single frame."""
+        # Process left and right hands separately
+        left_detected, left_bbox, left_bbox_ctr = self._process_epic_hand_detection(epic_data, "left")
+        right_detected, right_bbox, right_bbox_ctr = self._process_epic_hand_detection(epic_data, "right")
+        
+        # Store results in pre-allocated arrays
+        detection_arrays['left_hand_detected'][idx] = left_detected
+        detection_arrays['right_hand_detected'][idx] = right_detected
+        if left_detected:
+            detection_arrays['left_bboxes'][idx] = left_bbox
+            detection_arrays['left_bboxes_ctr'][idx] = left_bbox_ctr
+        if right_detected:
+            detection_arrays['right_bboxes'][idx] = right_bbox
+            detection_arrays['right_bboxes_ctr'][idx] = right_bbox_ctr
+
+        # Quality check: If hands appear crossed (left hand on right side), 
+        # mark both as invalid to avoid confusion
+        if left_detected and right_detected:
+            self._validate_hand_positions(idx, left_bbox_ctr, right_bbox_ctr, detection_arrays)
+
+    def _validate_hand_positions(self, idx: int, left_bbox_ctr: npt.NDArray, right_bbox_ctr: npt.NDArray,
+                                detection_arrays: Dict[str, npt.NDArray]) -> None:
+        """Validate that hands are on correct sides of the image."""
+        if left_bbox_ctr[0] > right_bbox_ctr[0]:
+            # Left hand appears to be on the right side - mark both as invalid
+            detection_arrays['left_hand_detected'][idx] = False
+            detection_arrays['right_hand_detected'][idx] = False
+    
+    def _process_epic_hand_detection(self, 
+                            epic_data: List[Any], 
+                            hand_side: HandSide) -> Tuple[bool, BBoxArray, CenterArray]:
+        """
+        Process EPIC hand detection data for a single frame and hand side.
+        
+        This method extracts and validates hand detection data from EPIC annotations,
+        converting normalized coordinates to pixel coordinates and applying spatial
+        validation constraints.
+
+        Args:
+            epic_data: List of detection objects for the current frame
+            hand_side: Either "left" or "right" specifying which hand to process
+
+        Returns:
+            Tuple of (is_detected: bool, bbox: ndarray, bbox_center: ndarray)
+        """
+        if hand_side not in hand_side_dict:
+            raise ValueError(f"Invalid hand side: {hand_side}")
+
+        # Default empty result for failed detections
+        empty_result = (False, np.array([0, 0, 0, 0]), np.array([0, 0]))
+        
+        try:
+            # Filter and validate detection data
+            hand_data = self._filter_epic_hand_data(epic_data, hand_side)
+            if not hand_data:
+                return empty_result
+            
+            # Validate data structure
+            if not self._validate_epic_data_structure(hand_data):
+                return empty_result
+
+            # Extract and process bounding box
+            bbox, bbox_center = self._extract_epic_bbox(hand_data[0])
+            
+            # Validate bounding box coordinates
+            if not self._validate_bbox_coordinates(hand_data[0].bbox, hand_side):
+                return empty_result
+
+            # Apply spatial validation
+            is_valid = self._validate_spatial_position(bbox_center, hand_side)
+            return (is_valid, bbox, bbox_center) if is_valid else empty_result
+
+        except Exception as e:
+            logging.warning(f"Unexpected error processing {hand_side} hand detection: {str(e)}")
+            return empty_result
+
+    def _filter_epic_hand_data(self, epic_data: List[Any], hand_side: HandSide) -> List[Any]:
+        """Filter EPIC detection data for the specified hand side."""
+        return [data for data in epic_data if data.side.value == hand_side_dict[hand_side]]
+
+    def _extract_epic_bbox(self, hand_data: Any) -> Tuple[BBoxArray, CenterArray]:
+        """Extract bounding box and center from EPIC hand detection data."""
+        bbox_cls = hand_data.bbox
+        
+        # Convert normalized coordinates to pixel coordinates
+        bbox = np.array([
+            bbox_cls.left * self.W,
+            bbox_cls.top * self.H,
+            bbox_cls.right * self.W,
+            bbox_cls.bottom * self.H
+        ])
+        
+        # Calculate center point for spatial validation
+        bbox_center = np.array([
+            (bbox[0] + bbox[2]) / 2,
+            (bbox[1] + bbox[3]) / 2
+        ]).astype(np.int32)
+        
+        return bbox, bbox_center
+
+    def _validate_spatial_position(self, bbox_center: CenterArray, hand_side: HandSide) -> bool:
+        """Validate that hand center is on correct side of image."""
+        if hand_side == "left":
+            return bbox_center[0] <= (self.center + self.HAND_SIDE_MARGIN)
+        else:  # right
+            return bbox_center[0] >= (self.center - self.HAND_SIDE_MARGIN)
+    
+    def _validate_bbox_coordinates(self, bbox_cls: Any, hand_side: HandSide) -> bool:
+        """Validate bounding box coordinates are within valid range [0,1]."""
+        if not (0 <= bbox_cls.left <= 1 and 0 <= bbox_cls.right <= 1 and
+                0 <= bbox_cls.top <= 1 and 0 <= bbox_cls.bottom <= 1):
+            logging.warning(f"Invalid bbox coordinates detected for {hand_side} hand: "
+                            f"left={bbox_cls.left:.3f}, right={bbox_cls.right:.3f}, "
+                            f"top={bbox_cls.top:.3f}, bottom={bbox_cls.bottom:.3f}")
+            return False
+        return True
+
+
+    # ============================================================================
+    # UTILITY/HELPER METHODS (General utilities and post-processing)
+    # ============================================================================
+
+
+    def _post_process_detections(self, detection_results: DetectionResults) -> DetectionResults:
+        """
+        Apply post-processing to improve detection temporal consistency.
+        
+        This method applies several filters and enhancements to the raw detection
+        results to improve their quality and temporal coherence:
+        1. Filter out large spatial jumps that indicate tracking errors
+        2. Interpolate short gaps in detection sequences
+        3. Calculate quality metrics (distance to image edges)
+        
+        Args:
+            detection_results: Raw detection results from frame processing
+            
+        Returns:
+            Enhanced detection results with improved temporal consistency
+        """
+        # Filter out large jumps for both hands
+        left_results = self._filter_large_jumps(
+            detection_results['left_hand_detected'],
+            detection_results['left_bboxes'],
+            detection_results['left_bboxes_ctr'],
+            max_jump=self.MAX_SPATIAL_JUMP,
+            lookahead=self.MAX_JUMP_LOOKAHEAD
+        )
+        right_results = self._filter_large_jumps(
+            detection_results['right_hand_detected'],
+            detection_results['right_bboxes'],
+            detection_results['right_bboxes_ctr'],
+            max_jump=self.MAX_SPATIAL_JUMP,
+            lookahead=self.MAX_JUMP_LOOKAHEAD
+        )
+
+        # Interpolate missing detections for smooth trajectories
+        left_results = self._interpolate_detections(*left_results, max_gap=self.MAX_INTERPOLATION_GAP)
+        right_results = self._interpolate_detections(*right_results, max_gap=self.MAX_INTERPOLATION_GAP)
+
+        # Calculate quality metrics: minimum distance from bbox center to image edges
+        left_bbox_min_dist = get_bbox_center_min_dist_to_edge(left_results[1], self.W, self.H)
+        right_bbox_min_dist = get_bbox_center_min_dist_to_edge(right_results[1], self.W, self.H)
+
+        return {
+            'left_hand_detected': left_results[0],
+            'right_hand_detected': right_results[0],
+            'left_bboxes': left_results[1],
+            'right_bboxes': right_results[1],
+            'left_bboxes_ctr': left_results[2],
+            'right_bboxes_ctr': right_results[2],
+            'left_bbox_min_dist_to_edge': left_bbox_min_dist,
+            'right_bbox_min_dist_to_edge': right_bbox_min_dist
+        }
+
+    def _generate_visualization(self, imgs_rgb: np.ndarray, results: Dict[str, np.ndarray]) -> List[np.ndarray]:
+        """
+        Generate visualization of detection results for quality assessment.
+        
+        Creates annotated frames showing detected bounding boxes for visual
+        inspection of detection quality and temporal consistency.
+        
+        Args:
+            imgs_rgb: Original RGB video frames
+            results: Processed detection results
+            
+        Returns:
+            List of annotated images with bounding boxes drawn
+        """
+        list_img_annot = []
+        for idx in range(len(imgs_rgb)):
+            left_bbox = None
+            right_bbox = None
+            
+            # Prepare bounding boxes for visualization
+            if results['left_hand_detected'][idx] or results['right_hand_detected'][idx]:
+                left_bbox = results['left_bboxes'][idx] if results['left_hand_detected'][idx] else None
+                right_bbox = results['right_bboxes'][idx] if results['right_hand_detected'][idx] else None
+                
+            # Generate annotated image
+            img_annot = self.visualize_detections(imgs_rgb[idx], left_bbox, right_bbox, show_image=False)
+            list_img_annot.append(img_annot)
+        return list_img_annot
+
+    def _save_results(self, paths: Paths, results: DetectionResults, visualization_results: List[npt.NDArray[np.uint8]]) -> None:
+        """
+        Save all processed results to disk in standardized format.
+        
+        Args:
+            paths: Paths object containing output file locations
+            results: Processed detection results
+            visualization_results: Generated visualization frames
+        """
+        # Create output directory if it doesn't exist
+        if not os.path.exists(paths.bbox_processor):
+            os.makedirs(paths.bbox_processor)
+
+        # Save detection data in compressed NumPy format
+        np.savez(paths.bbox_data, **results)
+        
+        # Save visualization video with lossless compression
+        media.write_video(paths.video_bboxes, visualization_results, fps=15, codec="ffv1")
+
+    def _interpolate_detections(self, detected: DetectionFlagArray,
+                               bboxes: BBoxArray,
+                               centers: CenterArray,
+                               max_gap: int = 10) -> Tuple[DetectionFlagArray, BBoxArray, CenterArray]:
+        """
+        Interpolate bounding boxes and detection status for short gaps in tracking.
+        
+        This method fills in missing detections using linear interpolation when the
+        gap is small enough to reasonably assume continuous hand motion. This helps
+        create smoother trajectories for downstream processing.
+        
+        Args:
+            detected: Boolean array of detection status per frame
+            bboxes: Array of bounding boxes [N, 4] format [x1, y1, x2, y2]
+            centers: Array of bbox centers [N, 2] format [x, y]
+            max_gap: Maximum gap size (in frames) to interpolate over
+            
+        Returns:
+            Tuple of (interpolated detection status, interpolated bboxes, interpolated centers)
+        """
+        detected = detected.copy()
+        bboxes = bboxes.copy()
+        centers = centers.copy()
+        
+        # Handle single-frame gaps first (most common case)
+        for i in range(1, len(detected) - 1):
+            if not detected[i] and detected[i-1] and detected[i+1]:
+                # Get valid bboxes/centers before and after gap
+                start_bbox = bboxes[i-1]
+                end_bbox = bboxes[i+1]
+                start_center = centers[i-1]
+                end_center = centers[i+1]
+                
+                # Linear interpolation with t = 0.5 for single frame
+                interpolated_bbox = 0.5 * (start_bbox + end_bbox)
+                interpolated_center = 0.5 * (start_center + end_center)
+                
+                # Validate interpolated values are reasonable
+                if self._is_valid_bbox(interpolated_bbox) and self._is_valid_center(interpolated_center):
+                    bboxes[i] = interpolated_bbox
+                    centers[i] = interpolated_center
+                    detected[i] = True
+        
+        # Handle multi-frame gaps
+        non_detect_start = None
+        for i in range(1, len(detected) - 1):
+            # Start of non-detection sequence
+            if detected[i-1] and not detected[i]:
+                non_detect_start = i
+            # End of non-detection sequence
+            elif non_detect_start is not None and not detected[i] and detected[i+1]:
+                non_detect_end = i
+                gap_size = non_detect_end - non_detect_start + 1
+
+                # Only interpolate if gap is small enough and has valid detections on both sides
+                if gap_size <= max_gap:
+                    # Get valid bboxes/centers before and after gap
+                    start_bbox = bboxes[non_detect_start - 1]
+                    end_bbox = bboxes[non_detect_end + 1]
+                    start_center = centers[non_detect_start - 1]
+                    end_center = centers[non_detect_end + 1]
+                    
+                    # Generate interpolation steps
+                    steps = gap_size + 1
+                    for j in range(gap_size):
+                        t = (j + 1) / steps  # Interpolation factor
+
+                        # Linear interpolation of bbox coordinates
+                        bboxes[non_detect_start + j] = (1 - t) * start_bbox + t * end_bbox
+                        
+                        # Linear interpolation of center coordinates
+                        centers[non_detect_start + j] = (1 - t) * start_center + t * end_center
+                        
+                        # Mark as detected
+                        detected[non_detect_start + j] = True
+
+                non_detect_start = None
+                
+        return detected, bboxes, centers
+
+    def _is_valid_bbox(self, bbox: BBoxArray) -> bool:
+        """Validate that bbox coordinates are reasonable."""
+        if bbox is None or len(bbox) != 4:
+            return False
+        # Check for reasonable bounds (not negative, not too large)
+        return (bbox >= 0).all() and (bbox[:2] < bbox[2:]).all() and bbox.max() < max(self.W, self.H) * 2
+
+    def _is_valid_center(self, center: CenterArray) -> bool:
+        """Validate that center coordinates are reasonable."""
+        if center is None or len(center) != 2:
+            return False
+        # Check for reasonable bounds
+        return (center >= 0).all() and center[0] < self.W * 2 and center[1] < self.H * 2
+
+    def visualize_detections(self, img: npt.NDArray[np.uint8],
+                           left_bbox: Optional[npt.NDArray[np.float32]] = None,
+                           right_bbox: Optional[npt.NDArray[np.float32]] = None,
+                           show_image: bool = True) -> npt.NDArray[np.uint8]:
+        """
+        Visualize hand detections by drawing bounding boxes on the image.
+        
+        This method creates annotated images showing detected hand locations with
+        color-coded bounding boxes (red for left hand, green for right hand).
+
+        Args:
+            img: Input RGB image to annotate
+            left_bbox: Left hand bounding box [x1, y1, x2, y2] or None if not detected
+            right_bbox: Right hand bounding box [x1, y1, x2, y2] or None if not detected
+            show_image: Whether to display the image using cv2.imshow
+
+        Returns:
+            The annotated image
+        """
+        # Work directly with the input image (assumed to be in BGR format)
+        img_bgr = img
+        
+        # Draw left hand bounding box in red
+        if left_bbox is not None and not np.array_equal(left_bbox, np.array([0, 0, 0, 0])):
+            cv2.rectangle(
+                img_bgr, 
+                (int(left_bbox[0]), int(left_bbox[1])), 
+                (int(left_bbox[2]), int(left_bbox[3])), 
+                self.LEFT_HAND_COLOR,
+                self.BBOX_THICKNESS
+            )
+            
+        # Draw right hand bounding box in green
+        if right_bbox is not None and not np.array_equal(right_bbox, np.array([0, 0, 0, 0])):
+            cv2.rectangle(
+                img_bgr, 
+                (int(right_bbox[0]), int(right_bbox[1])), 
+                (int(right_bbox[2]), int(right_bbox[3])), 
+                self.RIGHT_HAND_COLOR,
+                self.BBOX_THICKNESS
+            )
+            
+        # Optionally display the image for debugging
+        if show_image:
+            cv2.imshow("Hand Detections", img_bgr)
+            cv2.waitKey(0)
+            cv2.destroyAllWindows()
+            
+        return img_bgr
+
+    @staticmethod
+    def _filter_large_jumps(detected: DetectionFlagArray,
+                           bboxes: BBoxArray,
+                           centers: CenterArray,
+                           max_jump: float = 200.0,
+                           lookahead: int = 10) -> Tuple[DetectionFlagArray, BBoxArray, CenterArray]:
+        """
+        Filter out small groups of detections that are spatially inconsistent with the trajectory.
+        
+        This method identifies and removes isolated detections that are far from the
+        expected trajectory, which usually indicate false positives or tracking errors.
+        It helps maintain temporal consistency in hand tracking.
+        
+        Args:
+            detected: Boolean array of detection status per frame
+            bboxes: Array of bounding boxes [N, 4] format [x1, y1, x2, y2]
+            centers: Array of bbox centers [N, 2] format [x, y]
+            max_jump: Maximum allowed distance (in pixels) between consecutive detections
+            lookahead: Maximum number of consecutive distant points to filter as a group
+            
+        Returns:
+            Tuple of (filtered detection status, filtered bboxes, filtered centers)
+        """
+        detected = detected.copy()
+        bboxes = bboxes.copy()
+        centers = centers.copy()
+        
+        # Templates for clearing invalid detections
+        empty_bbox = np.array([0, 0, 0, 0])
+        empty_center = np.array([0, 0])
+        
+        i = 0
+        while i < len(detected):
+            # Find next detected point to compare against
+            next_valid = i + 1
+
+            if next_valid >= len(detected):
+                break
+                
+            # Calculate spatial distance to next detection
+            dist = np.linalg.norm(centers[next_valid] - centers[i])
+            
+            if dist > max_jump:
+                # Large jump detected - check if it's part of a small group of outliers
+                distant_points = []
+                ref_center = centers[i]  # Use current point as reference
+                
+                # Look ahead to find consecutive distant points
+                for j in range(next_valid, len(detected)):
+                    curr_dist = np.linalg.norm(centers[j] - ref_center)
+                    if curr_dist > max_jump:
+                        distant_points.append(j)
+                    else:
+                        break
+                
+                # If we found a small group of distant points, filter them out
+                if len(distant_points) > 0 and len(distant_points) <= lookahead:
+                    for idx in distant_points:
+                        detected[idx] = False
+                        bboxes[idx] = empty_bbox
+                        centers[idx] = empty_center
+                        logging.warning(f"Filtered out frame {idx} as part of small distant group")
+            
+            i = next_valid
+        
+        return detected, bboxes, centers
+    
+
+
+
+
+    
+
+
+    
+
diff --git a/phantom/phantom/processors/hand_processor.py b/phantom/phantom/processors/hand_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d18a549233d6682a698c4968e43911a1299523a
--- /dev/null
+++ b/phantom/phantom/processors/hand_processor.py
@@ -0,0 +1,675 @@
+"""
+Hand Processor Module
+
+This module converts detected hand bounding boxes into detailed 3D hand poses using 
+state-of-the-art pose estimation models, with optional depth-based refinement for improved accuracy.
+
+Processing Pipeline:
+1. Load video frames and bounding box data from previous stage
+2. Apply HaMeR pose estimation within detected bounding boxes
+3. Filter poses based on edge proximity and quality metrics
+4. Optionally refine 3D poses using depth data and segmentation
+5. Generate hand mesh models and extract keypoint trajectories
+6. Save processed hand sequences for downstream tasks
+
+The module supports multiple processing modes:
+- Hand2DProcessor: 2D pose estimation only (faster, camera-based)
+- Hand3DProcessor: Full 3D processing with depth alignment (more accurate, if depth is available)
+
+Output Data:
+- HandSequence objects containing pose trajectories
+- 2D keypoint positions in image coordinates
+- 3D keypoint positions in camera coordinates
+- Hand detection flags per frame
+- Annotated visualization videos
+"""
+
+import glob
+import os
+import logging
+from tqdm import tqdm
+import numpy as np
+import mediapy as media
+import open3d as o3d  # type: ignore
+from typing import Tuple, Optional, Dict, Any
+import trimesh
+from collections import defaultdict
+import argparse
+
+from phantom.utils.pcd_utils import get_visible_points, get_pcd_from_points, icp_registration, get_point_cloud_of_segmask, get_3D_points_from_pixels, remove_outliers, get_bbox_of_3d_points, trim_pcd_to_bbox, visualize_pcds
+from phantom.utils.transform_utils import transform_pts
+from phantom.processors.base_processor import BaseProcessor
+from phantom.detectors.detector_hamer import DetectorHamer
+from phantom.processors.phantom_data import HandSequence, HandFrame, hand_side_dict
+from phantom.processors.paths import Paths
+from phantom.processors.segmentation_processor import HandSegmentationProcessor
+
+logger = logging.getLogger(__name__)
+
+class HandBaseProcessor(BaseProcessor): 
+    """
+    Base class for hand pose processing using HaMeR detection and optional depth refinement.
+    
+    The processor operates on the output of BBoxProcessor, using detected hand bounding boxes
+    to guide pose estimation. It supports both 2D and 3D processing modes, with the 3D mode
+    providing enhanced accuracy through depth sensor integration.
+    
+    Processing Workflow:
+    1. Load video frames and bounding box detection results
+    2. For each frame with detected hands:
+       - Apply HaMeR pose estimation within bounding box
+       - Validate pose quality (edge proximity, confidence)
+       - Optionally generate hand segmentation masks for depth refinement
+       - Optionally apply depth-based pose refinement
+    3. Generate temporal hand sequences with smooth trajectories
+    4. Save processed results and visualization videos
+    
+    Attributes:
+        process_hand_masks (bool): Whether to generate hand segmentation masks
+        apply_depth_alignment (bool): Whether to use depth-based pose refinement
+        detector_hamer (DetectorHamer): HaMeR pose estimation model
+        hand_mask_processor: Segmentation processor for hand mask generation
+        H (int): Video frame height
+        W (int): Video frame width
+        imgs_depth (np.ndarray): Depth images for 3D refinement
+        left_masks (np.ndarray): Left hand segmentation masks
+        right_masks (np.ndarray): Right hand segmentation masks
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize the hand processor with configuration parameters.
+        
+        Args:
+            args: Command line arguments containing processing configuration
+                 including depth processing flags and model parameters
+        """
+        super().__init__(args)
+        self.process_hand_masks: bool = False
+        self._initialize_detectors()
+        self.hand_mask_processor: Optional[HandSegmentationProcessor] = None
+        self.apply_depth_alignment: bool = False
+
+    def _initialize_detectors(self) -> None:
+        """
+        Initialize all required detection models.
+        
+        Sets up the HaMeR detector for hand pose estimation. 
+        """
+        self.detector_hamer = DetectorHamer()
+
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration video to extract hand poses and segmentation.
+        
+        Args:
+            data_sub_folder: Path to the demonstration data folder containing
+                           video files, bounding box data, and optional depth data
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+   
+        paths = self.get_paths(save_folder)
+        
+        # Load RGB video frames
+        imgs_rgb = media.read_video(getattr(paths, f"video_left"))
+        self.H, self.W, _ = imgs_rgb[0].shape
+
+        # Load depth data if available (for 3D processing)
+        if os.path.exists(paths.depth):
+            self.imgs_depth = np.load(paths.depth)
+        else:
+            self.imgs_depth = np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1]))
+
+        # Load hand segmentation masks if available
+        if os.path.exists(paths.masks_hand_left) and os.path.exists(paths.masks_hand_right):
+            self.left_masks = np.load(paths.masks_hand_left)
+            self.right_masks = np.load(paths.masks_hand_right)
+        else:
+            self.left_masks = np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1]))
+            self.right_masks = np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1]))
+
+        # Load bounding box detection results from previous stage
+        bbox_data = np.load(paths.bbox_data)
+        left_hand_detected = bbox_data["left_hand_detected"]
+        right_hand_detected = bbox_data["right_hand_detected"]
+        left_bboxes = bbox_data["left_bboxes"]
+        right_bboxes = bbox_data["right_bboxes"]
+
+        # Validate data consistency
+        assert len(left_hand_detected) == len(right_hand_detected)
+        assert len(left_hand_detected) == len(imgs_rgb)
+
+        # Process left and right hand sequences
+        left_sequence = self._process_all_frames(imgs_rgb, left_bboxes, left_hand_detected, "left")
+        right_sequence = self._process_all_frames(imgs_rgb, right_bboxes, right_hand_detected, "right")
+
+        # Generate hand segmentation masks if enabled
+        if self.process_hand_masks:
+            self._get_hand_masks(data_sub_folder, left_sequence, right_sequence)
+            self.left_masks = np.load(paths.masks_hand_left)
+            self.right_masks = np.load(paths.masks_hand_right)
+        
+        # Apply depth-based pose refinement if enabled
+        if self.apply_depth_alignment:
+            left_sequence = self._process_all_frames_depth_alignment(imgs_rgb, left_hand_detected, "left", left_sequence)
+            right_sequence = self._process_all_frames_depth_alignment(imgs_rgb, right_hand_detected, "right", right_sequence)
+
+        # Save processed sequences and generate visualizations
+        self._save_results(paths, left_sequence, right_sequence)
+
+    def _process_all_frames(self, imgs_rgb: np.ndarray, bboxes: np.ndarray, 
+                            hand_detections: np.ndarray, hand_side: str) -> HandSequence:
+        """
+        Process all frames in a video sequence to extract hand poses.
+        
+        This method iterates through all video frames, applying pose estimation
+        where hands are detected and creating empty frames where they are not.
+        It maintains temporal consistency and provides quality filtering.
+        
+        Args:
+            imgs_rgb: RGB video frames, shape (num_frames, height, width, 3)
+            bboxes: Hand bounding boxes per frame, shape (num_frames, 4)
+            hand_detections: Boolean flags indicating valid detections per frame
+            hand_side: "left" or "right" to specify which hand is being processed
+            
+        Returns:
+            HandSequence object containing processed pose data for all frames
+        """
+        sequence = HandSequence()
+
+        for img_idx in tqdm(range(len(imgs_rgb)), disable=False, leave=False):
+            if not hand_detections[img_idx]:
+                # Create empty frame for missing detections
+                sequence.add_frame(HandFrame.create_empty_frame(
+                    frame_idx=img_idx,
+                    img_rgb=imgs_rgb[img_idx],
+                ))
+                continue
+
+            # Process frame with detected hand
+            frame_data = self._process_frame(img_idx, imgs_rgb[img_idx], bboxes[img_idx], 
+                                            hand_side)
+            sequence.add_frame(frame_data)
+
+        return sequence
+    
+    def _process_frame(self, img_idx: int, img_rgb: np.ndarray, bbox: np.ndarray, 
+                       hand_side: str, view: bool = False) -> HandFrame:
+        """
+        Process a single frame to extract hand pose and validate quality.
+        
+        This method applies HaMeR pose estimation within the detected bounding box
+        and performs quality checks to ensure the pose is suitable for downstream
+        processing. Poor quality poses (e.g., hands too close to image edges) are
+        rejected to maintain data quality.
+        
+        Args:
+            img_idx: Index of the current frame
+            img_rgb: RGB image data for this frame
+            bbox: Hand bounding box coordinates [x1, y1, x2, y2]
+            hand_side: "left" or "right" specifying which hand is being processed
+            view: Whether to display debug visualizations
+            
+        Returns:
+            HandFrame object containing pose data or empty frame if quality is poor
+        """
+        try:
+            # Apply HaMeR pose estimation within bounding box
+            processed_data = self._process_image_with_hamer(img_rgb, bbox[None,...], hand_side, img_idx, view=view)
+
+            # Quality check: reject poses where keypoints are too close to image edges
+            if self.are_kpts_too_close_to_margin(processed_data["kpts_2d"], self.W, self.H, margin=5, threshold=0.1):
+                logger.error(f"Error processing frame {img_idx}: Edge hand")
+                return HandFrame.create_empty_frame(
+                    frame_idx=img_idx,
+                    img_rgb=img_rgb,
+                )
+            
+            # Create frame with validated pose data
+            frame_data = HandFrame(
+                frame_idx=img_idx,
+                hand_detected=True,
+                img_rgb=img_rgb,
+                img_hamer=processed_data["img_hamer"],
+                kpts_2d=processed_data["kpts_2d"],
+                kpts_3d=processed_data["kpts_3d"],
+            )
+
+            return frame_data
+            
+        except Exception as e:
+            logger.error(f"Error processing frame {img_idx}: {str(e)}")
+            return HandFrame.create_empty_frame(
+                frame_idx=img_idx,
+                img_rgb=img_rgb,
+            )
+
+    def are_kpts_too_close_to_margin(self, kpts_2d: np.ndarray, img_width: int, img_height: int, 
+                                   margin: int = 20, threshold: float = 0.5) -> bool:
+        """
+        Filter hand keypoints based on proximity to image edges.
+        
+        This quality check rejects hand poses where too many keypoints are near
+        the image boundaries, which typically indicates partial occlusion or
+        tracking errors that would lead to poor pose estimates.
+
+        Args:
+            kpts_2d: 2D keypoint positions, shape (N, 2) where N is number of keypoints
+            img_width: Image width in pixels
+            img_height: Image height in pixels
+            margin: Distance from edge (in pixels) to consider "too close"
+            threshold: Fraction of keypoints that triggers rejection (e.g., 0.5 = 50%)
+
+        Returns:
+            True if hand should be rejected due to edge proximity, False otherwise
+        """
+        x = kpts_2d[:, 0]
+        y = kpts_2d[:, 1]
+
+        # Create boolean mask for keypoints near any image edge
+        near_edge = (
+            (x < margin) |
+            (y < margin) |
+            (x > img_width - margin) |
+            (y > img_height - margin)
+        )
+
+        frac_near_edge = np.mean(near_edge)  # Fraction of keypoints near edge
+        return frac_near_edge > threshold
+
+    def _save_results(self, paths: Paths, left_sequence: HandSequence, right_sequence: HandSequence) -> None:
+        """
+        Save processed hand sequences and generate visualization videos.
+        
+        Args:
+            paths: Paths object containing output file locations
+            left_sequence: Processed left hand pose sequence
+            right_sequence: Processed right hand pose sequence
+        """
+        # Create output directory
+        if not os.path.exists(getattr(paths, f"hand_processor")):
+            os.makedirs(getattr(paths, f"hand_processor"))
+
+        # Save hand sequence data in compressed format
+        left_sequence.save(getattr(paths, f"hand_data_left"))
+        right_sequence.save(getattr(paths, f"hand_data_right"))
+
+        # Save RGB frames for reference
+        media.write_video(getattr(paths, f"video_rgb_imgs"), left_sequence.imgs_rgb, fps=10, codec="ffv1")
+
+        # Load additional visualization components
+        imgs_bbox = media.read_video(getattr(paths, f"video_bboxes"))
+
+        # Load segmentation visualization if available
+        if os.path.exists(getattr(paths, f"video_sam_arm")):
+            imgs_sam = media.read_video(getattr(paths, f"video_sam_arm"))
+        else:
+            imgs_sam = np.zeros((len(left_sequence.imgs_rgb), left_sequence.imgs_rgb[0].shape[0], left_sequence.imgs_rgb[0].shape[1], 3))
+
+        # Create comprehensive annotation video showing all processing stages
+        annot_imgs = []
+        for idx in range(len(left_sequence.imgs_rgb)):
+            img_hamer_left = left_sequence.imgs_hamer[idx]
+            img_hamer_right = right_sequence.imgs_hamer[idx]
+            img_bbox = imgs_bbox[idx]
+            img_sam = imgs_sam[idx]
+            
+            # Combine visualizations in 2x2 grid: [bbox, sam] on top, [left_hand, right_hand] on bottom
+            annot_img = np.vstack((np.hstack((img_bbox, img_sam)), np.hstack((img_hamer_left, img_hamer_right)))).astype(np.uint8)
+            annot_imgs.append(annot_img)
+            
+        # Save comprehensive visualization video
+        media.write_video(getattr(paths, f"video_annot"), np.array(annot_imgs), fps=10, codec="h264") # mp4
+
+    def _create_hand_mesh(self, hamer_out: Dict[str, Any]) -> trimesh.Trimesh:
+        """
+        Create a 3D triangle mesh from HaMeR pose estimation output.
+        
+        Args:
+            hamer_out: HaMeR output dictionary containing vertex positions
+            
+        Returns:
+            Trimesh object representing the hand mesh
+        """
+        return trimesh.Trimesh(hamer_out["verts"].copy(), self.detector_hamer.faces_left.copy(), process=False)
+    
+    def _get_hand_masks(self, data_sub_folder: str, hamer_data_left: HandSequence, hamer_data_right: HandSequence) -> None:
+        """
+        Generate hand segmentation masks using processed pose data.
+        
+        This method integrates with the segmentation processor to generate
+        detailed hand masks that can be used for depth-based pose refinement.
+        
+        Args:
+            data_sub_folder: Path to demonstration data folder
+            hamer_data_left: Processed left hand sequence for guidance
+            hamer_data_right: Processed right hand sequence for guidance
+        """
+        hamer_data = {
+            "left": hamer_data_left,
+            "right": hamer_data_right
+        }
+        self.hand_mask_processor.process_one_demo(data_sub_folder, hamer_data)
+
+    @staticmethod
+    def _get_visible_pts_from_hamer(detector_hamer: DetectorHamer, hamer_out: Dict[str, Any], mesh: trimesh.Trimesh,
+                                img_depth: np.ndarray, cam_intrinsics: Dict[str, Any]) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Identify visible hand vertices and their corresponding depth points.
+        
+        Args:
+            detector_hamer: HaMeR detector instance for coordinate projections
+            hamer_out: HaMeR output containing pose estimates and camera parameters
+            mesh: 3D hand mesh generated from HaMeR output
+            img_depth: Depth image corresponding to the RGB frame
+            cam_intrinsics: Camera intrinsic parameters for 3D projection
+            
+        Returns:
+            Tuple of (visible_points_3d, visible_hamer_vertices):
+                - visible_points_3d: 3D points from depth image at visible mesh locations
+                - visible_hamer_vertices: Corresponding vertices from the HaMeR mesh
+        """
+        # Perform ray-casting to identify visible mesh vertices
+        visible_hamer_vertices, _ = get_visible_points(mesh, origin=np.array([0,0,0]))
+        
+        # Project 3D vertices to 2D image coordinates
+        visible_points_2d = detector_hamer.project_3d_kpt_to_2d(
+            (visible_hamer_vertices-hamer_out["T_cam_pred"].cpu().numpy()).astype(np.float32), 
+            hamer_out["img_w"], hamer_out["img_h"], hamer_out["scaled_focal_length"], 
+            hamer_out["camera_center"], hamer_out["T_cam_pred"])
+
+        # Filter out points that fall outside the depth image boundaries
+        original_visible_points_2d = visible_points_2d.copy()
+
+        # Create valid region mask (note: depth indexing is [y, x])
+        valid_mask = ((original_visible_points_2d[:, 0] < img_depth.shape[1]) & 
+                     (original_visible_points_2d[:, 1] < img_depth.shape[0]))
+
+        visible_points_2d = visible_points_2d[valid_mask]
+        visible_hamer_vertices = visible_hamer_vertices[valid_mask]
+        
+        # Convert 2D depth pixels to 3D points using camera intrinsics
+        visible_points_3d = get_3D_points_from_pixels(visible_points_2d, img_depth, cam_intrinsics)
+
+        return visible_points_3d, visible_hamer_vertices
+    
+    @staticmethod
+    def _get_transformation_estimate(visible_points_3d: np.ndarray, 
+                                    visible_hamer_vertices: np.ndarray, 
+                                    pcd: o3d.geometry.PointCloud) -> Tuple[np.ndarray, o3d.geometry.PointCloud]:
+        """
+        Estimate transformation to align HaMeR mesh with observed point cloud.
+        
+        This method uses Iterative Closest Point (ICP) registration to find the
+        optimal transformation that aligns the visible parts of the predicted
+        hand mesh with the point cloud extracted from depth and segmentation data.
+        
+        Args:
+            visible_points_3d: 3D points from depth image at mesh locations
+            visible_hamer_vertices: Corresponding vertices from HaMeR mesh
+            pcd: Point cloud from segmentation and depth data
+            
+        Returns:
+            Tuple of (transformation_matrix, aligned_mesh_pointcloud):
+                - transformation_matrix: 4x4 transformation to align mesh with depth
+                - aligned_mesh_pointcloud: Transformed mesh point cloud after alignment
+        """
+        # Get initial transformation estimate using median translation
+        T_0 = HandBaseProcessor._get_initial_transformation_estimate(visible_points_3d, visible_hamer_vertices)
+        
+        # Create point cloud from visible mesh vertices
+        visible_hamer_pcd = get_pcd_from_points(visible_hamer_vertices, colors=np.ones_like(visible_hamer_vertices) * [0, 1, 0])
+        
+        try: 
+            # Apply ICP registration for fine alignment
+            aligned_hamer_pcd, T = icp_registration(visible_hamer_pcd, pcd, voxel_size=0.005, init_transform=T_0)
+        except Exception as e:
+            logger.error(f"ICP registration failed: {e}")
+            return T_0, visible_hamer_pcd
+            
+        return T, aligned_hamer_pcd
+    
+    @staticmethod
+    def _get_initial_transformation_estimate(visible_points_3d: np.ndarray, 
+                                            visible_hamer_vertices: np.ndarray) -> np.ndarray:
+        """
+        Compute initial transformation estimate for mesh-to-depth alignment.
+        
+        This method provides a coarse alignment between the HaMeR prediction and
+        the depth-based point cloud using median translation. It assumes that
+        orientation is approximately correct and only translation correction is needed.
+        
+        Args:
+            visible_points_3d: 3D points from depth image
+            visible_hamer_vertices: Corresponding HaMeR mesh vertices
+            
+        Returns:
+            4x4 transformation matrix with estimated translation
+        """
+        # Calculate median translation between corresponding point sets
+        translation = np.nanmedian(visible_points_3d - visible_hamer_vertices, axis=0)
+        
+        # Create transformation matrix (identity rotation + translation)
+        T_0 = np.eye(4)
+        if not np.isnan(translation).any():
+            T_0[:3, 3] = translation
+            
+        return T_0
+
+
+class Hand2DProcessor(HandBaseProcessor): 
+    """
+    2D hand pose processor optimized for speed and RGB-only operation.
+    
+    This processor focuses on extracting 2D hand poses and basic 3D estimates
+    without depth sensor integration. It's designed for applications where
+    depth sensors are not available.
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize 2D hand processor with RGB-only configuration.
+        
+        Args:
+            args: Command line arguments for processor configuration
+        """
+        super().__init__(args)
+
+    def _process_image_with_hamer(self, img_rgb: np.ndarray, bboxes: np.ndarray, hand_side: str, 
+                                  img_idx: int, view: bool = False) -> Dict[str, Any]:
+        """
+        Process RGB image with HaMeR for 2D pose estimation.
+        
+        Args:
+            img_rgb: RGB image to process
+            bboxes: Hand bounding boxes for pose estimation guidance
+            hand_side: "left" or "right" specifying which hand to process
+            img_idx: Frame index for debugging and logging
+            view: Whether to display debug visualizations
+            
+        Returns:
+            Dictionary containing:
+                - img_hamer: Annotated image with pose visualization
+                - kpts_3d: Estimated 3D keypoints
+                - kpts_2d: 2D keypoint projections in image coordinates
+                
+        Raises:
+            ValueError: If no valid hand pose is detected in the image
+        """
+        # Configure HaMeR for target hand side
+        is_right = np.array([hand_side_dict[str(hand_side)]*True]*len(bboxes))
+        
+        # Apply HaMeR pose estimation
+        hamer_out = self.detector_hamer.detect_hand_keypoints(
+            img_rgb, 
+            hand_side=hand_side, 
+            bboxes=bboxes, 
+            is_right=is_right, 
+            camera_params=self.intrinsics_dict, 
+            visualize=False
+        )
+        
+        if hamer_out is None or not hamer_out.get("success", False):  
+            raise ValueError("No hand detected in image")
+
+        return {
+            "img_hamer": hamer_out["annotated_img"][:,:,::-1],  # Convert BGR to RGB
+            "kpts_3d": hamer_out["kpts_3d"],
+            "kpts_2d": hamer_out['kpts_2d']
+        }
+    
+class Hand3DProcessor(HandBaseProcessor): 
+    """
+    3D hand pose processor with depth-based refinement capabilities.
+    
+    This processor provides more accurate 3D hand poses by combining HaMeR
+    estimation with depth sensor data and hand segmentation. It uses point cloud
+    registration techniques to refine the initial pose estimates, resulting in
+    poses that are better aligned with the physical environment.
+    
+    Processing Enhancements:
+    - Mesh generation from HaMeR output for visibility analysis
+    - Hand segmentation using SAM2 for accurate depth extraction
+    - ICP-based alignment between predicted mesh and observed point cloud
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize 3D hand processor with depth refinement capabilities.
+        
+        Args:
+            args: Command line arguments containing depth processing configuration
+        """
+        super().__init__(args)
+        self.args = args
+        
+        # Storage for HaMeR outputs needed for depth alignment
+        self.hamer_out_dict: Dict[str, Dict[int, Dict[str, Any]]] = {
+            "left": defaultdict(dict),
+            "right": defaultdict(dict)
+        }
+        
+        # Enable advanced processing features
+        self.process_hand_masks = True
+        self.apply_depth_alignment = True
+        self.hand_mask_processor = HandSegmentationProcessor(self.args)
+
+    def _process_image_with_hamer(self, img_rgb: np.ndarray, bboxes: np.ndarray, hand_side: str, 
+                                  img_idx: int, view: bool = False) -> Dict[str, Any]:
+        """
+        Process RGB image with HaMeR optimized for subsequent depth refinement.
+        
+        This method applies HaMeR pose estimation configured for 3D processing,
+        storing intermediate results needed for later depth-based refinement.
+        
+        Args:
+            img_rgb: RGB image to process
+            bboxes: Hand bounding boxes for pose estimation guidance
+            hand_side: "left" or "right" specifying which hand to process
+            img_idx: Frame index for result storage and debugging
+            view: Whether to display debug visualizations
+            
+        Returns:
+            Dictionary containing pose estimation results
+            
+        Raises:
+            ValueError: If no valid hand pose is detected in the image
+        """
+        # Configure HaMeR for target hand side
+        is_right = np.array([hand_side_dict[str(hand_side)]*True]*len(bboxes))
+        
+        # Apply HaMeR with 2D keypoint focus (3D refinement happens later)
+        hamer_out = self.detector_hamer.detect_hand_keypoints(
+            img_rgb, 
+            hand_side=hand_side, 
+            bboxes=bboxes, 
+            is_right=is_right, 
+            kpts_2d_only=True,  # Initial processing focuses on 2D
+            camera_params=self.intrinsics_dict
+        )
+        
+        if hamer_out is None or not hamer_out.get("success", False):  
+            raise ValueError("No hand detected in image")
+        
+        # Store HaMeR output for later depth alignment processing
+        self.hamer_out_dict[hand_side][img_idx] = hamer_out
+
+        return {
+            "img_hamer": hamer_out["annotated_img"][:,:,::-1],  # Convert BGR to RGB
+            "kpts_3d": hamer_out["kpts_3d"],
+            "kpts_2d": hamer_out['kpts_2d']
+        }
+    
+    def _process_all_frames_depth_alignment(self, imgs_rgb: np.ndarray, hand_detections: np.ndarray, 
+                                    hand_side: str, sequence: Optional[HandSequence] = None) -> HandSequence:
+        """
+        Apply depth-based refinement to all frames in the sequence.
+        
+        This method performs the depth alignment stage of processing, using
+        segmentation masks and depth data to refine the initial HaMeR pose
+        estimates for improved 3D accuracy.
+        
+        Args:
+            imgs_rgb: RGB video frames for reference
+            hand_detections: Boolean flags indicating frames with valid detections
+            hand_side: "left" or "right" specifying which hand to process
+            sequence: HandSequence containing initial pose estimates to refine
+            
+        Returns:
+            HandSequence with refined 3D poses aligned to depth data
+        """
+        for img_idx in tqdm(range(len(imgs_rgb)), disable=False, leave=False):
+            if not hand_detections[img_idx]:
+                continue
+
+            # Apply depth-based refinement to this frame
+            frame_data = sequence.get_frame(img_idx)
+            frame_data.kpts_3d = self._depth_alignment(img_idx, hand_side, imgs_rgb[img_idx])
+            sequence.modify_frame(img_idx, frame_data)
+
+        return sequence
+    
+    def _depth_alignment(self, img_idx: int, hand_side: str, img_rgb: np.ndarray) -> np.ndarray:
+        """
+        Perform depth-based pose refinement for a single frame.
+        
+        Algorithm Steps:
+        1. Extract depth image and segmentation mask for the frame
+        2. Obtain 3D hand mesh from HaMeR output
+        3. Create point cloud from segmented depth region
+        4. Identify visible mesh vertices through ray casting
+        5. Apply ICP registration between mesh and point cloud
+        6. Transform original keypoints using computed alignment
+        
+        Args:
+            img_idx: Index of the frame to process
+            hand_side: "left" or "right" specifying which hand to process
+            img_rgb: RGB image for reference (used in point cloud generation)
+            
+        Returns:
+            Refined 3D keypoint positions aligned with depth data
+        """
+        # Load frame-specific data
+        img_depth = self.imgs_depth[img_idx]
+        mask = self.left_masks[img_idx] if hand_side == "left" else self.right_masks[img_idx]
+        hamer_out = self.hamer_out_dict[hand_side][img_idx]
+        
+        # Create 3D hand mesh from HaMeR pose estimate
+        mesh = self._create_hand_mesh(hamer_out)
+
+        # Generate point cloud from depth image within segmented hand region
+        pcd = get_point_cloud_of_segmask(mask, img_depth, img_rgb, self.intrinsics_dict, visualize=False)
+
+        # Identify visible mesh vertices and corresponding depth points
+        visible_points_3d, visible_hamer_vertices = self._get_visible_pts_from_hamer(
+            self.detector_hamer, 
+            hamer_out, 
+            mesh, 
+            img_depth, 
+            self.intrinsics_dict
+        )
+        
+        # Compute optimal transformation using ICP registration
+        T, _ = self._get_transformation_estimate(visible_points_3d, visible_hamer_vertices, pcd)
+
+        # Apply transformation to refine original keypoint positions
+        kpts_3d = transform_pts(hamer_out["kpts_3d"], T)
+
+        return kpts_3d
\ No newline at end of file
diff --git a/phantom/phantom/processors/handinpaint_processor.py b/phantom/phantom/processors/handinpaint_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ac9d948f497c8ca6b74fb84b646263aa82025ec
--- /dev/null
+++ b/phantom/phantom/processors/handinpaint_processor.py
@@ -0,0 +1,485 @@
+"""
+Hand Inpainting Processor Module
+
+This module removes human hands from demonstration videos using the E2FGVI model. 
+
+Paper:
+Towards An End-to-End Framework for Flow-Guided Video Inpainting
+https://github.com/MCG-NKU/E2FGVI.git
+
+Processing Pipeline:
+1. Load pre-trained E2FGVI model and initialize GPU processing
+2. Read input video frames and corresponding hand segmentation masks
+3. Process frames in batches with neighboring temporal context
+4. Apply mask-guided inpainting to remove hand regions
+5. Verify complete processing and handle any missed frames
+6. Save final hand-free video for robot learning applications
+"""
+
+import cv2
+from PIL import Image
+import numpy as np
+import os
+from pathlib import Path
+from tqdm import tqdm
+import torch
+import mediapy as media
+import logging
+import gc
+from typing import List, Tuple, Optional, Any, Union
+
+from phantom.processors.base_processor import BaseProcessor
+from phantom.utils.data_utils import get_parent_folder_of_package
+from E2FGVI.model.e2fgvi_hq import InpaintGenerator  # type: ignore
+from E2FGVI.core.utils import to_tensors  # type: ignore
+
+DEFAULT_CHECKPOINT = 'E2FGVI/release_model/E2FGVI-HQ-CVPR22.pth'
+
+logger = logging.getLogger(__name__)
+
+class HandInpaintProcessor(BaseProcessor): 
+    """
+    Hand inpainting processor for removing human hands from demonstration videos.
+    
+    Attributes:
+        model: E2FGVI neural network model for video inpainting
+        device: GPU/CPU device for model execution
+        ref_length (int): Spacing between reference frames for temporal consistency
+        num_ref (int): Number of reference frames to use (-1 for automatic)
+        neighbor_stride (int): Spacing between neighboring frames in temporal context
+        batch_size (int): Number of frame groups to process simultaneously
+        scale_factor (int): Resolution scaling factor for processing optimization
+    """
+    
+    def __init__(self, args: Any) -> None:
+        """
+        Initialize the hand inpainting processor with E2FGVI model and parameters.
+        
+        Args:
+            args: Command line arguments containing processing configuration
+                 including scale factor and other inpainting parameters
+        """
+        super().__init__(args)
+        
+        # Load pre-trained E2FGVI model
+        root_dir = get_parent_folder_of_package("E2FGVI")
+        checkpoint_path = Path(root_dir, DEFAULT_CHECKPOINT)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        # Initialize and load the inpainting model
+        self.model = InpaintGenerator().to(self.device)
+        data = torch.load(checkpoint_path, map_location=self.device)
+        self.model.load_state_dict(data)
+        self.model.eval()
+
+        # Configure temporal processing parameters
+        self.ref_length: int = 20        # Spacing between reference frames
+        self.num_ref: int = -1           # Number of reference frames (-1 = automatic)
+        self.neighbor_stride: int = 5    # Stride for neighboring frame selection
+
+        # Configure batch processing parameters for memory optimization
+        self.batch_size: int = 10        # Number of frame groups per batch
+        self.scale_factor: int = getattr(args, 'scale_factor', 2)  # Resolution scaling
+
+    def _clear_gpu_memory(self) -> None:
+        """Clear GPU memory cache and trigger garbage collection."""
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration video to remove hand regions.
+        
+        Args:
+            data_sub_folder: Path to demonstration data folder containing
+                           input video and hand segmentation masks
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+        if not os.path.exists(paths.inpaint_processor):
+            os.makedirs(paths.inpaint_processor)
+
+        self._process_frames(paths)
+    
+    def _process_frames(self, paths: Any) -> None:
+        """
+        Process all video frames to remove hand regions using E2FGVI inpainting.
+        
+        Args:
+            paths: Paths object containing input video and mask file locations
+        """
+        # Load and prepare video frames
+        frames = self._load_and_prepare_frames(paths)
+        video_length = len(frames)
+        logger.info(f"Processing {video_length} frames")
+        
+        # Initialize tracking arrays for processed frames
+        comp_frames: List[Optional[np.ndarray]] = [None] * video_length
+        processed_frame_mask: List[bool] = [False] * video_length
+        
+        # Process frames in batches with temporal overlap for consistency
+        self._process_frames_in_batches(frames, paths, comp_frames, processed_frame_mask)
+        
+        # Handle any missed frames
+        self._process_missed_frames(frames, paths, comp_frames, processed_frame_mask)
+        
+        # Final verification and save
+        self._verify_and_save_results(comp_frames, paths)
+
+    def _load_and_prepare_frames(self, paths: Any) -> List[Image.Image]:
+        """Load video frames and prepare them for processing."""
+        frames = self.read_frame_from_videos(paths.video_rgb_imgs)
+        
+        # Calculate output dimensions based on configuration
+        h, w = frames[0].height, frames[0].width
+        
+        if self.epic:
+            size = (w, h)
+        else:
+            if self.square:
+                output_resolution = np.array([self.output_resolution, self.output_resolution])
+            else:
+                output_resolution = np.array([int(w/h*self.output_resolution), self.output_resolution])
+            output_resolution = output_resolution.astype(np.int32)
+            size = output_resolution
+            frames, size = self.resize_frames(frames, size)
+            
+        return frames
+
+    def _process_frames_in_batches(self, frames: List[Image.Image], paths: Any, 
+                                 comp_frames: List[Optional[np.ndarray]], 
+                                 processed_frame_mask: List[bool]) -> None:
+        """Process frames in batches with temporal overlap."""
+        video_length = len(frames)
+        h, w = frames[0].height, frames[0].width
+        
+        for batch_start in tqdm(range(0, video_length, self.batch_size * self.neighbor_stride), 
+                               desc="Processing batches"):
+            batch_end = min(batch_start + self.batch_size * self.neighbor_stride + self.neighbor_stride, video_length)
+            
+            # Prepare batch data
+            batch_data = self._prepare_batch_data(frames, paths, batch_start, batch_end, h, w)
+            
+            # Process frames within batch
+            self._process_batch_frames(frames, batch_data, batch_start, batch_end, 
+                                     comp_frames, processed_frame_mask, h, w)
+            
+            # Clean up batch memory
+            del batch_data['batch_imgs'], batch_data['batch_masks']
+            self._clear_gpu_memory()
+
+    def _prepare_batch_data(self, frames: List[Image.Image], paths: Any, 
+                          batch_start: int, batch_end: int, h: int, w: int) -> dict:
+        """Prepare batch data including frames, masks, and binary masks."""
+        batch_frames = frames[batch_start:batch_end]
+        batch_imgs = to_tensors()(batch_frames).unsqueeze(0).to(self.device) * 2 - 1
+        
+        batch_masks = self.read_mask(paths.masks_arm, (w, h))[batch_start:batch_end]
+        batch_masks = to_tensors()(batch_masks).unsqueeze(0).to(self.device)
+        
+        binary_masks = self._create_binary_masks(paths.masks_arm, batch_start, batch_end, w, h)
+        
+        return {
+            'batch_imgs': batch_imgs,
+            'batch_masks': batch_masks,
+            'binary_masks': binary_masks
+        }
+
+    def _create_binary_masks(self, mask_path: str, batch_start: int, batch_end: int, 
+                           w: int, h: int) -> List[np.ndarray]:
+        """Create binary masks for the batch."""
+        masks = self.read_mask(mask_path, (w, h))[batch_start:batch_end]
+        binary_masks = []
+        
+        for mask in masks:
+            mask_array = np.array(mask)
+            binary_mask = np.expand_dims((mask_array != 0).astype(np.uint8), 2)
+            binary_mask = cv2.resize(binary_mask, (w, h), interpolation=cv2.INTER_NEAREST)
+            binary_mask = np.expand_dims(binary_mask, 2)
+            binary_masks.append(binary_mask)
+            
+        return binary_masks
+
+    def _process_batch_frames(self, frames: List[Image.Image], batch_data: dict, 
+                            batch_start: int, batch_end: int, 
+                            comp_frames: List[Optional[np.ndarray]], 
+                            processed_frame_mask: List[bool], h: int, w: int) -> None:
+        """Process individual frames within a batch."""
+        stride = max(1, self.neighbor_stride if batch_start + self.batch_size * self.neighbor_stride < len(frames) else 1)
+        
+        for frame_idx in range(batch_start, batch_end, stride):
+            neighbor_ids = self._get_neighbor_ids(frame_idx, batch_start, batch_end)
+            ref_ids = self.get_ref_index(frame_idx, neighbor_ids, batch_end)
+            
+            if not neighbor_ids:
+                continue
+                
+            # Convert to batch-relative indices
+            batch_neighbor_ids = [i - batch_start for i in neighbor_ids]
+            batch_ref_ids = [i - batch_start for i in ref_ids if batch_start <= i < batch_end]
+            
+            # Process frame with temporal context
+            self._process_single_frame(frames, batch_data, neighbor_ids, batch_neighbor_ids, 
+                                     batch_ref_ids, comp_frames, processed_frame_mask, h, w)
+            
+            self._clear_gpu_memory()
+
+    def _get_neighbor_ids(self, frame_idx: int, batch_start: int, batch_end: int) -> List[int]:
+        """Get neighboring frame indices for temporal context."""
+        return list(range(
+            max(batch_start, frame_idx - self.neighbor_stride), 
+            min(batch_end, frame_idx + self.neighbor_stride + 1)
+        ))
+
+    def _process_single_frame(self, frames: List[Image.Image], batch_data: dict, 
+                            neighbor_ids: List[int], batch_neighbor_ids: List[int], 
+                            batch_ref_ids: List[int], comp_frames: List[Optional[np.ndarray]], 
+                            processed_frame_mask: List[bool], h: int, w: int) -> None:
+        """Process a single frame with its temporal context."""
+        batch_start = neighbor_ids[0] - batch_neighbor_ids[0]
+        
+        # Select relevant frames and masks
+        selected_imgs = batch_data['batch_imgs'][:, batch_neighbor_ids + batch_ref_ids, :, :, :]
+        selected_masks = batch_data['batch_masks'][:, batch_neighbor_ids + batch_ref_ids, :, :]
+        
+        with torch.no_grad():
+            # Apply masks and generate inpainted frames
+            masked_imgs = selected_imgs * (1 - selected_masks)
+            masked_imgs = self._pad_images(masked_imgs, h, w)
+            
+            pred_imgs, _ = self.model(masked_imgs, len(batch_neighbor_ids))
+            pred_imgs = (pred_imgs[:, :, :h, :w] + 1) / 2
+            pred_imgs = (pred_imgs.cpu().permute(0, 2, 3, 1).numpy() * 255).astype(np.uint8)
+            
+            # Composite with original background
+            for i, idx in enumerate(neighbor_ids):
+                binary_mask = batch_data['binary_masks'][idx - batch_start]
+                original_frame = np.array(frames[idx])
+                
+                inpainted_frame = (pred_imgs[i] * binary_mask + 
+                                 original_frame * (1 - binary_mask))
+                
+                # Average with previous results if frame was already processed
+                if comp_frames[idx] is None:
+                    comp_frames[idx] = inpainted_frame
+                else:
+                    comp_frames[idx] = ((comp_frames[idx].astype(np.float32) + 
+                                       inpainted_frame.astype(np.float32)) / 2).astype(np.uint8)
+                processed_frame_mask[idx] = True
+
+    def _process_missed_frames(self, frames: List[Image.Image], paths: Any, 
+                             comp_frames: List[Optional[np.ndarray]], 
+                             processed_frame_mask: List[bool]) -> None:
+        """Process any frames that were missed during batch processing."""
+        unprocessed_frames = [i for i, processed in enumerate(processed_frame_mask) if not processed]
+        
+        if not unprocessed_frames:
+            return
+            
+        logger.warning(f"Found {len(unprocessed_frames)} unprocessed frames at indices: {unprocessed_frames}")
+        
+        # Determine processing context for missed frames
+        start_idx, end_idx = self._get_missed_frame_context(unprocessed_frames, processed_frame_mask, len(frames))
+        
+        logger.info(f"Processing missed frames from {start_idx} to {end_idx}")
+        self._process_missed_frame_sequence(frames, paths, unprocessed_frames, 
+                                          start_idx, end_idx, comp_frames, processed_frame_mask)
+
+    def _get_missed_frame_context(self, unprocessed_frames: List[int], 
+                                processed_frame_mask: List[bool], video_length: int) -> Tuple[int, int]:
+        """Get the context range for processing missed frames."""
+        last_processed_idx = max([i for i, processed in enumerate(processed_frame_mask[:unprocessed_frames[0]]) 
+                                if processed], default=-1)
+        if last_processed_idx == -1:
+            last_processed_idx = 0
+        
+        next_processed_idx = min([i for i, processed in enumerate(processed_frame_mask[unprocessed_frames[-1]:], 
+                                 start=unprocessed_frames[-1]) if processed], default=video_length)
+        
+        start_idx = max(0, last_processed_idx - self.neighbor_stride)
+        end_idx = min(video_length, next_processed_idx + self.neighbor_stride)
+        
+        return start_idx, end_idx
+
+    def _process_missed_frame_sequence(self, frames: List[Image.Image], paths: Any, 
+                                     unprocessed_frames: List[int], start_idx: int, end_idx: int,
+                                     comp_frames: List[Optional[np.ndarray]], 
+                                     processed_frame_mask: List[bool]) -> None:
+        """Process the sequence containing missed frames."""
+        h, w = frames[0].height, frames[0].width
+        
+        # Prepare sequence data
+        batch_frames = frames[start_idx:end_idx]
+        batch_imgs = to_tensors()(batch_frames).unsqueeze(0).to(self.device) * 2 - 1
+        
+        batch_masks = self.read_mask(paths.masks_arm, (w, h))[start_idx:end_idx]
+        batch_masks = to_tensors()(batch_masks).unsqueeze(0).to(self.device)
+        
+        binary_masks = self._create_binary_masks(paths.masks_arm, start_idx, end_idx, w, h)
+        
+        # Process each missed frame
+        for idx in tqdm(unprocessed_frames, desc="Processing missed frames"):
+            self._process_missed_single_frame(frames, batch_imgs, batch_masks, binary_masks,
+                                           idx, start_idx, end_idx, comp_frames, processed_frame_mask, h, w)
+        
+        del batch_imgs, batch_masks
+        self._clear_gpu_memory()
+
+    def _process_missed_single_frame(self, frames: List[Image.Image], batch_imgs: torch.Tensor,
+                                   batch_masks: torch.Tensor, binary_masks: List[np.ndarray],
+                                   frame_idx: int, start_idx: int, end_idx: int,
+                                   comp_frames: List[Optional[np.ndarray]], 
+                                   processed_frame_mask: List[bool], h: int, w: int) -> None:
+        """Process a single missed frame."""
+        relative_start = frame_idx - start_idx
+        neighbor_ids = list(range(
+            max(0, relative_start - self.neighbor_stride),
+            min(end_idx - start_idx, relative_start + self.neighbor_stride + 1)
+        ))
+        ref_ids = self.get_ref_index(relative_start, neighbor_ids, end_idx - start_idx)
+        
+        with torch.no_grad():
+            selected_imgs = batch_imgs[:, neighbor_ids + ref_ids, :, :, :]
+            selected_masks = batch_masks[:, neighbor_ids + ref_ids, :, :]
+            
+            masked_imgs = selected_imgs * (1 - selected_masks)
+            masked_imgs = self._pad_images(masked_imgs, h, w)
+            
+            pred_imgs, _ = self.model(masked_imgs, len(neighbor_ids))
+            pred_imgs = (pred_imgs[:, :, :h, :w] + 1) / 2
+            pred_imgs = (pred_imgs.cpu().permute(0, 2, 3, 1).numpy() * 255).astype(np.uint8)
+            
+            relative_idx = frame_idx - start_idx - neighbor_ids[0]
+            binary_mask = binary_masks[frame_idx - start_idx]
+            original_frame = np.array(frames[frame_idx])
+            
+            inpainted_frame = (pred_imgs[relative_idx] * binary_mask + 
+                             original_frame * (1 - binary_mask))
+            comp_frames[frame_idx] = inpainted_frame
+            processed_frame_mask[frame_idx] = True
+
+    def _verify_and_save_results(self, comp_frames: List[Optional[np.ndarray]], paths: Any) -> None:
+        """Verify all frames were processed and save the final video."""
+        missing_frames = [i for i, frame in enumerate(comp_frames) 
+                         if frame is None or (isinstance(frame, np.ndarray) and frame.size == 0)]
+        
+        if missing_frames:
+            raise RuntimeError(f"Still found unprocessed frames after cleanup: {missing_frames}")
+            
+        logger.info("Successfully processed all frames")
+        
+        # Save final inpainted video
+        media.write_video(paths.video_human_inpaint, comp_frames, fps=15, codec="ffv1")
+
+    def get_ref_index(self, f: int, neighbor_ids: List[int], length: int) -> List[int]:
+        """
+        Select reference frame indices for temporal consistency.
+        
+        Args:
+            f: Current frame index
+            neighbor_ids: List of neighboring frame indices
+            length: Total length of the sequence
+            
+        Returns:
+            List of reference frame indices for temporal consistency
+        """
+        if self.num_ref == -1:
+            # Automatic reference selection: every ref_length frames not in neighbors
+            ref_index = [
+                i for i in range(0, length, self.ref_length)
+                if i not in neighbor_ids
+            ]
+        else:
+            # Limited reference selection: specific number around current frame
+            ref_index = []
+            for i in range(max(0, f - self.ref_length * (self.num_ref // 2)),
+                          min(length, f + self.ref_length * (self.num_ref // 2)) + 1,
+                          self.ref_length):
+                if i not in neighbor_ids and len(ref_index) < self.num_ref:
+                    ref_index.append(i)
+        return ref_index
+
+    @staticmethod
+    def read_mask(mask_path: str, size: Tuple[int, int]) -> List[Image.Image]:
+        """
+        Load and process hand segmentation masks for inpainting guidance.
+        
+        Args:
+            mask_path: Path to mask file containing hand segmentation data
+            size: Target size (width, height) for mask resizing
+            
+        Returns:
+            List of processed PIL Images containing binary hand masks
+        """
+        masks = []
+        frames_media = np.load(mask_path, allow_pickle=True)
+        frames = [frame for frame in frames_media]
+        
+        for mask_frame in frames:
+            # Convert to PIL Image and resize
+            mask_img = Image.fromarray(mask_frame)
+            mask_img = mask_img.resize(size, Image.NEAREST)
+            mask_array = np.array(mask_img.convert('L'))
+            
+            # Create binary mask
+            binary_mask = np.array(mask_array > 0).astype(np.uint8)
+            
+            # Apply morphological dilation to expand mask boundaries
+            # This helps ensure complete coverage of hand regions
+            dilated_mask = cv2.dilate(binary_mask,
+                                    cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)),
+                                    iterations=4)
+            masks.append(Image.fromarray(dilated_mask * 255))
+        return masks
+
+    @staticmethod
+    def read_frame_from_videos(video_path: str) -> List[Image.Image]:
+        """
+        Load video frames and convert to PIL Images.
+        
+        Args:
+            video_path: Path to video file
+            
+        Returns:
+            List of PIL Images containing video frames
+        """
+        return [Image.fromarray(frame) for frame in media.read_video(video_path)]
+
+    @staticmethod
+    def resize_frames(frames: List[Image.Image], size: Optional[Tuple[int, int]] = None) -> Tuple[List[Image.Image], Tuple[int, int]]:
+        """
+        Resize video frames to target resolution.
+        
+        Args:
+            frames: List of PIL Images to resize
+            size: Target size (width, height), or None to keep original
+            
+        Returns:
+            Tuple containing resized frames and final size
+        """
+        return ([f.resize(size) for f in frames], size)
+
+    @staticmethod
+    def _pad_images(img_tensor: torch.Tensor, h: int, w: int) -> torch.Tensor:
+        """
+        Pad image tensor to meet model input requirements.
+        
+        Args:
+            img_tensor: Input image tensor to pad
+            h: Original height
+            w: Original width
+            
+        Returns:
+            Padded image tensor suitable for model input
+        """
+        # Model requires specific dimension multiples
+        mod_size_h, mod_size_w = 60, 108
+        
+        # Calculate required padding
+        h_pad = (mod_size_h - h % mod_size_h) % mod_size_h
+        w_pad = (mod_size_w - w % mod_size_w) % mod_size_w
+        
+        # Apply reflection padding to avoid boundary artifacts
+        img_tensor = torch.cat([img_tensor, torch.flip(img_tensor, [3])], 3)[:, :, :, :h + h_pad, :]
+        return torch.cat([img_tensor, torch.flip(img_tensor, [4])], 4)[:, :, :, :, :w + w_pad]
+
diff --git a/phantom/phantom/processors/paths.py b/phantom/phantom/processors/paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3007dd0f567d26cc9d48f874fbc743676a39b30
--- /dev/null
+++ b/phantom/phantom/processors/paths.py
@@ -0,0 +1,219 @@
+"""
+Path management for Phantom.
+"""
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Dict, Optional
+import yaml
+
+from phantom.utils.image_utils import convert_video_to_images
+
+@dataclass
+class Paths:
+    """Data class containing all file paths used by processors."""
+    data_path: Path
+    robot_name: str = "franka"
+
+    def __post_init__(self):
+        """Compute derived paths based on base paths."""
+        # Convert string paths to Path objects if needed
+        if isinstance(self.data_path, str):
+            self.data_path = Path(self.data_path)
+        
+        # Validate data path
+        if not self.data_path.exists():
+            raise FileNotFoundError(f"Data path does not exist: {self.data_path}")
+            
+        # Videos
+        self.video_left = self.data_path / "video_L.mp4"
+        self.video_right = self.data_path / "video_R.mp4"
+        self.video_rgb_imgs = self.data_path / "video_rgb_imgs.mkv"
+
+        # Image folders
+        self.original_images_folder = self.data_path / "original_images"
+        # self._setup_original_images()
+        self.original_images_folder_reverse = self.data_path / "original_images_reverse"
+        # self._setup_original_images_reverse()
+
+        # Epic annotations
+        self.hand_detection_data = self.data_path / "hand_det.pkl"
+        self.cam_extrinsics_data = self.data_path / "extrinsics.npy"
+
+        # Depth
+        self.depth = self.data_path / "depth.npy"
+
+        # Bbox processor
+        self.bbox_processor = self.data_path / "bbox_processor"
+        self.bbox_data = self.bbox_processor / "bbox_data.npz"
+        self.video_bboxes = self.bbox_processor / "video_bboxes.mkv"
+
+        # Segmentation processor
+        self.segmentation_processor = self.data_path / "segmentation_processor"
+        self.masks_arm = self.segmentation_processor / "masks_arm.npy"
+        self.video_masks_arm = self.segmentation_processor / "video_masks_arm.mkv"
+        self.video_sam_arm = self.segmentation_processor / "video_sam_arm.mkv"
+        for side in ["left", "right"]:
+            setattr(self, f"masks_hand_{side}", self.segmentation_processor / f"masks_hand_{side}.npy")
+            setattr(self, f"video_masks_hand_{side}", self.segmentation_processor / f"video_masks_hand_{side}.mkv")
+            setattr(self, f"video_sam_hand_{side}", self.segmentation_processor / f"video_sam_hand_{side}.mkv")
+
+        # Hand Processor
+        self.hand_processor = self.data_path / f"hand_processor"
+        for side in ["left", "right"]:
+            setattr(self, f"hand_data_{side}", self.hand_processor / f"hand_data_{side}.npz")
+            setattr(self, f"hand_data_3d_{side}", self.hand_processor / f"hand_data_3d_{side}.npz")
+        self.video_annot = self.data_path / "video_annot.mp4"
+
+        # Action processor
+        self.action_processor = self.data_path / "action_processor"
+        for side in ["left", "right"]:
+            setattr(self, f"actions_{side}", self.action_processor / f"actions_{side}.npz")
+        
+        # Smoothing processor
+        self.smoothing_processor = self.data_path / f"smoothing_processor"
+        for side in ["left", "right"]:
+            setattr(self, f"smoothed_actions_{side}", self.smoothing_processor / f"smoothed_actions_{side}.npz")
+
+        # Inpaint processor
+        self.inpaint_processor = self.data_path / "inpaint_processor"
+        self.video_overlay = self.data_path / "video_overlay.mkv"
+        self.video_human_inpaint = self.inpaint_processor / "video_human_inpaint.mkv"
+        self.video_inpaint_overlay = self.inpaint_processor / "video_inpaint_overlay.mkv"
+        self.video_birdview = self.inpaint_processor / "video_birdview.mkv"
+        self.training_data = self.inpaint_processor / "training_data.npz"
+
+    def _setup_original_images(self):
+        """Set up original images paths."""
+        convert_video_to_images(self.video_left, self.original_images_folder, square=False)
+        image_paths = sorted(
+            list(self.original_images_folder.glob("*.jpg")), 
+            key=lambda x: int(x.stem)
+        )
+        self.original_images = image_paths
+
+    def _setup_original_images_reverse(self):
+        """Set up original images paths."""
+        convert_video_to_images(self.video_left, self.original_images_folder_reverse, square=False, reverse=True)
+        image_paths = sorted(
+            list(self.original_images_folder_reverse.glob("*.jpg")), 
+            key=lambda x: int(x.stem)
+        )
+        self.original_images_reverse = image_paths
+    
+    def ensure_directories_exist(self):
+        """
+        Create necessary directories if they don't exist.
+        """
+        # Create all necessary directories
+        directories = [
+            self.data_path,
+        ]
+        
+        for directory in directories:
+            if isinstance(directory, Path) and not directory.exists():
+                directory.mkdir(parents=True, exist_ok=True)
+
+
+
+class PathsConfig:
+    """
+    Configuration for paths used in the project.
+    
+    This class handles loading and saving path configurations from files,
+    and provides methods for creating Paths objects.
+    """
+    
+    def __init__(self, config_file: Optional[str] = None) -> None:
+        """
+        Initialize paths configuration.
+        
+        Args:
+            config_file: Path to configuration file. If None, use default config.
+        """
+        self.config: dict[str, str] = {}
+        if config_file:
+            self.load_config(config_file)
+        else:
+            self.set_default_config()
+    
+    def load_config(self, config_file: str) -> None:
+        """
+        Load configuration from a YAML file.
+        
+        Args:
+            config_file: Path to configuration file
+            
+        Raises:
+            FileNotFoundError: If config file doesn't exist
+            yaml.YAMLError: If config file is invalid YAML
+        """
+        try:
+            with open(config_file, 'r') as f:
+                self.config = yaml.safe_load(f)
+        except FileNotFoundError:
+            raise FileNotFoundError(f"Configuration file not found: {config_file}")
+        except yaml.YAMLError as e:
+            raise yaml.YAMLError(f"Invalid YAML in configuration file {config_file}: {e}")
+    
+    def save_config(self, config_file: str) -> None:
+        """
+        Save configuration to a YAML file.
+        
+        Args:
+            config_file: Path to save configuration file
+            
+        Raises:
+            OSError: If unable to write to the file
+        """
+        with open(config_file, 'w') as f:
+            yaml.dump(self.config, f, default_flow_style=False)
+    
+    def set_default_config(self) -> None:
+        """Set default configuration values."""
+        self.config = {
+            'data_root': './data',
+            'processed_root': './processed_data',
+            'project_name': 'phantom',
+        }
+    
+    def get_paths(self, demo_name: str, robot_name: str = "franka") -> Paths:
+        """
+        Get Paths object for a specific demo.
+        
+        Args:
+            demo_name: Name of the demo
+            robot_name: Name of the robot
+            
+        Returns:
+            Paths object for the demo
+        """
+        data_path = os.path.join(self.config['data_root'], demo_name)
+        
+        return Paths(
+            data_path=Path(data_path),
+            robot_name=robot_name
+        )
+    
+    def get_all_demo_paths(self) -> List[str]:
+        """
+        Get list of all demo paths in data root.
+        
+        Returns:
+            List of demo paths
+        """
+        data_root = self.config['data_root']
+        all_data_collection_folders = [
+            f for f in os.listdir(data_root) 
+            if os.path.isdir(os.path.join(data_root, f))
+        ]
+        
+        all_data_folders = [
+            os.path.join(d1, d2) 
+            for d1 in os.listdir(data_root) 
+            if os.path.isdir(os.path.join(data_root, d1)) 
+            for d2 in os.listdir(os.path.join(data_root, d1)) 
+            if os.path.isdir(os.path.join(data_root, d1, d2))
+        ]
+        
+        return sorted(all_data_folders, key=lambda x: tuple(map(int, x.rsplit('/', 2)[-2:])))
\ No newline at end of file
diff --git a/phantom/phantom/processors/phantom_data.py b/phantom/phantom/processors/phantom_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd25a072505edc9f35e142a533619eb9e5d3e0f8
--- /dev/null
+++ b/phantom/phantom/processors/phantom_data.py
@@ -0,0 +1,340 @@
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Callable, Any
+import numpy as np
+
+hand_side_dict = {
+    'left': 0,
+    'right': 1,
+}
+
+class LazyLoadingMixin:
+    """Mixin to provide lazy loading functionality for cached properties."""
+    
+    def _invalidate_cache(self) -> None:
+        """Invalidate all cached properties. Override in subclasses."""
+        pass
+    
+    def _get_cached_property(self, cache_attr: str, compute_func: Callable[[], Any]) -> Any:
+        """Generic lazy loading for cached properties."""
+        if getattr(self, cache_attr) is None:
+            setattr(self, cache_attr, compute_func())
+        return getattr(self, cache_attr)
+
+@dataclass
+class TrainingData:
+    """Container for processing results"""
+    frame_idx: int
+    valid: bool
+    action_pos_left: np.ndarray
+    action_orixyzw_left: np.ndarray
+    action_pos_right: np.ndarray
+    action_orixyzw_right: np.ndarray
+    action_gripper_left: np.ndarray
+    action_gripper_right: np.ndarray
+    gripper_width_left: np.ndarray
+    gripper_width_right: np.ndarray
+
+    @classmethod
+    def create_empty_frame(cls, frame_idx: int) -> 'TrainingData':
+        """Create a frame with no hand detection"""
+        return cls(
+            frame_idx=frame_idx,
+            valid=False,
+            action_pos_left=np.zeros((3,)),
+            action_orixyzw_left=np.zeros((4,)),
+            action_pos_right=np.zeros((3,)),
+            action_orixyzw_right=np.zeros((4,)),
+            action_gripper_left=0,
+            action_gripper_right=0,
+            gripper_width_left=0,
+            gripper_width_right=0,
+        )
+
+class TrainingDataSequence(LazyLoadingMixin):
+    """Container for a sequence of training data"""
+    def __init__(self):
+        self.frames: List[TrainingData] = []
+        self.metadata: Dict = {}
+
+        self._frame_indices: Optional[np.ndarray] = None
+        self._valid: Optional[np.ndarray] = None
+        self._action_pos_left: Optional[np.ndarray] = None
+        self._action_orixyzw_left: Optional[np.ndarray] = None
+        self._action_pos_right: Optional[np.ndarray] = None
+        self._action_orixyzw_right: Optional[np.ndarray] = None
+        self._action_gripper_left: Optional[np.ndarray] = None
+        self._action_gripper_right: Optional[np.ndarray] = None
+        self._gripper_width_left: Optional[np.ndarray] = None
+        self._gripper_width_right: Optional[np.ndarray] = None
+    
+    def add_frame(self, frame: TrainingData) -> None:
+        """Add a frame to the sequence and invalidate cached properties."""
+        self.frames.append(frame)
+        self._invalidate_cache()
+
+    def save(self, path: str) -> None:
+        """Save the sequence to disk in both frame-wise and sequence-wise formats"""
+        
+        sequence_data = {
+            'frame_indices': self.frame_indices,
+            'valid': self.valid,
+            'action_pos_left': self.action_pos_left,
+            'action_orixyzw_left': self.action_orixyzw_left,
+            'action_pos_right': self.action_pos_right,
+            'action_orixyzw_right': self.action_orixyzw_right,
+            'action_gripper_left': self.action_gripper_left,
+            'action_gripper_right': self.action_gripper_right,
+            'gripper_width_left': self.gripper_width_left,
+            'gripper_width_right': self.gripper_width_right,
+        }
+        
+        np.savez_compressed(
+            path,
+            **sequence_data
+        )
+
+    @property
+    def frame_indices(self) -> np.ndarray:
+        """Lazy loading of all frame indices"""
+        return self._get_cached_property(
+            '_frame_indices',
+            lambda: np.arange(len(self.frames))
+        )
+    
+    @property
+    def valid(self) -> np.ndarray:
+        """Lazy loading of all valid flags"""
+        return self._get_cached_property(
+            '_valid',
+            lambda: np.stack([f.valid for f in self.frames])
+        )
+    
+    @property
+    def action_pos_left(self) -> np.ndarray:
+        """Lazy loading of all action positions"""
+        return self._get_cached_property(
+            '_action_pos_left',
+            lambda: np.stack([f.action_pos_left for f in self.frames])
+        )
+    
+    @property
+    def action_orixyzw_left(self) -> np.ndarray:
+        """Lazy loading of all action orientations"""
+        return self._get_cached_property(
+            '_action_orixyzw_left',
+            lambda: np.stack([f.action_orixyzw_left for f in self.frames])
+        )
+    
+    @property
+    def action_pos_right(self) -> np.ndarray:
+        """Lazy loading of all action positions"""
+        return self._get_cached_property(
+            '_action_pos_right',
+            lambda: np.stack([f.action_pos_right for f in self.frames])
+        )
+    
+    @property
+    def action_orixyzw_right(self) -> np.ndarray:
+        """Lazy loading of all action orientations"""
+        return self._get_cached_property(
+            '_action_orixyzw_right',
+            lambda: np.stack([f.action_orixyzw_right for f in self.frames])
+        )
+    
+    @property
+    def action_gripper_left(self) -> np.ndarray:
+        """Lazy loading of all action gripper distances"""
+        return self._get_cached_property(
+            '_action_gripper_left',
+            lambda: np.stack([f.action_gripper_left for f in self.frames])
+        )
+    
+    @property
+    def action_gripper_right(self) -> np.ndarray:
+        """Lazy loading of all action gripper distances"""
+        return self._get_cached_property(
+            '_action_gripper_right',
+            lambda: np.stack([f.action_gripper_right for f in self.frames])
+        )
+    
+    @property
+    def gripper_width_left(self) -> np.ndarray:
+        """Lazy loading of all gripper widths"""
+        return self._get_cached_property(
+            '_gripper_width_left',
+            lambda: np.stack([f.gripper_width_left for f in self.frames])
+        )
+    
+    @property
+    def gripper_width_right(self) -> np.ndarray:
+        """Lazy loading of all gripper widths"""
+        return self._get_cached_property(
+            '_gripper_width_right',
+            lambda: np.stack([f.gripper_width_right for f in self.frames])
+        )
+    
+    def _invalidate_cache(self):
+        """Invalidate all cached properties."""
+        self._frame_indices = None
+        self._valid = None
+        self._action_pos_left = None
+        self._action_orixyzw_left = None
+        self._action_pos_right = None
+        self._action_orixyzw_right = None
+        self._action_gripper_left = None
+        self._action_gripper_right = None
+        self._gripper_width_left = None
+        self._gripper_width_right = None
+
+    @classmethod
+    def load(cls, path: str) -> 'TrainingDataSequence':
+        """Load a sequence from disk"""
+        data = np.load(path, allow_pickle=True)
+        sequence = cls()
+
+        sequence._frame_indices = data['frame_indices']
+        sequence._valid = data['valid']
+        sequence._action_pos_left = data['action_pos_left']
+        sequence._action_orixyzw_left = data['action_orixyzw_left']
+        sequence._action_pos_right = data['action_pos_right']
+        sequence._action_orixyzw_right = data['action_orixyzw_right']
+        sequence._action_gripper_left = data['action_gripper_left']
+        sequence._action_gripper_right = data['action_gripper_right']
+        sequence._gripper_width_left = data['gripper_width_left']
+        sequence._gripper_width_right = data['gripper_width_right']
+
+        return sequence
+
+@dataclass
+class HandFrame:
+    """Data structure for a single frame of hand data"""
+    frame_idx: int
+    hand_detected: bool
+    img_rgb: np.ndarray
+    img_hamer: np.ndarray
+    kpts_2d: np.ndarray  # shape: (N, 2)
+    kpts_3d: np.ndarray  # shape: (N, 3)
+
+    @classmethod
+    def create_empty_frame(cls, frame_idx: int, img_rgb: np.ndarray) -> 'HandFrame':
+        """Create a frame with no hand detection"""
+        return cls(
+            frame_idx=frame_idx,
+            hand_detected=False,
+            img_rgb=img_rgb,
+            img_hamer=np.zeros_like(img_rgb),
+            kpts_2d=np.zeros((21, 2)),
+            kpts_3d=np.zeros((21, 3)),
+        )
+
+class HandSequence(LazyLoadingMixin):
+    """Container for a sequence of hand data"""
+    def __init__(self):
+        self.frames: List[HandFrame] = []
+        self.metadata: Dict = {}
+
+        self._frame_indices: Optional[np.ndarray] = None
+        self._hand_detected: Optional[np.ndarray] = None
+        self._img_rgb: Optional[np.ndarray] = None
+        self._img_hamer: Optional[np.ndarray] = None
+        self._kpts_2d: Optional[np.ndarray] = None
+        self._kpts_3d: Optional[np.ndarray] = None
+    
+    def add_frame(self, frame: HandFrame) -> None:
+        """Add a frame to the sequence and invalidate cached properties."""
+        self.frames.append(frame)
+        self._invalidate_cache()
+    
+    def get_frame(self, frame_idx: int) -> HandFrame:
+        """Get a frame by index."""
+        return self.frames[frame_idx]
+
+    def modify_frame(self, frame_idx: int, frame: HandFrame) -> None:
+        """Modify a frame at the given index and invalidate cached properties."""
+        self.frames[frame_idx] = frame
+        self._invalidate_cache()
+
+    def save(self, path: str) -> None:
+        """Save the sequence to disk in both frame-wise and sequence-wise formats"""
+        sequence_data = {
+            'hand_detected': self.hand_detected,
+            'kpts_2d': self.kpts_2d,
+            'kpts_3d': self.kpts_3d,
+            'frame_indices': self.frame_indices,
+        }
+        
+        np.savez_compressed(
+            path,
+            **sequence_data
+        )
+
+    @property
+    def frame_indices(self) -> np.ndarray:
+        """Lazy loading of all frame indices"""
+        return self._get_cached_property(
+            '_frame_indices',
+            lambda: np.arange(len(self.frames))
+        )
+
+    @property
+    def hand_detected(self) -> np.ndarray:
+        """Lazy loading of all hand detection flags"""
+        return self._get_cached_property(
+            '_hand_detected',
+            lambda: np.stack([f.hand_detected for f in self.frames])
+        )
+    
+    @property
+    def imgs_rgb(self) -> np.ndarray:
+        """Lazy loading of all RGB images"""
+        return self._get_cached_property(
+            '_img_rgb',
+            lambda: np.stack([f.img_rgb for f in self.frames])
+        )
+    
+    @property
+    def imgs_hamer(self) -> np.ndarray:
+        """Lazy loading of all HAMER images"""
+        return self._get_cached_property(
+            '_img_hamer',
+            lambda: np.stack([f.img_hamer for f in self.frames])
+        )
+    
+    @property
+    def kpts_2d(self) -> np.ndarray:
+        """Lazy loading of all 2D keypoints"""
+        return self._get_cached_property(
+            '_kpts_2d',
+            lambda: np.stack([f.kpts_2d for f in self.frames])
+        )
+    
+    @property
+    def kpts_3d(self) -> np.ndarray:
+        """Lazy loading of all 3D keypoints"""
+        return self._get_cached_property(
+            '_kpts_3d',
+            lambda: np.stack([f.kpts_3d for f in self.frames])
+        )
+    
+    @classmethod
+    def load(cls, path: str) -> 'HandSequence':
+        """Load a sequence from disk"""
+        data = np.load(path, allow_pickle=True)
+        sequence = cls()
+        
+        # Load pre-computed sequence-wise data
+        sequence._frame_indices = data['frame_indices']
+        sequence._hand_detected = data['hand_detected']
+        sequence._kpts_2d = data['kpts_2d']
+        sequence._kpts_3d = data['kpts_3d']
+    
+        return sequence
+
+    def _invalidate_cache(self):
+        """Invalidate all cached properties."""
+        self._frame_indices = None
+        self._hand_detected = None
+        self._img_rgb = None
+        self._img_hamer = None
+        self._kpts_2d = None
+        self._kpts_3d = None
\ No newline at end of file
diff --git a/phantom/phantom/processors/robotinpaint_processor.py b/phantom/phantom/processors/robotinpaint_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd8b1e53191d19dde50c21f150e79d3358573dd1
--- /dev/null
+++ b/phantom/phantom/processors/robotinpaint_processor.py
@@ -0,0 +1,785 @@
+"""
+Robot Inpainting Processor Module
+
+This module uses MuJoCo to render robot models and overlay them onto human demonstration videos.
+
+Processing Pipeline:
+1. Load smoothed robot trajectories from previous processing stages
+2. Initialize MuJoCo robot simulation with calibrated camera parameters
+3. For each frame:
+   - Move simulated robot to target pose from human demonstration
+   - Render robot from calibrated camera viewpoint
+   - Apply depth-based occlusion handling (Optional)
+   - Create robot overlay on human demonstration video
+4. Generate training data with robot state annotations
+5. Save robot-inpainted videos and training data
+"""
+
+import os 
+import pdb
+import numpy as np
+import cv2
+from tqdm import tqdm
+import mediapy as media
+from scipy.spatial.transform import Rotation
+from typing import Tuple, Dict, List, Optional, Any, Union
+import logging
+from dataclasses import dataclass
+
+from phantom.processors.phantom_data import TrainingData, TrainingDataSequence, HandSequence
+from phantom.processors.base_processor import BaseProcessor
+from phantom.twin_bimanual_robot import TwinBimanualRobot, MujocoCameraParams
+from phantom.twin_robot import TwinRobot
+from phantom.processors.paths import Paths
+
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class RobotState:
+    """
+    Container for robot state data including pose and gripper configuration.
+    
+    Attributes:
+        pos: 3D position coordinates in world frame
+        ori_xyzw: Quaternion orientation in XYZW format (scalar-last)
+        gripper_pos: Gripper opening distance or action value
+    """
+    pos: np.ndarray
+    ori_xyzw: np.ndarray
+    gripper_pos: float
+
+class RobotInpaintProcessor(BaseProcessor):  
+    """
+    Uses mujoco to overlay robot on human inpainted images.
+    """
+    # Processing constants for quality control and output formatting
+    TRACKING_ERROR_THRESHOLD = 0.05  # Maximum tracking error in meters
+    DEFAULT_FPS = 15                 # Standard frame rate for output videos
+    DEFAULT_CODEC = "ffv1"          # Lossless codec for high-quality output
+
+    def __init__(self, args: Any) -> None:
+        """
+        Initialize the robot inpainting processor with simulation parameters.
+        
+        Args:
+            args: Command line arguments containing robot configuration,
+                 camera parameters, and processing options
+        """
+        super().__init__(args)
+        self.use_depth = self.depth_for_overlay
+        self._initialize_robot()
+
+    def _initialize_robot(self) -> None:
+        """
+        Initialize the twin robot simulation with calibrated camera parameters.
+        """
+        # Generate MuJoCo camera parameters from real-world calibration
+        camera_params = self._get_mujoco_camera_params()  
+        img_w, img_h = self._get_image_dimensions()
+        
+        # Initialize appropriate robot configuration
+        if self.bimanual_setup == "single_arm":
+            self.twin_robot = TwinRobot(
+                self.robot, 
+                self.gripper,
+                camera_params,
+                camera_height=img_h, 
+                camera_width=img_w,
+                render=self.render, 
+                n_steps_short=3,    
+                n_steps_long=75,    
+                debug_cameras=self.debug_cameras,
+                square=self.square,
+            )
+        else:
+            self.twin_robot = TwinBimanualRobot(
+                self.robot, 
+                self.gripper, 
+                self.bimanual_setup,
+                camera_params,
+                camera_height=img_h, 
+                camera_width=img_w,
+                render=self.render, 
+                n_steps_short=10, 
+                n_steps_long=75,
+                debug_cameras=self.debug_cameras,
+                epic=self.epic,
+                joint_controller=False,  # Use operational-space control
+            )
+
+    def __del__(self):
+        """Clean up robot simulation resources."""
+        if hasattr(self, 'twin_robot'):
+            self.twin_robot.close()
+
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration to create robot-inpainted visualization.
+        
+        Args:
+            data_sub_folder: Path to demonstration data folder containing
+                           smoothed trajectories and original video data
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        if self._should_skip_processing(save_folder):
+            return
+        paths = self.get_paths(save_folder)
+        
+        # Reinitialize robot simulation for each demo to ensure clean state
+        self.__del__()
+        self._initialize_robot()
+        
+        # Load and prepare demonstration data
+        data = self._load_data(paths)
+        images = self._load_images(paths, data["union_indices"])
+        gripper_actions, gripper_widths = self._process_gripper_widths(paths, data)
+
+        # Process all frames to generate robot overlays and training data
+        sequence, img_overlay, img_birdview = self._process_frames(images, data, gripper_actions, gripper_widths)
+        
+        # Save comprehensive results
+        self._save_results(paths, sequence, img_overlay, img_birdview)
+
+    def _process_frames(self, images: Dict[str, np.ndarray], data: Dict[str, np.ndarray],
+                       gripper_actions: Dict[str, np.ndarray], gripper_widths: Dict[str, np.ndarray]) -> Tuple[TrainingDataSequence, List[np.ndarray], Optional[List[np.ndarray]]]:
+        """
+        Process each frame to generate robot overlays and training data.
+        
+        Args:
+            images: Dictionary containing human demonstration images and masks
+            data: Robot trajectory data (positions and orientations)
+            gripper_actions: Processed gripper action commands
+            gripper_widths: Gripper opening distances
+            
+        Returns:
+            Tuple containing:
+                - TrainingDataSequence with robot state annotations
+                - List of robot overlay images
+                - Optional list of bird's eye view images (if debug cameras enabled)
+        """
+        sequence = TrainingDataSequence()
+        img_overlay = []
+        img_birdview = None
+        if "birdview" in self.debug_cameras:
+            img_birdview = []
+
+        for idx in tqdm(range(len(images['human_imgs'])), desc="Processing frames"):
+            # Extract robot states for current frame
+            left_state = self._get_robot_state(
+                data['ee_pts_left'][idx], 
+                data['ee_oris_left'][idx], 
+                gripper_widths['left'][idx]
+            )
+            right_state = self._get_robot_state(
+                data['ee_pts_right'][idx], 
+                data['ee_oris_right'][idx], 
+                gripper_widths['right'][idx]
+            )
+
+            # Process individual frame with robot simulation
+            frame_results = self._process_single_frame(
+                images, left_state, right_state, idx
+            )
+            
+            # Handle failed processing (tracking errors, simulation issues)
+            if frame_results is None:
+                print(f"sdfsdfsTracking error too large at frame {idx}, skipping")
+                sequence.add_frame(TrainingData.create_empty_frame(
+                    frame_idx=idx,
+                ))
+                img_overlay.append(np.zeros_like(images['human_imgs'][idx]))
+                if "birdview" in self.debug_cameras:
+                    img_birdview.append(np.zeros_like(images['human_imgs'][idx]))
+            else:
+                # Create comprehensive training data annotation
+                sequence.add_frame(TrainingData(
+                    frame_idx=idx,
+                    valid=True,
+                    action_pos_left=left_state.pos,
+                    action_orixyzw_left=left_state.ori_xyzw,
+                    action_pos_right=right_state.pos,
+                    action_orixyzw_right=right_state.ori_xyzw,
+                    action_gripper_left=gripper_actions['left'][idx],
+                    action_gripper_right=gripper_actions['right'][idx],
+                    gripper_width_left=gripper_widths['left'][idx],
+                    gripper_width_right=gripper_widths['right'][idx],
+                ))
+                img_overlay.append(frame_results['rgb_robot_overlay'])
+                if "birdview" in self.debug_cameras:
+                    img_birdview.append(frame_results['birdview_img'])
+        return sequence, img_overlay, img_birdview
+
+    
+    def _process_single_frame(self, images: Dict[str, np.ndarray],
+                            left_state: RobotState,
+                            right_state: RobotState,
+                            idx: int) -> Optional[Dict[str, np.ndarray]]:
+        """
+        Process a single frame to generate robot overlay and validate tracking.
+        
+        Args:
+            images: Dictionary containing human images and segmentation data
+            left_state: Target state for left robot arm
+            right_state: Target state for right robot arm
+            idx: Frame index for initialization and logging
+            
+        Returns:
+            Dictionary containing rendered robot overlay and debug camera views,
+            or None if tracking error exceeds threshold
+        """
+        # Prepare robot target state based on configuration
+        if self.bimanual_setup == "single_arm":
+            if self.target_hand == "left":
+                target_state = {
+                    "pos": left_state.pos,
+                    "ori_xyzw": left_state.ori_xyzw,
+                    "gripper_pos": left_state.gripper_pos,
+                }
+            else:
+                target_state = {
+                    "pos": right_state.pos,
+                    "ori_xyzw": right_state.ori_xyzw,
+                    "gripper_pos": right_state.gripper_pos,
+                }
+        else:
+            # Bimanual configuration requires coordinated control
+            target_state = {
+                "pos": [right_state.pos, left_state.pos],
+                "ori_xyzw": [right_state.ori_xyzw, left_state.ori_xyzw],
+                "gripper_pos": [right_state.gripper_pos, left_state.gripper_pos],
+            }
+
+        # Move robot to target state and get simulation results
+        robot_results = self.twin_robot.move_to_target_state(
+            target_state, init=(idx == 0)  # Initialize on first frame
+        )
+
+        # Validate tracking accuracy to ensure quality
+        if self.bimanual_setup == "single_arm":
+            if robot_results['pos_err'] > self.TRACKING_ERROR_THRESHOLD:
+                print(f"Tracking error too large at frame {idx}, skipping", robot_results['pos_err'])
+                logger.warning(f"Tracking error too large at frame {idx}, skipping")
+                return None
+        else:        
+            if robot_results['left_pos_err'] > self.TRACKING_ERROR_THRESHOLD or robot_results['right_pos_err'] > self.TRACKING_ERROR_THRESHOLD:
+                logger.warning(f"Tracking error too large at frame {idx}, skipping")
+                return None
+
+        # Generate robot overlay using appropriate method
+        if self.use_depth:
+            rgb_robot_overlay = self._process_robot_overlay_with_depth(
+                images['human_imgs'][idx],
+                images['human_masks'][idx],
+                images['imgs_depth'][idx],
+                robot_results
+            )
+        else:
+            rgb_robot_overlay = self._process_robot_overlay(
+                images['human_imgs'][idx], robot_results
+            )
+
+        # Prepare output with main overlay and debug camera views
+        output = {
+            'rgb_robot_overlay': rgb_robot_overlay,
+        }
+
+        # Add debug camera views if requested
+        for cam in self.debug_cameras:
+            output[f"{cam}_img"] = (robot_results[f"{cam}_img"] * 255).astype(np.uint8)
+
+        return output
+
+    def _should_skip_processing(self, save_folder: str) -> bool:
+        """
+        Check if processing should be skipped due to existing output files.
+        
+        Args:
+            save_folder: Directory where output files would be saved
+            
+        Returns:
+            True if processing should be skipped, False otherwise
+        """
+        if self.skip_existing:
+            try:
+                with os.scandir(save_folder) as it:
+                    existing_files = {entry.name for entry in it if entry.is_file()}
+                if str("video_overlay"+f"_{self.robot}_{self.bimanual_setup}.mkv") in existing_files:
+                    print(f"Skipping existing demo {save_folder}")
+                    return True
+            except FileNotFoundError:
+                return False
+        return False
+
+    def _load_data(self, paths: Paths) -> Dict[str, np.ndarray]:
+        """
+        Load robot trajectory data from smoothed action files.
+        
+        Args:
+            paths: Paths object containing file locations
+            
+        Returns:
+            Dictionary containing robot trajectory data and frame indices
+        """
+        if self.bimanual_setup == "single_arm":
+            # Get paths based on target hand for single-arm operation
+            smoothed_base = getattr(paths, f"smoothed_actions_{self.target_hand}")
+            actions_base = getattr(paths, f"actions_{self.target_hand}")
+            smoothed_actions_path = str(smoothed_base).replace(".npz", f"_{self.bimanual_setup}.npz")
+            actions_path = str(actions_base).replace(".npz", f"_{self.bimanual_setup}.npz")
+            
+            # Load actual trajectory data for target hand
+            ee_pts = np.load(smoothed_actions_path)["ee_pts"]
+            ee_oris = np.load(smoothed_actions_path)["ee_oris"]
+            
+            # Create dummy data for non-target hand 
+            dummy_pts = np.zeros((len(ee_pts), 3))
+            dummy_oris = np.eye(3)[None, :, :].repeat(len(ee_oris), axis=0)
+            
+            # Create data dictionary with target hand data and dummy data for other hand
+            other_hand = "right" if self.target_hand == "left" else "left"
+            return {
+                f'ee_pts_{self.target_hand}': ee_pts,
+                f'ee_oris_{self.target_hand}': ee_oris,
+                f'ee_pts_{other_hand}': dummy_pts,
+                f'ee_oris_{other_hand}': dummy_oris,
+                'union_indices': np.load(actions_path, allow_pickle=True)["union_indices"]
+            }
+
+        # Load bimanual trajectory data
+        smoothed_actions_left_path = str(paths.smoothed_actions_left).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        smoothed_actions_right_path = str(paths.smoothed_actions_right).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        actions_left_path = str(paths.actions_left).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        return {
+            'ee_pts_left': np.load(smoothed_actions_left_path)["ee_pts"],
+            'ee_oris_left': np.load(smoothed_actions_left_path)["ee_oris"],
+            'ee_pts_right': np.load(smoothed_actions_right_path)["ee_pts"],
+            'ee_oris_right': np.load(smoothed_actions_right_path)["ee_oris"],
+            'union_indices': np.load(actions_left_path, allow_pickle=True)["union_indices"]
+        }
+
+    def _load_images(self, paths: Paths, union_indices: np.ndarray) -> Dict[str, np.ndarray]:
+        """
+        Load and index human demonstration images and associated data.
+        
+        Args:
+            paths: Paths object containing image file locations
+            union_indices: Frame indices to extract from full video sequences
+            
+        Returns:
+            Dictionary containing indexed human images, masks, and depth data
+        """
+        return {
+            'human_masks': np.load(paths.masks_arm)[union_indices],
+            'human_imgs': np.array(media.read_video(paths.video_human_inpaint))[union_indices],
+            'imgs_depth': np.load(paths.depth)[union_indices] if self.use_depth else None
+        }
+    
+    def _process_gripper_widths(self, paths: Paths, data: Dict[str, np.ndarray]) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
+        """
+        Process gripper distance data into robot action commands.
+        
+        Args:
+            paths: Paths object containing smoothed action file locations
+            data: Dictionary containing trajectory data and frame indices
+            
+        Returns:
+            Tuple containing:
+                - Dictionary of gripper action commands for each hand
+                - Dictionary of gripper width values for each hand
+        """
+        if self.bimanual_setup == "single_arm":
+            # Get the appropriate smoothed actions path based on target hand
+            base_path = getattr(paths, f"smoothed_actions_{self.target_hand}")
+            smoothed_actions_path = str(base_path).replace(".npz", f"_{self.bimanual_setup}.npz")
+            
+            # Compute gripper actions and widths from smoothed data
+            actions, widths = self._compute_gripper_actions(
+                np.load(smoothed_actions_path)["ee_widths"]
+            )
+            
+            # Create return dictionaries with actions for target hand, zeros for the other
+            num_indices = len(data['union_indices'])
+            other_hand = "right" if self.target_hand == "left" else "left"
+            
+            return (
+                {self.target_hand: actions, other_hand: np.zeros(num_indices)},
+                {self.target_hand: widths, other_hand: np.zeros(num_indices)}
+            )
+        
+        # Process bimanual gripper data
+        smoothed_actions_left_path = str(paths.smoothed_actions_left).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        smoothed_actions_right_path = str(paths.smoothed_actions_right).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        left_actions, left_widths = self._compute_gripper_actions(
+            np.load(smoothed_actions_left_path)["ee_widths"]
+        )
+        right_actions, right_widths = self._compute_gripper_actions(
+            np.load(smoothed_actions_right_path)["ee_widths"]
+        )
+        return {'left': left_actions, 'right': right_actions}, {'left': left_widths, 'right': right_widths}
+
+
+    def _compute_gripper_actions(self, list_gripper_dist: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Convert continuous gripper distances to discrete robot gripper actions.
+        Args:
+            list_gripper_dist: Array of gripper distances throughout trajectory
+            
+        Returns:
+            Tuple containing:
+                - Gripper action commands (0 for grasp, distance for open)
+                - Processed gripper width values
+        """
+        try:
+            # Analyze gripper distance range and determine grasp threshold
+            min_val, max_val = np.min(list_gripper_dist), np.max(list_gripper_dist)
+            thresh = min_val + 0.2 * (max_val - min_val)  # 20% above minimum
+            
+            # Classify gripper states: 0 = closed/grasping, 1 = open
+            gripper_state = np.array([0 if dist < thresh else 1 for dist in list_gripper_dist])
+            
+            # Find range of grasping action
+            min_idx_pos = np.where(gripper_state == 0)[0][0]
+            max_idx_pos = np.where(gripper_state == 0)[0][-1]
+
+            # Generate gripper action commands
+            list_gripper_actions = []
+            for idx in range(len(list_gripper_dist)):
+                if min_idx_pos <= idx <= max_idx_pos:   
+                    # During grasping phase: use grasp command (0) and limit distance
+                    list_gripper_actions.append(0)
+                    list_gripper_dist[idx] = np.min([list_gripper_dist[idx], thresh])
+                else:
+                    # Outside grasping phase: use distance as action command
+                    list_gripper_actions.append(list_gripper_dist[idx])
+        except:
+            # Fallback: use distances directly if processing fails
+            list_gripper_actions = list_gripper_dist.tolist()  
+        
+        return np.array(list_gripper_actions), list_gripper_dist
+    
+    def _get_robot_state(self, ee_pt: np.ndarray, ori_matrix: np.ndarray, gripper_dist: float) -> RobotState:
+        """
+        Convert trajectory data to robot state representation.
+        
+        Args:
+            ee_pt: End-effector position in 3D space
+            ori_matrix: 3x3 rotation matrix for end-effector orientation  
+            gripper_dist: Gripper opening distance
+            
+        Returns:
+            RobotState object containing pose and gripper information
+        """
+        # Convert rotation matrix to quaternion (XYZW format for robot control)
+        ori_xyzw = Rotation.from_matrix(ori_matrix).as_quat(scalar_first=False)
+        robot_state = RobotState(pos=ee_pt, ori_xyzw=ori_xyzw, gripper_pos=gripper_dist)
+        return robot_state
+    
+    def _process_robot_overlay(self, img: np.ndarray, robot_results: Dict[str, Any]) -> np.ndarray:
+        """
+        Create robot overlay on human image using segmentation masks.
+        
+        Args:
+            img: Original human demonstration image
+            robot_results: Dictionary containing robot rendering results
+            
+        Returns:
+            Image with robot overlay applied
+        """
+        # Extract robot rendering and segmentation data
+        rgb_img_sim = (robot_results['rgb_img'] * 255).astype(np.uint8)
+        H, W = rgb_img_sim.shape[:2]
+        
+        # Resize robot rendering and masks to match output resolution
+        if self.square:
+            rgb_img_sim = cv2.resize(rgb_img_sim, (self.output_resolution, self.output_resolution))
+            robot_mask = cv2.resize(robot_results['robot_mask'], (self.output_resolution, self.output_resolution))
+            robot_mask[robot_mask > 0] = 1
+            gripper_mask = cv2.resize(robot_results['gripper_mask'], (self.output_resolution, self.output_resolution))
+            gripper_mask[gripper_mask > 0] = 1
+        else:
+            rgb_img_sim = cv2.resize(rgb_img_sim, (int(W/H*self.output_resolution), self.output_resolution))
+            robot_mask = cv2.resize(robot_results['robot_mask'], (int(W/H*self.output_resolution), self.output_resolution))
+            robot_mask[robot_mask > 0] = 1
+            gripper_mask = cv2.resize(robot_results['gripper_mask'], (int(W/H*self.output_resolution), self.output_resolution))
+            gripper_mask[gripper_mask > 0] = 1
+        
+        # Create overlay by compositing robot over human image
+        img_robot_overlay = img.copy()
+        overlay_mask = (robot_mask == 1) | (gripper_mask == 1)
+        img_robot_overlay[overlay_mask] = rgb_img_sim[overlay_mask]
+        
+        return img_robot_overlay
+
+    def _process_robot_overlay_with_depth(self, img: np.ndarray, hand_mask: np.ndarray, 
+                                    img_depth: np.ndarray, robot_results: Dict[str, Any]) -> np.ndarray:
+        """
+        Create depth-aware robot overlay with realistic occlusion handling.
+        
+        Args:
+            img: Original human demonstration image
+            hand_mask: Segmentation mask of human hand regions
+            img_depth: Depth image corresponding to the demonstration
+            robot_results: Dictionary containing robot rendering and depth results
+            
+        Returns:
+            Image with depth-aware robot overlay applied
+        """
+        # Extract robot rendering and depth data
+        robot_mask = robot_results['robot_mask']
+        gripper_mask = robot_results['gripper_mask']
+        rgb_img_sim = robot_results['rgb_img']
+        depth_img_sim = np.squeeze(robot_results['depth_img'])
+        H, W = rgb_img_sim.shape[:2]
+
+        # Create masked depth images for occlusion analysis
+        depth_sim_masked = self._create_masked_depth(depth_img_sim, robot_mask, gripper_mask)
+        depth_masked = self._create_masked_depth(img_depth, robot_mask, gripper_mask)
+        
+        # Process hand mask for improved occlusion handling
+        hand_mask = self._dilate_mask(hand_mask.astype(np.uint8))
+        
+        # Create overlay mask using depth-based occlusion
+        img_robot_overlay = img.copy()
+        overlay_mask = self._create_overlay_mask(
+            robot_mask, gripper_mask, depth_masked, depth_sim_masked, hand_mask
+        )
+
+        # Convert and resize robot rendering
+        rgb_img_sim = (rgb_img_sim * 255).astype(np.uint8)
+        
+        if self.square:
+            resize_shape = (self.output_resolution, self.output_resolution)
+        else:
+            resize_shape = (int(W/H*self.output_resolution), self.output_resolution)
+
+        # Apply final overlay with depth-aware occlusion
+        rgb_img_sim = cv2.resize(rgb_img_sim, resize_shape)
+        overlay_mask = cv2.resize(overlay_mask.astype(np.uint8), resize_shape)
+        overlay_mask[overlay_mask > 0] = 1
+        overlay_mask = overlay_mask.astype(bool)
+        
+        img_robot_overlay[overlay_mask] = rgb_img_sim[overlay_mask]
+        
+        return img_robot_overlay
+    
+    def _create_masked_depth(self, depth_img: np.ndarray, robot_mask: np.ndarray, 
+                            gripper_mask: np.ndarray) -> np.ndarray:
+        """
+        Create depth image masked to robot regions for occlusion analysis.
+        
+        Args:
+            depth_img: Input depth image
+            robot_mask: Binary mask indicating robot regions
+            gripper_mask: Binary mask indicating gripper regions
+            
+        Returns:
+            Depth image with values only in robot/gripper regions
+        """
+        masked_img = np.zeros_like(depth_img)
+        mask = (robot_mask == 1) | (gripper_mask == 1)
+        masked_img[mask] = depth_img[mask]
+        return masked_img
+
+    def _dilate_mask(self, mask: np.ndarray) -> np.ndarray:
+        """
+        Apply morphological dilation to expand mask boundaries.
+        
+        Args:
+            mask: Binary mask to dilate
+            
+        Returns:
+            Dilated binary mask
+        """
+        kernel = np.ones((5, 5), np.uint8)
+        return cv2.dilate(mask, kernel, iterations=1)
+
+    def _create_overlay_mask(self, robot_mask: np.ndarray, gripper_mask: np.ndarray,
+                            depth_masked: np.ndarray, depth_sim_masked: np.ndarray,
+                            hand_mask: np.ndarray) -> np.ndarray:
+        """
+        Create sophisticated overlay mask using depth-based occlusion reasoning.
+        
+        Args:
+            robot_mask: Binary mask for robot body regions
+            gripper_mask: Binary mask for robot gripper regions
+            depth_masked: Real depth image masked to robot regions
+            depth_sim_masked: Simulated robot depth masked to robot regions
+            hand_mask: Binary mask for human hand regions
+            
+        Returns:
+            Binary mask indicating where robot overlay should be applied
+        """
+        # Start with basic robot visibility mask
+        overlay_mask = (robot_mask == 1) | (gripper_mask == 1)
+        
+        # Apply depth-based occlusion: hide robot when it's behind real objects
+        # and not in hand regions (where occlusion handling is more complex)
+        overlay_mask[(depth_masked < depth_sim_masked) & (hand_mask == 0)] = 0
+        
+        return overlay_mask
+
+    def _save_results(self, paths: Paths, sequence: TrainingDataSequence, img_overlay: List[np.ndarray], 
+                     img_birdview: Optional[List[np.ndarray]] = None) -> None:
+        """
+        Save comprehensive robot inpainting results to disk.
+        
+        Args:
+            paths: Paths object containing output file locations
+            sequence: Training data sequence with robot state annotations
+            img_overlay: List of robot overlay images
+            img_birdview: Optional list of bird's eye view images for analysis
+        """
+        # Create output directory
+        os.makedirs(paths.inpaint_processor, exist_ok=True)
+
+        if len(img_overlay) == 0:
+            print("No robot inpainted images, skipping")
+            return
+        
+        # Save main robot-inpainted video
+        video_path = str(paths.video_overlay).split(".mkv")[0] + f"_{self.robot}_{self.bimanual_setup}.mkv"
+        self._save_video(video_path, img_overlay)
+
+        # Save bird's eye view video for analysis and debugging
+        if img_birdview is not None:
+            birdview_path = str(paths.video_birdview).split(".mkv")[0] + f"_{self.robot}_{self.bimanual_setup}.mkv"
+            self._save_video(birdview_path, np.array(img_birdview))
+
+        # Save comprehensive training data with robot state annotations
+        training_data_path = str(paths.training_data).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        sequence.save(training_data_path)
+
+    def _save_video(self, path: str, frames: List[np.ndarray]) -> None:
+        """
+        Save video with consistent encoding parameters.
+        
+        Args:
+            path: Output video file path
+            frames: List of video frames to save
+        """
+        media.write_video(
+            path, 
+            frames, 
+            fps=self.DEFAULT_FPS, 
+            codec=self.DEFAULT_CODEC
+        )
+
+    def _get_mujoco_camera_params(self) -> MujocoCameraParams:
+        """
+        Generate MuJoCo camera parameters from real-world camera calibration.
+        
+        Returns:
+            MujocoCameraParams object with calibrated camera settings
+        """
+        # Extract real-world camera extrinsics and convert to MuJoCo format
+        extrinsics = self.extrinsics[0]
+        camera_ori_wxyz = self._convert_real_camera_ori_to_mujoco(
+            np.array(extrinsics["camera_base_ori"])
+        )
+
+        # Calculate image dimensions and camera intrinsics
+        img_w, img_h = self._get_image_dimensions()
+        offset = self._calculate_image_offset(img_w, img_h)
+        fx, fy, cx, cy = self._get_camera_intrinsics(offset)
+        sensor_width, sensor_height = self._calculate_sensor_size(img_w, img_h, fx, fy)
+
+        # Select appropriate camera name based on dataset
+        if self.epic:
+            camera_name = "zed"
+        else:
+            camera_name = "frontview"
+            
+        return MujocoCameraParams(
+            name=camera_name,
+            pos=extrinsics["camera_base_pos"],
+            ori_wxyz=camera_ori_wxyz,
+            fov=self.intrinsics_dict["v_fov"],
+            resolution=(img_h, img_w),
+            sensorsize=np.array([sensor_width, sensor_height]),
+            principalpixel=np.array([img_w/2-cx, cy-img_h/2]),
+            focalpixel=np.array([fx, fy])
+        )
+    
+    def _get_image_dimensions(self) -> Tuple[int, int]:
+        """
+        Calculate image dimensions based on input resolution configuration.
+        
+        Returns:
+            Tuple of (width, height) in pixels
+        """
+        # Epic
+        if self.input_resolution == 256:
+            img_w = 456 
+        # Phantom paper
+        elif self.input_resolution == 1080:
+            img_w = self.input_resolution * 16 // 9
+        img_h = self.input_resolution
+        return img_w, img_h
+    
+    def _calculate_image_offset(self, img_w: int, img_h: int) -> int:
+        """
+        Calculate horizontal image offset for square aspect ratio processing.
+        
+        Args:
+            img_w: Image width in pixels
+            img_h: Image height in pixels
+            
+        Returns:
+            Horizontal offset in pixels
+        """
+        if self.square:
+            offset = (img_w - img_h) // 2
+        else:
+            offset = 0
+        return offset
+    
+    def _get_camera_intrinsics(self, offset: int) -> Tuple[float, float, float, float]:
+        """
+        Extract camera intrinsic parameters with offset correction.
+        
+        Args:
+            offset: Horizontal offset for principal point adjustment
+            
+        Returns:
+            Tuple of (fx, fy, cx, cy) camera intrinsic parameters
+        """
+        return self.intrinsics_dict["fx"], self.intrinsics_dict["fy"], self.intrinsics_dict["cx"]+offset, self.intrinsics_dict["cy"]
+
+    def _calculate_sensor_size(self, img_w: int, img_h: int, fx: float, fy: float) -> Tuple[float, float]:
+        """
+        Calculate physical sensor dimensions from image resolution and focal length.
+        
+        Args:
+            img_w: Image width in pixels
+            img_h: Image height in pixels
+            fx: Focal length in x direction (pixels)
+            fy: Focal length in y direction (pixels)
+            
+        Returns:
+            Tuple of (sensor_width, sensor_height) in meters
+        """
+        sensor_width = img_w / fy / 1000
+        sensor_height = img_h / fx / 1000
+        return sensor_width, sensor_height
+    
+    @staticmethod
+    def _convert_real_camera_ori_to_mujoco(camera_ori_matrix: np.ndarray) -> np.ndarray:
+        """
+        Convert real-world camera orientation to MuJoCo coordinate system.
+        
+        Args:
+            camera_ori_matrix: 3x3 rotation matrix in real-world coordinates
+            
+        Returns:
+            Quaternion in WXYZ format for MuJoCo
+        """
+        # Apply coordinate system transformation (flip Y and Z axes)
+        camera_ori_matrix[:, [1, 2]] = -camera_ori_matrix[:, [1, 2]]
+        
+        # Convert to quaternion in MuJoCo's WXYZ format
+        r = Rotation.from_matrix(camera_ori_matrix)
+        camera_ori_wxyz = r.as_quat(scalar_first=True)
+        return camera_ori_wxyz
+ 
+
diff --git a/phantom/phantom/processors/segmentation_processor.py b/phantom/phantom/processors/segmentation_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9adc48f70e4509f9e714f42be59b442405ecd33b
--- /dev/null
+++ b/phantom/phantom/processors/segmentation_processor.py
@@ -0,0 +1,1056 @@
+"""
+Segmentation Processor Module
+
+This module uses SAM2 to create masks of hands and arms in video sequences.
+
+Processing Pipeline:
+1. Load video frames and detection/pose data from previous stages
+2. Initialize segmentation with highest-quality detection frame
+3. Propagate segmentation bidirectionally (forward and reverse)
+4. Combine temporal results for complete sequence coverage
+5. Generate visualization videos and save segmentation masks
+
+The module supports different segmentation modes:
+- HandSegmentationProcessor: Precise hand-only segmentation
+- ArmSegmentationProcessor: Combined hand + arm segmentation
+"""
+
+import os
+import logging
+import shutil
+from tqdm import tqdm
+import numpy as np
+import mediapy as media
+import argparse
+from typing import Dict, Tuple, Optional, List
+
+from phantom.processors.paths import Paths
+from phantom.processors.base_processor import BaseProcessor
+from phantom.detectors.detector_sam2 import DetectorSam2
+from phantom.detectors.detector_detectron2 import DetectorDetectron2
+from phantom.utils.bbox_utils import get_overlap_score
+from phantom.processors.phantom_data import HandSequence
+
+logger = logging.getLogger(__name__)
+
+# Configuration constants for segmentation processing
+DEFAULT_FPS = 10
+DEFAULT_OVERLAP_THRESHOLD = 0.5
+DEFAULT_CODEC = "ffv1"
+ANNOTATION_CODEC = "h264"
+
+class BaseSegmentationProcessor(BaseProcessor): 
+    """
+    Base class for video segmentation processing using SAM2.
+    
+    The base processor establishes the framework for temporal segmentation processing,
+    where segmentation masks are propagated both forward and backward through time
+    to ensure temporal consistency and complete coverage of the video sequence.
+    
+    Attributes:
+        detector_sam (DetectorSam2): SAM2 segmentation model instance
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize the base segmentation processor.
+        
+        Args:
+            args: Command line arguments containing segmentation configuration
+        """
+        super().__init__(args)
+        self.detector_sam = DetectorSam2()
+
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration - to be implemented by subclasses.
+        
+        Args:
+            data_sub_folder: Path to demonstration data folder
+            
+        Raises:
+            NotImplementedError: Must be implemented by concrete subclasses
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+    
+    def _load_hamer_data(self, paths: Paths) -> Dict[str, HandSequence]:
+        """
+        Load hand pose estimation data from previous processing stage.
+        
+        Args:
+            paths: Paths object containing file locations
+            
+        Returns:
+            Dictionary containing left and right hand sequences
+        """
+        if self.bimanual_setup == "single_arm":
+            if self.target_hand == "left":
+                return {"left": HandSequence.load(paths.hand_data_left)}
+            elif self.target_hand == "right":
+                return {"right": HandSequence.load(paths.hand_data_right)}
+            else:
+                raise ValueError(f"Invalid target hand: {self.target_hand}")
+        elif self.bimanual_setup == "shoulders":    
+            return {
+                "left": HandSequence.load(paths.hand_data_left),
+                "right": HandSequence.load(paths.hand_data_right)
+            }
+        else:
+            raise ValueError(f"Invalid bimanual setup: {self.bimanual_setup}")
+    
+    @staticmethod
+    def _load_video(video_path: str) -> np.ndarray:
+        """
+        Load and validate video frames from disk.
+        
+        Args:
+            video_path: Path to video file
+            
+        Returns:
+            Array of RGB video frames
+            
+        Raises:
+            FileNotFoundError: If video file doesn't exist
+            ValueError: If video file is empty or corrupted
+        """
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"Video file not found: {video_path}")
+        
+        imgs_rgb = media.read_video(video_path)
+        if len(imgs_rgb) == 0:
+            raise ValueError("Empty video file")
+        
+        return imgs_rgb
+    
+    @staticmethod
+    def _load_bbox_data(bbox_path: str) -> Dict[str, np.ndarray]:
+        """
+        Load and validate bounding box detection data.
+        
+        Args:
+            bbox_path: Path to bounding box data file
+            
+        Returns:
+            Dictionary containing detection results from bounding box processor
+            
+        Raises:
+            FileNotFoundError: If bounding box data file doesn't exist
+        """
+        if not os.path.exists(bbox_path):
+            raise FileNotFoundError(f"Bbox data not found: {bbox_path}")
+        
+        return np.load(bbox_path)
+    
+    @staticmethod
+    def _combine_sam_images(
+        imgs_rgb: np.ndarray,
+        imgs_forward: Dict[int, np.ndarray],
+        imgs_reverse: Dict[int, np.ndarray]
+    ) -> np.ndarray:
+        """
+        Combine forward and reverse SAM visualization images.
+        
+        This method merges the visualization results from bidirectional
+        processing to create a complete visualization sequence.
+        
+        Args:
+            imgs_rgb: Original RGB frames for shape reference
+            imgs_forward: Forward propagation visualization results
+            imgs_reverse: Reverse propagation visualization results
+            
+        Returns:
+            Combined visualization array
+        """
+        result = np.zeros_like(imgs_rgb)
+        # Fill in forward propagation results
+        for idx in imgs_forward:
+            result[idx] = imgs_forward[idx]
+        # Fill in reverse propagation results (may overwrite forward results)
+        for idx in imgs_reverse:
+            result[idx] = imgs_reverse[idx]
+        return result
+
+    @staticmethod
+    def _combine_masks(
+        imgs_rgb: np.ndarray,
+        masks_forward: Dict[int, np.ndarray],
+        masks_reverse: Dict[int, np.ndarray]
+    ) -> np.ndarray:
+        """
+        Combine forward and reverse segmentation masks.
+        
+        This method merges segmentation masks from bidirectional processing
+        to ensure complete temporal coverage of the video sequence.
+        
+        Args:
+            imgs_rgb: Original RGB frames for shape reference
+            masks_forward: Forward propagation mask results
+            masks_reverse: Reverse propagation mask results
+            
+        Returns:
+            Combined mask array with shape (num_frames, height, width)
+        """
+        result = np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1]))
+        for idx in masks_forward:
+            result[idx] = masks_forward[idx][0]
+        for idx in masks_reverse:
+            result[idx] = masks_reverse[idx][0]
+        return result
+
+class ArmSegmentationProcessor(BaseSegmentationProcessor): 
+    """
+    Processor for segmenting combined hand and arm regions in video sequences.
+    
+    Attributes:
+        detectron_detector (DetectorDetectron2): Detectron2 model for initial detection
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize the arm segmentation processor with detection models.
+        
+        Args:
+            args: Command line arguments containing model configuration
+        """
+        super().__init__(args)
+
+        # Initialize Detectron2 for initial hand/arm detection
+        root_dir = "../submodules/phantom-hamer/"
+        self.detectron_detector = DetectorDetectron2(root_dir)
+
+
+    def process_one_demo(self, data_sub_folder: str, hamer_data: Optional[Dict[str, HandSequence]] = None) -> None:
+        """
+        Process a single video demonstration to generate combined hand + arm segmentation masks.
+
+        Args:
+            data_sub_folder: Path to the subfolder containing the demo data
+            hamer_data: Optional pre-loaded hand pose data for segmentation guidance
+
+        Raises:
+            FileNotFoundError: If required input files are not found
+            ValueError: If video frames or bounding boxes are invalid
+        """
+        # Setup and load all required data
+        save_folder, paths, imgs_rgb, bbox_data, det_bbox_data, hamer_data = self._setup_processing(
+            data_sub_folder, hamer_data
+        )
+
+        # Process based on setup type
+        if self.bimanual_setup == "single_arm":
+            masks = self._process_single_arm(imgs_rgb, bbox_data, det_bbox_data, hamer_data, paths)
+        elif self.bimanual_setup == "shoulders":
+            masks = self._process_bimanual(imgs_rgb, bbox_data, det_bbox_data, hamer_data, paths)
+        else:
+            raise ValueError(f"Invalid bimanual setup: {self.bimanual_setup}")
+
+        # Create visualization and save results
+        sam_imgs = self._create_visualization(imgs_rgb, masks)
+        self._validate_output_consistency(imgs_rgb, masks, sam_imgs)
+        self._save_results(paths, masks, sam_imgs)
+
+    def _setup_processing(
+        self, 
+        data_sub_folder: str, 
+        hamer_data: Optional[Dict[str, HandSequence]]
+    ) -> Tuple[str, Paths, np.ndarray, Dict[str, np.ndarray], Dict[str, np.ndarray], Dict[str, HandSequence]]:
+        """
+        Setup processing environment and load all required data.
+        
+        Args:
+            data_sub_folder: Path to the subfolder containing the demo data
+            hamer_data: Optional pre-loaded hand pose data
+            
+        Returns:
+            Tuple containing: (save_folder, paths, imgs_rgb, bbox_data, det_bbox_data, hamer_data)
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+        paths._setup_original_images()
+        paths._setup_original_images_reverse()
+
+        # Load and validate all input data
+        imgs_rgb = self._load_video(paths.video_left)
+        bbox_data = self._load_bbox_data(paths.bbox_data)
+        det_bbox_data = self.get_detectron_bboxes(imgs_rgb, bbox_data)
+        if hamer_data is None:
+            hamer_data = self._load_hamer_data(paths)
+            
+        return save_folder, paths, imgs_rgb, bbox_data, det_bbox_data, hamer_data
+
+    def _process_single_arm(
+        self,
+        imgs_rgb: np.ndarray,
+        bbox_data: Dict[str, np.ndarray],
+        det_bbox_data: Dict[str, np.ndarray],
+        hamer_data: Dict[str, HandSequence],
+        paths: Paths
+    ) -> np.ndarray:
+        """
+        Process single arm setup (left or right hand only).
+        
+        Args:
+            imgs_rgb: RGB video frames
+            bbox_data: Bounding box detection data
+            det_bbox_data: Detectron2 refined bounding boxes
+            hamer_data: Hand pose estimation data
+            paths: Paths object for file management
+            
+        Returns:
+            Boolean segmentation masks
+        """
+        if self.target_hand == "left":
+            hand_data = self._process_hand_data(
+                imgs_rgb,
+                bbox_data["left_bboxes"],
+                bbox_data["left_bbox_min_dist_to_edge"],
+                bbox_data["left_hand_detected"],
+                det_bbox_data["left_det_bboxes"],
+                hamer_data["left"],
+                paths,
+                "left"
+            )
+            masks = hand_data["left_masks"].astype(np.bool_)
+        elif self.target_hand == "right":
+            hand_data = self._process_hand_data(
+                imgs_rgb,
+                bbox_data["right_bboxes"],
+                bbox_data["right_bbox_min_dist_to_edge"],
+                bbox_data["right_hand_detected"],
+                det_bbox_data["right_det_bboxes"],
+                hamer_data["right"],
+                paths,
+                "right"
+            )
+            masks = hand_data["right_masks"].astype(np.bool_)
+        else:
+            raise ValueError(f"Invalid target hand: {self.target_hand}")
+        
+        return masks.astype(np.bool_)
+
+    def _process_bimanual(
+        self,
+        imgs_rgb: np.ndarray,
+        bbox_data: Dict[str, np.ndarray],
+        det_bbox_data: Dict[str, np.ndarray],
+        hamer_data: Dict[str, HandSequence],
+        paths: Paths
+    ) -> np.ndarray:
+        """
+        Process bimanual setup (both hands combined).
+        
+        Args:
+            imgs_rgb: RGB video frames
+            bbox_data: Bounding box detection data
+            det_bbox_data: Detectron2 refined bounding boxes
+            hamer_data: Hand pose estimation data
+            paths: Paths object for file management
+            
+        Returns:
+            Combined boolean segmentation masks
+        """
+        # Process left hand with arm segmentation
+        left_data = self._process_hand_data(
+            imgs_rgb,
+            bbox_data["left_bboxes"],
+            bbox_data["left_bbox_min_dist_to_edge"],
+            bbox_data["left_hand_detected"],
+            det_bbox_data["left_det_bboxes"],
+            hamer_data["left"],
+            paths,
+            "left"
+        )
+
+        # Process right hand with arm segmentation
+        right_data = self._process_hand_data(
+            imgs_rgb,
+            bbox_data["right_bboxes"],
+            bbox_data["right_bbox_min_dist_to_edge"],
+            bbox_data["right_hand_detected"],
+            det_bbox_data["right_det_bboxes"],
+            hamer_data["right"],
+            paths,
+            "right"
+        )
+
+        # Convert to boolean masks and combine
+        left_masks = left_data["left_masks"].astype(np.bool_)
+        right_masks = right_data["right_masks"].astype(np.bool_)
+        
+        # Generate combined video masks by taking the union of left and right masks
+        masks = np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1]))
+        for idx in range(len(imgs_rgb)):
+            masks[idx] = left_masks[idx] | right_masks[idx]
+        
+        return masks.astype(np.bool_)
+
+    def _create_visualization(self, imgs_rgb: np.ndarray, masks: np.ndarray) -> np.ndarray:
+        """
+        Create visualization by masking out segmented regions.
+        
+        Args:
+            imgs_rgb: Original RGB video frames
+            masks: Boolean segmentation masks
+            
+        Returns:
+            Visualization images with masked regions set to black
+        """
+        sam_imgs = []
+        for idx in range(len(imgs_rgb)):
+            img = imgs_rgb[idx].copy()  # Create copy to avoid modifying original
+            mask = masks[idx]
+            img[mask] = 0  # Set masked regions to black
+            sam_imgs.append(img)
+        return np.array(sam_imgs)
+
+    def _validate_output_consistency(
+        self, 
+        imgs_rgb: np.ndarray, 
+        masks: np.ndarray, 
+        sam_imgs: np.ndarray
+    ) -> None:
+        """
+        Validate that output arrays have consistent dimensions.
+        
+        Args:
+            imgs_rgb: Original RGB video frames
+            masks: Segmentation masks
+            sam_imgs: Visualization images
+            
+        Raises:
+            AssertionError: If dimensions don't match
+        """
+        assert len(sam_imgs) == len(imgs_rgb), "Visualization length doesn't match input"
+        assert len(masks) == len(imgs_rgb), "Masks length doesn't match input"
+
+
+    def _process_hand_data(
+        self,
+        imgs_rgb: np.ndarray,
+        bboxes: np.ndarray,
+        bbox_min_dist: np.ndarray,
+        hand_detected: np.ndarray,
+        det_bboxes: np.ndarray,
+        hamer_data: HandSequence,
+        paths: Paths,
+        hand_side: str
+    ) -> Dict[str, np.ndarray]:
+        """
+        Process segmentation data for a single hand (left or right) with arm inclusion.
+
+        Args:
+            imgs_rgb: RGB video frames
+            bboxes: Hand bounding boxes from detection stage
+            bbox_min_dist: Minimum distances to image edges (quality metric)
+            hand_detected: Boolean flags indicating valid hand detections
+            det_bboxes: Refined bounding boxes from Detectron2
+            hamer_data: Hand pose data for segmentation guidance
+            paths: Paths object for file management
+            hand_side: "left" or "right" specifying which hand to process
+
+        Returns:
+            Dictionary containing segmentation masks and visualization images
+        """
+        # Handle cases with no valid detections
+        if not hand_detected.any() or max(bbox_min_dist) == 0:
+            return {
+                f"{hand_side}_masks": np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1])),
+                f"{hand_side}_sam_imgs": np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1], 3))
+            }
+        
+        # Extract hand pose keypoints for segmentation guidance
+        kpts_2d = hamer_data.kpts_2d
+                
+        # Find the frame with highest quality (furthest from edges)
+        max_dist_idx = np.argmax(bbox_min_dist)
+        points = np.expand_dims(kpts_2d[max_dist_idx], axis=1)
+        bbox_dets = det_bboxes[max_dist_idx]
+
+        # Use original bounding box if Detectron2 detection failed
+        if bbox_dets.sum() == 0:
+            bbox_dets = bboxes[max_dist_idx]
+
+        # Process segmentation in both temporal directions
+        masks_forward, sam_imgs_forward = self._run_sam_segmentation(
+            paths, bbox_dets, points, max_dist_idx, reverse=False
+        )
+        masks_reverse, sam_imgs_reverse = self._run_sam_segmentation(
+            paths, bbox_dets, points, max_dist_idx, reverse=True
+        )
+
+        # Combine bidirectional results
+        sam_imgs = self._combine_sam_images(imgs_rgb, sam_imgs_forward, sam_imgs_reverse)
+        masks = self._combine_masks(imgs_rgb, masks_forward, masks_reverse)
+
+        return {
+            f"{hand_side}_masks": masks,
+            f"{hand_side}_sam_imgs": sam_imgs
+        }
+
+    def _run_sam_segmentation(
+        self,
+        paths: Paths,
+        bbox_dets: np.ndarray,
+        points: np.ndarray,
+        max_dist_idx: int,
+        reverse: bool
+    ) -> Tuple[Dict[int, np.ndarray], Dict[int, np.ndarray]]:
+        """
+        Process video segmentation in either forward or reverse temporal direction.
+        
+        Args:
+            paths: Paths object for file management
+            bbox_dets: Detectron2 bounding box for initialization
+            points: Hand keypoints for segmentation guidance
+            max_dist_idx: Index of highest-quality frame for initialization
+            reverse: Whether to process in reverse temporal order
+            
+        Returns:
+            Tuple of (segmentation_masks, visualization_images)
+        """
+        return self.detector_sam.segment_video(
+            paths.original_images_folder,
+            bbox_dets,
+            points,
+            [max_dist_idx],
+            reverse=reverse
+        )
+
+    def get_detectron_bboxes(self, imgs_rgb: np.ndarray, bbox_data: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+        """
+        Generate enhanced bounding boxes using Detectron2 for improved segmentation.
+
+        Args:
+            imgs_rgb: Array of RGB frames with shape (N, H, W, 3)
+            bbox_data: Initial bounding box data from hand detection stage containing:
+                      - left_bboxes: Left hand bounding boxes
+                      - right_bboxes: Right hand bounding boxes  
+                      - left_hand_detected: Boolean flags for left hand detection
+                      - right_hand_detected: Boolean flags for right hand detection
+                      - left_bbox_min_dist_to_edge: Quality metrics for left hand
+                      - right_bbox_min_dist_to_edge: Quality metrics for right hand
+
+        Returns:
+            Dictionary containing refined bounding boxes:
+            - left_det_bboxes: Enhanced left hand bounding boxes
+            - right_det_bboxes: Enhanced right hand bounding boxes
+
+        Raises:
+            ValueError: If input array is empty or has incorrect shape
+        """
+        self._validate_detectron_input(imgs_rgb)
+        
+        # Extract detection data and initialize output arrays
+        detection_data = self._extract_detection_data(bbox_data)
+        left_det_bboxes, right_det_bboxes = self._initialize_bbox_arrays(imgs_rgb)
+        
+        # Process only highest-quality frames for efficiency
+        idx_list = self._get_quality_frame_indices(bbox_data)
+        
+        for idx in tqdm(idx_list, desc="Processing frames"):
+            try:
+                self._process_detectron_frame(
+                    idx, imgs_rgb, detection_data, left_det_bboxes, right_det_bboxes
+                )
+            except Exception as e:
+                logging.error(f"Error processing frame {idx}: {str(e)}")
+      
+        return {"left_det_bboxes": left_det_bboxes, "right_det_bboxes": right_det_bboxes}
+
+    def _validate_detectron_input(self, imgs_rgb: np.ndarray) -> None:
+        """
+        Validate input array for Detectron2 processing.
+        
+        Args:
+            imgs_rgb: Array of RGB frames
+            
+        Raises:
+            ValueError: If input array is empty or has incorrect shape
+        """
+        if len(imgs_rgb) == 0:
+            raise ValueError("Empty input array - no video frames provided")
+        
+        if len(imgs_rgb.shape) != 4 or imgs_rgb.shape[-1] != 3:
+            raise ValueError(f"Expected input shape (N, H, W, 3), got {imgs_rgb.shape}. "
+                           f"Input should be RGB video frames.")
+
+    def _extract_detection_data(self, bbox_data: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+        """
+        Extract detection data from bounding box data.
+        
+        Args:
+            bbox_data: Bounding box detection data
+            
+        Returns:
+            Dictionary containing extracted detection data
+        """
+        return {
+            "left_bboxes": bbox_data["left_bboxes"],
+            "right_bboxes": bbox_data["right_bboxes"],
+            "left_hand_detected": bbox_data["left_hand_detected"],
+            "right_hand_detected": bbox_data["right_hand_detected"]
+        }
+
+    def _initialize_bbox_arrays(self, imgs_rgb: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Initialize output bounding box arrays.
+        
+        Args:
+            imgs_rgb: RGB video frames for shape reference
+            
+        Returns:
+            Tuple of (left_det_bboxes, right_det_bboxes) initialized arrays
+        """
+        left_det_bboxes = np.zeros((len(imgs_rgb), 4))
+        right_det_bboxes = np.zeros((len(imgs_rgb), 4))
+        return left_det_bboxes, right_det_bboxes
+
+    def _get_quality_frame_indices(self, bbox_data: Dict[str, np.ndarray]) -> List[int]:
+        """
+        Get indices of highest-quality frames for processing.
+        
+        Args:
+            bbox_data: Bounding box detection data
+            
+        Returns:
+            List of frame indices to process
+        """
+        idx_left = np.argmax(bbox_data["left_bbox_min_dist_to_edge"])
+        idx_right = np.argmax(bbox_data["right_bbox_min_dist_to_edge"])
+        return [idx_left, idx_right]
+
+    def _process_detectron_frame(
+        self,
+        idx: int,
+        imgs_rgb: np.ndarray,
+        detection_data: Dict[str, np.ndarray],
+        left_det_bboxes: np.ndarray,
+        right_det_bboxes: np.ndarray
+    ) -> None:
+        """
+        Process a single frame with Detectron2 detection.
+        
+        Args:
+            idx: Frame index to process
+            imgs_rgb: RGB video frames
+            detection_data: Extracted detection data
+            left_det_bboxes: Left hand bounding box output array
+            right_det_bboxes: Right hand bounding box output array
+        """
+        left_hand_detected = detection_data["left_hand_detected"]
+        right_hand_detected = detection_data["right_hand_detected"]
+        
+        # Skip frames without any hand detections
+        if not left_hand_detected[idx] and not right_hand_detected[idx]:
+            left_det_bboxes[idx] = np.array([0, 0, 0, 0])
+            right_det_bboxes[idx] = np.array([0, 0, 0, 0])
+            return
+
+        # Apply Detectron2 detection
+        img = imgs_rgb[idx]
+        det_bboxes, det_scores = self.detectron_detector.get_bboxes(img, visualize=False)
+
+        if len(det_bboxes) == 0:
+            return
+        
+        # Match left hand detection with Detectron2 results
+        if left_hand_detected[idx]:
+            self._match_hand_detection(
+                idx, "left", detection_data, det_bboxes, left_det_bboxes
+            )
+
+        # Match right hand detection with Detectron2 results
+        if right_hand_detected[idx]:
+            self._match_hand_detection(
+                idx, "right", detection_data, det_bboxes, right_det_bboxes
+            )
+
+    def _match_hand_detection(
+        self,
+        idx: int,
+        hand_side: str,
+        detection_data: Dict[str, np.ndarray],
+        det_bboxes: np.ndarray,
+        output_bboxes: np.ndarray
+    ) -> None:
+        """
+        Match hand detection with Detectron2 results using overlap scores.
+        
+        Args:
+            idx: Frame index
+            hand_side: "left" or "right" hand
+            detection_data: Extracted detection data
+            det_bboxes: Detectron2 detection results
+            output_bboxes: Output bounding box array to update
+        """
+        bbox = detection_data[f"{hand_side}_bboxes"][idx]
+        overlap_scores = []
+        
+        for det_bbox in det_bboxes:
+            overlap_score = get_overlap_score(bbox, det_bbox)
+            overlap_scores.append(overlap_score)
+
+        if np.max(overlap_scores) > DEFAULT_OVERLAP_THRESHOLD:
+            best_idx = np.argmax(overlap_scores)
+            output_bboxes[idx] = det_bboxes[best_idx].astype(np.int32)
+
+    @staticmethod
+    def _save_results(
+        paths: Paths,
+        masks: np.ndarray,
+        sam_imgs: np.ndarray,
+        fps: int = DEFAULT_FPS
+    ) -> None:
+        """
+        Save arm segmentation results to disk.
+
+        Args:
+            paths: Paths object containing output file locations
+            masks: Combined arm segmentation masks
+            sam_imgs: SAM visualization images
+            fps: Frames per second for output videos (default: 10)
+        """
+        ArmSegmentationProcessor._create_output_directory(paths)
+        
+        try:
+            ArmSegmentationProcessor._save_mask_data(paths, masks)
+            ArmSegmentationProcessor._create_videos(paths, masks, sam_imgs, fps)
+        except Exception as e:
+            logging.error(f"Error saving results: {str(e)}")
+            raise
+
+        ArmSegmentationProcessor._cleanup_temp_files(paths)
+        ArmSegmentationProcessor._update_annotation_video(paths, masks, sam_imgs, fps)
+
+    @staticmethod
+    def _create_output_directory(paths: Paths) -> None:
+        """
+        Create output directory for segmentation results.
+        
+        Args:
+            paths: Paths object containing output directory location
+        """
+        if not os.path.exists(paths.segmentation_processor):
+            os.makedirs(paths.segmentation_processor)
+
+    @staticmethod
+    def _save_mask_data(paths: Paths, masks: np.ndarray) -> None:
+        """
+        Save mask data to disk.
+        
+        Args:
+            paths: Paths object containing output file locations
+            masks: Segmentation masks to save
+        """
+        np.save(paths.masks_arm, masks)
+
+    @staticmethod
+    def _create_videos(paths: Paths, masks: np.ndarray, sam_imgs: np.ndarray, fps: int) -> None:
+        """
+        Create visualization videos from masks and SAM images.
+        
+        Args:
+            paths: Paths object containing output file locations
+            masks: Segmentation masks
+            sam_imgs: SAM visualization images
+            fps: Frames per second for output videos
+        """
+        for name, data in [
+            ("video_masks_arm", masks),
+            ("video_sam_arm", sam_imgs),
+        ]:
+            output_path = getattr(paths, name)
+            media.write_video(output_path, data, fps=fps, codec=DEFAULT_CODEC)
+
+    @staticmethod
+    def _cleanup_temp_files(paths: Paths) -> None:
+        """
+        Clean up temporary directories created during processing.
+        
+        Args:
+            paths: Paths object containing temporary directory locations
+        """
+        if os.path.exists(paths.original_images_folder):
+            shutil.rmtree(paths.original_images_folder)
+        if os.path.exists(paths.original_images_folder_reverse):
+            shutil.rmtree(paths.original_images_folder_reverse)
+
+    @staticmethod
+    def _update_annotation_video(paths: Paths, masks: np.ndarray, sam_imgs: np.ndarray, fps: int) -> None:
+        """
+        Update existing annotation video with segmentation results.
+        
+        Args:
+            paths: Paths object containing annotation video location
+            masks: Segmentation masks
+            sam_imgs: SAM visualization images
+            fps: Frames per second for output video
+        """
+        if os.path.exists(paths.video_annot):
+            annot_imgs = media.read_video(paths.video_annot)
+            for idx in range(len(annot_imgs)):
+                annot_img = annot_imgs[idx]
+                h = masks[idx].shape[0]
+                w = masks[idx].shape[1]
+                # Insert segmentation visualization in the top-right quadrant
+                annot_img[:h, w:, :] = sam_imgs[idx]
+            media.write_video(paths.video_annot, annot_imgs, fps=fps, codec=ANNOTATION_CODEC)
+
+
+
+class HandSegmentationProcessor(BaseSegmentationProcessor): 
+    """
+    Processor for precise hand-only segmentation in video sequences.
+    
+    Attributes:
+        Inherits detector_sam from BaseSegmentationProcessor
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize the hand segmentation processor.
+        
+        Args:
+            args: Command line arguments containing segmentation configuration
+        """
+        super().__init__(args)
+
+    def process_one_demo(self, data_sub_folder: str, hamer_data: Optional[Dict[str, HandSequence]] = None) -> None:
+        """
+        Process a single video demonstration to generate precise hand segmentation masks.
+
+        Args:
+            data_sub_folder: Path to the subfolder containing the demo data
+            hamer_data: Optional pre-loaded hand pose data for segmentation guidance
+
+        Raises:
+            FileNotFoundError: If required input files are not found
+            ValueError: If video frames or bounding boxes are invalid
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+
+        paths = self.get_paths(save_folder)
+        paths._setup_original_images()
+        paths._setup_original_images_reverse()
+
+        # Load and validate input data
+        imgs_rgb = self._load_video(paths.video_left)
+        bbox_data = self._load_bbox_data(paths.bbox_data)
+        if hamer_data is None:
+            hamer_data = self._load_hamer_data(paths)
+
+        # Process left and right hands separately for precise segmentation
+        left_data = self._process_hand_data(
+            imgs_rgb,
+            bbox_data["left_bboxes"],
+            bbox_data["left_bbox_min_dist_to_edge"],
+            bbox_data["left_hand_detected"],
+            hamer_data["left"],
+            paths,
+            "left"
+        )
+
+        right_data = self._process_hand_data(
+            imgs_rgb,
+            bbox_data["right_bboxes"],
+            bbox_data["right_bbox_min_dist_to_edge"],
+            bbox_data["right_hand_detected"],
+            hamer_data["right"],
+            paths,
+            "right"
+        )
+
+        # Convert to boolean masks
+        left_masks = left_data["left_masks"].astype(np.bool_)
+        left_sam_imgs = left_data["left_sam_imgs"]
+        right_masks = right_data["right_masks"].astype(np.bool_)
+        right_sam_imgs = right_data["right_sam_imgs"]
+
+        # Save results with separate left/right hand data
+        self._save_results(paths, left_masks, left_sam_imgs, right_masks, right_sam_imgs)
+
+
+    def _process_hand_data(
+        self,
+        imgs_rgb: np.ndarray,
+        bboxes: np.ndarray,
+        bbox_min_dist: np.ndarray,
+        hand_detected: np.ndarray,
+        hamer_data: HandSequence,
+        paths: Paths,
+        hand_side: str
+    ) -> Dict[str, np.ndarray]:
+        """
+        Process hand segmentation data for a single hand (left or right).
+
+        Args:
+            imgs_rgb: RGB video frames
+            bboxes: Hand bounding boxes from detection stage
+            bbox_min_dist: Minimum distances to image edges (quality metric)
+            hand_detected: Boolean flags indicating valid hand detections
+            hamer_data: Hand pose data for segmentation guidance
+            paths: Paths object for file management
+            hand_side: "left" or "right" specifying which hand to process
+
+        Returns:
+            Dictionary containing segmentation masks and visualization images
+        """
+        # Handle cases with no valid detections
+        if not hand_detected.any() or max(bbox_min_dist) == 0:
+            return {
+                f"{hand_side}_masks": np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1])),
+                f"{hand_side}_sam_imgs": np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1], 3))
+            }
+        
+        # Extract hand pose keypoints for segmentation guidance
+        kpts_2d = hamer_data.kpts_2d
+                
+        # Find the frame with highest quality (furthest from edges)
+        max_dist_idx = np.argmax(bbox_min_dist)
+        bbox = bboxes[max_dist_idx]
+        points = np.expand_dims(kpts_2d[max_dist_idx], axis=1)
+
+        # Process segmentation in both temporal directions
+        masks_forward, sam_imgs_forward = self._run_sam_segmentation(
+            paths, bbox, points, max_dist_idx, reverse=False, output_bboxes=bboxes
+        )
+        masks_reverse, sam_imgs_reverse = self._run_sam_segmentation(
+            paths, bbox, points, max_dist_idx, reverse=True, output_bboxes=bboxes
+        )
+
+        # Combine bidirectional results
+        sam_imgs = self._combine_sam_images(imgs_rgb, sam_imgs_forward, sam_imgs_reverse)
+        masks = self._combine_masks(imgs_rgb, masks_forward, masks_reverse)
+
+        return {
+            f"{hand_side}_masks": masks,
+            f"{hand_side}_sam_imgs": sam_imgs
+        }
+    
+
+    def _run_sam_segmentation(
+        self,
+        paths: Paths,
+        bbox: np.ndarray,
+        points: np.ndarray,
+        max_dist_idx: int,
+        reverse: bool,
+        output_bboxes: np.ndarray
+    ) -> Tuple[Dict[int, np.ndarray], Dict[int, np.ndarray]]:
+        """
+        Process video segmentation in either forward or reverse temporal direction.
+        
+        Args:
+            paths: Paths object for file management
+            bbox: Initial bounding box for segmentation
+            points: Hand keypoints for segmentation guidance
+            max_dist_idx: Index of highest-quality frame for initialization
+            reverse: Whether to process in reverse temporal order
+            output_bboxes: All bounding boxes for the sequence
+            
+        Returns:
+            Tuple of (segmentation_masks, visualization_images)
+        """
+        return self.detector_sam.segment_video(
+            paths.original_images_folder,
+            bbox,
+            points,
+            [max_dist_idx],
+            reverse=reverse,
+            output_bboxes=output_bboxes
+        )
+
+    @staticmethod
+    def _save_results(
+        paths: Paths,
+        left_masks: np.ndarray,
+        left_sam_imgs: np.ndarray,
+        right_masks: np.ndarray,
+        right_sam_imgs: np.ndarray,
+        fps: int = DEFAULT_FPS
+    ) -> None:
+        """
+        Save hand segmentation results to disk.
+
+        Args:
+            paths: Paths object containing output file locations
+            left_masks: Left hand segmentation masks
+            left_sam_imgs: Left hand SAM visualization images
+            right_masks: Right hand segmentation masks
+            right_sam_imgs: Right hand SAM visualization images
+            fps: Frames per second for output videos (default: 10)
+        """
+        HandSegmentationProcessor._create_output_directory(paths)
+        
+        try:
+            HandSegmentationProcessor._save_hand_mask_data(paths, left_masks, right_masks)
+            HandSegmentationProcessor._create_hand_videos(paths, left_masks, left_sam_imgs, right_masks, right_sam_imgs, fps)
+        except Exception as e:
+            logging.error(f"Error saving results: {str(e)}")
+            raise
+        
+        HandSegmentationProcessor._cleanup_temp_files(paths)
+
+    @staticmethod
+    def _create_output_directory(paths: Paths) -> None:
+        """
+        Create output directory for segmentation results.
+        
+        Args:
+            paths: Paths object containing output directory location
+        """
+        if not os.path.exists(paths.segmentation_processor):
+            os.makedirs(paths.segmentation_processor)
+
+    @staticmethod
+    def _save_hand_mask_data(paths: Paths, left_masks: np.ndarray, right_masks: np.ndarray) -> None:
+        """
+        Save hand mask data to disk.
+        
+        Args:
+            paths: Paths object containing output file locations
+            left_masks: Left hand segmentation masks
+            right_masks: Right hand segmentation masks
+        """
+        np.save(paths.masks_hand_left, left_masks)
+        np.save(paths.masks_hand_right, right_masks)
+
+    @staticmethod
+    def _create_hand_videos(
+        paths: Paths, 
+        left_masks: np.ndarray, 
+        left_sam_imgs: np.ndarray,
+        right_masks: np.ndarray, 
+        right_sam_imgs: np.ndarray, 
+        fps: int
+    ) -> None:
+        """
+        Create visualization videos for hand segmentation.
+        
+        Args:
+            paths: Paths object containing output file locations
+            left_masks: Left hand segmentation masks
+            left_sam_imgs: Left hand SAM visualization images
+            right_masks: Right hand segmentation masks
+            right_sam_imgs: Right hand SAM visualization images
+            fps: Frames per second for output videos
+        """
+        for name, data in [
+            ("video_masks_hand_left", left_masks),
+            ("video_masks_hand_right", right_masks),
+            ("video_sam_hand_left", left_sam_imgs),
+            ("video_sam_hand_right", right_sam_imgs),
+        ]:
+            output_path = getattr(paths, name)
+            media.write_video(output_path, data, fps=fps, codec=DEFAULT_CODEC)
+
+    @staticmethod
+    def _cleanup_temp_files(paths: Paths) -> None:
+        """
+        Clean up temporary directories created during processing.
+        
+        Args:
+            paths: Paths object containing temporary directory locations
+        """
+        if os.path.exists(paths.original_images_folder):
+            shutil.rmtree(paths.original_images_folder)
+        if os.path.exists(paths.original_images_folder_reverse):
+            shutil.rmtree(paths.original_images_folder_reverse)
+
diff --git a/phantom/phantom/processors/smoothing_processor.py b/phantom/phantom/processors/smoothing_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca2b9b325dc0cca10473bd515420b7ac3ac82c7
--- /dev/null
+++ b/phantom/phantom/processors/smoothing_processor.py
@@ -0,0 +1,303 @@
+"""
+Trajectory Smoothing Processor Module
+
+This module does trajectory smoothing for end-effector positions, orientations, and gripper states
+extracted from human demonstrations.
+
+Processing Pipeline:
+1. Load processed action data from previous pipeline stages
+2. Apply Gaussian Process smoothing to 3D position trajectories
+3. Apply SLERP-based smoothing to rotation matrix trajectories
+4. Apply Gaussian Process smoothing to gripper distance trajectories
+5. Save smoothed trajectories for robot execution
+"""
+
+import os
+from typing import Optional
+import argparse
+import numpy as np
+import logging
+from sklearn.gaussian_process import GaussianProcessRegressor  # type: ignore
+from sklearn.gaussian_process.kernels import RBF, WhiteKernel  # type: ignore
+from scipy.spatial.transform import Rotation, Slerp
+
+from phantom.processors.base_processor import BaseProcessor
+from phantom.processors.paths import Paths
+
+logger = logging.getLogger(__name__)
+
+def gaussian_kernel(size: int, sigma: float) -> np.ndarray:
+    """
+    Generate a centered Gaussian kernel for local smoothing operations.
+    
+    Args:
+        size: Size of the kernel (should be odd for proper centering)
+        sigma: Standard deviation of the Gaussian distribution
+        
+    Returns:
+        Normalized Gaussian kernel array
+        
+    Raises:
+        ValueError: If size is not positive
+    """
+    if size <= 0:
+        raise ValueError("Kernel size must be positive")
+    
+    x = np.arange(size) - size // 2
+    kernel = np.exp(-0.5 * (x / sigma) ** 2)
+    return kernel / kernel.sum()
+
+class SmoothingProcessor(BaseProcessor): 
+    """
+    This processor takes raw trajectory data extracted from human demonstrations
+    and applies smoothing techniques to create executable robot trajectories. 
+
+    Attributes:
+        bimanual_setup (str): Configuration mode ("single_arm" or bimanual type)
+        target_hand (str): Target hand for single-arm processing ("left" or "right")
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize the smoothing processor with configuration parameters.
+        
+        Args:
+            args: Command line arguments containing smoothing configuration
+                 including bimanual setup and target hand specification
+        """
+        super().__init__(args)
+
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process and smooth trajectories for a single demonstration.
+        
+        Args:
+            data_sub_folder: Path to demonstration data folder containing
+                           processed action trajectories from previous stages
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+
+        # Handle single-arm processing mode
+        if self.bimanual_setup == "single_arm":
+            self._process_single_arm_demo(paths)
+        else:
+            self._process_bimanual_demo(paths)
+
+    def _process_single_arm_demo(self, paths: Paths) -> None:
+        """
+        Process single-arm demonstration data.
+        
+        Args:
+            paths: Paths object containing file locations
+        """
+        # Load action data for target hand
+        actions_path = self._get_actions_path(paths)
+        actions = np.load(actions_path, allow_pickle=True)
+        
+        # Apply smoothing to each trajectory component
+        smoothed_ee_pts = self.gaussian_process_smoothing(actions["ee_pts"])
+        
+        # Apply rotation smoothing with configuration-specific parameters
+        if self.constrained_hand:
+            smoothed_ee_oris = self.gaussian_slerp_smoothing(
+                actions["ee_oris"], sigma=10.0, kernel_size=41
+            )
+        else:
+            smoothed_ee_oris = self.gaussian_slerp_smoothing(
+                actions["ee_oris"], sigma=10.0
+            )
+            
+        smoothed_ee_widths = self.gaussian_process_smoothing(actions["ee_widths"])
+        
+        # Save results based on target hand
+        if self.target_hand == "left":
+            self._save_results(paths, smoothed_ee_pts_left=smoothed_ee_pts, 
+                             smoothed_ee_oris_left=smoothed_ee_oris, 
+                             smoothed_ee_widths_left=smoothed_ee_widths)
+        else:
+            self._save_results(paths, smoothed_ee_pts_right=smoothed_ee_pts, 
+                             smoothed_ee_oris_right=smoothed_ee_oris, 
+                             smoothed_ee_widths_right=smoothed_ee_widths)
+
+    def _process_bimanual_demo(self, paths: Paths) -> None:
+        """
+        Process bimanual demonstration data.
+        
+        Args:
+            paths: Paths object containing file locations
+        """
+        # Load data for both hands
+        actions_left_path = str(paths.actions_left).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        actions_right_path = str(paths.actions_right).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        actions_left = np.load(actions_left_path, allow_pickle=True)
+        actions_right = np.load(actions_right_path, allow_pickle=True)
+
+        # Apply position smoothing using Gaussian Process regression
+        smoothed_ee_pts_left = self.gaussian_process_smoothing(actions_left["ee_pts"])
+        smoothed_ee_pts_right = self.gaussian_process_smoothing(actions_right["ee_pts"])
+
+        # Apply rotation smoothing using SLERP with optimized parameters for bimanual coordination
+        smoothed_ee_oris_left = self.gaussian_slerp_smoothing(
+            actions_left["ee_oris"], sigma=10.0, kernel_size=21
+        )
+        smoothed_ee_oris_right = self.gaussian_slerp_smoothing(
+            actions_right["ee_oris"], sigma=10.0, kernel_size=21
+        )
+
+        # Apply gripper distance smoothing
+        smoothed_ee_widths_left = self.gaussian_process_smoothing(actions_left["ee_widths"])
+        smoothed_ee_widths_right = self.gaussian_process_smoothing(actions_right["ee_widths"])
+
+        # Save all smoothed trajectories
+        self._save_results(paths, smoothed_ee_pts_left, smoothed_ee_oris_left, smoothed_ee_widths_left, 
+                           smoothed_ee_pts_right, smoothed_ee_oris_right, smoothed_ee_widths_right)
+
+    def _get_actions_path(self, paths: Paths) -> str:
+        """
+        Get the appropriate actions file path based on target hand.
+        
+        Args:
+            paths: Paths object containing file locations
+            
+        Returns:
+            Path to the actions file for the target hand
+        """
+        if self.target_hand == "left":
+            base_path = str(paths.actions_left)
+        else:
+            base_path = str(paths.actions_right)
+        return base_path.split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+
+    def _save_results(self, paths: Paths, smoothed_ee_pts_left: Optional[np.ndarray] = None, 
+                      smoothed_ee_oris_left: Optional[np.ndarray] = None, 
+                      smoothed_ee_widths_left: Optional[np.ndarray] = None, 
+                      smoothed_ee_pts_right: Optional[np.ndarray] = None, 
+                      smoothed_ee_oris_right: Optional[np.ndarray] = None, 
+                      smoothed_ee_widths_right: Optional[np.ndarray] = None) -> None:
+        """
+        Save smoothed trajectory results to disk.
+        
+        Args:
+            paths: Paths object containing output file locations
+            smoothed_ee_pts_left: Smoothed left hand position trajectory
+            smoothed_ee_oris_left: Smoothed left hand orientation trajectory
+            smoothed_ee_widths_left: Smoothed left hand gripper trajectory
+            smoothed_ee_pts_right: Smoothed right hand position trajectory
+            smoothed_ee_oris_right: Smoothed right hand orientation trajectory
+            smoothed_ee_widths_right: Smoothed right hand gripper trajectory
+        """
+        # Create output directory
+        os.makedirs(paths.smoothing_processor, exist_ok=True)
+        
+        # Save left hand trajectories if provided
+        if smoothed_ee_pts_left is not None:
+            smoothed_actions_left_path = str(paths.smoothed_actions_left).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+            np.savez(smoothed_actions_left_path, 
+                    ee_pts=smoothed_ee_pts_left, 
+                    ee_oris=smoothed_ee_oris_left, 
+                    ee_widths=smoothed_ee_widths_left)
+        
+        # Save right hand trajectories if provided
+        if smoothed_ee_pts_right is not None:
+            smoothed_actions_right_path = str(paths.smoothed_actions_right).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+            np.savez(smoothed_actions_right_path, 
+                    ee_pts=smoothed_ee_pts_right, 
+                    ee_oris=smoothed_ee_oris_right, 
+                    ee_widths=smoothed_ee_widths_right)
+
+    @staticmethod
+    def gaussian_slerp_smoothing(rot_mats: np.ndarray, sigma: float = 2, kernel_size: int = 9) -> np.ndarray:
+        """
+        Apply Gaussian-weighted SLERP smoothing to rotation matrices.
+        
+        Args:
+            rot_mats: Array of rotation matrices to smooth, shape (N, 3, 3)
+            sigma: Standard deviation for Gaussian kernel
+            kernel_size: Size of the smoothing kernel (should be odd)
+            
+        Returns:
+            Array of smoothed rotation matrices, shape (N, 3, 3)
+            
+        Raises:
+            ValueError: If kernel_size is not odd
+        """
+        if kernel_size % 2 != 1:
+            raise ValueError("Kernel size must be odd for proper centering")
+            
+        half_k = kernel_size // 2
+        N = len(rot_mats)
+
+        # Step 1: Convert rotation matrices to quaternions for interpolation
+        quats = Rotation.from_matrix(rot_mats).as_quat()
+
+        # Step 2: Apply hemisphere correction to ensure quaternion continuity
+        quats_fixed = [quats[0]]
+        for i in range(1, N):
+            q = quats[i]
+            # Choose quaternion hemisphere that minimizes distance to previous quaternion
+            if np.dot(q, quats_fixed[-1]) < 0:
+                q = -q
+            quats_fixed.append(q)
+        quats_fixed = np.array(quats_fixed)
+
+        # Step 3: Prepare normalized Gaussian weights for local smoothing
+        weights = gaussian_kernel(kernel_size, sigma)
+
+        # Step 4: Apply weighted SLERP averaging for each time point
+        smoothed_rots = []
+        for i in range(N):
+            # Define local neighborhood around current time point
+            start = max(0, i - half_k)
+            end = min(N, i + half_k + 1)
+
+            # Extract local quaternions and corresponding weights
+            local_quats = quats_fixed[start:end]
+            local_weights = weights[half_k - (i - start): half_k + (end - i)]
+
+            # Normalize weights for current neighborhood
+            local_weights /= local_weights.sum()
+
+            # Initialize weighted average with first quaternion
+            q_avg = local_quats[0]
+            r_avg = Rotation.from_quat(q_avg)
+
+            # Iteratively apply weighted SLERP interpolation
+            for j in range(1, len(local_quats)):
+                r_next = Rotation.from_quat(local_quats[j])
+                # Use SLERP with weight proportional to current quaternion's contribution
+                r_avg = Slerp([0, 1], Rotation.concatenate([r_avg, r_next]))([local_weights[j] / (local_weights[:j+1].sum())])[0]
+
+            smoothed_rots.append(r_avg.as_matrix())
+
+        return np.stack(smoothed_rots)
+
+    @staticmethod
+    def gaussian_process_smoothing(pts: np.ndarray) -> np.ndarray:
+        """
+        Apply Gaussian process smoothing to trajectory points.
+        
+        Args:
+            pts: Trajectory points to smooth, shape (N,) for 1D or (N, D) for multi-dimensional
+            
+        Returns:
+            Smoothed trajectory points with same shape as input
+            
+        Raises:
+            ValueError: If pts is empty
+        """
+        if len(pts) == 0:
+            raise ValueError("Cannot smooth empty trajectory")
+            
+        # Create time indices as features for GP regression
+        time = np.arange(len(pts))[:, None]  # Time as a single feature
+        
+        # Configure GP kernel: RBF for smoothness + White noise for robustness
+        kernel = RBF(length_scale=1) + WhiteKernel(noise_level=1)
+        gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+
+        # Handle 1D trajectory case
+        if pts.ndim == 1:
+            return gpr.fit(time, pts).predict(time)
+        
+        # Handle multi-dimensional trajectory case by processing each dimension independently
+        return np.column_stack([gpr.fit(time, pts[:, i]).predict(time) for i in range(pts.shape[1])]) 
\ No newline at end of file
diff --git a/phantom/phantom/twin_bimanual_robot.py b/phantom/phantom/twin_bimanual_robot.py
new file mode 100644
index 0000000000000000000000000000000000000000..48dae775515e4b785a5b0a97c496a4c2373db6e7
--- /dev/null
+++ b/phantom/phantom/twin_bimanual_robot.py
@@ -0,0 +1,597 @@
+"""
+Virtual twin bimanual robot implementation for MuJoCo simulation.
+
+This module provides a TwinBimanualRobot class that creates a virtual representation
+of a bimanual (two-arm) robot system in MuJoCo using the robosuite framework.
+The twin robot can be controlled via end-effector poses or joint positions and
+provides observation data including RGB images, depth maps, and robot masks.
+"""
+
+from collections import deque
+import re
+import cv2
+import pdb
+import matplotlib.pyplot as plt
+import numpy as np 
+from scipy.spatial.transform import Rotation
+from dataclasses import dataclass
+from typing import Tuple, Union, Any
+
+from robosuite.controllers import load_controller_config # type: ignore
+from robosuite.utils.camera_utils import get_real_depth_map # type: ignore
+from robomimic.envs.env_robosuite import EnvRobosuite # type: ignore
+import robomimic.utils.obs_utils as ObsUtils # type: ignore
+
+
+@dataclass
+class MujocoCameraParams:
+    """
+    Camera parameters for MuJoCo simulation.
+    
+    Attributes:
+        name: Camera name identifier
+        pos: 3D position of camera in world coordinates
+        ori_wxyz: Camera orientation as quaternion (w, x, y, z)
+        fov: Field of view in degrees
+        resolution: Image resolution as (width, height)
+        sensorsize: Physical sensor size in mm
+        principalpixel: Principal point coordinates in pixels
+        focalpixel: Focal length in pixels
+    """
+    name: str
+    pos: np.ndarray
+    ori_wxyz: np.ndarray
+    fov: float
+    resolution: Tuple[int, int]
+    sensorsize: np.ndarray
+    principalpixel: np.ndarray
+    focalpixel: np.ndarray
+
+# Color constants for visualization (RGBA format)
+THUMB_COLOR = [0, 1, 0, 1]  # Green for thumb
+INDEX_COLOR = [1, 0, 0, 1]  # Red for index finger
+HAND_EE_COLOR = [0, 0, 1, 1]  # Blue for hand end-effector
+
+# Transformation matrix for Epic Kitchen setup - converts from base frame to robot frame
+BASE_T_1 = np.array([[0.0, -1.0,  0.0,  0.0],
+                    [ 0.5,  0.0,  0.866,  0.2],
+                    [-0.866,  0.0,  0.5,  1.50],
+                    [ 0.0,  0.0,  0.0,  1.0]])
+
+def convert_real_camera_ori_to_mujoco(camera_ori_matrix: np.ndarray) -> np.ndarray:
+    """
+    Convert camera orientation from real world to MuJoCo XML format.
+    
+    MuJoCo uses a different coordinate system convention, so we need to
+    flip the Y and Z axes of the rotation matrix before converting to quaternion.
+    
+    Args:
+        camera_ori_matrix: 3x3 rotation matrix in real-world coordinates
+        
+    Returns:
+        Camera orientation as quaternion in MuJoCo format (w, x, y, z)
+    """
+    camera_ori_matrix[:, [1, 2]] = -camera_ori_matrix[:, [1, 2]]
+    r = Rotation.from_matrix(camera_ori_matrix)
+    camera_ori_wxyz = r.as_quat(scalar_first=True)
+    return camera_ori_wxyz
+
+class TwinBimanualRobot:
+    """
+    Virtual twin of a bimanual robot system in MuJoCo simulation.
+    
+    This class creates a simulated bimanual robot that can be controlled via
+    end-effector poses or joint positions. It provides functionality for:
+    - Robot pose control (OSC or joint-level)
+    - Camera observation collection (RGB, depth, segmentation)
+    - Robot and gripper mask generation
+    - Observation history management
+    """
+    
+    def __init__(self, robot_name: str, gripper_name: str, bimanual_setup: str,
+                 camera_params: MujocoCameraParams, camera_height: int, camera_width: int,
+                 render: bool, n_steps_short: int, n_steps_long: int, square: bool = False,
+                 debug_cameras: list[str] = [], epic: bool = False, joint_controller: bool = False): 
+        """
+        Initialize the bimanual robot twin.
+        
+        Args:
+            robot_name: Type of robot (e.g., "Kinova3")
+            gripper_name: Type of gripper (e.g., "Robotiq85")
+            bimanual_setup: Configuration for bimanual setup
+            camera_params: Camera configuration parameters
+            camera_height: Height of camera images in pixels
+            camera_width: Width of camera images in pixels
+            render: Whether to render the simulation visually
+            n_steps_short: Number of simulation steps for quick movements
+            n_steps_long: Number of simulation steps for initial/slow movements
+            square: Whether to crop images to square aspect ratio
+            debug_cameras: Additional camera names for debugging views
+            epic: Whether to use Epic Kitchen coordinate system
+            joint_controller: Whether to use joint-level control instead of OSC
+        """
+        # Store configuration parameters
+        self.robot_name = robot_name
+        self.gripper_name = gripper_name
+        self.bimanual_setup = bimanual_setup
+        self.camera_params = camera_params
+        self.render = render
+        self.n_steps_long = n_steps_long
+        self.n_steps_short= n_steps_short
+        self.num_frames = 2  # Number of frames to keep in observation history
+        self.camera_height = camera_height
+        self.camera_width = camera_width
+        self.camera_name = "zed"  # Main camera name
+        self.square = square
+        self.debug_cameras = list(debug_cameras) if debug_cameras else []
+        self.epic = epic  # Epic Kitchen mode flag
+        self.joint_controller = joint_controller  # Control mode flag
+
+        # Configure observation specifications for robomimic
+        obs_spec = dict(
+            obs=dict(
+                low_dim=["robot0_eef_pos"],  # End-effector position observations
+                rgb=[f"{self.camera_params.name}_image"] + [f"{cam}_image" for cam in self.debug_cameras],
+            ),
+        )
+        ObsUtils.initialize_obs_utils_with_obs_specs(
+            obs_modality_specs=obs_spec)
+                        
+        # Configure robosuite environment options
+        options: dict[str, Union[str, list[str], dict[str, Any], bool, int, np.ndarray]] = {}
+        options["env_name"] = "PhantomBimanual"
+        options["bimanual_setup"] = bimanual_setup
+        options["robots"] = [self.robot_name, self.robot_name]  # Two identical robots
+        if self.robot_name == "Kinova3":
+            options["gripper_types"] = [f"{self.gripper_name}GripperRealKinova", f"{self.gripper_name}GripperRealKinova"]
+        else:
+            options["gripper_types"] = [f"{self.gripper_name}Gripper", f"{self.gripper_name}Gripper"]
+        
+        # Configure controller (OSC pose control by default)
+        controller_config = load_controller_config(default_controller="OSC_POSE")
+        controller_config["control_delta"] = False  # Use absolute positioning
+        controller_config["uncouple_pos_ori"] = False  # Couple position and orientation
+        options["controller_configs"] = controller_config
+        
+        # Override with joint controller if specified
+        if self.joint_controller:
+            controller_config = load_controller_config(default_controller="JOINT_POSITION")
+            controller_config["input_type"] = "absolute"
+            controller_config["input_max"] = 10
+            controller_config["input_min"] = -10
+            controller_config["output_max"] = 10
+            controller_config["output_min"] = -10
+            controller_config["kd"] = 200  # Derivative gain
+            controller_config["kv"] = 200  # Velocity gain
+            controller_config["kp"] = 1000  # Proportional gain
+            controller_config["kp_limits"] = [0, 1000]  # Proportional gain limits
+            options["controller_configs"] = controller_config
+            
+        # Camera and observation settings
+        options["camera_heights"] = self.camera_height
+        options["camera_widths"] = self.camera_width
+        options["camera_segmentations"] = "instance"  # Instance segmentation masks
+        options["direct_gripper_control"] = True
+        options["use_depth_obs"] = True
+        
+        # Apply Epic Kitchen coordinate transformation if enabled
+        if self.epic:
+            self.base_T_1 = BASE_T_1
+            # Transform camera position and orientation to Epic Kitchen frame
+            self.camera_params.pos = self.base_T_1[:3, :3] @ self.camera_params.pos + self.base_T_1[:3, 3]
+            camera_ori_matrix = self.base_T_1[:3, :3] @ Rotation.from_quat(self.camera_params.ori_wxyz, scalar_first=True).as_matrix()
+            self.camera_params.ori_wxyz = Rotation.from_matrix(camera_ori_matrix).as_quat(scalar_first=True)
+
+        # Set camera parameters
+        options["camera_pos"] = self.camera_params.pos
+        options["camera_quat_wxyz"] = self.camera_params.ori_wxyz
+        options["camera_sensorsize"] = self.camera_params.sensorsize
+        options["camera_principalpixel"] = self.camera_params.principalpixel
+        options["camera_focalpixel"] = self.camera_params.focalpixel
+
+        # Create the robosuite environment
+        self.env = EnvRobosuite(
+            **options,
+            render=render,
+            render_offscreen=True,  # Enable offscreen rendering for image capture
+            use_image_obs=True,
+            camera_names=[self.camera_params.name] + self.debug_cameras,
+            control_freq=20,  # 20 Hz control frequency
+        )
+
+        # Initialize environment and compute robot base position
+        self.reset()
+        self.robot_base_pos = np.array([0, 0, self.env.env.robot_base_height+self.env.env.robot_base_offset])
+
+ 
+    def reset(self):
+        """Reset environment and clear observation history."""
+        self.env.reset()
+        self.obs_history = deque()
+
+    def close(self):
+        """Close the simulation environment."""
+        self.env.env.close()
+
+    def get_action_from_ee_pose(self, ee_pos: np.ndarray, ee_quat_xyzw: np.ndarray, gripper_action: float,
+                                use_base_offset: bool = False) -> np.ndarray:
+        """
+        Convert end-effector pose to robot action vector.
+        
+        This method transforms the desired end-effector position and orientation
+        into the action format expected by the robot controller.
+        
+        Args:
+            ee_pos: End-effector position as 3D array
+            ee_quat_xyzw: End-effector orientation as quaternion (x, y, z, w)
+            gripper_action: Gripper action value
+            use_base_offset: Whether to add robot base offset to position
+            
+        Returns:
+            Action vector [position(3), rotation(3), gripper(1)]
+        """
+        # Handle batch inputs by taking the last element
+        if ee_pos.ndim > 1:
+            ee_pos = ee_pos[-1]
+            ee_quat_xyzw = ee_quat_xyzw[-1]
+            
+        # Add base offset if requested and not in Epic mode
+        if use_base_offset and not self.epic:
+            ee_pos = ee_pos + self.robot_base_pos
+        
+        # Apply coordinate transformations based on mode
+        if self.epic:
+            # Transform position and orientation to Epic Kitchen coordinate frame
+            ee_pos = self.base_T_1[:3, 3] + self.base_T_1[:3, :3] @ ee_pos
+            axis_angle = Rotation.from_matrix(self.base_T_1[:3, :3] @ Rotation.from_quat(ee_quat_xyzw).as_matrix()).as_rotvec()
+        elif not self.epic:
+            # Apply 135-degree Z rotation for standard setup
+            rot = Rotation.from_quat(ee_quat_xyzw)
+            rot_135deg = Rotation.from_euler('z', 135, degrees=True)
+            new_rot = rot * rot_135deg 
+            axis_angle = new_rot.as_rotvec()
+            
+        # Combine into action vector
+        action = np.concatenate([ee_pos, axis_angle, [gripper_action]])
+
+        return action
+
+    def _get_initial_obs_history(self, state: dict) -> deque:
+        """
+        Initialize observation history by repeating the first observation.
+        
+        This creates a history buffer filled with the initial robot state,
+        which is useful for algorithms that require temporal context.
+        
+        Args:
+            state: Initial robot state dictionary
+            
+        Returns:
+            Deque containing repeated initial observations
+        """
+        obs_history = deque(
+                [self.move_to_target_state(state, init=True)], 
+                maxlen=self.num_frames,
+        )
+        # Fill remaining slots with copies of the initial observation
+        for _ in range(self.num_frames-1):
+            obs_history.append(self.move_to_target_state(state))
+        return obs_history
+    
+    def get_obs_history(self, state: dict) -> list:
+        """
+        Get observation history with specified length.
+        
+        Maintains a rolling buffer of recent observations for temporal context.
+        
+        Args:
+            state: Current robot state dictionary
+            
+        Returns:
+            List of recent observations (length = self.num_frames)
+        """
+        if len(self.obs_history) == 0:
+            # Initialize history if empty
+            self.obs_history = self._get_initial_obs_history(state)
+        else:
+            # Add new observation to history
+            self.obs_history.append(self.move_to_target_state(state))
+        return list(self.obs_history)
+    
+    def move_to_target_state(self, state: dict, init=False) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Move robot to target state and collect observation data.
+        
+        This is the main method for controlling the robot and collecting observations.
+        It handles both pose and joint control modes, and collects RGB, depth,
+        and segmentation data along with tracking errors.
+        
+        Args:
+            state: Target state containing positions, orientations, and gripper states
+            init: Whether this is an initialization step (uses longer movement time)
+            
+        Returns:
+            Dictionary containing observation data:
+            - robot_mask: Binary mask showing robot pixels
+            - gripper_mask: Binary mask showing gripper pixels  
+            - rgb_img: RGB camera image
+            - depth_img: Depth camera image
+            - robot_pos: Robot end-effector position
+            - left_pos_err: Left arm position tracking error
+            - right_pos_err: Right arm position tracking error
+            - {cam}_img: Additional camera images if debug_cameras specified
+        """
+        # Convert gripper positions to actions based on controller type
+        if not self.joint_controller:
+            # Use pose controller with gripper position mapping
+            gripper_action_0 = self._convert_handgripper_pos_to_action(state["gripper_pos"][0])
+            gripper_action_1 = self._convert_handgripper_pos_to_action(state["gripper_pos"][1])
+            gripper_action = [gripper_action_0, gripper_action_1]
+        else:
+            # Use joint controller with direct gripper control
+            gripper_action = [state["gripper_pos"][0]*255, state["gripper_pos"][1]*255]
+
+        # Choose movement duration based on whether this is initialization
+        n_steps = self.n_steps_long if init else self.n_steps_short
+
+        # Execute movement based on controller type
+        if not self.joint_controller:
+            # Move using pose control
+            obs = self.move_to_pose(state["pos"], state["ori_xyzw"], gripper_action, n_steps)
+        else:
+            # Move using joint control
+            obs = self.move_to_pose(state["pos"], state["ori_xyzw"], gripper_action, n_steps, state["q0"], state["q1"])
+
+        # Extract observation data from simulation
+        robot_mask = np.squeeze(self.get_robot_mask(obs))
+        gripper_mask = np.squeeze(self.get_gripper_mask(obs))
+        rgb_img = self.get_image(obs)
+        depth_img = self.get_depth_image(obs)
+        robot_pos = obs["robot0_eef_pos"] - self.robot_base_pos
+
+        # Calculate end-effector tracking errors for both arms
+        if not self.epic:
+            # Standard coordinate frame
+            right_pos_error = np.linalg.norm(obs['robot0_eef_pos']-self.robot_base_pos - state["pos"][0])
+            left_pos_error = np.linalg.norm(obs['robot1_eef_pos']-self.robot_base_pos - state["pos"][1])
+        else:
+            # Epic Kitchen coordinate frame
+            right_pos_error = np.linalg.norm(obs['robot0_eef_pos']-self.base_T_1[:3, 3] - self.base_T_1[:3, :3] @ state["pos"][0])
+            left_pos_error = np.linalg.norm(obs['robot1_eef_pos']-self.base_T_1[:3, 3] - self.base_T_1[:3, :3] @ state["pos"][1])
+
+        # Compile output dictionary
+        output = {
+            "robot_mask": robot_mask,
+            "gripper_mask": gripper_mask,
+            "rgb_img": rgb_img,
+            "depth_img": depth_img,
+            "robot_pos": robot_pos,
+            "left_pos_err": left_pos_error,
+            "right_pos_err": right_pos_error,
+        }
+
+        # Add debug camera images if specified
+        for cam in self.debug_cameras:
+            cam_img = self.get_camera_image(obs, cam)
+            output[f"{cam}_img"] = cam_img
+
+        return output
+ 
+    def _convert_handgripper_pos_to_action(self, gripper_pos: float) -> np.ndarray:
+        """
+        Convert hand gripper position to robot gripper action.
+        
+        Maps from physical gripper opening distance to robot action values.
+        Different gripper types may have different mappings.
+        
+        Args:
+            gripper_pos: Gripper opening distance in meters
+            
+        Returns:
+            Robot gripper action value (0-255 for Robotiq85)
+            
+        Raises:
+            ValueError: If gripper type is not supported
+        """
+        if self.gripper_name == "Robotiq85":
+            # Robotiq85 gripper specifications
+            min_gripper_pos, max_gripper_pos = 0.0, 0.085  # 0 to 8.5cm opening
+            gripper_pos = np.clip(gripper_pos, min_gripper_pos, max_gripper_pos)
+            open_gripper_action, closed_gripper_action = 0, 255  # 0=open, 255=closed
+            # Linear interpolation between open and closed states
+            return np.interp(gripper_pos, [min_gripper_pos, max_gripper_pos], [closed_gripper_action, open_gripper_action])
+        else:
+            raise ValueError(f"Gripper name {self.gripper_name} not supported")
+
+    def move_to_pose(self, ee_pos: dict, ee_ori: dict, gripper_action: dict, n_steps: int, q0=None, q1=None) -> dict:
+        """
+        Execute robot movement to target pose.
+        
+        Sends action commands to the simulation for the specified number of steps.
+        Handles both pose control (OSC) and joint control modes.
+        
+        Args:
+            ee_pos: End-effector positions for both arms {0: pos0, 1: pos1}
+            ee_ori: End-effector orientations for both arms {0: ori0, 1: ori1}
+            gripper_action: Gripper actions for both arms {0: grip0, 1: grip1}
+            n_steps: Number of simulation steps to execute
+            q0: Joint positions for arm 0 (only for joint controller)
+            q1: Joint positions for arm 1 (only for joint controller)
+            
+        Returns:
+            Final observation dictionary from simulation
+        """
+        if not self.joint_controller:
+            # Pose control mode: convert poses to actions
+            action_0 = self.get_action_from_ee_pose(ee_pos[0], ee_ori[0], gripper_action[0], use_base_offset=True)
+            action_1 = self.get_action_from_ee_pose(ee_pos[1], ee_ori[1], gripper_action[1], use_base_offset=True)
+            action = np.concatenate([action_0, action_1])
+        else:
+            # Joint control mode: convert joint angles from degrees to radians
+            q0_new = []
+            for rot_q in q0:
+                if rot_q >= 180:
+                    q0_new.append((rot_q/180*np.pi-2*np.pi))  # Handle angle wrapping
+                else:
+                    q0_new.append(rot_q/180*np.pi)
+            q1_new = []
+            for rot_q in q1:
+                if rot_q >= 180:
+                    q1_new.append((rot_q/180*np.pi-2*np.pi))  # Handle angle wrapping
+                else:
+                    q1_new.append(rot_q/180*np.pi)
+
+            # Combine joint positions and gripper actions
+            action_0 = q0_new
+            action_1 = q1_new
+            action = np.concatenate([action_0, np.array(gripper_action[0]).reshape(1,), action_1, np.array(gripper_action[1]).reshape(1,)])
+
+        # Execute action for specified number of steps
+        for _ in range(n_steps):
+            obs, _, _, _ = self.env.step(action)
+            if self.render:
+                self.env.render()
+        return obs
+    
+    def get_proprioception(self, obs: dict) -> np.ndarray:
+        """
+        Get proprioceptive information (robot's internal state).
+        
+        Args:
+            obs: Observation dictionary from simulation
+            
+        Returns:
+            End-effector position of first robot
+        """
+        pos = obs["robot0_eef_pos"]
+        return pos
+
+    def get_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract RGB image from observation.
+        
+        Handles image format conversion and optional square cropping.
+        
+        Args:
+            obs: Observation dictionary containing image data
+            
+        Returns:
+            RGB image as numpy array (H, W, 3)
+        """
+        img = obs[f"{self.camera_name}_image"]
+        img = img.transpose(1, 2, 0)  # Convert from CHW to HWC format
+        height = img.shape[0]
+        width = img.shape[1]
+        
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    
+    def get_camera_image(self, obs: dict, camera_name: str) -> np.ndarray:
+        """
+        Extract RGB image from specific camera.
+        
+        Args:
+            obs: Observation dictionary containing image data
+            camera_name: Name of the camera to extract image from
+            
+        Returns:
+            RGB image as numpy array (H, W, 3)
+        """
+        img = obs[f"{camera_name}_image"]
+        img = img.transpose(1, 2, 0)  # Convert from CHW to HWC format
+        height = img.shape[0]
+        width = img.shape[1]
+        
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    
+    def get_seg_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract instance segmentation image.
+        
+        Args:
+            obs: Observation dictionary containing segmentation data
+            
+        Returns:
+            Segmentation image as uint8 array where each pixel value
+            represents a different object instance ID
+        """
+        img = obs[f"{self.camera_name}_segmentation_instance"]
+        height = img.shape[0]
+        width = img.shape[1]
+        
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]  
+            img = img.astype(np.uint8)
+        return img
+
+    def get_depth_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract and process depth image.
+        
+        Converts raw depth buffer to real-world depth values using
+        robosuite's depth processing utilities.
+        
+        Args:
+            obs: Observation dictionary containing depth data
+            
+        Returns:
+            Depth image as numpy array where values represent
+            distance in meters
+        """
+        img = obs[f"{self.camera_name}_depth"]
+        img = get_real_depth_map(sim=self.env.env.sim, depth_map=img)
+        height = img.shape[0]
+        width = img.shape[1]
+        
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    
+    def get_robot_mask(self, obs: dict) -> np.ndarray:
+        """
+        Generate binary mask for robot pixels.
+        
+        Uses instance segmentation to identify which pixels belong to
+        the robot arms (instance IDs 1 and 4).
+        
+        Args:
+            obs: Observation dictionary containing segmentation data
+            
+        Returns:
+            Binary mask where 1 indicates robot pixels, 0 otherwise
+        """
+        seg_img = self.get_seg_image(obs)
+        mask = np.zeros_like(seg_img)
+        mask[seg_img == 1] = 1  # First robot arm
+        mask[seg_img == 4] = 1  # Second robot arm
+        return mask
+    
+    def get_gripper_mask(self, obs: dict) -> np.ndarray:
+        """
+        Generate binary mask for gripper pixels.
+        
+        Uses instance segmentation to identify which pixels belong to
+        the robot grippers (instance IDs 3 and 6).
+        
+        Args:
+            obs: Observation dictionary containing segmentation data
+            
+        Returns:
+            Binary mask where 1 indicates gripper pixels, 0 otherwise
+        """
+        seg_img = self.get_seg_image(obs)
+        mask = np.zeros_like(seg_img)
+        mask[seg_img == 3] = 1  # First gripper
+        mask[seg_img == 6] = 1  # Second gripper
+        return mask
diff --git a/phantom/phantom/twin_robot.py b/phantom/phantom/twin_robot.py
new file mode 100644
index 0000000000000000000000000000000000000000..006539ace12d02a24354c21a73edcbfe447f39bf
--- /dev/null
+++ b/phantom/phantom/twin_robot.py
@@ -0,0 +1,490 @@
+"""
+Virtual twin single-arm robot implementation for MuJoCo simulation.
+
+This module provides a TwinRobot class that creates a virtual representation
+of a single-arm robot system in MuJoCo using the robosuite framework.
+The twin robot can be controlled via end-effector poses and provides 
+observation data including RGB images, depth maps, and robot masks.
+"""
+
+from collections import deque
+import cv2
+import numpy as np 
+from scipy.spatial.transform import Rotation
+from dataclasses import dataclass
+from typing import Tuple, Union, Any
+
+from robosuite.controllers import load_controller_config # type: ignore
+from robosuite.utils.camera_utils import get_real_depth_map # type: ignore
+from robomimic.envs.env_robosuite import EnvRobosuite # type: ignore
+import robomimic.utils.obs_utils as ObsUtils # type: ignore
+
+
+@dataclass
+class MujocoCameraParams:
+    """
+    Camera parameters for MuJoCo simulation.
+    
+    Attributes:
+        name: Camera name identifier
+        pos: 3D position of camera in world coordinates
+        ori_wxyz: Camera orientation as quaternion (w, x, y, z)
+        fov: Field of view in degrees
+        resolution: Image resolution as (width, height)
+        sensorsize: Physical sensor size in mm
+        principalpixel: Principal point coordinates in pixels
+        focalpixel: Focal length in pixels
+    """
+    name: str
+    pos: np.ndarray
+    ori_wxyz: np.ndarray
+    fov: float
+    resolution: Tuple[int, int]
+    sensorsize: np.ndarray
+    principalpixel: np.ndarray
+    focalpixel: np.ndarray
+
+# Color constants for visualization (RGBA format)
+THUMB_COLOR = [0, 1, 0, 1]  # Green for thumb
+INDEX_COLOR = [1, 0, 0, 1]  # Red for index finger
+HAND_EE_COLOR = [0, 0, 1, 1]  # Blue for hand end-effector
+
+def convert_real_camera_ori_to_mujoco(camera_ori_matrix: np.ndarray) -> np.ndarray:
+    """
+    Convert camera orientation from real world to MuJoCo XML format.
+    
+    MuJoCo uses a different coordinate system convention, so we need to
+    flip the Y and Z axes of the rotation matrix before converting to quaternion.
+    
+    Args:
+        camera_ori_matrix: 3x3 rotation matrix in real-world coordinates
+        
+    Returns:
+        Camera orientation as quaternion in MuJoCo format (w, x, y, z)
+    """
+    camera_ori_matrix[:, [1, 2]] = -camera_ori_matrix[:, [1, 2]]
+    r = Rotation.from_matrix(camera_ori_matrix)
+    camera_ori_wxyz = r.as_quat(scalar_first=True)
+    return camera_ori_wxyz
+
+
+class TwinRobot:
+    """
+    Virtual twin of a single-arm robot system in MuJoCo simulation.
+    
+    This class creates a simulated single-arm robot that can be controlled via
+    end-effector poses. It provides functionality for:
+    - Robot pose control using OSC (Operational Space Control)
+    - Camera observation collection (RGB, depth, segmentation)
+    - Robot and gripper mask generation
+    - Observation history management
+    """
+    
+    # Robot configuration constants
+    DEFAULT_ROBOT_BASE_POS = np.array([-0.56, 0, 0.912])
+    
+    def __init__(self, robot_name: str, gripper_name: str, camera_params: MujocoCameraParams, camera_height: int, camera_width: int,
+                 render: bool, n_steps_short: int, n_steps_long: int, debug_cameras: list[str] = [], 
+                 square: bool = False): 
+        """
+        Initialize the single-arm robot twin.
+        
+        Args:
+            robot_name: Type of robot (e.g., "Kinova3")
+            gripper_name: Type of gripper (e.g., "Robotiq85")
+            camera_params: Camera configuration parameters
+            camera_height: Height of camera images in pixels
+            camera_width: Width of camera images in pixels
+            render: Whether to render the simulation visually
+            n_steps_short: Number of simulation steps for quick movements
+            n_steps_long: Number of simulation steps for initial/slow movements
+            debug_cameras: Additional camera names for debugging views
+            square: Whether to crop images to square aspect ratio
+        """
+        # Store configuration parameters
+        self.robot_name = robot_name
+        self.gripper_name = gripper_name
+        self.camera_params = camera_params
+        self.render = render
+        self.n_steps_long = n_steps_long
+        self.n_steps_short= n_steps_short
+        self.num_frames = 2  # Number of frames to keep in observation history
+        self.camera_height = camera_height
+        self.camera_width = camera_width
+        self.camera_name = "frontview"  # Main camera name for single-arm setup
+        self.square = square
+        self.debug_cameras = list(debug_cameras) if debug_cameras else []
+
+        # Configure observation specifications for robomimic
+        obs_spec = dict(
+            obs=dict(
+                low_dim=["robot0_eef_pos"],  # End-effector position observations
+                rgb=[f"{self.camera_params.name}_image"] + [f"{cam}_image" for cam in self.debug_cameras],
+            ),
+        )
+        ObsUtils.initialize_obs_utils_with_obs_specs(
+            obs_modality_specs=obs_spec)
+                        
+        # Configure robosuite environment options
+        options: dict[str, Union[str, list[str], dict[str, Any], bool, int, np.ndarray]] = {}
+        options["env_name"] = "Phantom"  # Single-arm environment
+        options["robots"] = [self.robot_name]  # Single robot
+        options["gripper_types"] = [f"{self.gripper_name}Gripper"]  # Single gripper
+        
+        # Configure OSC pose controller
+        controller_config = load_controller_config(default_controller="OSC_POSE")
+        controller_config["control_delta"] = False  # Use absolute positioning
+        controller_config["uncouple_pos_ori"] = False  # Couple position and orientation
+        options["controller_configs"] = controller_config
+        
+        # Camera and observation settings
+        options["camera_heights"] = self.camera_height
+        options["camera_widths"] = self.camera_width
+        options["camera_segmentations"] = "instance"  # Instance segmentation masks
+        options["direct_gripper_control"] = True
+        options["use_depth_obs"] = True
+        
+        # Set camera parameters
+        options["camera_pos"] = self.camera_params.pos
+        options["camera_quat_wxyz"] = self.camera_params.ori_wxyz
+        options["camera_sensorsize"] = self.camera_params.sensorsize
+        options["camera_principalpixel"] = self.camera_params.principalpixel
+        options["camera_focalpixel"] = self.camera_params.focalpixel
+
+        # Create the robosuite environment
+        self.env = EnvRobosuite(
+            **options,
+            render=render,
+            render_offscreen=True,  # Enable offscreen rendering for image capture
+            use_image_obs=True,
+            camera_names=[self.camera_params.name] + self.debug_cameras,
+            control_freq=20,  # 20 Hz control frequency
+        )
+
+        # Initialize environment and set robot base position
+        self.reset()
+        self.robot_base_pos = self.DEFAULT_ROBOT_BASE_POS  # Fixed base position for single-arm setup
+
+    def reset(self):
+        """Reset environment and clear observation history."""
+        self.env.reset()
+        self.obs_history = deque()
+
+    def close(self):
+        """Close the simulation environment."""
+        self.env.env.close()
+
+    def get_action_from_ee_pose(self, ee_pos: np.ndarray, ee_quat_xyzw: np.ndarray, gripper_action: float,
+                                use_base_offset: bool = False) -> np.ndarray:
+        """
+        Convert end-effector pose to robot action vector.
+        
+        This method transforms the desired end-effector position and orientation
+        into the action format expected by the robot controller.
+        
+        Args:
+            ee_pos: End-effector position as 3D array
+            ee_quat_xyzw: End-effector orientation as quaternion (x, y, z, w)
+            gripper_action: Gripper action value
+            use_base_offset: Whether to add robot base offset to position
+            
+        Returns:
+            Action vector [position(3), rotation(3), gripper(1)]
+        """
+        # Handle batch inputs by taking the last element
+        if ee_pos.ndim > 1:
+            ee_pos = ee_pos[-1]
+            ee_quat_xyzw = ee_quat_xyzw[-1]
+            
+        # Add base offset if requested
+        if use_base_offset:
+            ee_pos = ee_pos + self.robot_base_pos
+
+        # Apply -135 degree Z rotation for single-arm setup coordinate conversion
+        rot = Rotation.from_quat(ee_quat_xyzw)
+        rot_135deg = Rotation.from_euler('z', -135, degrees=True)
+        new_rot = rot * rot_135deg 
+
+        # Convert rotation to axis-angle representation
+        # Note: commented lines show alternative approach using quaternion directly
+        # quat_rotated = rot_rotated135.as_quat()
+        # axis_angle = Rotation.from_quat(quat_rotated).as_rotvec()
+        axis_angle = new_rot.as_rotvec()
+        
+        # Combine position, rotation, and gripper action into action vector
+        action = np.concatenate([ee_pos, axis_angle, [gripper_action]])
+
+        return action
+
+    def _get_initial_obs_history(self, state: dict) -> deque:
+        """
+        Initialize observation history by repeating the first observation.
+        
+        This creates a history buffer filled with the initial robot state,
+        which is useful for algorithms that require temporal context.
+        
+        Args:
+            state: Initial robot state dictionary
+            
+        Returns:
+            Deque containing repeated initial observations
+        """
+        obs_history = deque(
+                [self.move_to_target_state(state, init=True)], 
+                maxlen=self.num_frames,
+        )
+        # Fill remaining slots with copies of the initial observation
+        for _ in range(self.num_frames-1):
+            obs_history.append(self.move_to_target_state(state))
+        return obs_history
+    
+    def get_obs_history(self, state: dict) -> list:
+        """
+        Get observation history with specified length.
+        
+        Maintains a rolling buffer of recent observations for temporal context.
+        
+        Args:
+            state: Current robot state dictionary
+            
+        Returns:
+            List of recent observations (length = self.num_frames)
+        """
+        if len(self.obs_history) == 0:
+            # Initialize history if empty
+            self.obs_history = self._get_initial_obs_history(state)
+        else:
+            # Add new observation to history
+            self.obs_history.append(self.move_to_target_state(state))
+        return list(self.obs_history)
+    
+    def move_to_target_state(self, state: dict, init=False) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Move robot to target state and collect observation data.
+        
+        Args:
+            state: Target state containing position, orientation, and gripper state
+            init: Whether this is an initialization step (uses longer movement time)
+            
+        Returns:
+            Dictionary containing observation data:
+            - robot_mask: Binary mask showing robot pixels
+            - gripper_mask: Binary mask showing gripper pixels  
+            - rgb_img: RGB camera image
+            - depth_img: Depth camera image
+            - robot_pos: Robot end-effector position relative to base
+            - pos_err: Position tracking error magnitude
+            - {cam}_img: Additional camera images if debug_cameras specified
+        """
+        # Convert gripper position to robot action
+        gripper_action = self._convert_handgripper_pos_to_action(state["gripper_pos"])
+        
+        # Choose movement duration based on whether this is initialization
+        n_steps = self.n_steps_long if init else self.n_steps_short
+        
+        # Execute movement to target pose
+        obs = self.move_to_pose(state["pos"], state["ori_xyzw"], float(gripper_action), n_steps)
+
+        # Extract observation data from simulation
+        robot_mask = np.squeeze(self.get_robot_mask(obs))
+        gripper_mask = np.squeeze(self.get_gripper_mask(obs))
+        rgb_img = self.get_image(obs)
+        depth_img = self.get_depth_image(obs)
+        robot_pos = obs["robot0_eef_pos"] - self.robot_base_pos
+        pos_error = np.linalg.norm(robot_pos - state["pos"])
+
+        # Compile output dictionary
+        output = {
+            "robot_mask": robot_mask,
+            "gripper_mask": gripper_mask,
+            "rgb_img": rgb_img,
+            "depth_img": depth_img,
+            "robot_pos": robot_pos,
+            "pos_err": pos_error,
+        }
+
+        # Add debug camera images if specified
+        for cam in self.debug_cameras:
+            cam_img = self.get_cam_image(obs, cam)
+            output[f"{cam}_img"] = cam_img
+
+        return output
+
+    def _convert_handgripper_pos_to_action(self, gripper_pos: float) -> np.ndarray:
+        """
+        Convert hand gripper position to robot gripper action.
+        
+        Maps from physical gripper opening distance to robot action values.
+        Different gripper types may have different mappings.
+        
+        Args:
+            gripper_pos: Gripper opening distance in meters
+            
+        Returns:
+            Robot gripper action value (0-255 for Robotiq85)
+            
+        Raises:
+            ValueError: If gripper type is not supported
+        """
+        if self.gripper_name == "Robotiq85":
+            # Robotiq85 gripper specifications
+            min_gripper_pos, max_gripper_pos = 0.0, 0.085  # 0 to 8.5cm opening
+            gripper_pos = np.clip(gripper_pos, min_gripper_pos, max_gripper_pos)
+            open_gripper_action, closed_gripper_action = 0, 255  # 0=open, 255=closed
+            # Linear interpolation between open and closed states
+            return np.interp(gripper_pos, [min_gripper_pos, max_gripper_pos], [closed_gripper_action, open_gripper_action])
+        else:
+            raise ValueError(f"Gripper name {self.gripper_name} not supported")
+
+    def move_to_pose(self, ee_pos: np.ndarray, ee_ori: np.ndarray, gripper_action: float, n_steps: int) -> dict:
+        """
+        Execute robot movement to target pose.
+        
+        Sends action commands to the simulation for the specified number of steps.
+        
+        Args:
+            ee_pos: End-effector position as 3D array
+            ee_ori: End-effector orientation as quaternion (x, y, z, w)
+            gripper_action: Gripper action value
+            n_steps: Number of simulation steps to execute
+            
+        Returns:
+            Final observation dictionary from simulation
+        """
+        # Convert pose to action vector
+        action = self.get_action_from_ee_pose(ee_pos, ee_ori, gripper_action, use_base_offset=True)
+        
+        # Execute action for specified number of steps
+        for _ in range(n_steps):
+            obs, _, _, _ = self.env.step(action)
+            if self.render:
+                self.env.render()
+        return obs
+
+    def get_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract RGB image from observation.
+        
+        Handles image format conversion and optional square cropping.
+        
+        Args:
+            obs: Observation dictionary containing image data
+            
+        Returns:
+            RGB image as numpy array (H, W, 3)
+        """
+        img = obs[f"{self.camera_name}_image"]
+        img = img.transpose(1, 2, 0)  # Convert from CHW to HWC format
+        height = img.shape[0]
+        width = img.shape[1]
+        
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    
+    def get_cam_image(self, obs: dict, camera_name: str) -> np.ndarray:
+        """
+        Extract RGB image from specific camera.
+        
+        Args:
+            obs: Observation dictionary containing image data
+            camera_name: Name of the camera to extract image from
+            
+        Returns:
+            RGB image as numpy array (H, W, 3)
+        """
+        img = obs[f"{camera_name}_image"]
+        img = img.transpose(1, 2, 0)  # Convert from CHW to HWC format
+        height = img.shape[0]
+        width = img.shape[1]
+        
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    
+    def get_seg_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract instance segmentation image.
+        
+        Args:
+            obs: Observation dictionary containing segmentation data
+            
+        Returns:
+            Segmentation image as uint8 array where each pixel value
+            represents a different object instance ID
+        """
+        img = obs["frontview_segmentation_instance"]  # Fixed camera name for single-arm
+        height = img.shape[0]
+        width = img.shape[1]
+        
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]  
+        img = img.astype(np.uint8)
+        return img
+
+    def get_depth_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract and process depth image.
+        
+        Converts raw depth buffer to real-world depth values using
+        robosuite's depth processing utilities.
+        
+        Args:
+            obs: Observation dictionary containing depth data
+            
+        Returns:
+            Depth image as numpy array where values represent
+            distance in meters
+        """
+        img = obs["frontview_depth"]  # Fixed camera name for single-arm
+        img = get_real_depth_map(sim=self.env.env.sim, depth_map=img)
+        height = img.shape[0]
+        width = img.shape[1]
+        
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    
+    def get_robot_mask(self, obs: dict) -> np.ndarray:
+        """
+        Generate binary mask for robot pixels.
+        
+        Uses instance segmentation to identify which pixels belong to
+        the robot arm (instance ID 1).
+        
+        Args:
+            obs: Observation dictionary containing segmentation data
+            
+        Returns:
+            Binary mask where 1 indicates robot pixels, 0 otherwise
+        """
+        seg_img = self.get_seg_image(obs)
+        mask = np.zeros_like(seg_img)
+        mask[seg_img == 1] = 1  # Robot arm
+        return mask
+    
+    def get_gripper_mask(self, obs: dict) -> np.ndarray:
+        """
+        Generate binary mask for gripper pixels.
+        
+        Uses instance segmentation to identify which pixels belong to
+        the robot gripper (instance ID 3).
+        
+        Args:
+            obs: Observation dictionary containing segmentation data
+            
+        Returns:
+            Binary mask where 1 indicates gripper pixels, 0 otherwise
+        """
+        seg_img = self.get_seg_image(obs)
+        mask = np.zeros_like(seg_img)
+        mask[seg_img == 3] = 1  # Gripper
+        return mask
\ No newline at end of file
diff --git a/phantom/phantom/utils/__init__.py b/phantom/phantom/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/phantom/utils/bbox_utils.py b/phantom/phantom/utils/bbox_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b893cda53bb99e67279e70a08ef4557cd79e9fb4
--- /dev/null
+++ b/phantom/phantom/utils/bbox_utils.py
@@ -0,0 +1,38 @@
+import numpy as np
+import numpy.typing as npt
+
+def get_bbox_center(bbox: np.ndarray) -> np.ndarray:
+    """Calculate center point of bounding box."""
+    return np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2])
+
+
+def get_bbox_area(bbox: np.ndarray) -> float:
+    """Get the area of a bounding box."""
+    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+
+
+def get_overlap_score(bbox1: np.ndarray, bbox2: np.ndarray) -> float:
+    """ Get the overlap area between two boxes divided by the area of the smaller box """
+    area1 = get_bbox_area(bbox1)
+    area2 = get_bbox_area(bbox2)
+    overlap_area = get_overlap_area(bbox1, bbox2)
+    return overlap_area / min(area1, area2)
+
+def get_overlap_area(bbox1: np.ndarray, bbox2: np.ndarray) -> float:
+    """ Get the overlap area between two boxes """
+    return max(0, min(bbox1[2], bbox2[2]) - max(bbox1[0], bbox2[0])) * max(0, min(bbox1[3], bbox2[3]) - max(bbox1[1], bbox2[1]))
+
+def get_bbox_center_min_dist_to_edge(bboxes: npt.NDArray[np.float32], W: int, H: int) -> npt.NDArray[np.float32]:
+    """
+    Get the minimum distance of the bbox center to the edge of the image.
+    """
+    center_min_dist_to_edge_list = []
+    for bbox in bboxes:
+        x1, y1, x2, y2 = bbox
+        center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
+        min_dist_to_edge = min(center[0], center[1], W - center[0], H - center[1])
+        center_min_dist_to_edge_list.append(min_dist_to_edge)
+    return np.array(center_min_dist_to_edge_list)
+
+
+
diff --git a/phantom/phantom/utils/data_utils.py b/phantom/phantom/utils/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..292dbef34ab5ef370de009f5f6f57a6d3ffb31de
--- /dev/null
+++ b/phantom/phantom/utils/data_utils.py
@@ -0,0 +1,38 @@
+import re
+import os
+import numpy as np
+import pandas as pd
+from pathlib import Path
+
+def get_finger_poses_from_pkl(path: Path) -> dict:
+    """Get human finger poses from pkl file."""
+    finger_poses = pd.read_pickle(path)
+    thumb_poses = np.vstack(finger_poses["thumb"])
+    index_poses = np.vstack(finger_poses["index"])
+    hand_ee_poses = np.vstack(finger_poses["hand_ee"])
+    skeleton_poses = np.stack(finger_poses["skeleton"], axis=0)
+    hand_poses = np.stack(finger_poses["hand_pose"], axis=0)
+    all_global_orient = np.vstack(finger_poses["global_orient"])
+    data = {
+        "thumb": thumb_poses,
+        "index": index_poses,
+        "hand_ee": hand_ee_poses,
+        "skeleton": skeleton_poses,
+        "hand_pose": hand_poses,
+        "global_orient": all_global_orient
+    }
+    return data
+
+def get_parent_folder_of_package(package_name: str) -> str:
+    # Import the package
+    package = __import__(package_name)
+
+    # Get the absolute path of the imported package
+    package_path = package.__file__
+    if package_path is None:
+        raise ValueError(f"Package {package_name} does not have a valid __file__ attribute")
+    package_path = os.path.abspath(package_path)
+
+    # Get the parent directory of the package directory
+    return os.path.dirname(os.path.dirname(package_path))
+
diff --git a/phantom/phantom/utils/image_utils.py b/phantom/phantom/utils/image_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a61d1d9d2f03d333dd6dc86885c7ce3510eec83
--- /dev/null
+++ b/phantom/phantom/utils/image_utils.py
@@ -0,0 +1,103 @@
+import json
+import numpy as np
+import cv2
+import os
+import mediapy as media
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+@dataclass
+class BoundingBox:
+    xmin: int
+    ymin: int
+    xmax: int
+    ymax: int
+
+    @property
+    def xyxy(self) -> List[float]:
+        return [self.xmin, self.ymin, self.xmax, self.ymax]
+
+
+@dataclass
+class DetectionResult:
+    score: float
+    label: str
+    box: BoundingBox
+    mask: Optional[np.ndarray] = None
+
+    @classmethod
+    def from_dict(cls, detection_dict: Dict) -> "DetectionResult":
+        return cls(
+            score=detection_dict["score"],
+            label=detection_dict["label"],
+            box=BoundingBox(
+                xmin=detection_dict["box"]["xmin"],
+                ymin=detection_dict["box"]["ymin"],
+                xmax=detection_dict["box"]["xmax"],
+                ymax=detection_dict["box"]["ymax"],
+            ),
+        )
+
+def get_transformation_matrix_from_extrinsics(camera_extrinsics: List[Dict]) -> np.ndarray:
+    """Get homogeneous transformation matrix from camera extrinsics."""
+    cam_base_pos = np.array(camera_extrinsics[0]["camera_base_pos"])
+    cam_base_ori = np.array(camera_extrinsics[0]["camera_base_ori"])
+    T_cam2robot = np.eye(4)
+    T_cam2robot[:3, 3] = cam_base_pos
+    T_cam2robot[:3, :3] = np.array(cam_base_ori).reshape(3, 3)
+    return T_cam2robot
+
+
+def get_intrinsics_from_json(json_path: str) -> Tuple[np.ndarray, dict]:
+    with open(json_path, "r") as f:
+        camera_intrinsics = json.load(f)
+
+    # Get camera matrix 
+    fx = camera_intrinsics["left"]["fx"]
+    fy = camera_intrinsics["left"]["fy"]
+    cx = camera_intrinsics["left"]["cx"]
+    cy = camera_intrinsics["left"]["cy"]
+    v_fov = camera_intrinsics["left"]["v_fov"]
+    intrinsics_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+
+    intrinsics_dict = {
+        "fx": fx,
+        "fy": fy,
+        "cx": cx,
+        "cy": cy,
+        "v_fov": v_fov,
+    }
+
+    return intrinsics_matrix, intrinsics_dict
+
+def resize_binary_image(image: np.ndarray, new_size: int) -> np.ndarray:
+    max_value = np.max(image)
+
+    # Resize the image
+    resized_image = cv2.resize(image, (new_size, new_size), interpolation=cv2.INTER_NEAREST)
+
+    if max_value == 1:
+        _, binary_image = cv2.threshold(resized_image, 0.5, 1, cv2.THRESH_BINARY)
+    else:
+        _, binary_image = cv2.threshold(resized_image, 127, 255, cv2.THRESH_BINARY)
+
+    return binary_image
+
+
+def convert_video_to_images(video_path: str, save_folder: str, square=False, reverse=False):
+    """Save each frame of video as an image in save_folder."""
+    if not os.path.exists(save_folder):
+        os.makedirs(save_folder)
+
+    imgs = np.array(media.read_video(str(video_path)))
+    n_imgs = len(imgs)
+    if reverse:
+        imgs = imgs[::-1]
+    for idx in range(n_imgs):
+        img = imgs[idx]
+        if square:
+            delta = (img.shape[1] - img.shape[0]) // 2
+            img = img[:, delta:-delta, :]
+        media.write_image(f"{save_folder}/{idx:05d}.jpg", img)
+
+
diff --git a/phantom/phantom/utils/pcd_utils.py b/phantom/phantom/utils/pcd_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0bb7677653781ce386cd14f4e0384a6d514a952
--- /dev/null
+++ b/phantom/phantom/utils/pcd_utils.py
@@ -0,0 +1,210 @@
+import numpy as np
+from typing import Tuple, Optional
+import open3d as o3d  # type: ignore
+import trimesh 
+from sklearn.neighbors import NearestNeighbors # type: ignore
+
+def preprocess_point_cloud(pcd: o3d.geometry.PointCloud, 
+                           voxel_size: float) -> Tuple[o3d.geometry.PointCloud, o3d.pipelines.registration.Feature]:
+    """
+    Downsample point cloud to desired voxel resolution and compute FPFH features.
+    """
+    pcd_down = pcd.voxel_down_sample(voxel_size)
+    radius_normal = voxel_size * 2
+    pcd_down.estimate_normals(o3d.geometry.KDTreeSearchParamHybrid(radius=radius_normal, max_nn=30))
+    radius_feature = voxel_size * 5
+    pcd_fpfh = o3d.pipelines.registration.compute_fpfh_feature(
+        pcd_down, o3d.geometry.KDTreeSearchParamHybrid(radius=radius_feature, max_nn=100))
+    return pcd_down, pcd_fpfh
+
+
+def global_registration(source_pcd: o3d.geometry.PointCloud, target_pcd: o3d.geometry.PointCloud, 
+                        voxel_size: float) -> o3d.pipelines.registration.RegistrationResult:
+    """
+    Register two point clouds using global registration with RANSAC.
+    """
+    source_down, source_fpfh = preprocess_point_cloud(source_pcd, voxel_size)
+    target_down, target_fpfh = preprocess_point_cloud(target_pcd, voxel_size)
+    
+    distance_threshold = voxel_size * 1.5
+    result_ransac = o3d.pipelines.registration.registration_ransac_based_on_feature_matching(
+        source_down, target_down, source_fpfh, target_fpfh, True,
+        distance_threshold,
+        o3d.pipelines.registration.TransformationEstimationPointToPoint(),
+        4,  # RANSAC iterations
+        [o3d.pipelines.registration.CorrespondenceCheckerBasedOnEdgeLength(0.9),
+         o3d.pipelines.registration.CorrespondenceCheckerBasedOnDistance(distance_threshold)],
+        o3d.pipelines.registration.RANSACConvergenceCriteria(4000000, 500))
+    
+    return result_ransac
+
+
+def icp_registration(source_pcd: o3d.geometry.PointCloud, target_pcd: o3d.geometry.PointCloud, 
+                     voxel_size: float=0.05, use_global_registration:bool=True, 
+                     init_transform:Optional[np.ndarray]=None) -> Tuple[o3d.geometry.PointCloud, np.ndarray]:
+    """
+    Register two point clouds using ICP algorithm. 
+    """
+    # Optional global registration using RANSAC
+    if use_global_registration:
+        if init_transform is None:
+            result_ransac = global_registration(source_pcd, target_pcd, voxel_size)
+            init_transform = result_ransac.transformation
+    else:
+        init_transform = np.eye(4)
+    
+    # Refine alignment using ICP
+    max_correspondence_distance = voxel_size * 5
+    result_icp = o3d.pipelines.registration.registration_icp(
+        source=source_pcd, target=target_pcd, max_correspondence_distance=max_correspondence_distance, 
+        init=init_transform,
+        estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint())
+    
+    if np.array_equal(init_transform, result_icp.transformation):
+        result_ransac = global_registration(source_pcd, target_pcd, voxel_size)
+        init_transform = result_ransac.transformation
+        result_icp = o3d.pipelines.registration.registration_icp(
+            source=source_pcd, target=target_pcd, max_correspondence_distance=max_correspondence_distance, 
+            init=init_transform,
+            estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint())
+    
+    aligned_source_pcd = source_pcd.transform(result_icp.transformation)
+
+    return aligned_source_pcd, result_icp.transformation
+
+
+def get_visible_points(mesh, origin: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Return list of points in mesh that are visible from origin.
+    """
+    intersector = trimesh.ray.ray_triangle.RayMeshIntersector(mesh)
+    pts = mesh.vertices
+    vectors = pts - origin
+    directions = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
+    visible_triangle_indices = intersector.intersects_first(np.tile(origin, (pts.shape[0], 1)), directions)
+    visible_triangles = mesh.faces[visible_triangle_indices]
+    visible_vertex_indices = np.unique(visible_triangles)
+    visible_points = pts[visible_vertex_indices]
+    return np.array(visible_points).astype(np.float32), np.array(visible_vertex_indices)
+
+
+def get_pcd_from_points(points: np.ndarray, colors: Optional[np.ndarray]=None) -> o3d.geometry.PointCloud:
+    """
+    Convert a list of points to an Open3D point cloud.
+    """
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points)
+    if colors is not None:
+        pcd.colors = o3d.utility.Vector3dVector(colors)
+    pcd.remove_non_finite_points()
+    return pcd
+
+
+def visualize_pcds(list_pcds: list, visible: bool=True) -> np.ndarray:
+    """
+    Visualize a list of point clouds.
+    """
+    visualization_image = None
+    vis = o3d.visualization.Visualizer()
+    vis.create_window(visible=visible)
+    opt = vis.get_render_option()
+    opt.background_color = np.asarray([0.2, 0.2, 0.2])
+    for pcd in list_pcds:
+        if pcd is not None:
+            vis.add_geometry(pcd)
+    vis.poll_events()
+    vis.update_renderer()
+    if not visible:
+        visualization_image = vis.capture_screen_float_buffer(do_render=True)
+        visualization_image = (255.0 * np.asarray(visualization_image)).astype(np.uint8)
+    if visible:
+        vis.run()
+    vis.destroy_window()
+    if visualization_image is None:
+        visualization_image = np.array([])
+    return visualization_image
+
+def radius_outlier_detection(points: np.ndarray, radius: float=5, 
+                             min_neighbors: int=5) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Detect outliers in a point cloud using radius-based outlier detection.
+    """
+    # Fit the NearestNeighbors model
+    nbrs = NearestNeighbors(radius=radius).fit(points)
+    
+    # Get the number of neighbors for each point within the specified radius
+    distances, indices = nbrs.radius_neighbors(points)
+    
+    # Detect points with fewer neighbors than the minimum threshold
+    outliers_mask = np.array([len(neigh) < min_neighbors for neigh in indices])
+
+    outlier_pts = points[outliers_mask]
+    
+    return outliers_mask, outlier_pts
+
+
+def remove_outliers(pcd: o3d.geometry.PointCloud, radius: float=5, 
+                    min_neighbors: int=5) -> Tuple[o3d.geometry.PointCloud, np.ndarray]:
+    """
+    Remove outliers from a point cloud using radius-based outlier detection.
+    """
+    outlier_indices, outlier_pts = radius_outlier_detection(np.asarray(pcd.points), 
+                                                            radius=radius, min_neighbors=min_neighbors)
+    filtered_pts = np.asarray(pcd.points)[~outlier_indices]
+    filtered_colors = np.asarray(pcd.colors)[~outlier_indices]
+    filtered_pcd = get_pcd_from_points(filtered_pts, colors=filtered_colors)
+    return filtered_pcd, outlier_indices
+
+def get_3D_points_from_pixels(pixels_2d: np.ndarray, depth_map: np.ndarray, intrinsics: dict) -> np.ndarray:
+    """
+    Convert an array of pixel coordinates and depth map to 3D points.
+    """
+    px = pixels_2d[:, 0]
+    py = pixels_2d[:, 1]
+
+    x = (px - intrinsics["cx"]) / intrinsics["fx"]
+    y = (py - intrinsics["cy"]) / intrinsics["fy"]
+
+    if len(depth_map.shape) == 3:
+        depth_map = depth_map[:, :, 0]
+
+    depth = depth_map[py, px]
+
+    X = x * depth
+    Y = y * depth
+
+    points_3d = np.stack((X, Y, depth), axis=1)
+    return points_3d
+
+def get_point_cloud_of_segmask(mask: np.ndarray, depth_img: np.ndarray, img: np.ndarray, 
+                               intrinsics: dict, visualize: bool=False) -> o3d.geometry.PointCloud:
+    """
+    Return the point cloud that corresponds to the segmentation mask in the depth image.
+    """
+    idxs_y, idxs_x = mask.nonzero()
+    pixels_2d = np.stack((idxs_x, idxs_y), axis=1)
+    seg_points = get_3D_points_from_pixels(pixels_2d, depth_img, intrinsics)
+    seg_colors = img[idxs_y, idxs_x, :] / 255.0  # Normalize to [0,1] for cv2
+
+    pcd = get_pcd_from_points(seg_points, colors=seg_colors)
+
+    if visualize:
+        visualize_pcds([pcd])
+
+    return pcd
+
+def get_bbox_of_3d_points(points: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Return the bounding box of 3D points.
+    """
+    min_xyz = np.min(points, axis=0)
+    max_xyz = np.max(points, axis=0)
+    return min_xyz, max_xyz
+
+def trim_pcd_to_bbox(pcd: o3d.geometry.PointCloud, bbox: Tuple[np.ndarray, np.ndarray]) -> o3d.geometry.PointCloud:
+    """
+    Trim a point cloud to the specified bounding box.
+    """
+    min_xyz, max_xyz = bbox
+    trimmed_pcd = pcd.crop(o3d.geometry.AxisAlignedBoundingBox(min_xyz, max_xyz))
+    return trimmed_pcd
\ No newline at end of file
diff --git a/phantom/phantom/utils/transform_utils.py b/phantom/phantom/utils/transform_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbca6ba296d99e50f9a0484d6205a7f60396d582
--- /dev/null
+++ b/phantom/phantom/utils/transform_utils.py
@@ -0,0 +1,43 @@
+import numpy as np
+import math
+
+EPS = np.finfo(float).eps * 4.0
+
+def transform_pts(pts: np.ndarray, T: np.ndarray) -> np.ndarray:
+    pts = np.hstack([pts, np.ones((len(pts), 1))])
+    pts = np.dot(T, pts.T).T
+    return pts[:, :3]
+
+def project_point_to_plane(point: np.ndarray, plane_coeffs: np.ndarray) -> np.ndarray:
+    """
+    Projects a 3D point onto a plane defined by its coefficients.
+
+    Args:
+        point (array-like): Coordinates of the point to be projected (x0, y0, z0).
+        plane_coeffs (array-like): Coefficients of the plane (a, b, c, d) for ax + by + cz + d = 0.
+
+    Returns:
+        numpy.ndarray: The projected point's coordinates on the plane.
+    """
+    # Convert inputs to numpy arrays
+    point = np.array(point)
+    plane_coeffs = np.array(plane_coeffs)
+    
+    # Extract the plane normal vector and constant term
+    normal = plane_coeffs[:3]  # [a, b, c]
+    d = plane_coeffs[3]
+    
+    # Normalize the plane normal vector
+    normal_magnitude = np.linalg.norm(normal)
+    if normal_magnitude == 0:
+        raise ValueError("Invalid plane coefficients: normal vector cannot have zero magnitude.")
+    normal /= normal_magnitude
+    
+    # Calculate the signed distance from the point to the plane
+    distance = np.dot(normal, point) + d / normal_magnitude
+    
+    # Project the point onto the plane
+    projected_point = point - distance * normal
+    
+    return projected_point
+
diff --git a/phantom/setup.py b/phantom/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a9fec23ac0eb103e3f849c66355f5bce2f995f4
--- /dev/null
+++ b/phantom/setup.py
@@ -0,0 +1,7 @@
+import setuptools
+
+setuptools.setup(
+    name="phantom",
+    version="0.1",
+    packages=setuptools.find_packages(exclude=["submodules", "submodules.*"]),
+)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-E2FGVI/.gitignore b/phantom/submodules/phantom-E2FGVI/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..de2ae7cce9460e73fbd78398b0e401a3bc7b861f
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/.gitignore
@@ -0,0 +1,136 @@
+# Customized
+*.pth
+*.pt
+keys.txt
+results/
+.vscode/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/__init__.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/configs/train_e2fgvi.json b/phantom/submodules/phantom-E2FGVI/E2FGVI/configs/train_e2fgvi.json
new file mode 100644
index 0000000000000000000000000000000000000000..2093a0deb42da5dd2c8e60f63cfb458ccc6852c2
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/configs/train_e2fgvi.json
@@ -0,0 +1,41 @@
+{
+    "seed": 2021,
+    "save_dir": "release_model/",
+    "train_data_loader": {
+        "name": "youtube-vos",
+        "data_root": "datasets",
+        "w": 432,
+        "h": 240,
+        "num_local_frames": 5,
+        "num_ref_frames": 3
+    },
+    "losses": {
+        "hole_weight": 1,
+        "valid_weight": 1,
+        "flow_weight": 1,
+        "adversarial_weight": 0.01,
+        "GAN_LOSS": "hinge"
+    },
+    "model": {
+        "net": "e2fgvi",
+        "no_dis": 0
+    },
+    "trainer": {
+        "type": "Adam",
+        "beta1": 0,
+        "beta2": 0.99,
+        "lr": 1e-4,
+        "batch_size": 8,
+        "num_workers": 2,
+        "log_freq": 100,
+        "save_freq": 5e3,
+        "iterations": 50e4,
+        "scheduler": {
+            "type": "MultiStepLR",
+            "milestones": [
+                40e4
+            ],
+            "gamma": 0.1
+        }
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/configs/train_e2fgvi_hq.json b/phantom/submodules/phantom-E2FGVI/E2FGVI/configs/train_e2fgvi_hq.json
new file mode 100644
index 0000000000000000000000000000000000000000..6693b731cc62e354e2c27342d9e5a2807e0c0a4a
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/configs/train_e2fgvi_hq.json
@@ -0,0 +1,41 @@
+{
+    "seed": 2021,
+    "save_dir": "release_model/",
+    "train_data_loader": {
+        "name": "youtube-vos",
+        "data_root": "datasets",
+        "w": 432,
+        "h": 240,
+        "num_local_frames": 5,
+        "num_ref_frames": 3
+    },
+    "losses": {
+        "hole_weight": 1,
+        "valid_weight": 1,
+        "flow_weight": 1,
+        "adversarial_weight": 0.01,
+        "GAN_LOSS": "hinge"
+    },
+    "model": {
+        "net": "e2fgvi_hq",
+        "no_dis": 0
+    },
+    "trainer": {
+        "type": "Adam",
+        "beta1": 0,
+        "beta2": 0.99,
+        "lr": 1e-4,
+        "batch_size": 8,
+        "num_workers": 2,
+        "log_freq": 100,
+        "save_freq": 5e3,
+        "iterations": 50e4,
+        "scheduler": {
+            "type": "MultiStepLR",
+            "milestones": [
+                40e4
+            ],
+            "gamma": 0.1
+        }
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/core/__init__.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/core/dataset.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5d7f992c73cc3b32be64e335caf81f236cb0242
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/dataset.py
@@ -0,0 +1,135 @@
+import os
+import json
+import random
+
+import cv2
+from PIL import Image
+import numpy as np
+
+import torch
+import torchvision.transforms as transforms
+
+from core.utils import (TrainZipReader, TestZipReader,
+                        create_random_shape_with_random_motion, Stack,
+                        ToTorchFormatTensor, GroupRandomHorizontalFlip)
+
+
+class TrainDataset(torch.utils.data.Dataset):
+    def __init__(self, args: dict, debug=False):
+        self.args = args
+        self.num_local_frames = args['num_local_frames']
+        self.num_ref_frames = args['num_ref_frames']
+        self.size = self.w, self.h = (args['w'], args['h'])
+
+        json_path = os.path.join(args['data_root'], args['name'], 'train.json')
+        with open(json_path, 'r') as f:
+            self.video_dict = json.load(f)
+        self.video_names = list(self.video_dict.keys())
+        if debug:
+            self.video_names = self.video_names[:100]
+
+        self._to_tensors = transforms.Compose([
+            Stack(),
+            ToTorchFormatTensor(),
+        ])
+
+    def __len__(self):
+        return len(self.video_names)
+
+    def __getitem__(self, index):
+        item = self.load_item(index)
+        return item
+
+    def _sample_index(self, length, sample_length, num_ref_frame=3):
+        complete_idx_set = list(range(length))
+        pivot = random.randint(0, length - sample_length)
+        local_idx = complete_idx_set[pivot:pivot + sample_length]
+        remain_idx = list(set(complete_idx_set) - set(local_idx))
+        ref_index = sorted(random.sample(remain_idx, num_ref_frame))
+
+        return local_idx + ref_index
+
+    def load_item(self, index):
+        video_name = self.video_names[index]
+        # create masks
+        all_masks = create_random_shape_with_random_motion(
+            self.video_dict[video_name], imageHeight=self.h, imageWidth=self.w)
+
+        # create sample index
+        selected_index = self._sample_index(self.video_dict[video_name],
+                                            self.num_local_frames,
+                                            self.num_ref_frames)
+
+        # read video frames
+        frames = []
+        masks = []
+        for idx in selected_index:
+            video_path = os.path.join(self.args['data_root'],
+                                      self.args['name'], 'JPEGImages',
+                                      f'{video_name}.zip')
+            img = TrainZipReader.imread(video_path, idx).convert('RGB')
+            img = img.resize(self.size)
+            frames.append(img)
+            masks.append(all_masks[idx])
+
+        # normalizate, to tensors
+        frames = GroupRandomHorizontalFlip()(frames)
+        frame_tensors = self._to_tensors(frames) * 2.0 - 1.0
+        mask_tensors = self._to_tensors(masks)
+        return frame_tensors, mask_tensors, video_name
+
+
+class TestDataset(torch.utils.data.Dataset):
+    def __init__(self, args):
+        self.args = args
+        self.size = self.w, self.h = args.size
+
+        with open(os.path.join(args.data_root, args.dataset, 'test.json'),
+                  'r') as f:
+            self.video_dict = json.load(f)
+        self.video_names = list(self.video_dict.keys())
+
+        self._to_tensors = transforms.Compose([
+            Stack(),
+            ToTorchFormatTensor(),
+        ])
+
+    def __len__(self):
+        return len(self.video_names)
+
+    def __getitem__(self, index):
+        item = self.load_item(index)
+        return item
+
+    def load_item(self, index):
+        video_name = self.video_names[index]
+        ref_index = list(range(self.video_dict[video_name]))
+
+        # read video frames
+        frames = []
+        masks = []
+        for idx in ref_index:
+            video_path = os.path.join(self.args.data_root, self.args.dataset,
+                                      'JPEGImages', f'{video_name}.zip')
+            img = TestZipReader.imread(video_path, idx).convert('RGB')
+            img = img.resize(self.size)
+            frames.append(img)
+            mask_path = os.path.join(self.args.data_root, self.args.dataset,
+                                     'test_masks', video_name,
+                                     str(idx).zfill(5) + '.png')
+            mask = Image.open(mask_path).resize(self.size,
+                                                Image.NEAREST).convert('L')
+            # origin: 0 indicates missing. now: 1 indicates missing
+            mask = np.asarray(mask)
+            m = np.array(mask > 0).astype(np.uint8)
+            m = cv2.dilate(m,
+                           cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)),
+                           iterations=4)
+            mask = Image.fromarray(m * 255)
+            masks.append(mask)
+
+        # to tensors
+        frames_PIL = [np.array(f).astype(np.uint8) for f in frames]
+        frame_tensors = self._to_tensors(frames) * 2.0 - 1.0
+        mask_tensors = self._to_tensors(masks)
+        return frame_tensors, mask_tensors, video_name, frames_PIL
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/core/dist.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e4e9e670a3b853fac345618d3557d648d813902
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/dist.py
@@ -0,0 +1,47 @@
+import os
+import torch
+
+
+def get_world_size():
+    """Find OMPI world size without calling mpi functions
+    :rtype: int
+    """
+    if os.environ.get('PMI_SIZE') is not None:
+        return int(os.environ.get('PMI_SIZE') or 1)
+    elif os.environ.get('OMPI_COMM_WORLD_SIZE') is not None:
+        return int(os.environ.get('OMPI_COMM_WORLD_SIZE') or 1)
+    else:
+        return torch.cuda.device_count()
+
+
+def get_global_rank():
+    """Find OMPI world rank without calling mpi functions
+    :rtype: int
+    """
+    if os.environ.get('PMI_RANK') is not None:
+        return int(os.environ.get('PMI_RANK') or 0)
+    elif os.environ.get('OMPI_COMM_WORLD_RANK') is not None:
+        return int(os.environ.get('OMPI_COMM_WORLD_RANK') or 0)
+    else:
+        return 0
+
+
+def get_local_rank():
+    """Find OMPI local rank without calling mpi functions
+    :rtype: int
+    """
+    if os.environ.get('MPI_LOCALRANKID') is not None:
+        return int(os.environ.get('MPI_LOCALRANKID') or 0)
+    elif os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') is not None:
+        return int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') or 0)
+    else:
+        return 0
+
+
+def get_master_ip():
+    if os.environ.get('AZ_BATCH_MASTER_NODE') is not None:
+        return os.environ.get('AZ_BATCH_MASTER_NODE').split(':')[0]
+    elif os.environ.get('AZ_BATCHAI_MPI_MASTER_NODE') is not None:
+        return os.environ.get('AZ_BATCHAI_MPI_MASTER_NODE')
+    else:
+        return "127.0.0.1"
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/core/loss.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..19d0d5f4e3118d82a844921a99b5aa66f05bb7d6
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/loss.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+
+class AdversarialLoss(nn.Module):
+    r"""
+    Adversarial loss
+    https://arxiv.org/abs/1711.10337
+    """
+    def __init__(self,
+                 type='nsgan',
+                 target_real_label=1.0,
+                 target_fake_label=0.0):
+        r"""
+        type = nsgan | lsgan | hinge
+        """
+        super(AdversarialLoss, self).__init__()
+        self.type = type
+        self.register_buffer('real_label', torch.tensor(target_real_label))
+        self.register_buffer('fake_label', torch.tensor(target_fake_label))
+
+        if type == 'nsgan':
+            self.criterion = nn.BCELoss()
+        elif type == 'lsgan':
+            self.criterion = nn.MSELoss()
+        elif type == 'hinge':
+            self.criterion = nn.ReLU()
+
+    def __call__(self, outputs, is_real, is_disc=None):
+        if self.type == 'hinge':
+            if is_disc:
+                if is_real:
+                    outputs = -outputs
+                return self.criterion(1 + outputs).mean()
+            else:
+                return (-outputs).mean()
+        else:
+            labels = (self.real_label
+                      if is_real else self.fake_label).expand_as(outputs)
+            loss = self.criterion(outputs, labels)
+            return loss
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/core/lr_scheduler.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd1341cdcc64aa1c2a416b837551590ded4a43d
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/lr_scheduler.py
@@ -0,0 +1,112 @@
+"""
+    LR scheduler from BasicSR https://github.com/xinntao/BasicSR
+"""
+import math
+from collections import Counter
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class MultiStepRestartLR(_LRScheduler):
+    """ MultiStep with restarts learning rate scheme.
+    Args:
+        optimizer (torch.nn.optimizer): Torch optimizer.
+        milestones (list): Iterations that will decrease learning rate.
+        gamma (float): Decrease ratio. Default: 0.1.
+        restarts (list): Restart iterations. Default: [0].
+        restart_weights (list): Restart weights at each restart iteration.
+            Default: [1].
+        last_epoch (int): Used in _LRScheduler. Default: -1.
+    """
+    def __init__(self,
+                 optimizer,
+                 milestones,
+                 gamma=0.1,
+                 restarts=(0, ),
+                 restart_weights=(1, ),
+                 last_epoch=-1):
+        self.milestones = Counter(milestones)
+        self.gamma = gamma
+        self.restarts = restarts
+        self.restart_weights = restart_weights
+        assert len(self.restarts) == len(
+            self.restart_weights), 'restarts and their weights do not match.'
+        super(MultiStepRestartLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if self.last_epoch in self.restarts:
+            weight = self.restart_weights[self.restarts.index(self.last_epoch)]
+            return [
+                group['initial_lr'] * weight
+                for group in self.optimizer.param_groups
+            ]
+        if self.last_epoch not in self.milestones:
+            return [group['lr'] for group in self.optimizer.param_groups]
+        return [
+            group['lr'] * self.gamma**self.milestones[self.last_epoch]
+            for group in self.optimizer.param_groups
+        ]
+
+
+def get_position_from_periods(iteration, cumulative_period):
+    """Get the position from a period list.
+    It will return the index of the right-closest number in the period list.
+    For example, the cumulative_period = [100, 200, 300, 400],
+    if iteration == 50, return 0;
+    if iteration == 210, return 2;
+    if iteration == 300, return 2.
+    Args:
+        iteration (int): Current iteration.
+        cumulative_period (list[int]): Cumulative period list.
+    Returns:
+        int: The position of the right-closest number in the period list.
+    """
+    for i, period in enumerate(cumulative_period):
+        if iteration <= period:
+            return i
+
+
+class CosineAnnealingRestartLR(_LRScheduler):
+    """ Cosine annealing with restarts learning rate scheme.
+    An example of config:
+    periods = [10, 10, 10, 10]
+    restart_weights = [1, 0.5, 0.5, 0.5]
+    eta_min=1e-7
+    It has four cycles, each has 10 iterations. At 10th, 20th, 30th, the
+    scheduler will restart with the weights in restart_weights.
+    Args:
+        optimizer (torch.nn.optimizer): Torch optimizer.
+        periods (list): Period for each cosine anneling cycle.
+        restart_weights (list): Restart weights at each restart iteration.
+            Default: [1].
+        eta_min (float): The mimimum lr. Default: 0.
+        last_epoch (int): Used in _LRScheduler. Default: -1.
+    """
+    def __init__(self,
+                 optimizer,
+                 periods,
+                 restart_weights=(1, ),
+                 eta_min=1e-7,
+                 last_epoch=-1):
+        self.periods = periods
+        self.restart_weights = restart_weights
+        self.eta_min = eta_min
+        assert (len(self.periods) == len(self.restart_weights)
+                ), 'periods and restart_weights should have the same length.'
+        self.cumulative_period = [
+            sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
+        ]
+        super(CosineAnnealingRestartLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        idx = get_position_from_periods(self.last_epoch,
+                                        self.cumulative_period)
+        current_weight = self.restart_weights[idx]
+        nearest_restart = 0 if idx == 0 else self.cumulative_period[idx - 1]
+        current_period = self.periods[idx]
+
+        return [
+            self.eta_min + current_weight * 0.5 * (base_lr - self.eta_min) *
+            (1 + math.cos(math.pi * (
+                (self.last_epoch - nearest_restart) / current_period)))
+            for base_lr in self.base_lrs
+        ]
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/core/metrics.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..441613d8e96983b4dc72ca046a16790011b23e2a
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/metrics.py
@@ -0,0 +1,570 @@
+import numpy as np
+from skimage import measure
+from scipy import linalg
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from core.utils import to_tensors
+
+
+def calculate_epe(flow1, flow2):
+    """Calculate End point errors."""
+
+    epe = torch.sum((flow1 - flow2)**2, dim=1).sqrt()
+    epe = epe.view(-1)
+    return epe.mean().item()
+
+
+def calculate_psnr(img1, img2):
+    """Calculate PSNR (Peak Signal-to-Noise Ratio).
+    Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+    Args:
+        img1 (ndarray): Images with range [0, 255].
+        img2 (ndarray): Images with range [0, 255].
+    Returns:
+        float: psnr result.
+    """
+
+    assert img1.shape == img2.shape, \
+        (f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
+
+    mse = np.mean((img1 - img2)**2)
+    if mse == 0:
+        return float('inf')
+    return 20. * np.log10(255. / np.sqrt(mse))
+
+
+def calc_psnr_and_ssim(img1, img2):
+    """Calculate PSNR and SSIM for images.
+        img1: ndarray, range [0, 255]
+        img2: ndarray, range [0, 255]
+    """
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+
+    psnr = calculate_psnr(img1, img2)
+    ssim = measure.compare_ssim(img1,
+                                img2,
+                                data_range=255,
+                                multichannel=True,
+                                win_size=65)
+
+    return psnr, ssim
+
+
+###########################
+# I3D models
+###########################
+
+
+def init_i3d_model():
+    i3d_model_path = './release_model/i3d_rgb_imagenet.pt'
+    print(f"[Loading I3D model from {i3d_model_path} for FID score ..]")
+    i3d_model = InceptionI3d(400, in_channels=3, final_endpoint='Logits')
+    i3d_model.load_state_dict(torch.load(i3d_model_path))
+    i3d_model.to(torch.device('cuda:0'))
+    return i3d_model
+
+
+def calculate_i3d_activations(video1, video2, i3d_model, device):
+    """Calculate VFID metric.
+        video1: list[PIL.Image]
+        video2: list[PIL.Image]
+    """
+    video1 = to_tensors()(video1).unsqueeze(0).to(device)
+    video2 = to_tensors()(video2).unsqueeze(0).to(device)
+    video1_activations = get_i3d_activations(
+        video1, i3d_model).cpu().numpy().flatten()
+    video2_activations = get_i3d_activations(
+        video2, i3d_model).cpu().numpy().flatten()
+
+    return video1_activations, video2_activations
+
+
+def calculate_vfid(real_activations, fake_activations):
+    """
+    Given two distribution of features, compute the FID score between them
+    Params:
+        real_activations: list[ndarray]
+        fake_activations: list[ndarray]
+    """
+    m1 = np.mean(real_activations, axis=0)
+    m2 = np.mean(fake_activations, axis=0)
+    s1 = np.cov(real_activations, rowvar=False)
+    s2 = np.cov(fake_activations, rowvar=False)
+    return calculate_frechet_distance(m1, s1, m2, s2)
+
+
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """Numpy implementation of the Frechet Distance.
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+    Stable version by Dougal J. Sutherland.
+    Params:
+    -- mu1   : Numpy array containing the activations of a layer of the
+               inception net (like returned by the function 'get_predictions')
+               for generated samples.
+    -- mu2   : The sample mean over activations, precalculated on an
+               representive data set.
+    -- sigma1: The covariance matrix over activations for generated samples.
+    -- sigma2: The covariance matrix over activations, precalculated on an
+               representive data set.
+    Returns:
+    --   : The Frechet Distance.
+    """
+
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+
+    diff = mu1 - mu2
+
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+               'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+
+    tr_covmean = np.trace(covmean)
+
+    return (diff.dot(diff) + np.trace(sigma1) +  # NOQA
+            np.trace(sigma2) - 2 * tr_covmean)
+
+
+def get_i3d_activations(batched_video,
+                        i3d_model,
+                        target_endpoint='Logits',
+                        flatten=True,
+                        grad_enabled=False):
+    """
+    Get features from i3d model and flatten them to 1d feature,
+    valid target endpoints are defined in InceptionI3d.VALID_ENDPOINTS
+    VALID_ENDPOINTS = (
+        'Conv3d_1a_7x7',
+        'MaxPool3d_2a_3x3',
+        'Conv3d_2b_1x1',
+        'Conv3d_2c_3x3',
+        'MaxPool3d_3a_3x3',
+        'Mixed_3b',
+        'Mixed_3c',
+        'MaxPool3d_4a_3x3',
+        'Mixed_4b',
+        'Mixed_4c',
+        'Mixed_4d',
+        'Mixed_4e',
+        'Mixed_4f',
+        'MaxPool3d_5a_2x2',
+        'Mixed_5b',
+        'Mixed_5c',
+        'Logits',
+        'Predictions',
+    )
+    """
+    with torch.set_grad_enabled(grad_enabled):
+        feat = i3d_model.extract_features(batched_video.transpose(1, 2),
+                                          target_endpoint)
+    if flatten:
+        feat = feat.view(feat.size(0), -1)
+
+    return feat
+
+
+# This code is from https://github.com/piergiaj/pytorch-i3d/blob/master/pytorch_i3d.py
+# I only fix flake8 errors and do some cleaning here
+
+
+class MaxPool3dSamePadding(nn.MaxPool3d):
+    def compute_pad(self, dim, s):
+        if s % self.stride[dim] == 0:
+            return max(self.kernel_size[dim] - self.stride[dim], 0)
+        else:
+            return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
+
+    def forward(self, x):
+        # compute 'same' padding
+        (batch, channel, t, h, w) = x.size()
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+
+        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        x = F.pad(x, pad)
+        return super(MaxPool3dSamePadding, self).forward(x)
+
+
+class Unit3D(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 output_channels,
+                 kernel_shape=(1, 1, 1),
+                 stride=(1, 1, 1),
+                 padding=0,
+                 activation_fn=F.relu,
+                 use_batch_norm=True,
+                 use_bias=False,
+                 name='unit_3d'):
+        """Initializes Unit3D module."""
+        super(Unit3D, self).__init__()
+
+        self._output_channels = output_channels
+        self._kernel_shape = kernel_shape
+        self._stride = stride
+        self._use_batch_norm = use_batch_norm
+        self._activation_fn = activation_fn
+        self._use_bias = use_bias
+        self.name = name
+        self.padding = padding
+
+        self.conv3d = nn.Conv3d(
+            in_channels=in_channels,
+            out_channels=self._output_channels,
+            kernel_size=self._kernel_shape,
+            stride=self._stride,
+            padding=0,  # we always want padding to be 0 here. We will
+            # dynamically pad based on input size in forward function
+            bias=self._use_bias)
+
+        if self._use_batch_norm:
+            self.bn = nn.BatchNorm3d(self._output_channels,
+                                     eps=0.001,
+                                     momentum=0.01)
+
+    def compute_pad(self, dim, s):
+        if s % self._stride[dim] == 0:
+            return max(self._kernel_shape[dim] - self._stride[dim], 0)
+        else:
+            return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
+
+    def forward(self, x):
+        # compute 'same' padding
+        (batch, channel, t, h, w) = x.size()
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+
+        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        x = F.pad(x, pad)
+
+        x = self.conv3d(x)
+        if self._use_batch_norm:
+            x = self.bn(x)
+        if self._activation_fn is not None:
+            x = self._activation_fn(x)
+        return x
+
+
+class InceptionModule(nn.Module):
+    def __init__(self, in_channels, out_channels, name):
+        super(InceptionModule, self).__init__()
+
+        self.b0 = Unit3D(in_channels=in_channels,
+                         output_channels=out_channels[0],
+                         kernel_shape=[1, 1, 1],
+                         padding=0,
+                         name=name + '/Branch_0/Conv3d_0a_1x1')
+        self.b1a = Unit3D(in_channels=in_channels,
+                          output_channels=out_channels[1],
+                          kernel_shape=[1, 1, 1],
+                          padding=0,
+                          name=name + '/Branch_1/Conv3d_0a_1x1')
+        self.b1b = Unit3D(in_channels=out_channels[1],
+                          output_channels=out_channels[2],
+                          kernel_shape=[3, 3, 3],
+                          name=name + '/Branch_1/Conv3d_0b_3x3')
+        self.b2a = Unit3D(in_channels=in_channels,
+                          output_channels=out_channels[3],
+                          kernel_shape=[1, 1, 1],
+                          padding=0,
+                          name=name + '/Branch_2/Conv3d_0a_1x1')
+        self.b2b = Unit3D(in_channels=out_channels[3],
+                          output_channels=out_channels[4],
+                          kernel_shape=[3, 3, 3],
+                          name=name + '/Branch_2/Conv3d_0b_3x3')
+        self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
+                                        stride=(1, 1, 1),
+                                        padding=0)
+        self.b3b = Unit3D(in_channels=in_channels,
+                          output_channels=out_channels[5],
+                          kernel_shape=[1, 1, 1],
+                          padding=0,
+                          name=name + '/Branch_3/Conv3d_0b_1x1')
+        self.name = name
+
+    def forward(self, x):
+        b0 = self.b0(x)
+        b1 = self.b1b(self.b1a(x))
+        b2 = self.b2b(self.b2a(x))
+        b3 = self.b3b(self.b3a(x))
+        return torch.cat([b0, b1, b2, b3], dim=1)
+
+
+class InceptionI3d(nn.Module):
+    """Inception-v1 I3D architecture.
+    The model is introduced in:
+        Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
+        Joao Carreira, Andrew Zisserman
+        https://arxiv.org/pdf/1705.07750v1.pdf.
+    See also the Inception architecture, introduced in:
+        Going deeper with convolutions
+        Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
+        Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
+        http://arxiv.org/pdf/1409.4842v1.pdf.
+    """
+
+    # Endpoints of the model in order. During construction, all the endpoints up
+    # to a designated `final_endpoint` are returned in a dictionary as the
+    # second return value.
+    VALID_ENDPOINTS = (
+        'Conv3d_1a_7x7',
+        'MaxPool3d_2a_3x3',
+        'Conv3d_2b_1x1',
+        'Conv3d_2c_3x3',
+        'MaxPool3d_3a_3x3',
+        'Mixed_3b',
+        'Mixed_3c',
+        'MaxPool3d_4a_3x3',
+        'Mixed_4b',
+        'Mixed_4c',
+        'Mixed_4d',
+        'Mixed_4e',
+        'Mixed_4f',
+        'MaxPool3d_5a_2x2',
+        'Mixed_5b',
+        'Mixed_5c',
+        'Logits',
+        'Predictions',
+    )
+
+    def __init__(self,
+                 num_classes=400,
+                 spatial_squeeze=True,
+                 final_endpoint='Logits',
+                 name='inception_i3d',
+                 in_channels=3,
+                 dropout_keep_prob=0.5):
+        """Initializes I3D model instance.
+        Args:
+          num_classes: The number of outputs in the logit layer (default 400, which
+              matches the Kinetics dataset).
+          spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
+              before returning (default True).
+          final_endpoint: The model contains many possible endpoints.
+              `final_endpoint` specifies the last endpoint for the model to be built
+              up to. In addition to the output at `final_endpoint`, all the outputs
+              at endpoints up to `final_endpoint` will also be returned, in a
+              dictionary. `final_endpoint` must be one of
+              InceptionI3d.VALID_ENDPOINTS (default 'Logits').
+          name: A string (optional). The name of this module.
+        Raises:
+          ValueError: if `final_endpoint` is not recognized.
+        """
+
+        if final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError('Unknown final endpoint %s' % final_endpoint)
+
+        super(InceptionI3d, self).__init__()
+        self._num_classes = num_classes
+        self._spatial_squeeze = spatial_squeeze
+        self._final_endpoint = final_endpoint
+        self.logits = None
+
+        if self._final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError('Unknown final endpoint %s' %
+                             self._final_endpoint)
+
+        self.end_points = {}
+        end_point = 'Conv3d_1a_7x7'
+        self.end_points[end_point] = Unit3D(in_channels=in_channels,
+                                            output_channels=64,
+                                            kernel_shape=[7, 7, 7],
+                                            stride=(2, 2, 2),
+                                            padding=(3, 3, 3),
+                                            name=name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'MaxPool3d_2a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(
+            kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Conv3d_2b_1x1'
+        self.end_points[end_point] = Unit3D(in_channels=64,
+                                            output_channels=64,
+                                            kernel_shape=[1, 1, 1],
+                                            padding=0,
+                                            name=name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Conv3d_2c_3x3'
+        self.end_points[end_point] = Unit3D(in_channels=64,
+                                            output_channels=192,
+                                            kernel_shape=[3, 3, 3],
+                                            padding=1,
+                                            name=name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'MaxPool3d_3a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(
+            kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Mixed_3b'
+        self.end_points[end_point] = InceptionModule(192,
+                                                     [64, 96, 128, 16, 32, 32],
+                                                     name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Mixed_3c'
+        self.end_points[end_point] = InceptionModule(
+            256, [128, 128, 192, 32, 96, 64], name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'MaxPool3d_4a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(
+            kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Mixed_4b'
+        self.end_points[end_point] = InceptionModule(
+            128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Mixed_4c'
+        self.end_points[end_point] = InceptionModule(
+            192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Mixed_4d'
+        self.end_points[end_point] = InceptionModule(
+            160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Mixed_4e'
+        self.end_points[end_point] = InceptionModule(
+            128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Mixed_4f'
+        self.end_points[end_point] = InceptionModule(
+            112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128],
+            name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'MaxPool3d_5a_2x2'
+        self.end_points[end_point] = MaxPool3dSamePadding(
+            kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Mixed_5b'
+        self.end_points[end_point] = InceptionModule(
+            256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128],
+            name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Mixed_5c'
+        self.end_points[end_point] = InceptionModule(
+            256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128],
+            name + end_point)
+        if self._final_endpoint == end_point:
+            return
+
+        end_point = 'Logits'
+        self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1))
+        self.dropout = nn.Dropout(dropout_keep_prob)
+        self.logits = Unit3D(in_channels=384 + 384 + 128 + 128,
+                             output_channels=self._num_classes,
+                             kernel_shape=[1, 1, 1],
+                             padding=0,
+                             activation_fn=None,
+                             use_batch_norm=False,
+                             use_bias=True,
+                             name='logits')
+
+        self.build()
+
+    def replace_logits(self, num_classes):
+        self._num_classes = num_classes
+        self.logits = Unit3D(in_channels=384 + 384 + 128 + 128,
+                             output_channels=self._num_classes,
+                             kernel_shape=[1, 1, 1],
+                             padding=0,
+                             activation_fn=None,
+                             use_batch_norm=False,
+                             use_bias=True,
+                             name='logits')
+
+    def build(self):
+        for k in self.end_points.keys():
+            self.add_module(k, self.end_points[k])
+
+    def forward(self, x):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](
+                    x)  # use _modules to work with dataparallel
+
+        x = self.logits(self.dropout(self.avg_pool(x)))
+        if self._spatial_squeeze:
+            logits = x.squeeze(3).squeeze(3)
+        # logits is batch X time X classes, which is what we want to work with
+        return logits
+
+    def extract_features(self, x, target_endpoint='Logits'):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](x)
+                if end_point == target_endpoint:
+                    break
+        if target_endpoint == 'Logits':
+            return x.mean(4).mean(3).mean(2)
+        else:
+            return x
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/core/trainer.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b52e7fad9260f904375c295392f208f0ac624aef
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/trainer.py
@@ -0,0 +1,399 @@
+import os
+import glob
+import logging
+import importlib
+from tqdm import tqdm
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+
+from core.lr_scheduler import MultiStepRestartLR, CosineAnnealingRestartLR
+from core.loss import AdversarialLoss
+from core.dataset import TrainDataset
+from model.modules.flow_comp import FlowCompletionLoss
+
+
+class Trainer:
+    def __init__(self, config):
+        self.config = config
+        self.epoch = 0
+        self.iteration = 0
+        self.num_local_frames = config['train_data_loader']['num_local_frames']
+        self.num_ref_frames = config['train_data_loader']['num_ref_frames']
+        self.spynet_lr = config['trainer'].get('spynet_lr', 1.0)
+
+        # setup data set and data loader
+        self.train_dataset = TrainDataset(config['train_data_loader'])
+
+        self.train_sampler = None
+        self.train_args = config['trainer']
+        if config['distributed']:
+            self.train_sampler = DistributedSampler(
+                self.train_dataset,
+                num_replicas=config['world_size'],
+                rank=config['global_rank'])
+
+        self.train_loader = DataLoader(
+            self.train_dataset,
+            batch_size=self.train_args['batch_size'] // config['world_size'],
+            shuffle=(self.train_sampler is None),
+            num_workers=self.train_args['num_workers'],
+            sampler=self.train_sampler)
+
+        # set loss functions
+        self.adversarial_loss = AdversarialLoss(
+            type=self.config['losses']['GAN_LOSS'])
+        self.adversarial_loss = self.adversarial_loss.to(self.config['device'])
+        self.l1_loss = nn.L1Loss()
+        self.flow_comp_loss = FlowCompletionLoss().to(self.config['device'])
+
+        # setup models including generator and discriminator
+        net = importlib.import_module('model.' + config['model']['net'])
+        self.netG = net.InpaintGenerator()
+        print(self.netG)
+        self.netG = self.netG.to(self.config['device'])
+        if not self.config['model']['no_dis']:
+            self.netD = net.Discriminator(
+                in_channels=3,
+                use_sigmoid=config['losses']['GAN_LOSS'] != 'hinge')
+            self.netD = self.netD.to(self.config['device'])
+
+        # setup optimizers and schedulers
+        self.setup_optimizers()
+        self.setup_schedulers()
+        self.load()
+
+        if config['distributed']:
+            self.netG = DDP(self.netG,
+                            device_ids=[self.config['local_rank']],
+                            output_device=self.config['local_rank'],
+                            broadcast_buffers=True,
+                            find_unused_parameters=True)
+            if not self.config['model']['no_dis']:
+                self.netD = DDP(self.netD,
+                                device_ids=[self.config['local_rank']],
+                                output_device=self.config['local_rank'],
+                                broadcast_buffers=True,
+                                find_unused_parameters=False)
+
+        # set summary writer
+        self.dis_writer = None
+        self.gen_writer = None
+        self.summary = {}
+        if self.config['global_rank'] == 0 or (not config['distributed']):
+            self.dis_writer = SummaryWriter(
+                os.path.join(config['save_dir'], 'dis'))
+            self.gen_writer = SummaryWriter(
+                os.path.join(config['save_dir'], 'gen'))
+
+    def setup_optimizers(self):
+        """Set up optimizers."""
+        backbone_params = []
+        spynet_params = []
+        for name, param in self.netG.named_parameters():
+            if 'update_spynet' in name:
+                spynet_params.append(param)
+            else:
+                backbone_params.append(param)
+
+        optim_params = [
+            {
+                'params': backbone_params,
+                'lr': self.config['trainer']['lr']
+            },
+            {  # finetuning learning rate for spynet
+                'params': spynet_params,
+                'lr': self.config['trainer']['lr'] * self.spynet_lr
+            },
+        ]
+
+        self.optimG = torch.optim.Adam(optim_params,
+                                       betas=(self.config['trainer']['beta1'],
+                                              self.config['trainer']['beta2']))
+
+        if not self.config['model']['no_dis']:
+            self.optimD = torch.optim.Adam(
+                self.netD.parameters(),
+                lr=self.config['trainer']['lr'],
+                betas=(self.config['trainer']['beta1'],
+                       self.config['trainer']['beta2']))
+
+    def setup_schedulers(self):
+        """Set up schedulers."""
+        scheduler_opt = self.config['trainer']['scheduler']
+        scheduler_type = scheduler_opt.pop('type')
+
+        if scheduler_type in ['MultiStepLR', 'MultiStepRestartLR']:
+            self.scheG = MultiStepRestartLR(
+                self.optimG,
+                milestones=scheduler_opt['milestones'],
+                gamma=scheduler_opt['gamma'])
+            self.scheD = MultiStepRestartLR(
+                self.optimD,
+                milestones=scheduler_opt['milestones'],
+                gamma=scheduler_opt['gamma'])
+        elif scheduler_type == 'CosineAnnealingRestartLR':
+            self.scheG = CosineAnnealingRestartLR(
+                self.optimG,
+                periods=scheduler_opt['periods'],
+                restart_weights=scheduler_opt['restart_weights'])
+            self.scheD = CosineAnnealingRestartLR(
+                self.optimD,
+                periods=scheduler_opt['periods'],
+                restart_weights=scheduler_opt['restart_weights'])
+        else:
+            raise NotImplementedError(
+                f'Scheduler {scheduler_type} is not implemented yet.')
+
+    def update_learning_rate(self):
+        """Update learning rate."""
+        self.scheG.step()
+        self.scheD.step()
+
+    def get_lr(self):
+        """Get current learning rate."""
+        return self.optimG.param_groups[0]['lr']
+
+    def add_summary(self, writer, name, val):
+        """Add tensorboard summary."""
+        if name not in self.summary:
+            self.summary[name] = 0
+        self.summary[name] += val
+        if writer is not None and self.iteration % 100 == 0:
+            writer.add_scalar(name, self.summary[name] / 100, self.iteration)
+            self.summary[name] = 0
+
+    def load(self):
+        """Load netG (and netD)."""
+        # get the latest checkpoint
+        model_path = self.config['save_dir']
+        if os.path.isfile(os.path.join(model_path, 'latest.ckpt')):
+            latest_epoch = open(os.path.join(model_path, 'latest.ckpt'),
+                                'r').read().splitlines()[-1]
+        else:
+            ckpts = [
+                os.path.basename(i).split('.pth')[0]
+                for i in glob.glob(os.path.join(model_path, '*.pth'))
+            ]
+            ckpts.sort()
+            latest_epoch = ckpts[-1] if len(ckpts) > 0 else None
+
+        if latest_epoch is not None:
+            gen_path = os.path.join(model_path,
+                                    f'gen_{int(latest_epoch):06d}.pth')
+            dis_path = os.path.join(model_path,
+                                    f'dis_{int(latest_epoch):06d}.pth')
+            opt_path = os.path.join(model_path,
+                                    f'opt_{int(latest_epoch):06d}.pth')
+
+            if self.config['global_rank'] == 0:
+                print(f'Loading model from {gen_path}...')
+            dataG = torch.load(gen_path, map_location=self.config['device'])
+            self.netG.load_state_dict(dataG)
+            if not self.config['model']['no_dis']:
+                dataD = torch.load(dis_path,
+                                   map_location=self.config['device'])
+                self.netD.load_state_dict(dataD)
+
+            data_opt = torch.load(opt_path, map_location=self.config['device'])
+            self.optimG.load_state_dict(data_opt['optimG'])
+            self.scheG.load_state_dict(data_opt['scheG'])
+            if not self.config['model']['no_dis']:
+                self.optimD.load_state_dict(data_opt['optimD'])
+                self.scheD.load_state_dict(data_opt['scheD'])
+            self.epoch = data_opt['epoch']
+            self.iteration = data_opt['iteration']
+
+        else:
+            if self.config['global_rank'] == 0:
+                print('Warnning: There is no trained model found.'
+                      'An initialized model will be used.')
+
+    def save(self, it):
+        """Save parameters every eval_epoch"""
+        if self.config['global_rank'] == 0:
+            # configure path
+            gen_path = os.path.join(self.config['save_dir'],
+                                    f'gen_{it:06d}.pth')
+            dis_path = os.path.join(self.config['save_dir'],
+                                    f'dis_{it:06d}.pth')
+            opt_path = os.path.join(self.config['save_dir'],
+                                    f'opt_{it:06d}.pth')
+            print(f'\nsaving model to {gen_path} ...')
+
+            # remove .module for saving
+            if isinstance(self.netG, torch.nn.DataParallel) \
+               or isinstance(self.netG, DDP):
+                netG = self.netG.module
+                if not self.config['model']['no_dis']:
+                    netD = self.netD.module
+            else:
+                netG = self.netG
+                if not self.config['model']['no_dis']:
+                    netD = self.netD
+
+            # save checkpoints
+            torch.save(netG.state_dict(), gen_path)
+            if not self.config['model']['no_dis']:
+                torch.save(netD.state_dict(), dis_path)
+                torch.save(
+                    {
+                        'epoch': self.epoch,
+                        'iteration': self.iteration,
+                        'optimG': self.optimG.state_dict(),
+                        'optimD': self.optimD.state_dict(),
+                        'scheG': self.scheG.state_dict(),
+                        'scheD': self.scheD.state_dict()
+                    }, opt_path)
+            else:
+                torch.save(
+                    {
+                        'epoch': self.epoch,
+                        'iteration': self.iteration,
+                        'optimG': self.optimG.state_dict(),
+                        'scheG': self.scheG.state_dict()
+                    }, opt_path)
+
+            latest_path = os.path.join(self.config['save_dir'], 'latest.ckpt')
+            os.system(f"echo {it:06d} > {latest_path}")
+
+    def train(self):
+        """training entry"""
+        pbar = range(int(self.train_args['iterations']))
+        if self.config['global_rank'] == 0:
+            pbar = tqdm(pbar,
+                        initial=self.iteration,
+                        dynamic_ncols=True,
+                        smoothing=0.01)
+
+        os.makedirs('logs', exist_ok=True)
+
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s %(filename)s[line:%(lineno)d]"
+            "%(levelname)s %(message)s",
+            datefmt="%a, %d %b %Y %H:%M:%S",
+            filename=f"logs/{self.config['save_dir'].split('/')[-1]}.log",
+            filemode='w')
+
+        while True:
+            self.epoch += 1
+            if self.config['distributed']:
+                self.train_sampler.set_epoch(self.epoch)
+
+            self._train_epoch(pbar)
+            if self.iteration > self.train_args['iterations']:
+                break
+        print('\nEnd training....')
+
+    def _train_epoch(self, pbar):
+        """Process input and calculate loss every training epoch"""
+        device = self.config['device']
+
+        for frames, masks, _ in self.train_loader:
+            self.iteration += 1
+
+            frames, masks = frames.to(device), masks.to(device)
+            l_t = self.num_local_frames
+            b, t, c, h, w = frames.size()
+
+            masked_frames = (frames * (1 - masks).float())
+            gt_local_frames = (frames[:, :l_t, ...] + 1) / 2
+
+            pred_imgs, pred_flows = self.netG(masked_frames, l_t)
+            pred_imgs = pred_imgs.view(b, -1, c, h, w)
+            comp_imgs = frames * (1. - masks) + masks * pred_imgs
+
+            # compute flow completion loss
+            flow_loss = self.flow_comp_loss(pred_flows, gt_local_frames)
+
+            gen_loss = 0
+            dis_loss = 0
+
+            if not self.config['model']['no_dis']:
+                # discriminator adversarial loss
+                real_clip = self.netD(frames)
+                fake_clip = self.netD(comp_imgs.detach())
+                dis_real_loss = self.adversarial_loss(real_clip, True, True)
+                dis_fake_loss = self.adversarial_loss(fake_clip, False, True)
+                dis_loss += (dis_real_loss + dis_fake_loss) / 2
+                self.add_summary(self.dis_writer, 'loss/dis_vid_fake',
+                                 dis_fake_loss.item())
+                self.add_summary(self.dis_writer, 'loss/dis_vid_real',
+                                 dis_real_loss.item())
+                self.optimD.zero_grad()
+                dis_loss.backward()
+                self.optimD.step()
+
+                # generator adversarial loss
+                gen_clip = self.netD(comp_imgs)
+                gan_loss = self.adversarial_loss(gen_clip, True, False)
+                gan_loss = gan_loss \
+                    * self.config['losses']['adversarial_weight']
+                gen_loss += gan_loss
+                self.add_summary(self.gen_writer, 'loss/gan_loss',
+                                 gan_loss.item())
+
+            flow_loss = flow_loss * self.config['losses']['flow_weight']
+            gen_loss += flow_loss
+            self.add_summary(self.gen_writer, 'loss/flow_loss',
+                             flow_loss.item())
+
+            # generator l1 loss
+            hole_loss = self.l1_loss(pred_imgs * masks, frames * masks)
+            hole_loss = hole_loss / torch.mean(masks) \
+                * self.config['losses']['hole_weight']
+            gen_loss += hole_loss
+            self.add_summary(self.gen_writer, 'loss/hole_loss',
+                             hole_loss.item())
+
+            valid_loss = self.l1_loss(pred_imgs * (1 - masks),
+                                      frames * (1 - masks))
+            valid_loss = valid_loss / torch.mean(1-masks) \
+                * self.config['losses']['valid_weight']
+            gen_loss += valid_loss
+            self.add_summary(self.gen_writer, 'loss/valid_loss',
+                             valid_loss.item())
+
+            self.optimG.zero_grad()
+            gen_loss.backward()
+            self.optimG.step()
+
+            self.update_learning_rate()
+
+            # console logs
+            if self.config['global_rank'] == 0:
+                pbar.update(1)
+                if not self.config['model']['no_dis']:
+                    pbar.set_description((f"flow: {flow_loss.item():.3f}; "
+                                          f"d: {dis_loss.item():.3f}; "
+                                          f"hole: {hole_loss.item():.3f}; "
+                                          f"valid: {valid_loss.item():.3f}"))
+                else:
+                    pbar.set_description((f"flow: {flow_loss.item():.3f}; "
+                                          f"hole: {hole_loss.item():.3f}; "
+                                          f"valid: {valid_loss.item():.3f}"))
+
+                if self.iteration % self.train_args['log_freq'] == 0:
+                    if not self.config['model']['no_dis']:
+                        logging.info(f"[Iter {self.iteration}] "
+                                     f"flow: {flow_loss.item():.4f}; "
+                                     f"d: {dis_loss.item():.4f}; "
+                                     f"hole: {hole_loss.item():.4f}; "
+                                     f"valid: {valid_loss.item():.4f}")
+                    else:
+                        logging.info(f"[Iter {self.iteration}] "
+                                     f"flow: {flow_loss.item():.4f}; "
+                                     f"hole: {hole_loss.item():.4f}; "
+                                     f"valid: {valid_loss.item():.4f}")
+
+            # saving models
+            if self.iteration % self.train_args['save_freq'] == 0:
+                self.save(int(self.iteration))
+
+            if self.iteration > self.train_args['iterations']:
+                break
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/core/utils.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a173372157b69e11c28961e7760e78cedd81eec
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/core/utils.py
@@ -0,0 +1,330 @@
+import os
+import io
+import cv2
+import random
+import numpy as np
+from PIL import Image, ImageOps
+import zipfile
+
+import torch
+import matplotlib
+import matplotlib.patches as patches
+from matplotlib.path import Path
+from matplotlib import pyplot as plt
+from torchvision import transforms
+
+# matplotlib.use('agg')
+
+# ###########################################################################
+# Directory IO
+# ###########################################################################
+
+
+def read_dirnames_under_root(root_dir):
+    dirnames = [
+        name for i, name in enumerate(sorted(os.listdir(root_dir)))
+        if os.path.isdir(os.path.join(root_dir, name))
+    ]
+    print(f'Reading directories under {root_dir}, num: {len(dirnames)}')
+    return dirnames
+
+
+class TrainZipReader(object):
+    file_dict = dict()
+
+    def __init__(self):
+        super(TrainZipReader, self).__init__()
+
+    @staticmethod
+    def build_file_dict(path):
+        file_dict = TrainZipReader.file_dict
+        if path in file_dict:
+            return file_dict[path]
+        else:
+            file_handle = zipfile.ZipFile(path, 'r')
+            file_dict[path] = file_handle
+            return file_dict[path]
+
+    @staticmethod
+    def imread(path, idx):
+        zfile = TrainZipReader.build_file_dict(path)
+        filelist = zfile.namelist()
+        filelist.sort()
+        data = zfile.read(filelist[idx])
+        #
+        im = Image.open(io.BytesIO(data))
+        return im
+
+
+class TestZipReader(object):
+    file_dict = dict()
+
+    def __init__(self):
+        super(TestZipReader, self).__init__()
+
+    @staticmethod
+    def build_file_dict(path):
+        file_dict = TestZipReader.file_dict
+        if path in file_dict:
+            return file_dict[path]
+        else:
+            file_handle = zipfile.ZipFile(path, 'r')
+            file_dict[path] = file_handle
+            return file_dict[path]
+
+    @staticmethod
+    def imread(path, idx):
+        zfile = TestZipReader.build_file_dict(path)
+        filelist = zfile.namelist()
+        filelist.sort()
+        data = zfile.read(filelist[idx])
+        file_bytes = np.asarray(bytearray(data), dtype=np.uint8)
+        im = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR)
+        im = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
+        # im = Image.open(io.BytesIO(data))
+        return im
+
+
+# ###########################################################################
+# Data augmentation
+# ###########################################################################
+
+
+def to_tensors():
+    return transforms.Compose([Stack(), ToTorchFormatTensor()])
+
+
+class GroupRandomHorizontalFlowFlip(object):
+    """Randomly horizontally flips the given PIL.Image with a probability of 0.5
+    """
+    def __init__(self, is_flow=True):
+        self.is_flow = is_flow
+
+    def __call__(self, img_group, mask_group, flowF_group, flowB_group):
+        v = random.random()
+        if v < 0.5:
+            ret_img = [
+                img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group
+            ]
+            ret_mask = [
+                mask.transpose(Image.FLIP_LEFT_RIGHT) for mask in mask_group
+            ]
+            ret_flowF = [ff[:, ::-1] * [-1.0, 1.0] for ff in flowF_group]
+            ret_flowB = [fb[:, ::-1] * [-1.0, 1.0] for fb in flowB_group]
+            return ret_img, ret_mask, ret_flowF, ret_flowB
+        else:
+            return img_group, mask_group, flowF_group, flowB_group
+
+
+class GroupRandomHorizontalFlip(object):
+    """Randomly horizontally flips the given PIL.Image with a probability of 0.5
+    """
+    def __init__(self, is_flow=False):
+        self.is_flow = is_flow
+
+    def __call__(self, img_group, is_flow=False):
+        v = random.random()
+        if v < 0.5:
+            ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
+            if self.is_flow:
+                for i in range(0, len(ret), 2):
+                    # invert flow pixel values when flipping
+                    ret[i] = ImageOps.invert(ret[i])
+            return ret
+        else:
+            return img_group
+
+
+class Stack(object):
+    def __init__(self, roll=False):
+        self.roll = roll
+
+    def __call__(self, img_group):
+        mode = img_group[0].mode
+        if mode == '1':
+            img_group = [img.convert('L') for img in img_group]
+            mode = 'L'
+        if mode == 'L':
+            return np.stack([np.expand_dims(x, 2) for x in img_group], axis=2)
+        elif mode == 'RGB':
+            if self.roll:
+                return np.stack([np.array(x)[:, :, ::-1] for x in img_group],
+                                axis=2)
+            else:
+                return np.stack(img_group, axis=2)
+        else:
+            raise NotImplementedError(f"Image mode {mode}")
+
+
+class ToTorchFormatTensor(object):
+    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
+    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
+    def __init__(self, div=True):
+        self.div = div
+
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            # numpy img: [L, C, H, W]
+            img = torch.from_numpy(pic).permute(2, 3, 0, 1).contiguous()
+        else:
+            # handle PIL Image
+            img = torch.ByteTensor(torch.ByteStorage.from_buffer(
+                pic.tobytes()))
+            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        img = img.float().div(255) if self.div else img.float()
+        return img
+
+
+# ###########################################################################
+# Create masks with random shape
+# ###########################################################################
+
+
+def create_random_shape_with_random_motion(video_length,
+                                           imageHeight=240,
+                                           imageWidth=432):
+    # get a random shape
+    height = random.randint(imageHeight // 3, imageHeight - 1)
+    width = random.randint(imageWidth // 3, imageWidth - 1)
+    edge_num = random.randint(6, 8)
+    ratio = random.randint(6, 8) / 10
+    region = get_random_shape(edge_num=edge_num,
+                              ratio=ratio,
+                              height=height,
+                              width=width)
+    region_width, region_height = region.size
+    # get random position
+    x, y = random.randint(0, imageHeight - region_height), random.randint(
+        0, imageWidth - region_width)
+    velocity = get_random_velocity(max_speed=3)
+    m = Image.fromarray(np.zeros((imageHeight, imageWidth)).astype(np.uint8))
+    m.paste(region, (y, x, y + region.size[0], x + region.size[1]))
+    masks = [m.convert('L')]
+    # return fixed masks
+    if random.uniform(0, 1) > 0.5:
+        return masks * video_length
+    # return moving masks
+    for _ in range(video_length - 1):
+        x, y, velocity = random_move_control_points(x,
+                                                    y,
+                                                    imageHeight,
+                                                    imageWidth,
+                                                    velocity,
+                                                    region.size,
+                                                    maxLineAcceleration=(3,
+                                                                         0.5),
+                                                    maxInitSpeed=3)
+        m = Image.fromarray(
+            np.zeros((imageHeight, imageWidth)).astype(np.uint8))
+        m.paste(region, (y, x, y + region.size[0], x + region.size[1]))
+        masks.append(m.convert('L'))
+    return masks
+
+
+def get_random_shape(edge_num=9, ratio=0.7, width=432, height=240):
+    '''
+      There is the initial point and 3 points per cubic bezier curve.
+      Thus, the curve will only pass though n points, which will be the sharp edges.
+      The other 2 modify the shape of the bezier curve.
+      edge_num, Number of possibly sharp edges
+      points_num, number of points in the Path
+      ratio, (0, 1) magnitude of the perturbation from the unit circle,
+    '''
+    points_num = edge_num * 3 + 1
+    angles = np.linspace(0, 2 * np.pi, points_num)
+    codes = np.full(points_num, Path.CURVE4)
+    codes[0] = Path.MOVETO
+    # Using this instad of Path.CLOSEPOLY avoids an innecessary straight line
+    verts = np.stack((np.cos(angles), np.sin(angles))).T * \
+        (2*ratio*np.random.random(points_num)+1-ratio)[:, None]
+    verts[-1, :] = verts[0, :]
+    path = Path(verts, codes)
+    # draw paths into images
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    patch = patches.PathPatch(path, facecolor='black', lw=2)
+    ax.add_patch(patch)
+    ax.set_xlim(np.min(verts) * 1.1, np.max(verts) * 1.1)
+    ax.set_ylim(np.min(verts) * 1.1, np.max(verts) * 1.1)
+    ax.axis('off')  # removes the axis to leave only the shape
+    fig.canvas.draw()
+    # convert plt images into numpy images
+    data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+    data = data.reshape((fig.canvas.get_width_height()[::-1] + (3, )))
+    plt.close(fig)
+    # postprocess
+    data = cv2.resize(data, (width, height))[:, :, 0]
+    data = (1 - np.array(data > 0).astype(np.uint8)) * 255
+    corrdinates = np.where(data > 0)
+    xmin, xmax, ymin, ymax = np.min(corrdinates[0]), np.max(
+        corrdinates[0]), np.min(corrdinates[1]), np.max(corrdinates[1])
+    region = Image.fromarray(data).crop((ymin, xmin, ymax, xmax))
+    return region
+
+
+def random_accelerate(velocity, maxAcceleration, dist='uniform'):
+    speed, angle = velocity
+    d_speed, d_angle = maxAcceleration
+    if dist == 'uniform':
+        speed += np.random.uniform(-d_speed, d_speed)
+        angle += np.random.uniform(-d_angle, d_angle)
+    elif dist == 'guassian':
+        speed += np.random.normal(0, d_speed / 2)
+        angle += np.random.normal(0, d_angle / 2)
+    else:
+        raise NotImplementedError(
+            f'Distribution type {dist} is not supported.')
+    return (speed, angle)
+
+
+def get_random_velocity(max_speed=3, dist='uniform'):
+    if dist == 'uniform':
+        speed = np.random.uniform(max_speed)
+    elif dist == 'guassian':
+        speed = np.abs(np.random.normal(0, max_speed / 2))
+    else:
+        raise NotImplementedError(
+            f'Distribution type {dist} is not supported.')
+    angle = np.random.uniform(0, 2 * np.pi)
+    return (speed, angle)
+
+
+def random_move_control_points(X,
+                               Y,
+                               imageHeight,
+                               imageWidth,
+                               lineVelocity,
+                               region_size,
+                               maxLineAcceleration=(3, 0.5),
+                               maxInitSpeed=3):
+    region_width, region_height = region_size
+    speed, angle = lineVelocity
+    X += int(speed * np.cos(angle))
+    Y += int(speed * np.sin(angle))
+    lineVelocity = random_accelerate(lineVelocity,
+                                     maxLineAcceleration,
+                                     dist='guassian')
+    if ((X > imageHeight - region_height) or (X < 0)
+            or (Y > imageWidth - region_width) or (Y < 0)):
+        lineVelocity = get_random_velocity(maxInitSpeed, dist='guassian')
+    new_X = np.clip(X, 0, imageHeight - region_height)
+    new_Y = np.clip(Y, 0, imageWidth - region_width)
+    return new_X, new_Y, lineVelocity
+
+
+if __name__ == '__main__':
+
+    trials = 10
+    for _ in range(trials):
+        video_length = 10
+        # The returned masks are either stationary (50%) or moving (50%)
+        masks = create_random_shape_with_random_motion(video_length,
+                                                       imageHeight=240,
+                                                       imageWidth=432)
+
+        for m in masks:
+            cv2.imshow('mask', np.array(m))
+            cv2.waitKey(500)
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/evaluate.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8f70789ce9f510767bf6cae12d4f374749ad8ec
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/evaluate.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+import cv2
+import numpy as np
+import importlib
+import os
+import argparse
+from PIL import Image
+
+import torch
+from torch.utils.data import DataLoader
+
+from core.dataset import TestDataset
+from core.metrics import calc_psnr_and_ssim, calculate_i3d_activations, calculate_vfid, init_i3d_model
+
+# global variables
+w, h = 432, 240
+ref_length = 10
+neighbor_stride = 5
+default_fps = 24
+
+
+# sample reference frames from the whole video
+def get_ref_index(neighbor_ids, length):
+    ref_index = []
+    for i in range(0, length, ref_length):
+        if i not in neighbor_ids:
+            ref_index.append(i)
+    return ref_index
+
+
+def main_worker(args):
+    args.size = (w, h)
+    # set up datasets and data loader
+    assert (args.dataset == 'davis') or args.dataset == 'youtube-vos', \
+        f"{args.dataset} dataset is not supported"
+    test_dataset = TestDataset(args)
+
+    test_loader = DataLoader(test_dataset,
+                             batch_size=1,
+                             shuffle=False,
+                             num_workers=args.num_workers)
+
+    # set up models
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    net = importlib.import_module('model.' + args.model)
+    model = net.InpaintGenerator().to(device)
+    data = torch.load(args.ckpt, map_location=device)
+    model.load_state_dict(data)
+    print(f'Loading from: {args.ckpt}')
+    model.eval()
+
+    total_frame_psnr = []
+    total_frame_ssim = []
+
+    output_i3d_activations = []
+    real_i3d_activations = []
+
+    print('Start evaluation...')
+
+    # create results directory
+    result_path = os.path.join('results', f'{args.model}_{args.dataset}')
+    if not os.path.exists(result_path):
+        os.makedirs(result_path)
+    eval_summary = open(
+        os.path.join(result_path, f"{args.model}_{args.dataset}_metrics.txt"),
+        "w")
+
+    i3d_model = init_i3d_model()
+
+    for index, items in enumerate(test_loader):
+        frames, masks, video_name, frames_PIL = items
+
+        video_length = frames.size(1)
+        frames, masks = frames.to(device), masks.to(device)
+        ori_frames = frames_PIL
+        ori_frames = [
+            ori_frames[i].squeeze().cpu().numpy() for i in range(video_length)
+        ]
+        comp_frames = [None] * video_length
+
+        # complete holes by our model
+        for f in range(0, video_length, neighbor_stride):
+            neighbor_ids = [
+                i for i in range(max(0, f - neighbor_stride),
+                                 min(video_length, f + neighbor_stride + 1))
+            ]
+            ref_ids = get_ref_index(neighbor_ids, video_length)
+            selected_imgs = frames[:1, neighbor_ids + ref_ids, :, :, :]
+            selected_masks = masks[:1, neighbor_ids + ref_ids, :, :, :]
+            with torch.no_grad():
+                masked_frames = selected_imgs * (1 - selected_masks)
+                pred_img, _ = model(masked_frames, len(neighbor_ids))
+
+                pred_img = (pred_img + 1) / 2
+                pred_img = pred_img.cpu().permute(0, 2, 3, 1).numpy() * 255
+                binary_masks = masks[0, neighbor_ids, :, :, :].cpu().permute(
+                    0, 2, 3, 1).numpy().astype(np.uint8)
+                for i in range(len(neighbor_ids)):
+                    idx = neighbor_ids[i]
+                    img = np.array(pred_img[i]).astype(np.uint8) * binary_masks[i] \
+                        + ori_frames[idx] * (1 - binary_masks[i])
+                    if comp_frames[idx] is None:
+                        comp_frames[idx] = img
+                    else:
+                        comp_frames[idx] = comp_frames[idx].astype(
+                            np.float32) * 0.5 + img.astype(np.float32) * 0.5
+
+        # calculate metrics
+        cur_video_psnr = []
+        cur_video_ssim = []
+        comp_PIL = []  # to calculate VFID
+        frames_PIL = []
+        for ori, comp in zip(ori_frames, comp_frames):
+            psnr, ssim = calc_psnr_and_ssim(ori, comp)
+
+            cur_video_psnr.append(psnr)
+            cur_video_ssim.append(ssim)
+
+            total_frame_psnr.append(psnr)
+            total_frame_ssim.append(ssim)
+
+            frames_PIL.append(Image.fromarray(ori.astype(np.uint8)))
+            comp_PIL.append(Image.fromarray(comp.astype(np.uint8)))
+        cur_psnr = sum(cur_video_psnr) / len(cur_video_psnr)
+        cur_ssim = sum(cur_video_ssim) / len(cur_video_ssim)
+
+        # saving i3d activations
+        frames_i3d, comp_i3d = calculate_i3d_activations(frames_PIL,
+                                                         comp_PIL,
+                                                         i3d_model,
+                                                         device=device)
+        real_i3d_activations.append(frames_i3d)
+        output_i3d_activations.append(comp_i3d)
+
+        print(
+            f'[{index+1:3}/{len(test_loader)}] Name: {str(video_name):25} | PSNR/SSIM: {cur_psnr:.4f}/{cur_ssim:.4f}'
+        )
+        eval_summary.write(
+            f'[{index+1:3}/{len(test_loader)}] Name: {str(video_name):25} | PSNR/SSIM: {cur_psnr:.4f}/{cur_ssim:.4f}\n'
+        )
+
+        # saving images for evaluating warpping errors
+        if args.save_results:
+            save_frame_path = os.path.join(result_path, video_name[0])
+            os.makedirs(save_frame_path, exist_ok=False)
+
+            for i, frame in enumerate(comp_frames):
+                cv2.imwrite(
+                    os.path.join(save_frame_path,
+                                 str(i).zfill(5) + '.png'),
+                    cv2.cvtColor(frame.astype(np.uint8), cv2.COLOR_RGB2BGR))
+
+    avg_frame_psnr = sum(total_frame_psnr) / len(total_frame_psnr)
+    avg_frame_ssim = sum(total_frame_ssim) / len(total_frame_ssim)
+
+    fid_score = calculate_vfid(real_i3d_activations, output_i3d_activations)
+    print('Finish evaluation... Average Frame PSNR/SSIM/VFID: '
+          f'{avg_frame_psnr:.2f}/{avg_frame_ssim:.4f}/{fid_score:.3f}')
+    eval_summary.write(
+        'Finish evaluation... Average Frame PSNR/SSIM/VFID: '
+        f'{avg_frame_psnr:.2f}/{avg_frame_ssim:.4f}/{fid_score:.3f}')
+    eval_summary.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='E2FGVI')
+    parser.add_argument('--dataset',
+                        choices=['davis', 'youtube-vos'],
+                        type=str)
+    parser.add_argument('--data_root', type=str, required=True)
+    parser.add_argument('--model', choices=['e2fgvi', 'e2fgvi_hq'], type=str)
+    parser.add_argument('--ckpt', type=str, required=True)
+    parser.add_argument('--save_results', action='store_true', default=False)
+    parser.add_argument('--num_workers', default=4, type=int)
+    args = parser.parse_args()
+    main_worker(args)
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/model/__init__.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/model/e2fgvi.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/e2fgvi.py
new file mode 100644
index 0000000000000000000000000000000000000000..cac63a3786e71b1e692e28996128b5869b9be3fd
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/e2fgvi.py
@@ -0,0 +1,350 @@
+''' Towards An End-to-End Framework for Video Inpainting
+'''
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from E2FGVI.model.modules.flow_comp import SPyNet
+from E2FGVI.model.modules.feat_prop import BidirectionalPropagation, SecondOrderDeformableAlignment
+from E2FGVI.model.modules.tfocal_transformer import TemporalFocalTransformerBlock, SoftSplit, SoftComp
+from E2FGVI.model.modules.spectral_norm import spectral_norm as _spectral_norm
+
+
+class BaseNetwork(nn.Module):
+    def __init__(self):
+        super(BaseNetwork, self).__init__()
+
+    def print_network(self):
+        if isinstance(self, list):
+            self = self[0]
+        num_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+        print(
+            'Network [%s] was created. Total number of parameters: %.1f million. '
+            'To see the architecture, do print(network).' %
+            (type(self).__name__, num_params / 1000000))
+
+    def init_weights(self, init_type='normal', gain=0.02):
+        '''
+        initialize network's weights
+        init_type: normal | xavier | kaiming | orthogonal
+        https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/9451e70673400885567d08a9e97ade2524c700d0/models/networks.py#L39
+        '''
+        def init_func(m):
+            classname = m.__class__.__name__
+            if classname.find('InstanceNorm2d') != -1:
+                if hasattr(m, 'weight') and m.weight is not None:
+                    nn.init.constant_(m.weight.data, 1.0)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+            elif hasattr(m, 'weight') and (classname.find('Conv') != -1
+                                           or classname.find('Linear') != -1):
+                if init_type == 'normal':
+                    nn.init.normal_(m.weight.data, 0.0, gain)
+                elif init_type == 'xavier':
+                    nn.init.xavier_normal_(m.weight.data, gain=gain)
+                elif init_type == 'xavier_uniform':
+                    nn.init.xavier_uniform_(m.weight.data, gain=1.0)
+                elif init_type == 'kaiming':
+                    nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+                elif init_type == 'orthogonal':
+                    nn.init.orthogonal_(m.weight.data, gain=gain)
+                elif init_type == 'none':  # uses pytorch's default init method
+                    m.reset_parameters()
+                else:
+                    raise NotImplementedError(
+                        'initialization method [%s] is not implemented' %
+                        init_type)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+
+        self.apply(init_func)
+
+        # propagate to children
+        for m in self.children():
+            if hasattr(m, 'init_weights'):
+                m.init_weights(init_type, gain)
+
+
+class Encoder(nn.Module):
+    def __init__(self):
+        super(Encoder, self).__init__()
+        self.group = [1, 2, 4, 8, 1]
+        self.layers = nn.ModuleList([
+            nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1, groups=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(640, 512, kernel_size=3, stride=1, padding=1, groups=2),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(768, 384, kernel_size=3, stride=1, padding=1, groups=4),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(640, 256, kernel_size=3, stride=1, padding=1, groups=8),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(512, 128, kernel_size=3, stride=1, padding=1, groups=1),
+            nn.LeakyReLU(0.2, inplace=True)
+        ])
+
+    def forward(self, x):
+        bt, c, h, w = x.size()
+        h, w = h // 4, w // 4
+        out = x
+        for i, layer in enumerate(self.layers):
+            if i == 8:
+                x0 = out
+            if i > 8 and i % 2 == 0:
+                g = self.group[(i - 8) // 2]
+                x = x0.view(bt, g, -1, h, w)
+                o = out.view(bt, g, -1, h, w)
+                out = torch.cat([x, o], 2).view(bt, -1, h, w)
+            out = layer(out)
+        return out
+
+
+class deconv(nn.Module):
+    def __init__(self,
+                 input_channel,
+                 output_channel,
+                 kernel_size=3,
+                 padding=0):
+        super().__init__()
+        self.conv = nn.Conv2d(input_channel,
+                              output_channel,
+                              kernel_size=kernel_size,
+                              stride=1,
+                              padding=padding)
+
+    def forward(self, x):
+        x = F.interpolate(x,
+                          scale_factor=2,
+                          mode='bilinear',
+                          align_corners=True)
+        return self.conv(x)
+
+
+class InpaintGenerator(BaseNetwork):
+    def __init__(self, init_weights=True):
+        super(InpaintGenerator, self).__init__()
+        channel = 256
+        hidden = 512
+
+        # encoder
+        self.encoder = Encoder()
+
+        # decoder
+        self.decoder = nn.Sequential(
+            deconv(channel // 2, 128, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            deconv(64, 64, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 3, kernel_size=3, stride=1, padding=1))
+
+        # feature propagation module
+        self.feat_prop_module = BidirectionalPropagation(channel // 2)
+
+        # soft split and soft composition
+        kernel_size = (7, 7)
+        padding = (3, 3)
+        stride = (3, 3)
+        output_size = (60, 108)
+        t2t_params = {
+            'kernel_size': kernel_size,
+            'stride': stride,
+            'padding': padding,
+            'output_size': output_size
+        }
+        self.ss = SoftSplit(channel // 2,
+                            hidden,
+                            kernel_size,
+                            stride,
+                            padding,
+                            t2t_param=t2t_params)
+        self.sc = SoftComp(channel // 2, hidden, output_size, kernel_size,
+                           stride, padding)
+
+        n_vecs = 1
+        for i, d in enumerate(kernel_size):
+            n_vecs *= int((output_size[i] + 2 * padding[i] -
+                           (d - 1) - 1) / stride[i] + 1)
+
+        blocks = []
+        depths = 8
+        num_heads = [4] * depths
+        window_size = [(5, 9)] * depths
+        focal_windows = [(5, 9)] * depths
+        focal_levels = [2] * depths
+        pool_method = "fc"
+
+        for i in range(depths):
+            blocks.append(
+                TemporalFocalTransformerBlock(dim=hidden,
+                                              num_heads=num_heads[i],
+                                              window_size=window_size[i],
+                                              focal_level=focal_levels[i],
+                                              focal_window=focal_windows[i],
+                                              n_vecs=n_vecs,
+                                              t2t_params=t2t_params,
+                                              pool_method=pool_method))
+        self.transformer = nn.Sequential(*blocks)
+
+        if init_weights:
+            self.init_weights()
+            # Need to initial the weights of MSDeformAttn specifically
+            for m in self.modules():
+                if isinstance(m, SecondOrderDeformableAlignment):
+                    m.init_offset()
+
+        # flow completion network
+        self.update_spynet = SPyNet()
+
+    def forward_bidirect_flow(self, masked_local_frames):
+        b, l_t, c, h, w = masked_local_frames.size()
+
+        # compute forward and backward flows of masked frames
+        masked_local_frames = F.interpolate(masked_local_frames.view(
+            -1, c, h, w),
+                                            scale_factor=1 / 4,
+                                            mode='bilinear',
+                                            align_corners=True,
+                                            recompute_scale_factor=True)
+        masked_local_frames = masked_local_frames.view(b, l_t, c, h // 4,
+                                                       w // 4)
+        mlf_1 = masked_local_frames[:, :-1, :, :, :].reshape(
+            -1, c, h // 4, w // 4)
+        mlf_2 = masked_local_frames[:, 1:, :, :, :].reshape(
+            -1, c, h // 4, w // 4)
+        pred_flows_forward = self.update_spynet(mlf_1, mlf_2)
+        pred_flows_backward = self.update_spynet(mlf_2, mlf_1)
+
+        pred_flows_forward = pred_flows_forward.view(b, l_t - 1, 2, h // 4,
+                                                     w // 4)
+        pred_flows_backward = pred_flows_backward.view(b, l_t - 1, 2, h // 4,
+                                                       w // 4)
+
+        return pred_flows_forward, pred_flows_backward
+
+    def forward(self, masked_frames, num_local_frames):
+        l_t = num_local_frames
+        b, t, ori_c, ori_h, ori_w = masked_frames.size()
+
+        # normalization before feeding into the flow completion module
+        masked_local_frames = (masked_frames[:, :l_t, ...] + 1) / 2
+        pred_flows = self.forward_bidirect_flow(masked_local_frames)
+
+        # extracting features and performing the feature propagation on local features
+        enc_feat = self.encoder(masked_frames.view(b * t, ori_c, ori_h, ori_w))
+        _, c, h, w = enc_feat.size()
+        local_feat = enc_feat.view(b, t, c, h, w)[:, :l_t, ...]
+        ref_feat = enc_feat.view(b, t, c, h, w)[:, l_t:, ...]
+        local_feat = self.feat_prop_module(local_feat, pred_flows[0],
+                                           pred_flows[1])
+        enc_feat = torch.cat((local_feat, ref_feat), dim=1)
+
+        # content hallucination through stacking multiple temporal focal transformer blocks
+        trans_feat = self.ss(enc_feat.view(-1, c, h, w), b)
+        trans_feat = self.transformer(trans_feat)
+        trans_feat = self.sc(trans_feat, t)
+        trans_feat = trans_feat.view(b, t, -1, h, w)
+        enc_feat = enc_feat + trans_feat
+
+        # decode frames from features
+        output = self.decoder(enc_feat.view(b * t, c, h, w))
+        output = torch.tanh(output)
+        return output, pred_flows
+
+
+# ######################################################################
+#  Discriminator for Temporal Patch GAN
+# ######################################################################
+
+
+class Discriminator(BaseNetwork):
+    def __init__(self,
+                 in_channels=3,
+                 use_sigmoid=False,
+                 use_spectral_norm=True,
+                 init_weights=True):
+        super(Discriminator, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        nf = 32
+
+        self.conv = nn.Sequential(
+            spectral_norm(
+                nn.Conv3d(in_channels=in_channels,
+                          out_channels=nf * 1,
+                          kernel_size=(3, 5, 5),
+                          stride=(1, 2, 2),
+                          padding=1,
+                          bias=not use_spectral_norm), use_spectral_norm),
+            # nn.InstanceNorm2d(64, track_running_stats=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(nf * 1,
+                          nf * 2,
+                          kernel_size=(3, 5, 5),
+                          stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            # nn.InstanceNorm2d(128, track_running_stats=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(nf * 2,
+                          nf * 4,
+                          kernel_size=(3, 5, 5),
+                          stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            # nn.InstanceNorm2d(256, track_running_stats=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(nf * 4,
+                          nf * 4,
+                          kernel_size=(3, 5, 5),
+                          stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            # nn.InstanceNorm2d(256, track_running_stats=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(nf * 4,
+                          nf * 4,
+                          kernel_size=(3, 5, 5),
+                          stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            # nn.InstanceNorm2d(256, track_running_stats=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv3d(nf * 4,
+                      nf * 4,
+                      kernel_size=(3, 5, 5),
+                      stride=(1, 2, 2),
+                      padding=(1, 2, 2)))
+
+        if init_weights:
+            self.init_weights()
+
+    def forward(self, xs):
+        # T, C, H, W = xs.shape (old)
+        # B, T, C, H, W (new)
+        xs_t = torch.transpose(xs, 1, 2)
+        feat = self.conv(xs_t)
+        if self.use_sigmoid:
+            feat = torch.sigmoid(feat)
+        out = torch.transpose(feat, 1, 2)  # B, T, C, H, W
+        return out
+
+
+def spectral_norm(module, mode=True):
+    if mode:
+        return _spectral_norm(module)
+    return module
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/model/e2fgvi_hq.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/e2fgvi_hq.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6bc78760ebc22ce52a80ee218e07985098abf7d
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/e2fgvi_hq.py
@@ -0,0 +1,350 @@
+''' Towards An End-to-End Framework for Video Inpainting
+'''
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from E2FGVI.model.modules.flow_comp import SPyNet
+from E2FGVI.model.modules.feat_prop import BidirectionalPropagation, SecondOrderDeformableAlignment
+from E2FGVI.model.modules.tfocal_transformer_hq import TemporalFocalTransformerBlock, SoftSplit, SoftComp
+from E2FGVI.model.modules.spectral_norm import spectral_norm as _spectral_norm
+
+
+class BaseNetwork(nn.Module):
+    def __init__(self):
+        super(BaseNetwork, self).__init__()
+
+    def print_network(self):
+        if isinstance(self, list):
+            self = self[0]
+        num_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+        print(
+            'Network [%s] was created. Total number of parameters: %.1f million. '
+            'To see the architecture, do print(network).' %
+            (type(self).__name__, num_params / 1000000))
+
+    def init_weights(self, init_type='normal', gain=0.02):
+        '''
+        initialize network's weights
+        init_type: normal | xavier | kaiming | orthogonal
+        https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/9451e70673400885567d08a9e97ade2524c700d0/models/networks.py#L39
+        '''
+        def init_func(m):
+            classname = m.__class__.__name__
+            if classname.find('InstanceNorm2d') != -1:
+                if hasattr(m, 'weight') and m.weight is not None:
+                    nn.init.constant_(m.weight.data, 1.0)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+            elif hasattr(m, 'weight') and (classname.find('Conv') != -1
+                                           or classname.find('Linear') != -1):
+                if init_type == 'normal':
+                    nn.init.normal_(m.weight.data, 0.0, gain)
+                elif init_type == 'xavier':
+                    nn.init.xavier_normal_(m.weight.data, gain=gain)
+                elif init_type == 'xavier_uniform':
+                    nn.init.xavier_uniform_(m.weight.data, gain=1.0)
+                elif init_type == 'kaiming':
+                    nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+                elif init_type == 'orthogonal':
+                    nn.init.orthogonal_(m.weight.data, gain=gain)
+                elif init_type == 'none':  # uses pytorch's default init method
+                    m.reset_parameters()
+                else:
+                    raise NotImplementedError(
+                        'initialization method [%s] is not implemented' %
+                        init_type)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+
+        self.apply(init_func)
+
+        # propagate to children
+        for m in self.children():
+            if hasattr(m, 'init_weights'):
+                m.init_weights(init_type, gain)
+
+
+class Encoder(nn.Module):
+    def __init__(self):
+        super(Encoder, self).__init__()
+        self.group = [1, 2, 4, 8, 1]
+        self.layers = nn.ModuleList([
+            nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1, groups=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(640, 512, kernel_size=3, stride=1, padding=1, groups=2),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(768, 384, kernel_size=3, stride=1, padding=1, groups=4),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(640, 256, kernel_size=3, stride=1, padding=1, groups=8),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(512, 128, kernel_size=3, stride=1, padding=1, groups=1),
+            nn.LeakyReLU(0.2, inplace=True)
+        ])
+
+    def forward(self, x):
+        bt, c, _, _ = x.size()
+        # h, w = h//4, w//4
+        out = x
+        for i, layer in enumerate(self.layers):
+            if i == 8:
+                x0 = out
+                _, _, h, w = x0.size()
+            if i > 8 and i % 2 == 0:
+                g = self.group[(i - 8) // 2]
+                x = x0.view(bt, g, -1, h, w)
+                o = out.view(bt, g, -1, h, w)
+                out = torch.cat([x, o], 2).view(bt, -1, h, w)
+            out = layer(out)
+        return out
+
+
+class deconv(nn.Module):
+    def __init__(self,
+                 input_channel,
+                 output_channel,
+                 kernel_size=3,
+                 padding=0):
+        super().__init__()
+        self.conv = nn.Conv2d(input_channel,
+                              output_channel,
+                              kernel_size=kernel_size,
+                              stride=1,
+                              padding=padding)
+
+    def forward(self, x):
+        x = F.interpolate(x,
+                          scale_factor=2,
+                          mode='bilinear',
+                          align_corners=True)
+        return self.conv(x)
+
+
+class InpaintGenerator(BaseNetwork):
+    def __init__(self, init_weights=True):
+        super(InpaintGenerator, self).__init__()
+        channel = 256
+        hidden = 512
+
+        # encoder
+        self.encoder = Encoder()
+
+        # decoder
+        self.decoder = nn.Sequential(
+            deconv(channel // 2, 128, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            deconv(64, 64, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 3, kernel_size=3, stride=1, padding=1))
+
+        # feature propagation module
+        self.feat_prop_module = BidirectionalPropagation(channel // 2)
+
+        # soft split and soft composition
+        kernel_size = (7, 7)
+        padding = (3, 3)
+        stride = (3, 3)
+        output_size = (60, 108)
+        t2t_params = {
+            'kernel_size': kernel_size,
+            'stride': stride,
+            'padding': padding
+        }
+        self.ss = SoftSplit(channel // 2,
+                            hidden,
+                            kernel_size,
+                            stride,
+                            padding,
+                            t2t_param=t2t_params)
+        self.sc = SoftComp(channel // 2, hidden, kernel_size, stride, padding)
+
+        n_vecs = 1
+        for i, d in enumerate(kernel_size):
+            n_vecs *= int((output_size[i] + 2 * padding[i] -
+                           (d - 1) - 1) / stride[i] + 1)
+
+        blocks = []
+        depths = 8
+        num_heads = [4] * depths
+        window_size = [(5, 9)] * depths
+        focal_windows = [(5, 9)] * depths
+        focal_levels = [2] * depths
+        pool_method = "fc"
+
+        for i in range(depths):
+            blocks.append(
+                TemporalFocalTransformerBlock(dim=hidden,
+                                              num_heads=num_heads[i],
+                                              window_size=window_size[i],
+                                              focal_level=focal_levels[i],
+                                              focal_window=focal_windows[i],
+                                              n_vecs=n_vecs,
+                                              t2t_params=t2t_params,
+                                              pool_method=pool_method))
+        self.transformer = nn.Sequential(*blocks)
+
+        if init_weights:
+            self.init_weights()
+            # Need to initial the weights of MSDeformAttn specifically
+            for m in self.modules():
+                if isinstance(m, SecondOrderDeformableAlignment):
+                    m.init_offset()
+
+        # flow completion network
+        self.update_spynet = SPyNet()
+
+    def forward_bidirect_flow(self, masked_local_frames):
+        b, l_t, c, h, w = masked_local_frames.size()
+
+        # compute forward and backward flows of masked frames
+        masked_local_frames = F.interpolate(masked_local_frames.view(
+            -1, c, h, w),
+                                            scale_factor=1 / 4,
+                                            mode='bilinear',
+                                            align_corners=True,
+                                            recompute_scale_factor=True)
+        masked_local_frames = masked_local_frames.view(b, l_t, c, h // 4,
+                                                       w // 4)
+        mlf_1 = masked_local_frames[:, :-1, :, :, :].reshape(
+            -1, c, h // 4, w // 4)
+        mlf_2 = masked_local_frames[:, 1:, :, :, :].reshape(
+            -1, c, h // 4, w // 4)
+        pred_flows_forward = self.update_spynet(mlf_1, mlf_2)
+        pred_flows_backward = self.update_spynet(mlf_2, mlf_1)
+
+        pred_flows_forward = pred_flows_forward.view(b, l_t - 1, 2, h // 4,
+                                                     w // 4)
+        pred_flows_backward = pred_flows_backward.view(b, l_t - 1, 2, h // 4,
+                                                       w // 4)
+
+        return pred_flows_forward, pred_flows_backward
+
+    def forward(self, masked_frames, num_local_frames):
+        l_t = num_local_frames
+        b, t, ori_c, ori_h, ori_w = masked_frames.size()
+
+        # normalization before feeding into the flow completion module
+        masked_local_frames = (masked_frames[:, :l_t, ...] + 1) / 2
+        pred_flows = self.forward_bidirect_flow(masked_local_frames)
+
+        # extracting features and performing the feature propagation on local features
+        enc_feat = self.encoder(masked_frames.view(b * t, ori_c, ori_h, ori_w))
+        _, c, h, w = enc_feat.size()
+        fold_output_size = (h, w)
+        local_feat = enc_feat.view(b, t, c, h, w)[:, :l_t, ...]
+        ref_feat = enc_feat.view(b, t, c, h, w)[:, l_t:, ...]
+        local_feat = self.feat_prop_module(local_feat, pred_flows[0],
+                                           pred_flows[1])
+        enc_feat = torch.cat((local_feat, ref_feat), dim=1)
+
+        # content hallucination through stacking multiple temporal focal transformer blocks
+        trans_feat = self.ss(enc_feat.view(-1, c, h, w), b, fold_output_size)
+        trans_feat = self.transformer([trans_feat, fold_output_size])
+        trans_feat = self.sc(trans_feat[0], t, fold_output_size)
+        trans_feat = trans_feat.view(b, t, -1, h, w)
+        enc_feat = enc_feat + trans_feat
+
+        # decode frames from features
+        output = self.decoder(enc_feat.view(b * t, c, h, w))
+        output = torch.tanh(output)
+        return output, pred_flows
+
+
+# ######################################################################
+#  Discriminator for Temporal Patch GAN
+# ######################################################################
+
+
+class Discriminator(BaseNetwork):
+    def __init__(self,
+                 in_channels=3,
+                 use_sigmoid=False,
+                 use_spectral_norm=True,
+                 init_weights=True):
+        super(Discriminator, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        nf = 32
+
+        self.conv = nn.Sequential(
+            spectral_norm(
+                nn.Conv3d(in_channels=in_channels,
+                          out_channels=nf * 1,
+                          kernel_size=(3, 5, 5),
+                          stride=(1, 2, 2),
+                          padding=1,
+                          bias=not use_spectral_norm), use_spectral_norm),
+            # nn.InstanceNorm2d(64, track_running_stats=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(nf * 1,
+                          nf * 2,
+                          kernel_size=(3, 5, 5),
+                          stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            # nn.InstanceNorm2d(128, track_running_stats=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(nf * 2,
+                          nf * 4,
+                          kernel_size=(3, 5, 5),
+                          stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            # nn.InstanceNorm2d(256, track_running_stats=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(nf * 4,
+                          nf * 4,
+                          kernel_size=(3, 5, 5),
+                          stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            # nn.InstanceNorm2d(256, track_running_stats=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(nf * 4,
+                          nf * 4,
+                          kernel_size=(3, 5, 5),
+                          stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            # nn.InstanceNorm2d(256, track_running_stats=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv3d(nf * 4,
+                      nf * 4,
+                      kernel_size=(3, 5, 5),
+                      stride=(1, 2, 2),
+                      padding=(1, 2, 2)))
+
+        if init_weights:
+            self.init_weights()
+
+    def forward(self, xs):
+        # T, C, H, W = xs.shape (old)
+        # B, T, C, H, W (new)
+        xs_t = torch.transpose(xs, 1, 2)
+        feat = self.conv(xs_t)
+        if self.use_sigmoid:
+            feat = torch.sigmoid(feat)
+        out = torch.transpose(feat, 1, 2)  # B, T, C, H, W
+        return out
+
+
+def spectral_norm(module, mode=True):
+    if mode:
+        return _spectral_norm(module)
+    return module
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/__init__.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/feat_prop.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/feat_prop.py
new file mode 100644
index 0000000000000000000000000000000000000000..3957a72e8e97c4f88c45da4fc12334c343073ce2
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/feat_prop.py
@@ -0,0 +1,149 @@
+"""
+    BasicVSR++: Improving Video Super-Resolution with Enhanced Propagation and Alignment, CVPR 2022
+"""
+import torch
+import torch.nn as nn
+
+from mmcv.ops import ModulatedDeformConv2d, modulated_deform_conv2d
+from mmcv.cnn import constant_init
+
+from E2FGVI.model.modules.flow_comp import flow_warp
+
+
+class SecondOrderDeformableAlignment(ModulatedDeformConv2d):
+    """Second-order deformable alignment module."""
+    def __init__(self, *args, **kwargs):
+        self.max_residue_magnitude = kwargs.pop('max_residue_magnitude', 10)
+
+        super(SecondOrderDeformableAlignment, self).__init__(*args, **kwargs)
+
+        self.conv_offset = nn.Sequential(
+            nn.Conv2d(3 * self.out_channels + 4, self.out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(self.out_channels, self.out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(self.out_channels, self.out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(self.out_channels, 27 * self.deform_groups, 3, 1, 1),
+        )
+
+        self.init_offset()
+
+    def init_offset(self):
+        constant_init(self.conv_offset[-1], val=0, bias=0)
+
+    def forward(self, x, extra_feat, flow_1, flow_2):
+        extra_feat = torch.cat([extra_feat, flow_1, flow_2], dim=1)
+        out = self.conv_offset(extra_feat)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+
+        # offset
+        offset = self.max_residue_magnitude * torch.tanh(
+            torch.cat((o1, o2), dim=1))
+        offset_1, offset_2 = torch.chunk(offset, 2, dim=1)
+        offset_1 = offset_1 + flow_1.flip(1).repeat(1,
+                                                    offset_1.size(1) // 2, 1,
+                                                    1)
+        offset_2 = offset_2 + flow_2.flip(1).repeat(1,
+                                                    offset_2.size(1) // 2, 1,
+                                                    1)
+        offset = torch.cat([offset_1, offset_2], dim=1)
+
+        # mask
+        mask = torch.sigmoid(mask)
+
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+
+class BidirectionalPropagation(nn.Module):
+    def __init__(self, channel):
+        super(BidirectionalPropagation, self).__init__()
+        modules = ['backward_', 'forward_']
+        self.deform_align = nn.ModuleDict()
+        self.backbone = nn.ModuleDict()
+        self.channel = channel
+
+        for i, module in enumerate(modules):
+            self.deform_align[module] = SecondOrderDeformableAlignment(
+                2 * channel, channel, 3, padding=1, deform_groups=16)
+
+            self.backbone[module] = nn.Sequential(
+                nn.Conv2d((2 + i) * channel, channel, 3, 1, 1),
+                nn.LeakyReLU(negative_slope=0.1, inplace=True),
+                nn.Conv2d(channel, channel, 3, 1, 1),
+            )
+
+        self.fusion = nn.Conv2d(2 * channel, channel, 1, 1, 0)
+
+    def forward(self, x, flows_backward, flows_forward):
+        """
+        x shape : [b, t, c, h, w]
+        return [b, t, c, h, w]
+        """
+        b, t, c, h, w = x.shape
+        feats = {}
+        feats['spatial'] = [x[:, i, :, :, :] for i in range(0, t)]
+
+        for module_name in ['backward_', 'forward_']:
+
+            feats[module_name] = []
+
+            frame_idx = range(0, t)
+            flow_idx = range(-1, t - 1)
+            mapping_idx = list(range(0, len(feats['spatial'])))
+            mapping_idx += mapping_idx[::-1]
+
+            if 'backward' in module_name:
+                frame_idx = frame_idx[::-1]
+                flows = flows_backward
+            else:
+                flows = flows_forward
+
+            feat_prop = x.new_zeros(b, self.channel, h, w)
+            for i, idx in enumerate(frame_idx):
+                feat_current = feats['spatial'][mapping_idx[idx]]
+
+                if i > 0:
+                    flow_n1 = flows[:, flow_idx[i], :, :, :]
+                    cond_n1 = flow_warp(feat_prop, flow_n1.permute(0, 2, 3, 1))
+
+                    # initialize second-order features
+                    feat_n2 = torch.zeros_like(feat_prop)
+                    flow_n2 = torch.zeros_like(flow_n1)
+                    cond_n2 = torch.zeros_like(cond_n1)
+                    if i > 1:
+                        feat_n2 = feats[module_name][-2]
+                        flow_n2 = flows[:, flow_idx[i - 1], :, :, :]
+                        flow_n2 = flow_n1 + flow_warp(
+                            flow_n2, flow_n1.permute(0, 2, 3, 1))
+                        cond_n2 = flow_warp(feat_n2,
+                                            flow_n2.permute(0, 2, 3, 1))
+
+                    cond = torch.cat([cond_n1, feat_current, cond_n2], dim=1)
+                    feat_prop = torch.cat([feat_prop, feat_n2], dim=1)
+                    feat_prop = self.deform_align[module_name](feat_prop, cond,
+                                                               flow_n1,
+                                                               flow_n2)
+
+                feat = [feat_current] + [
+                    feats[k][idx]
+                    for k in feats if k not in ['spatial', module_name]
+                ] + [feat_prop]
+
+                feat = torch.cat(feat, dim=1)
+                feat_prop = feat_prop + self.backbone[module_name](feat)
+                feats[module_name].append(feat_prop)
+
+            if 'backward' in module_name:
+                feats[module_name] = feats[module_name][::-1]
+
+        outputs = []
+        for i in range(0, t):
+            align_feats = [feats[k].pop(0) for k in feats if k != 'spatial']
+            align_feats = torch.cat(align_feats, dim=1)
+            outputs.append(self.fusion(align_feats))
+
+        return torch.stack(outputs, dim=1) + x
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/flow_comp.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/flow_comp.py
new file mode 100644
index 0000000000000000000000000000000000000000..a33a8069e52803b9824798ee2b6602dfe560f83b
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/flow_comp.py
@@ -0,0 +1,450 @@
+import numpy as np
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+
+from mmcv.cnn import ConvModule
+from mmcv.runner import load_checkpoint
+
+
+class FlowCompletionLoss(nn.Module):
+    """Flow completion loss"""
+    def __init__(self):
+        super().__init__()
+        self.fix_spynet = SPyNet()
+        for p in self.fix_spynet.parameters():
+            p.requires_grad = False
+
+        self.l1_criterion = nn.L1Loss()
+
+    def forward(self, pred_flows, gt_local_frames):
+        b, l_t, c, h, w = gt_local_frames.size()
+
+        with torch.no_grad():
+            # compute gt forward and backward flows
+            gt_local_frames = F.interpolate(gt_local_frames.view(-1, c, h, w),
+                                            scale_factor=1 / 4,
+                                            mode='bilinear',
+                                            align_corners=True,
+                                            recompute_scale_factor=True)
+            gt_local_frames = gt_local_frames.view(b, l_t, c, h // 4, w // 4)
+            gtlf_1 = gt_local_frames[:, :-1, :, :, :].reshape(
+                -1, c, h // 4, w // 4)
+            gtlf_2 = gt_local_frames[:, 1:, :, :, :].reshape(
+                -1, c, h // 4, w // 4)
+            gt_flows_forward = self.fix_spynet(gtlf_1, gtlf_2)
+            gt_flows_backward = self.fix_spynet(gtlf_2, gtlf_1)
+
+        # calculate loss for flow completion
+        forward_flow_loss = self.l1_criterion(
+            pred_flows[0].view(-1, 2, h // 4, w // 4), gt_flows_forward)
+        backward_flow_loss = self.l1_criterion(
+            pred_flows[1].view(-1, 2, h // 4, w // 4), gt_flows_backward)
+        flow_loss = forward_flow_loss + backward_flow_loss
+
+        return flow_loss
+
+
+class SPyNet(nn.Module):
+    """SPyNet network structure.
+    The difference to the SPyNet in [tof.py] is that
+        1. more SPyNetBasicModule is used in this version, and
+        2. no batch normalization is used in this version.
+    Paper:
+        Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+    Args:
+        pretrained (str): path for pre-trained SPyNet. Default: None.
+    """
+    def __init__(
+        self,
+        use_pretrain=True,
+        pretrained='https://download.openmmlab.com/mmediting/restorers/basicvsr/spynet_20210409-c6c1bd09.pth'
+    ):
+        super().__init__()
+
+        self.basic_module = nn.ModuleList(
+            [SPyNetBasicModule() for _ in range(6)])
+
+        if use_pretrain:
+            if isinstance(pretrained, str):
+                print("load pretrained SPyNet...")
+                load_checkpoint(self, pretrained, strict=True)
+            elif pretrained is not None:
+                raise TypeError('[pretrained] should be str or None, '
+                                f'but got {type(pretrained)}.')
+
+        self.register_buffer(
+            'mean',
+            torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+        self.register_buffer(
+            'std',
+            torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+
+    def compute_flow(self, ref, supp):
+        """Compute flow from ref to supp.
+        Note that in this function, the images are already resized to a
+        multiple of 32.
+        Args:
+            ref (Tensor): Reference image with shape of (n, 3, h, w).
+            supp (Tensor): Supporting image with shape of (n, 3, h, w).
+        Returns:
+            Tensor: Estimated optical flow: (n, 2, h, w).
+        """
+        n, _, h, w = ref.size()
+
+        # normalize the input images
+        ref = [(ref - self.mean) / self.std]
+        supp = [(supp - self.mean) / self.std]
+
+        # generate downsampled frames
+        for level in range(5):
+            ref.append(
+                F.avg_pool2d(input=ref[-1],
+                             kernel_size=2,
+                             stride=2,
+                             count_include_pad=False))
+            supp.append(
+                F.avg_pool2d(input=supp[-1],
+                             kernel_size=2,
+                             stride=2,
+                             count_include_pad=False))
+        ref = ref[::-1]
+        supp = supp[::-1]
+
+        # flow computation
+        flow = ref[0].new_zeros(n, 2, h // 32, w // 32)
+        for level in range(len(ref)):
+            if level == 0:
+                flow_up = flow
+            else:
+                flow_up = F.interpolate(input=flow,
+                                        scale_factor=2,
+                                        mode='bilinear',
+                                        align_corners=True) * 2.0
+
+            # add the residue to the upsampled flow
+            flow = flow_up + self.basic_module[level](torch.cat([
+                ref[level],
+                flow_warp(supp[level],
+                          flow_up.permute(0, 2, 3, 1).contiguous(),
+                          padding_mode='border'), flow_up
+            ], 1))
+
+        return flow
+
+    def forward(self, ref, supp):
+        """Forward function of SPyNet.
+        This function computes the optical flow from ref to supp.
+        Args:
+            ref (Tensor): Reference image with shape of (n, 3, h, w).
+            supp (Tensor): Supporting image with shape of (n, 3, h, w).
+        Returns:
+            Tensor: Estimated optical flow: (n, 2, h, w).
+        """
+
+        # upsize to a multiple of 32
+        h, w = ref.shape[2:4]
+        w_up = w if (w % 32) == 0 else 32 * (w // 32 + 1)
+        h_up = h if (h % 32) == 0 else 32 * (h // 32 + 1)
+        ref = F.interpolate(input=ref,
+                            size=(h_up, w_up),
+                            mode='bilinear',
+                            align_corners=False)
+        supp = F.interpolate(input=supp,
+                             size=(h_up, w_up),
+                             mode='bilinear',
+                             align_corners=False)
+
+        # compute flow, and resize back to the original resolution
+        flow = F.interpolate(input=self.compute_flow(ref, supp),
+                             size=(h, w),
+                             mode='bilinear',
+                             align_corners=False)
+
+        # adjust the flow values
+        flow[:, 0, :, :] *= float(w) / float(w_up)
+        flow[:, 1, :, :] *= float(h) / float(h_up)
+
+        return flow
+
+
+class SPyNetBasicModule(nn.Module):
+    """Basic Module for SPyNet.
+    Paper:
+        Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+    """
+    def __init__(self):
+        super().__init__()
+
+        self.basic_module = nn.Sequential(
+            ConvModule(in_channels=8,
+                       out_channels=32,
+                       kernel_size=7,
+                       stride=1,
+                       padding=3,
+                       norm_cfg=None,
+                       act_cfg=dict(type='ReLU')),
+            ConvModule(in_channels=32,
+                       out_channels=64,
+                       kernel_size=7,
+                       stride=1,
+                       padding=3,
+                       norm_cfg=None,
+                       act_cfg=dict(type='ReLU')),
+            ConvModule(in_channels=64,
+                       out_channels=32,
+                       kernel_size=7,
+                       stride=1,
+                       padding=3,
+                       norm_cfg=None,
+                       act_cfg=dict(type='ReLU')),
+            ConvModule(in_channels=32,
+                       out_channels=16,
+                       kernel_size=7,
+                       stride=1,
+                       padding=3,
+                       norm_cfg=None,
+                       act_cfg=dict(type='ReLU')),
+            ConvModule(in_channels=16,
+                       out_channels=2,
+                       kernel_size=7,
+                       stride=1,
+                       padding=3,
+                       norm_cfg=None,
+                       act_cfg=None))
+
+    def forward(self, tensor_input):
+        """
+        Args:
+            tensor_input (Tensor): Input tensor with shape (b, 8, h, w).
+                8 channels contain:
+                [reference image (3), neighbor image (3), initial flow (2)].
+        Returns:
+            Tensor: Refined flow with shape (b, 2, h, w)
+        """
+        return self.basic_module(tensor_input)
+
+
+# Flow visualization code used from https://github.com/tomrunia/OpticalFlow_Visualization
+def make_colorwheel():
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
+
+    Code follows the original C++ source code of Daniel Scharstein.
+    Code follows the the Matlab source code of Deqing Sun.
+
+    Returns:
+        np.ndarray: Color wheel
+    """
+
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = np.zeros((ncols, 3))
+    col = 0
+
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY)
+    col = col + RY
+    # YG
+    colorwheel[col:col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG)
+    colorwheel[col:col + YG, 1] = 255
+    col = col + YG
+    # GC
+    colorwheel[col:col + GC, 1] = 255
+    colorwheel[col:col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC)
+    col = col + GC
+    # CB
+    colorwheel[col:col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB)
+    colorwheel[col:col + CB, 2] = 255
+    col = col + CB
+    # BM
+    colorwheel[col:col + BM, 2] = 255
+    colorwheel[col:col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM)
+    col = col + BM
+    # MR
+    colorwheel[col:col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR)
+    colorwheel[col:col + MR, 0] = 255
+    return colorwheel
+
+
+def flow_uv_to_colors(u, v, convert_to_bgr=False):
+    """
+    Applies the flow color wheel to (possibly clipped) flow components u and v.
+
+    According to the C++ source code of Daniel Scharstein
+    According to the Matlab source code of Deqing Sun
+
+    Args:
+        u (np.ndarray): Input horizontal flow of shape [H,W]
+        v (np.ndarray): Input vertical flow of shape [H,W]
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+    colorwheel = make_colorwheel()  # shape [55x3]
+    ncols = colorwheel.shape[0]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    a = np.arctan2(-v, -u) / np.pi
+    fk = (a + 1) / 2 * (ncols - 1)
+    k0 = np.floor(fk).astype(np.int32)
+    k1 = k0 + 1
+    k1[k1 == ncols] = 0
+    f = fk - k0
+    for i in range(colorwheel.shape[1]):
+        tmp = colorwheel[:, i]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1 - f) * col0 + f * col1
+        idx = (rad <= 1)
+        col[idx] = 1 - rad[idx] * (1 - col[idx])
+        col[~idx] = col[~idx] * 0.75  # out of range
+        # Note the 2-i => BGR instead of RGB
+        ch_idx = 2 - i if convert_to_bgr else i
+        flow_image[:, :, ch_idx] = np.floor(255 * col)
+    return flow_image
+
+
+def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
+    """
+    Expects a two dimensional flow image of shape.
+
+    Args:
+        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
+        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    assert flow_uv.ndim == 3, 'input flow must have three dimensions'
+    assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]'
+    if clip_flow is not None:
+        flow_uv = np.clip(flow_uv, 0, clip_flow)
+    u = flow_uv[:, :, 0]
+    v = flow_uv[:, :, 1]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    epsilon = 1e-5
+    u = u / (rad_max + epsilon)
+    v = v / (rad_max + epsilon)
+    return flow_uv_to_colors(u, v, convert_to_bgr)
+
+
+def flow_warp(x,
+              flow,
+              interpolation='bilinear',
+              padding_mode='zeros',
+              align_corners=True):
+    """Warp an image or a feature map with optical flow.
+    Args:
+        x (Tensor): Tensor with size (n, c, h, w).
+        flow (Tensor): Tensor with size (n, h, w, 2). The last dimension is
+            a two-channel, denoting the width and height relative offsets.
+            Note that the values are not normalized to [-1, 1].
+        interpolation (str): Interpolation mode: 'nearest' or 'bilinear'.
+            Default: 'bilinear'.
+        padding_mode (str): Padding mode: 'zeros' or 'border' or 'reflection'.
+            Default: 'zeros'.
+        align_corners (bool): Whether align corners. Default: True.
+    Returns:
+        Tensor: Warped image or feature map.
+    """
+    if x.size()[-2:] != flow.size()[1:3]:
+        raise ValueError(f'The spatial sizes of input ({x.size()[-2:]}) and '
+                         f'flow ({flow.size()[1:3]}) are not the same.')
+    _, _, h, w = x.size()
+    # create mesh grid
+    grid_y, grid_x = torch.meshgrid(torch.arange(0, h), torch.arange(0, w))
+    grid = torch.stack((grid_x, grid_y), 2).type_as(x)  # (w, h, 2)
+    grid.requires_grad = False
+
+    grid_flow = grid + flow
+    # scale grid_flow to [-1,1]
+    grid_flow_x = 2.0 * grid_flow[:, :, :, 0] / max(w - 1, 1) - 1.0
+    grid_flow_y = 2.0 * grid_flow[:, :, :, 1] / max(h - 1, 1) - 1.0
+    grid_flow = torch.stack((grid_flow_x, grid_flow_y), dim=3)
+    output = F.grid_sample(x,
+                           grid_flow,
+                           mode=interpolation,
+                           padding_mode=padding_mode,
+                           align_corners=align_corners)
+    return output
+
+
+def initial_mask_flow(mask):
+    """
+    mask 1 indicates valid pixel 0 indicates unknown pixel
+    """
+    B, T, C, H, W = mask.shape
+
+    # calculate relative position
+    grid_y, grid_x = torch.meshgrid(torch.arange(0, H), torch.arange(0, W))
+
+    grid_y, grid_x = grid_y.type_as(mask), grid_x.type_as(mask)
+    abs_relative_pos_y = H - torch.abs(grid_y[None, :, :] - grid_y[:, None, :])
+    relative_pos_y = H - (grid_y[None, :, :] - grid_y[:, None, :])
+
+    abs_relative_pos_x = W - torch.abs(grid_x[:, None, :] - grid_x[:, :, None])
+    relative_pos_x = W - (grid_x[:, None, :] - grid_x[:, :, None])
+
+    # calculate the nearest indices
+    pos_up = mask.unsqueeze(3).repeat(
+        1, 1, 1, H, 1, 1).flip(4) * abs_relative_pos_y[None, None, None] * (
+            relative_pos_y <= H)[None, None, None]
+    nearest_indice_up = pos_up.max(dim=4)[1]
+
+    pos_down = mask.unsqueeze(3).repeat(1, 1, 1, H, 1, 1) * abs_relative_pos_y[
+        None, None, None] * (relative_pos_y <= H)[None, None, None]
+    nearest_indice_down = (pos_down).max(dim=4)[1]
+
+    pos_left = mask.unsqueeze(4).repeat(
+        1, 1, 1, 1, W, 1).flip(5) * abs_relative_pos_x[None, None, None] * (
+            relative_pos_x <= W)[None, None, None]
+    nearest_indice_left = (pos_left).max(dim=5)[1]
+
+    pos_right = mask.unsqueeze(4).repeat(
+        1, 1, 1, 1, W, 1) * abs_relative_pos_x[None, None, None] * (
+            relative_pos_x <= W)[None, None, None]
+    nearest_indice_right = (pos_right).max(dim=5)[1]
+
+    # NOTE: IMPORTANT !!! depending on how to use this offset
+    initial_offset_up = -(nearest_indice_up - grid_y[None, None, None]).flip(3)
+    initial_offset_down = nearest_indice_down - grid_y[None, None, None]
+
+    initial_offset_left = -(nearest_indice_left -
+                            grid_x[None, None, None]).flip(4)
+    initial_offset_right = nearest_indice_right - grid_x[None, None, None]
+
+    # nearest_indice_x = (mask.unsqueeze(1).repeat(1, img_width, 1) * relative_pos_x).max(dim=2)[1]
+    # initial_offset_x = nearest_indice_x - grid_x
+
+    # handle the boundary cases
+    final_offset_down = (initial_offset_down < 0) * initial_offset_up + (
+        initial_offset_down > 0) * initial_offset_down
+    final_offset_up = (initial_offset_up > 0) * initial_offset_down + (
+        initial_offset_up < 0) * initial_offset_up
+    final_offset_right = (initial_offset_right < 0) * initial_offset_left + (
+        initial_offset_right > 0) * initial_offset_right
+    final_offset_left = (initial_offset_left > 0) * initial_offset_right + (
+        initial_offset_left < 0) * initial_offset_left
+    zero_offset = torch.zeros_like(final_offset_down)
+    # out = torch.cat([final_offset_left, zero_offset, final_offset_right, zero_offset, zero_offset, final_offset_up, zero_offset, final_offset_down], dim=2)
+    out = torch.cat([
+        zero_offset, final_offset_left, zero_offset, final_offset_right,
+        final_offset_up, zero_offset, final_offset_down, zero_offset
+    ],
+                    dim=2)
+
+    return out
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/spectral_norm.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/spectral_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f38c34e98c03caa28ce0b15a4083215fb7d8e9af
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/spectral_norm.py
@@ -0,0 +1,288 @@
+"""
+Spectral Normalization from https://arxiv.org/abs/1802.05957
+"""
+import torch
+from torch.nn.functional import normalize
+
+
+class SpectralNorm(object):
+    # Invariant before and after each forward call:
+    #   u = normalize(W @ v)
+    # NB: At initialization, this invariant is not enforced
+
+    _version = 1
+
+    # At version 1:
+    #   made  `W` not a buffer,
+    #   added `v` as a buffer, and
+    #   made eval mode use `W = u @ W_orig @ v` rather than the stored `W`.
+
+    def __init__(self, name='weight', n_power_iterations=1, dim=0, eps=1e-12):
+        self.name = name
+        self.dim = dim
+        if n_power_iterations <= 0:
+            raise ValueError(
+                'Expected n_power_iterations to be positive, but '
+                'got n_power_iterations={}'.format(n_power_iterations))
+        self.n_power_iterations = n_power_iterations
+        self.eps = eps
+
+    def reshape_weight_to_matrix(self, weight):
+        weight_mat = weight
+        if self.dim != 0:
+            # permute dim to front
+            weight_mat = weight_mat.permute(
+                self.dim,
+                *[d for d in range(weight_mat.dim()) if d != self.dim])
+        height = weight_mat.size(0)
+        return weight_mat.reshape(height, -1)
+
+    def compute_weight(self, module, do_power_iteration):
+        # NB: If `do_power_iteration` is set, the `u` and `v` vectors are
+        #     updated in power iteration **in-place**. This is very important
+        #     because in `DataParallel` forward, the vectors (being buffers) are
+        #     broadcast from the parallelized module to each module replica,
+        #     which is a new module object created on the fly. And each replica
+        #     runs its own spectral norm power iteration. So simply assigning
+        #     the updated vectors to the module this function runs on will cause
+        #     the update to be lost forever. And the next time the parallelized
+        #     module is replicated, the same randomly initialized vectors are
+        #     broadcast and used!
+        #
+        #     Therefore, to make the change propagate back, we rely on two
+        #     important behaviors (also enforced via tests):
+        #       1. `DataParallel` doesn't clone storage if the broadcast tensor
+        #          is already on correct device; and it makes sure that the
+        #          parallelized module is already on `device[0]`.
+        #       2. If the out tensor in `out=` kwarg has correct shape, it will
+        #          just fill in the values.
+        #     Therefore, since the same power iteration is performed on all
+        #     devices, simply updating the tensors in-place will make sure that
+        #     the module replica on `device[0]` will update the _u vector on the
+        #     parallized module (by shared storage).
+        #
+        #    However, after we update `u` and `v` in-place, we need to **clone**
+        #    them before using them to normalize the weight. This is to support
+        #    backproping through two forward passes, e.g., the common pattern in
+        #    GAN training: loss = D(real) - D(fake). Otherwise, engine will
+        #    complain that variables needed to do backward for the first forward
+        #    (i.e., the `u` and `v` vectors) are changed in the second forward.
+        weight = getattr(module, self.name + '_orig')
+        u = getattr(module, self.name + '_u')
+        v = getattr(module, self.name + '_v')
+        weight_mat = self.reshape_weight_to_matrix(weight)
+
+        if do_power_iteration:
+            with torch.no_grad():
+                for _ in range(self.n_power_iterations):
+                    # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
+                    # are the first left and right singular vectors.
+                    # This power iteration produces approximations of `u` and `v`.
+                    v = normalize(torch.mv(weight_mat.t(), u),
+                                  dim=0,
+                                  eps=self.eps,
+                                  out=v)
+                    u = normalize(torch.mv(weight_mat, v),
+                                  dim=0,
+                                  eps=self.eps,
+                                  out=u)
+                if self.n_power_iterations > 0:
+                    # See above on why we need to clone
+                    u = u.clone()
+                    v = v.clone()
+
+        sigma = torch.dot(u, torch.mv(weight_mat, v))
+        weight = weight / sigma
+        return weight
+
+    def remove(self, module):
+        with torch.no_grad():
+            weight = self.compute_weight(module, do_power_iteration=False)
+        delattr(module, self.name)
+        delattr(module, self.name + '_u')
+        delattr(module, self.name + '_v')
+        delattr(module, self.name + '_orig')
+        module.register_parameter(self.name,
+                                  torch.nn.Parameter(weight.detach()))
+
+    def __call__(self, module, inputs):
+        setattr(
+            module, self.name,
+            self.compute_weight(module, do_power_iteration=module.training))
+
+    def _solve_v_and_rescale(self, weight_mat, u, target_sigma):
+        # Tries to returns a vector `v` s.t. `u = normalize(W @ v)`
+        # (the invariant at top of this class) and `u @ W @ v = sigma`.
+        # This uses pinverse in case W^T W is not invertible.
+        v = torch.chain_matmul(weight_mat.t().mm(weight_mat).pinverse(),
+                               weight_mat.t(), u.unsqueeze(1)).squeeze(1)
+        return v.mul_(target_sigma / torch.dot(u, torch.mv(weight_mat, v)))
+
+    @staticmethod
+    def apply(module, name, n_power_iterations, dim, eps):
+        for k, hook in module._forward_pre_hooks.items():
+            if isinstance(hook, SpectralNorm) and hook.name == name:
+                raise RuntimeError(
+                    "Cannot register two spectral_norm hooks on "
+                    "the same parameter {}".format(name))
+
+        fn = SpectralNorm(name, n_power_iterations, dim, eps)
+        weight = module._parameters[name]
+
+        with torch.no_grad():
+            weight_mat = fn.reshape_weight_to_matrix(weight)
+
+            h, w = weight_mat.size()
+            # randomly initialize `u` and `v`
+            u = normalize(weight.new_empty(h).normal_(0, 1), dim=0, eps=fn.eps)
+            v = normalize(weight.new_empty(w).normal_(0, 1), dim=0, eps=fn.eps)
+
+        delattr(module, fn.name)
+        module.register_parameter(fn.name + "_orig", weight)
+        # We still need to assign weight back as fn.name because all sorts of
+        # things may assume that it exists, e.g., when initializing weights.
+        # However, we can't directly assign as it could be an nn.Parameter and
+        # gets added as a parameter. Instead, we register weight.data as a plain
+        # attribute.
+        setattr(module, fn.name, weight.data)
+        module.register_buffer(fn.name + "_u", u)
+        module.register_buffer(fn.name + "_v", v)
+
+        module.register_forward_pre_hook(fn)
+
+        module._register_state_dict_hook(SpectralNormStateDictHook(fn))
+        module._register_load_state_dict_pre_hook(
+            SpectralNormLoadStateDictPreHook(fn))
+        return fn
+
+
+# This is a top level class because Py2 pickle doesn't like inner class nor an
+# instancemethod.
+class SpectralNormLoadStateDictPreHook(object):
+    # See docstring of SpectralNorm._version on the changes to spectral_norm.
+    def __init__(self, fn):
+        self.fn = fn
+
+    # For state_dict with version None, (assuming that it has gone through at
+    # least one training forward), we have
+    #
+    #    u = normalize(W_orig @ v)
+    #    W = W_orig / sigma, where sigma = u @ W_orig @ v
+    #
+    # To compute `v`, we solve `W_orig @ x = u`, and let
+    #    v = x / (u @ W_orig @ x) * (W / W_orig).
+    def __call__(self, state_dict, prefix, local_metadata, strict,
+                 missing_keys, unexpected_keys, error_msgs):
+        fn = self.fn
+        version = local_metadata.get('spectral_norm',
+                                     {}).get(fn.name + '.version', None)
+        if version is None or version < 1:
+            with torch.no_grad():
+                weight_orig = state_dict[prefix + fn.name + '_orig']
+                # weight = state_dict.pop(prefix + fn.name)
+                # sigma = (weight_orig / weight).mean()
+                weight_mat = fn.reshape_weight_to_matrix(weight_orig)
+                u = state_dict[prefix + fn.name + '_u']
+                # v = fn._solve_v_and_rescale(weight_mat, u, sigma)
+                # state_dict[prefix + fn.name + '_v'] = v
+
+
+# This is a top level class because Py2 pickle doesn't like inner class nor an
+# instancemethod.
+class SpectralNormStateDictHook(object):
+    # See docstring of SpectralNorm._version on the changes to spectral_norm.
+    def __init__(self, fn):
+        self.fn = fn
+
+    def __call__(self, module, state_dict, prefix, local_metadata):
+        if 'spectral_norm' not in local_metadata:
+            local_metadata['spectral_norm'] = {}
+        key = self.fn.name + '.version'
+        if key in local_metadata['spectral_norm']:
+            raise RuntimeError(
+                "Unexpected key in metadata['spectral_norm']: {}".format(key))
+        local_metadata['spectral_norm'][key] = self.fn._version
+
+
+def spectral_norm(module,
+                  name='weight',
+                  n_power_iterations=1,
+                  eps=1e-12,
+                  dim=None):
+    r"""Applies spectral normalization to a parameter in the given module.
+
+    .. math::
+        \mathbf{W}_{SN} = \dfrac{\mathbf{W}}{\sigma(\mathbf{W})},
+        \sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \dfrac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2}
+
+    Spectral normalization stabilizes the training of discriminators (critics)
+    in Generative Adversarial Networks (GANs) by rescaling the weight tensor
+    with spectral norm :math:`\sigma` of the weight matrix calculated using
+    power iteration method. If the dimension of the weight tensor is greater
+    than 2, it is reshaped to 2D in power iteration method to get spectral
+    norm. This is implemented via a hook that calculates spectral norm and
+    rescales weight before every :meth:`~Module.forward` call.
+
+    See `Spectral Normalization for Generative Adversarial Networks`_ .
+
+    .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957
+
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+        n_power_iterations (int, optional): number of power iterations to
+            calculate spectral norm
+        eps (float, optional): epsilon for numerical stability in
+            calculating norms
+        dim (int, optional): dimension corresponding to number of outputs,
+            the default is ``0``, except for modules that are instances of
+            ConvTranspose{1,2,3}d, when it is ``1``
+
+    Returns:
+        The original module with the spectral norm hook
+
+    Example::
+
+        >>> m = spectral_norm(nn.Linear(20, 40))
+        >>> m
+        Linear(in_features=20, out_features=40, bias=True)
+        >>> m.weight_u.size()
+        torch.Size([40])
+
+    """
+    if dim is None:
+        if isinstance(module,
+                      (torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d,
+                       torch.nn.ConvTranspose3d)):
+            dim = 1
+        else:
+            dim = 0
+    SpectralNorm.apply(module, name, n_power_iterations, dim, eps)
+    return module
+
+
+def remove_spectral_norm(module, name='weight'):
+    r"""Removes the spectral normalization reparameterization from a module.
+
+    Args:
+        module (Module): containing module
+        name (str, optional): name of weight parameter
+
+    Example:
+        >>> m = spectral_norm(nn.Linear(40, 10))
+        >>> remove_spectral_norm(m)
+    """
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, SpectralNorm) and hook.name == name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            return module
+
+    raise ValueError("spectral_norm of '{}' not found in {}".format(
+        name, module))
+
+
+def use_spectral_norm(module, use_sn=False):
+    if use_sn:
+        return spectral_norm(module)
+    return module
\ No newline at end of file
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/tfocal_transformer.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/tfocal_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..179508f490f2662331a8817b37513005e98fe4de
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/tfocal_transformer.py
@@ -0,0 +1,536 @@
+"""
+    This code is based on:
+    [1] FuseFormer: Fusing Fine-Grained Information in Transformers for Video Inpainting, ICCV 2021
+        https://github.com/ruiliu-ai/FuseFormer
+    [2] Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet, ICCV 2021
+        https://github.com/yitu-opensource/T2T-ViT
+    [3] Focal Self-attention for Local-Global Interactions in Vision Transformers, NeurIPS 2021
+        https://github.com/microsoft/Focal-Transformer       
+"""
+
+import math
+from functools import reduce
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SoftSplit(nn.Module):
+    def __init__(self, channel, hidden, kernel_size, stride, padding,
+                 t2t_param):
+        super(SoftSplit, self).__init__()
+        self.kernel_size = kernel_size
+        self.t2t = nn.Unfold(kernel_size=kernel_size,
+                             stride=stride,
+                             padding=padding)
+        c_in = reduce((lambda x, y: x * y), kernel_size) * channel
+        self.embedding = nn.Linear(c_in, hidden)
+
+        self.f_h = int(
+            (t2t_param['output_size'][0] + 2 * t2t_param['padding'][0] -
+             (t2t_param['kernel_size'][0] - 1) - 1) / t2t_param['stride'][0] +
+            1)
+        self.f_w = int(
+            (t2t_param['output_size'][1] + 2 * t2t_param['padding'][1] -
+             (t2t_param['kernel_size'][1] - 1) - 1) / t2t_param['stride'][1] +
+            1)
+
+    def forward(self, x, b):
+        feat = self.t2t(x)
+        feat = feat.permute(0, 2, 1)
+        # feat shape [b*t, num_vec, ks*ks*c]
+        feat = self.embedding(feat)
+        # feat shape after embedding [b, t*num_vec, hidden]
+        feat = feat.view(b, -1, self.f_h, self.f_w, feat.size(2))
+        return feat
+
+
+class SoftComp(nn.Module):
+    def __init__(self, channel, hidden, output_size, kernel_size, stride,
+                 padding):
+        super(SoftComp, self).__init__()
+        self.relu = nn.LeakyReLU(0.2, inplace=True)
+        c_out = reduce((lambda x, y: x * y), kernel_size) * channel
+        self.embedding = nn.Linear(hidden, c_out)
+        self.t2t = torch.nn.Fold(output_size=output_size,
+                                 kernel_size=kernel_size,
+                                 stride=stride,
+                                 padding=padding)
+        h, w = output_size
+        self.bias = nn.Parameter(torch.zeros((channel, h, w),
+                                             dtype=torch.float32),
+                                 requires_grad=True)
+
+    def forward(self, x, t):
+        b_, _, _, _, c_ = x.shape
+        x = x.view(b_, -1, c_)
+        feat = self.embedding(x)
+        b, _, c = feat.size()
+        feat = feat.view(b * t, -1, c).permute(0, 2, 1)
+        feat = self.t2t(feat) + self.bias[None]
+        return feat
+
+
+class FusionFeedForward(nn.Module):
+    def __init__(self, d_model, n_vecs=None, t2t_params=None):
+        super(FusionFeedForward, self).__init__()
+        # We set d_ff as a default to 1960
+        hd = 1960
+        self.conv1 = nn.Sequential(nn.Linear(d_model, hd))
+        self.conv2 = nn.Sequential(nn.GELU(), nn.Linear(hd, d_model))
+        assert t2t_params is not None and n_vecs is not None
+        tp = t2t_params.copy()
+        self.fold = nn.Fold(**tp)
+        del tp['output_size']
+        self.unfold = nn.Unfold(**tp)
+        self.n_vecs = n_vecs
+
+    def forward(self, x):
+        x = self.conv1(x)
+        b, n, c = x.size()
+        normalizer = x.new_ones(b, n, 49).view(-1, self.n_vecs,
+                                               49).permute(0, 2, 1)
+        x = self.unfold(
+            self.fold(x.view(-1, self.n_vecs, c).permute(0, 2, 1)) /
+            self.fold(normalizer)).permute(0, 2, 1).contiguous().view(b, n, c)
+        x = self.conv2(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: shape is (B, T, H, W, C)
+        window_size (tuple[int]): window size
+    Returns:
+        windows: (B*num_windows, T*window_size*window_size, C)
+    """
+    B, T, H, W, C = x.shape
+    x = x.view(B, T, H // window_size[0], window_size[0], W // window_size[1],
+               window_size[1], C)
+    windows = x.permute(0, 2, 4, 1, 3, 5, 6).contiguous().view(
+        -1, T * window_size[0] * window_size[1], C)
+    return windows
+
+
+def window_partition_noreshape(x, window_size):
+    """
+    Args:
+        x: shape is (B, T, H, W, C)
+        window_size (tuple[int]): window size
+    Returns:
+        windows: (B, num_windows_h, num_windows_w, T, window_size, window_size, C)
+    """
+    B, T, H, W, C = x.shape
+    x = x.view(B, T, H // window_size[0], window_size[0], W // window_size[1],
+               window_size[1], C)
+    windows = x.permute(0, 2, 4, 1, 3, 5, 6).contiguous()
+    return windows
+
+
+def window_reverse(windows, window_size, T, H, W):
+    """
+    Args:
+        windows: shape is (num_windows*B, T, window_size, window_size, C)
+        window_size (tuple[int]): Window size
+        T (int): Temporal length of video
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, T, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size[0] / window_size[1]))
+    x = windows.view(B, H // window_size[0], W // window_size[1], T,
+                     window_size[0], window_size[1], -1)
+    x = x.permute(0, 3, 1, 4, 2, 5, 6).contiguous().view(B, T, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """Temporal focal window attention
+    """
+    def __init__(self, dim, expand_size, window_size, focal_window,
+                 focal_level, num_heads, qkv_bias, pool_method):
+
+        super().__init__()
+        self.dim = dim
+        self.expand_size = expand_size
+        self.window_size = window_size  # Wh, Ww
+        self.pool_method = pool_method
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+
+        if any(i > 0 for i in self.expand_size) and focal_level > 0:
+            # get mask for rolled k and rolled v
+            mask_tl = torch.ones(self.window_size[0], self.window_size[1])
+            mask_tl[:-self.expand_size[0], :-self.expand_size[1]] = 0
+            mask_tr = torch.ones(self.window_size[0], self.window_size[1])
+            mask_tr[:-self.expand_size[0], self.expand_size[1]:] = 0
+            mask_bl = torch.ones(self.window_size[0], self.window_size[1])
+            mask_bl[self.expand_size[0]:, :-self.expand_size[1]] = 0
+            mask_br = torch.ones(self.window_size[0], self.window_size[1])
+            mask_br[self.expand_size[0]:, self.expand_size[1]:] = 0
+            mask_rolled = torch.stack((mask_tl, mask_tr, mask_bl, mask_br),
+                                      0).flatten(0)
+            self.register_buffer("valid_ind_rolled",
+                                 mask_rolled.nonzero(as_tuple=False).view(-1))
+
+        if pool_method != "none" and focal_level > 1:
+            self.unfolds = nn.ModuleList()
+
+            # build relative position bias between local patch and pooled windows
+            for k in range(focal_level - 1):
+                stride = 2**k
+                kernel_size = tuple(2 * (i // 2) + 2**k + (2**k - 1)
+                                    for i in self.focal_window)
+                # define unfolding operations
+                self.unfolds += [
+                    nn.Unfold(kernel_size=kernel_size,
+                              stride=stride,
+                              padding=tuple(i // 2 for i in kernel_size))
+                ]
+
+                # define unfolding index for focal_level > 0
+                if k > 0:
+                    mask = torch.zeros(kernel_size)
+                    mask[(2**k) - 1:, (2**k) - 1:] = 1
+                    self.register_buffer(
+                        "valid_ind_unfold_{}".format(k),
+                        mask.flatten(0).nonzero(as_tuple=False).view(-1))
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x_all, mask_all=None):
+        """
+        Args:
+            x: input features with shape of (B, T, Wh, Ww, C)
+            mask: (0/-inf) mask with shape of (num_windows, T*Wh*Ww, T*Wh*Ww) or None
+
+            output: (nW*B, Wh*Ww, C)
+        """
+        x = x_all[0]
+
+        B, T, nH, nW, C = x.shape
+        qkv = self.qkv(x).reshape(B, T, nH, nW, 3,
+                                  C).permute(4, 0, 1, 2, 3, 5).contiguous()
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B, T, nH, nW, C
+
+        # partition q map
+        (q_windows, k_windows, v_windows) = map(
+            lambda t: window_partition(t, self.window_size).view(
+                -1, T, self.window_size[0] * self.window_size[1], self.
+                num_heads, C // self.num_heads).permute(0, 3, 1, 2, 4).
+            contiguous().view(-1, self.num_heads, T * self.window_size[
+                0] * self.window_size[1], C // self.num_heads), (q, k, v))
+        # q(k/v)_windows shape : [16, 4, 225, 128]
+
+        if any(i > 0 for i in self.expand_size) and self.focal_level > 0:
+            (k_tl, v_tl) = map(
+                lambda t: torch.roll(t,
+                                     shifts=(-self.expand_size[0], -self.
+                                             expand_size[1]),
+                                     dims=(2, 3)), (k, v))
+            (k_tr, v_tr) = map(
+                lambda t: torch.roll(t,
+                                     shifts=(-self.expand_size[0], self.
+                                             expand_size[1]),
+                                     dims=(2, 3)), (k, v))
+            (k_bl, v_bl) = map(
+                lambda t: torch.roll(t,
+                                     shifts=(self.expand_size[0], -self.
+                                             expand_size[1]),
+                                     dims=(2, 3)), (k, v))
+            (k_br, v_br) = map(
+                lambda t: torch.roll(t,
+                                     shifts=(self.expand_size[0], self.
+                                             expand_size[1]),
+                                     dims=(2, 3)), (k, v))
+
+            (k_tl_windows, k_tr_windows, k_bl_windows, k_br_windows) = map(
+                lambda t: window_partition(t, self.window_size).view(
+                    -1, T, self.window_size[0] * self.window_size[1], self.
+                    num_heads, C // self.num_heads), (k_tl, k_tr, k_bl, k_br))
+            (v_tl_windows, v_tr_windows, v_bl_windows, v_br_windows) = map(
+                lambda t: window_partition(t, self.window_size).view(
+                    -1, T, self.window_size[0] * self.window_size[1], self.
+                    num_heads, C // self.num_heads), (v_tl, v_tr, v_bl, v_br))
+            k_rolled = torch.cat(
+                (k_tl_windows, k_tr_windows, k_bl_windows, k_br_windows),
+                2).permute(0, 3, 1, 2, 4).contiguous()
+            v_rolled = torch.cat(
+                (v_tl_windows, v_tr_windows, v_bl_windows, v_br_windows),
+                2).permute(0, 3, 1, 2, 4).contiguous()
+
+            # mask out tokens in current window
+            k_rolled = k_rolled[:, :, :, self.valid_ind_rolled]
+            v_rolled = v_rolled[:, :, :, self.valid_ind_rolled]
+            temp_N = k_rolled.shape[3]
+            k_rolled = k_rolled.view(-1, self.num_heads, T * temp_N,
+                                     C // self.num_heads)
+            v_rolled = v_rolled.view(-1, self.num_heads, T * temp_N,
+                                     C // self.num_heads)
+            k_rolled = torch.cat((k_windows, k_rolled), 2)
+            v_rolled = torch.cat((v_windows, v_rolled), 2)
+        else:
+            k_rolled = k_windows
+            v_rolled = v_windows
+
+        # q(k/v)_windows shape : [16, 4, 225, 128]
+        # k_rolled.shape : [16, 4, 5, 165, 128]
+        # ideal expanded window size 153 ((5+2*2)*(9+2*4))
+        # k_windows=45 expand_window=108 overlap_window=12 (since expand_size < window_size / 2)
+
+        if self.pool_method != "none" and self.focal_level > 1:
+            k_pooled = []
+            v_pooled = []
+            for k in range(self.focal_level - 1):
+                stride = 2**k
+                x_window_pooled = x_all[k + 1].permute(
+                    0, 3, 1, 2, 4).contiguous()  # B, T, nWh, nWw, C
+
+                nWh, nWw = x_window_pooled.shape[2:4]
+
+                # generate mask for pooled windows
+                mask = x_window_pooled.new(T, nWh, nWw).fill_(1)
+                # unfold mask: [nWh*nWw//s//s, k*k, 1]
+                unfolded_mask = self.unfolds[k](mask.unsqueeze(1)).view(
+                    1, T, self.unfolds[k].kernel_size[0], self.unfolds[k].kernel_size[1], -1).permute(4, 1, 2, 3, 0).contiguous().\
+                    view(nWh*nWw // stride // stride, -1, 1)
+
+                if k > 0:
+                    valid_ind_unfold_k = getattr(
+                        self, "valid_ind_unfold_{}".format(k))
+                    unfolded_mask = unfolded_mask[:, valid_ind_unfold_k]
+
+                x_window_masks = unfolded_mask.flatten(1).unsqueeze(0)
+                x_window_masks = x_window_masks.masked_fill(
+                    x_window_masks == 0,
+                    float(-100.0)).masked_fill(x_window_masks > 0, float(0.0))
+                mask_all[k + 1] = x_window_masks
+
+                # generate k and v for pooled windows
+                qkv_pooled = self.qkv(x_window_pooled).reshape(
+                    B, T, nWh, nWw, 3, C).permute(4, 0, 1, 5, 2,
+                                                  3).view(3, -1, C, nWh,
+                                                          nWw).contiguous()
+                k_pooled_k, v_pooled_k = qkv_pooled[1], qkv_pooled[
+                    2]  # B*T, C, nWh, nWw
+                # k_pooled_k shape: [5, 512, 4, 4]
+                # self.unfolds[k](k_pooled_k) shape: [5, 23040 (512 * 5 * 9 ), 16]
+
+                (k_pooled_k, v_pooled_k) = map(
+                    lambda t: self.unfolds[k](t).view(
+                    B, T, C, self.unfolds[k].kernel_size[0], self.unfolds[k].kernel_size[1], -1).permute(0, 5, 1, 3, 4, 2).contiguous().\
+                    view(-1, T, self.unfolds[k].kernel_size[0]*self.unfolds[k].kernel_size[1], self.num_heads, C // self.num_heads).permute(0, 3, 1, 2, 4).contiguous(),
+                    (k_pooled_k, v_pooled_k)  # (B x (nH*nW)) x nHeads x T x (unfold_wsize x unfold_wsize) x head_dim
+                )
+                # k_pooled_k shape : [16, 4, 5, 45, 128]
+
+                # select valid unfolding index
+                if k > 0:
+                    (k_pooled_k, v_pooled_k) = map(
+                        lambda t: t[:, :, :, valid_ind_unfold_k],
+                        (k_pooled_k, v_pooled_k))
+
+                k_pooled_k = k_pooled_k.view(
+                    -1, self.num_heads, T * self.unfolds[k].kernel_size[0] *
+                    self.unfolds[k].kernel_size[1], C // self.num_heads)
+                v_pooled_k = v_pooled_k.view(
+                    -1, self.num_heads, T * self.unfolds[k].kernel_size[0] *
+                    self.unfolds[k].kernel_size[1], C // self.num_heads)
+
+                k_pooled += [k_pooled_k]
+                v_pooled += [v_pooled_k]
+
+            # k_all (v_all) shape : [16, 4, 5 * 210, 128]
+            k_all = torch.cat([k_rolled] + k_pooled, 2)
+            v_all = torch.cat([v_rolled] + v_pooled, 2)
+        else:
+            k_all = k_rolled
+            v_all = v_rolled
+
+        N = k_all.shape[-2]
+        q_windows = q_windows * self.scale
+        attn = (
+            q_windows @ k_all.transpose(-2, -1)
+        )  # B*nW, nHead, T*window_size*window_size, T*focal_window_size*focal_window_size
+        # T * 45
+        window_area = T * self.window_size[0] * self.window_size[1]
+        # T * 165
+        window_area_rolled = k_rolled.shape[2]
+
+        if self.pool_method != "none" and self.focal_level > 1:
+            offset = window_area_rolled
+            for k in range(self.focal_level - 1):
+                # add attentional mask
+                # mask_all[1] shape [1, 16, T * 45]
+
+                bias = tuple((i + 2**k - 1) for i in self.focal_window)
+
+                if mask_all[k + 1] is not None:
+                    attn[:, :, :window_area, offset:(offset + (T*bias[0]*bias[1]))] = \
+                        attn[:, :, :window_area, offset:(offset + (T*bias[0]*bias[1]))] + \
+                            mask_all[k+1][:, :, None, None, :].repeat(attn.shape[0] // mask_all[k+1].shape[1], 1, 1, 1, 1).view(-1, 1, 1, mask_all[k+1].shape[-1])
+
+                offset += T * bias[0] * bias[1]
+
+        if mask_all[0] is not None:
+            nW = mask_all[0].shape[0]
+            attn = attn.view(attn.shape[0] // nW, nW, self.num_heads,
+                             window_area, N)
+            attn[:, :, :, :, :
+                 window_area] = attn[:, :, :, :, :window_area] + mask_all[0][
+                     None, :, None, :, :]
+            attn = attn.view(-1, self.num_heads, window_area, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        x = (attn @ v_all).transpose(1, 2).reshape(attn.shape[0], window_area,
+                                                   C)
+        x = self.proj(x)
+        return x
+
+
+class TemporalFocalTransformerBlock(nn.Module):
+    r""" Temporal Focal Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+        focal_level (int):  The number level of focal window.
+        focal_window (int):  Window size of each focal window.
+        n_vecs (int): Required for F3N.
+        t2t_params (int): T2T parameters for F3N.
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(5, 9),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 pool_method="fc",
+                 focal_level=2,
+                 focal_window=(5, 9),
+                 norm_layer=nn.LayerNorm,
+                 n_vecs=None,
+                 t2t_params=None):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.expand_size = tuple(i // 2 for i in window_size)  # TODO
+        self.mlp_ratio = mlp_ratio
+        self.pool_method = pool_method
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+
+        self.window_size_glo = self.window_size
+
+        self.pool_layers = nn.ModuleList()
+        if self.pool_method != "none":
+            for k in range(self.focal_level - 1):
+                window_size_glo = tuple(
+                    math.floor(i / (2**k)) for i in self.window_size_glo)
+                self.pool_layers.append(
+                    nn.Linear(window_size_glo[0] * window_size_glo[1], 1))
+                self.pool_layers[-1].weight.data.fill_(
+                    1. / (window_size_glo[0] * window_size_glo[1]))
+                self.pool_layers[-1].bias.data.fill_(0)
+
+        self.norm1 = norm_layer(dim)
+
+        self.attn = WindowAttention(dim,
+                                    expand_size=self.expand_size,
+                                    window_size=self.window_size,
+                                    focal_window=focal_window,
+                                    focal_level=focal_level,
+                                    num_heads=num_heads,
+                                    qkv_bias=qkv_bias,
+                                    pool_method=pool_method)
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = FusionFeedForward(dim, n_vecs=n_vecs, t2t_params=t2t_params)
+
+    def forward(self, x):
+        B, T, H, W, C = x.shape
+
+        shortcut = x
+        x = self.norm1(x)
+
+        shifted_x = x
+
+        x_windows_all = [shifted_x]
+        x_window_masks_all = [None]
+
+        # partition windows tuple(i // 2 for i in window_size)
+        if self.focal_level > 1 and self.pool_method != "none":
+            # if we add coarser granularity and the pool method is not none
+            for k in range(self.focal_level - 1):
+                window_size_glo = tuple(
+                    math.floor(i / (2**k)) for i in self.window_size_glo)
+                pooled_h = math.ceil(H / window_size_glo[0]) * (2**k)
+                pooled_w = math.ceil(W / window_size_glo[1]) * (2**k)
+                H_pool = pooled_h * window_size_glo[0]
+                W_pool = pooled_w * window_size_glo[1]
+
+                x_level_k = shifted_x
+                # trim or pad shifted_x depending on the required size
+                if H > H_pool:
+                    trim_t = (H - H_pool) // 2
+                    trim_b = H - H_pool - trim_t
+                    x_level_k = x_level_k[:, :, trim_t:-trim_b]
+                elif H < H_pool:
+                    pad_t = (H_pool - H) // 2
+                    pad_b = H_pool - H - pad_t
+                    x_level_k = F.pad(x_level_k, (0, 0, 0, 0, pad_t, pad_b))
+
+                if W > W_pool:
+                    trim_l = (W - W_pool) // 2
+                    trim_r = W - W_pool - trim_l
+                    x_level_k = x_level_k[:, :, :, trim_l:-trim_r]
+                elif W < W_pool:
+                    pad_l = (W_pool - W) // 2
+                    pad_r = W_pool - W - pad_l
+                    x_level_k = F.pad(x_level_k, (0, 0, pad_l, pad_r))
+
+                x_windows_noreshape = window_partition_noreshape(
+                    x_level_k.contiguous(), window_size_glo
+                )  # B, nw, nw, T, window_size, window_size, C
+                nWh, nWw = x_windows_noreshape.shape[1:3]
+                x_windows_noreshape = x_windows_noreshape.view(
+                    B, nWh, nWw, T, window_size_glo[0] * window_size_glo[1],
+                    C).transpose(4, 5)  # B, nWh, nWw, T, C, wsize**2
+                x_windows_pooled = self.pool_layers[k](
+                    x_windows_noreshape).flatten(-2)  # B, nWh, nWw, T, C
+
+                x_windows_all += [x_windows_pooled]
+                x_window_masks_all += [None]
+
+        attn_windows = self.attn(
+            x_windows_all,
+            mask_all=x_window_masks_all)  # nW*B, T*window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, T, self.window_size[0],
+                                         self.window_size[1], C)
+        shifted_x = window_reverse(attn_windows, self.window_size, T, H,
+                                   W)  # B T H' W' C
+
+        # FFN
+        x = shortcut + shifted_x
+        y = self.norm2(x)
+        x = x + self.mlp(y.view(B, T * H * W, C)).view(B, T, H, W, C)
+
+        return x
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/tfocal_transformer_hq.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/tfocal_transformer_hq.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a24dfa799533ff96bfb94b01ad8593f45bb590f
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/model/modules/tfocal_transformer_hq.py
@@ -0,0 +1,565 @@
+"""
+    This code is based on:
+    [1] FuseFormer: Fusing Fine-Grained Information in Transformers for Video Inpainting, ICCV 2021
+        https://github.com/ruiliu-ai/FuseFormer
+    [2] Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet, ICCV 2021
+        https://github.com/yitu-opensource/T2T-ViT
+    [3] Focal Self-attention for Local-Global Interactions in Vision Transformers, NeurIPS 2021
+        https://github.com/microsoft/Focal-Transformer       
+"""
+
+import math
+from functools import reduce
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SoftSplit(nn.Module):
+    def __init__(self, channel, hidden, kernel_size, stride, padding,
+                 t2t_param):
+        super(SoftSplit, self).__init__()
+        self.kernel_size = kernel_size
+        self.t2t = nn.Unfold(kernel_size=kernel_size,
+                             stride=stride,
+                             padding=padding)
+        c_in = reduce((lambda x, y: x * y), kernel_size) * channel
+        self.embedding = nn.Linear(c_in, hidden)
+
+        self.t2t_param = t2t_param
+
+    def forward(self, x, b, output_size):
+        f_h = int((output_size[0] + 2 * self.t2t_param['padding'][0] -
+                   (self.t2t_param['kernel_size'][0] - 1) - 1) /
+                  self.t2t_param['stride'][0] + 1)
+        f_w = int((output_size[1] + 2 * self.t2t_param['padding'][1] -
+                   (self.t2t_param['kernel_size'][1] - 1) - 1) /
+                  self.t2t_param['stride'][1] + 1)
+
+        feat = self.t2t(x)
+        feat = feat.permute(0, 2, 1)
+        # feat shape [b*t, num_vec, ks*ks*c]
+        feat = self.embedding(feat)
+        # feat shape after embedding [b, t*num_vec, hidden]
+        feat = feat.view(b, -1, f_h, f_w, feat.size(2))
+        return feat
+
+
+class SoftComp(nn.Module):
+    def __init__(self, channel, hidden, kernel_size, stride, padding):
+        super(SoftComp, self).__init__()
+        self.relu = nn.LeakyReLU(0.2, inplace=True)
+        c_out = reduce((lambda x, y: x * y), kernel_size) * channel
+        self.embedding = nn.Linear(hidden, c_out)
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.bias_conv = nn.Conv2d(channel,
+                                   channel,
+                                   kernel_size=3,
+                                   stride=1,
+                                   padding=1)
+        # TODO upsample conv
+        # self.bias_conv = nn.Conv2d()
+        # self.bias = nn.Parameter(torch.zeros((channel, h, w), dtype=torch.float32), requires_grad=True)
+
+    def forward(self, x, t, output_size):
+        b_, _, _, _, c_ = x.shape
+        x = x.view(b_, -1, c_)
+        feat = self.embedding(x)
+        b, _, c = feat.size()
+        feat = feat.view(b * t, -1, c).permute(0, 2, 1)
+        feat = F.fold(feat,
+                      output_size=output_size,
+                      kernel_size=self.kernel_size,
+                      stride=self.stride,
+                      padding=self.padding)
+        feat = self.bias_conv(feat)
+        return feat
+
+
+class FusionFeedForward(nn.Module):
+    def __init__(self, d_model, n_vecs=None, t2t_params=None):
+        super(FusionFeedForward, self).__init__()
+        # We set d_ff as a default to 1960
+        hd = 1960
+        self.conv1 = nn.Sequential(nn.Linear(d_model, hd))
+        self.conv2 = nn.Sequential(nn.GELU(), nn.Linear(hd, d_model))
+        assert t2t_params is not None and n_vecs is not None
+        self.t2t_params = t2t_params
+
+    def forward(self, x, output_size):
+        n_vecs = 1
+        for i, d in enumerate(self.t2t_params['kernel_size']):
+            n_vecs *= int((output_size[i] + 2 * self.t2t_params['padding'][i] -
+                           (d - 1) - 1) / self.t2t_params['stride'][i] + 1)
+
+        x = self.conv1(x)
+        b, n, c = x.size()
+        normalizer = x.new_ones(b, n, 49).view(-1, n_vecs, 49).permute(0, 2, 1)
+        normalizer = F.fold(normalizer,
+                            output_size=output_size,
+                            kernel_size=self.t2t_params['kernel_size'],
+                            padding=self.t2t_params['padding'],
+                            stride=self.t2t_params['stride'])
+
+        x = F.fold(x.view(-1, n_vecs, c).permute(0, 2, 1),
+                   output_size=output_size,
+                   kernel_size=self.t2t_params['kernel_size'],
+                   padding=self.t2t_params['padding'],
+                   stride=self.t2t_params['stride'])
+
+        x = F.unfold(x / normalizer,
+                     kernel_size=self.t2t_params['kernel_size'],
+                     padding=self.t2t_params['padding'],
+                     stride=self.t2t_params['stride']).permute(
+                         0, 2, 1).contiguous().view(b, n, c)
+        x = self.conv2(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: shape is (B, T, H, W, C)
+        window_size (tuple[int]): window size
+    Returns:
+        windows: (B*num_windows, T*window_size*window_size, C)
+    """
+    B, T, H, W, C = x.shape
+    x = x.view(B, T, H // window_size[0], window_size[0], W // window_size[1],
+               window_size[1], C)
+    windows = x.permute(0, 2, 4, 1, 3, 5, 6).contiguous().view(
+        -1, T * window_size[0] * window_size[1], C)
+    return windows
+
+
+def window_partition_noreshape(x, window_size):
+    """
+    Args:
+        x: shape is (B, T, H, W, C)
+        window_size (tuple[int]): window size
+    Returns:
+        windows: (B, num_windows_h, num_windows_w, T, window_size, window_size, C)
+    """
+    B, T, H, W, C = x.shape
+    x = x.view(B, T, H // window_size[0], window_size[0], W // window_size[1],
+               window_size[1], C)
+    windows = x.permute(0, 2, 4, 1, 3, 5, 6).contiguous()
+    return windows
+
+
+def window_reverse(windows, window_size, T, H, W):
+    """
+    Args:
+        windows: shape is (num_windows*B, T, window_size, window_size, C)
+        window_size (tuple[int]): Window size
+        T (int): Temporal length of video
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, T, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size[0] / window_size[1]))
+    x = windows.view(B, H // window_size[0], W // window_size[1], T,
+                     window_size[0], window_size[1], -1)
+    x = x.permute(0, 3, 1, 4, 2, 5, 6).contiguous().view(B, T, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """Temporal focal window attention
+    """
+    def __init__(self, dim, expand_size, window_size, focal_window,
+                 focal_level, num_heads, qkv_bias, pool_method):
+
+        super().__init__()
+        self.dim = dim
+        self.expand_size = expand_size
+        self.window_size = window_size  # Wh, Ww
+        self.pool_method = pool_method
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+
+        if any(i > 0 for i in self.expand_size) and focal_level > 0:
+            # get mask for rolled k and rolled v
+            mask_tl = torch.ones(self.window_size[0], self.window_size[1])
+            mask_tl[:-self.expand_size[0], :-self.expand_size[1]] = 0
+            mask_tr = torch.ones(self.window_size[0], self.window_size[1])
+            mask_tr[:-self.expand_size[0], self.expand_size[1]:] = 0
+            mask_bl = torch.ones(self.window_size[0], self.window_size[1])
+            mask_bl[self.expand_size[0]:, :-self.expand_size[1]] = 0
+            mask_br = torch.ones(self.window_size[0], self.window_size[1])
+            mask_br[self.expand_size[0]:, self.expand_size[1]:] = 0
+            mask_rolled = torch.stack((mask_tl, mask_tr, mask_bl, mask_br),
+                                      0).flatten(0)
+            self.register_buffer("valid_ind_rolled",
+                                 mask_rolled.nonzero(as_tuple=False).view(-1))
+
+        if pool_method != "none" and focal_level > 1:
+            self.unfolds = nn.ModuleList()
+
+            # build relative position bias between local patch and pooled windows
+            for k in range(focal_level - 1):
+                stride = 2**k
+                kernel_size = tuple(2 * (i // 2) + 2**k + (2**k - 1)
+                                    for i in self.focal_window)
+                # define unfolding operations
+                self.unfolds += [
+                    nn.Unfold(kernel_size=kernel_size,
+                              stride=stride,
+                              padding=tuple(i // 2 for i in kernel_size))
+                ]
+
+                # define unfolding index for focal_level > 0
+                if k > 0:
+                    mask = torch.zeros(kernel_size)
+                    mask[(2**k) - 1:, (2**k) - 1:] = 1
+                    self.register_buffer(
+                        "valid_ind_unfold_{}".format(k),
+                        mask.flatten(0).nonzero(as_tuple=False).view(-1))
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x_all, mask_all=None):
+        """
+        Args:
+            x: input features with shape of (B, T, Wh, Ww, C)
+            mask: (0/-inf) mask with shape of (num_windows, T*Wh*Ww, T*Wh*Ww) or None
+
+            output: (nW*B, Wh*Ww, C)
+        """
+        x = x_all[0]
+
+        B, T, nH, nW, C = x.shape
+        qkv = self.qkv(x).reshape(B, T, nH, nW, 3,
+                                  C).permute(4, 0, 1, 2, 3, 5).contiguous()
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B, T, nH, nW, C
+
+        # partition q map
+        (q_windows, k_windows, v_windows) = map(
+            lambda t: window_partition(t, self.window_size).view(
+                -1, T, self.window_size[0] * self.window_size[1], self.
+                num_heads, C // self.num_heads).permute(0, 3, 1, 2, 4).
+            contiguous().view(-1, self.num_heads, T * self.window_size[
+                0] * self.window_size[1], C // self.num_heads), (q, k, v))
+        # q(k/v)_windows shape : [16, 4, 225, 128]
+
+        if any(i > 0 for i in self.expand_size) and self.focal_level > 0:
+            (k_tl, v_tl) = map(
+                lambda t: torch.roll(t,
+                                     shifts=(-self.expand_size[0], -self.
+                                             expand_size[1]),
+                                     dims=(2, 3)), (k, v))
+            (k_tr, v_tr) = map(
+                lambda t: torch.roll(t,
+                                     shifts=(-self.expand_size[0], self.
+                                             expand_size[1]),
+                                     dims=(2, 3)), (k, v))
+            (k_bl, v_bl) = map(
+                lambda t: torch.roll(t,
+                                     shifts=(self.expand_size[0], -self.
+                                             expand_size[1]),
+                                     dims=(2, 3)), (k, v))
+            (k_br, v_br) = map(
+                lambda t: torch.roll(t,
+                                     shifts=(self.expand_size[0], self.
+                                             expand_size[1]),
+                                     dims=(2, 3)), (k, v))
+
+            (k_tl_windows, k_tr_windows, k_bl_windows, k_br_windows) = map(
+                lambda t: window_partition(t, self.window_size).view(
+                    -1, T, self.window_size[0] * self.window_size[1], self.
+                    num_heads, C // self.num_heads), (k_tl, k_tr, k_bl, k_br))
+            (v_tl_windows, v_tr_windows, v_bl_windows, v_br_windows) = map(
+                lambda t: window_partition(t, self.window_size).view(
+                    -1, T, self.window_size[0] * self.window_size[1], self.
+                    num_heads, C // self.num_heads), (v_tl, v_tr, v_bl, v_br))
+            k_rolled = torch.cat(
+                (k_tl_windows, k_tr_windows, k_bl_windows, k_br_windows),
+                2).permute(0, 3, 1, 2, 4).contiguous()
+            v_rolled = torch.cat(
+                (v_tl_windows, v_tr_windows, v_bl_windows, v_br_windows),
+                2).permute(0, 3, 1, 2, 4).contiguous()
+
+            # mask out tokens in current window
+            k_rolled = k_rolled[:, :, :, self.valid_ind_rolled]
+            v_rolled = v_rolled[:, :, :, self.valid_ind_rolled]
+            temp_N = k_rolled.shape[3]
+            k_rolled = k_rolled.view(-1, self.num_heads, T * temp_N,
+                                     C // self.num_heads)
+            v_rolled = v_rolled.view(-1, self.num_heads, T * temp_N,
+                                     C // self.num_heads)
+            k_rolled = torch.cat((k_windows, k_rolled), 2)
+            v_rolled = torch.cat((v_windows, v_rolled), 2)
+        else:
+            k_rolled = k_windows
+            v_rolled = v_windows
+
+        # q(k/v)_windows shape : [16, 4, 225, 128]
+        # k_rolled.shape : [16, 4, 5, 165, 128]
+        # ideal expanded window size 153 ((5+2*2)*(9+2*4))
+        # k_windows=45 expand_window=108 overlap_window=12 (since expand_size < window_size / 2)
+
+        if self.pool_method != "none" and self.focal_level > 1:
+            k_pooled = []
+            v_pooled = []
+            for k in range(self.focal_level - 1):
+                stride = 2**k
+                # B, T, nWh, nWw, C
+                x_window_pooled = x_all[k + 1].permute(0, 3, 1, 2,
+                                                       4).contiguous()
+
+                nWh, nWw = x_window_pooled.shape[2:4]
+
+                # generate mask for pooled windows
+                mask = x_window_pooled.new(T, nWh, nWw).fill_(1)
+                # unfold mask: [nWh*nWw//s//s, k*k, 1]
+                unfolded_mask = self.unfolds[k](mask.unsqueeze(1)).view(
+                    1, T, self.unfolds[k].kernel_size[0], self.unfolds[k].kernel_size[1], -1).permute(4, 1, 2, 3, 0).contiguous().\
+                    view(nWh*nWw // stride // stride, -1, 1)
+
+                if k > 0:
+                    valid_ind_unfold_k = getattr(
+                        self, "valid_ind_unfold_{}".format(k))
+                    unfolded_mask = unfolded_mask[:, valid_ind_unfold_k]
+
+                x_window_masks = unfolded_mask.flatten(1).unsqueeze(0)
+                x_window_masks = x_window_masks.masked_fill(
+                    x_window_masks == 0,
+                    float(-100.0)).masked_fill(x_window_masks > 0, float(0.0))
+                mask_all[k + 1] = x_window_masks
+
+                # generate k and v for pooled windows
+                qkv_pooled = self.qkv(x_window_pooled).reshape(
+                    B, T, nWh, nWw, 3, C).permute(4, 0, 1, 5, 2,
+                                                  3).view(3, -1, C, nWh,
+                                                          nWw).contiguous()
+                # B*T, C, nWh, nWw
+                k_pooled_k, v_pooled_k = qkv_pooled[1], qkv_pooled[2]
+                # k_pooled_k shape: [5, 512, 4, 4]
+                # self.unfolds[k](k_pooled_k) shape: [5, 23040 (512 * 5 * 9 ), 16]
+
+                (k_pooled_k, v_pooled_k) = map(
+                    lambda t: self.unfolds[k]
+                    (t).view(B, T, C, self.unfolds[k].kernel_size[0], self.
+                             unfolds[k].kernel_size[1], -1)
+                    .permute(0, 5, 1, 3, 4, 2).contiguous().view(
+                        -1, T, self.unfolds[k].kernel_size[0] * self.unfolds[
+                            k].kernel_size[1], self.num_heads, C // self.
+                        num_heads).permute(0, 3, 1, 2, 4).contiguous(),
+                    # (B x (nH*nW)) x nHeads x T x (unfold_wsize x unfold_wsize) x head_dim
+                    (k_pooled_k, v_pooled_k))
+                # k_pooled_k shape : [16, 4, 5, 45, 128]
+
+                # select valid unfolding index
+                if k > 0:
+                    (k_pooled_k, v_pooled_k) = map(
+                        lambda t: t[:, :, :, valid_ind_unfold_k],
+                        (k_pooled_k, v_pooled_k))
+
+                k_pooled_k = k_pooled_k.view(
+                    -1, self.num_heads, T * self.unfolds[k].kernel_size[0] *
+                    self.unfolds[k].kernel_size[1], C // self.num_heads)
+                v_pooled_k = v_pooled_k.view(
+                    -1, self.num_heads, T * self.unfolds[k].kernel_size[0] *
+                    self.unfolds[k].kernel_size[1], C // self.num_heads)
+
+                k_pooled += [k_pooled_k]
+                v_pooled += [v_pooled_k]
+
+            # k_all (v_all) shape : [16, 4, 5 * 210, 128]
+            k_all = torch.cat([k_rolled] + k_pooled, 2)
+            v_all = torch.cat([v_rolled] + v_pooled, 2)
+        else:
+            k_all = k_rolled
+            v_all = v_rolled
+
+        N = k_all.shape[-2]
+        q_windows = q_windows * self.scale
+        # B*nW, nHead, T*window_size*window_size, T*focal_window_size*focal_window_size
+        attn = (q_windows @ k_all.transpose(-2, -1))
+        # T * 45
+        window_area = T * self.window_size[0] * self.window_size[1]
+        # T * 165
+        window_area_rolled = k_rolled.shape[2]
+
+        if self.pool_method != "none" and self.focal_level > 1:
+            offset = window_area_rolled
+            for k in range(self.focal_level - 1):
+                # add attentional mask
+                # mask_all[1] shape [1, 16, T * 45]
+
+                bias = tuple((i + 2**k - 1) for i in self.focal_window)
+
+                if mask_all[k + 1] is not None:
+                    attn[:, :, :window_area, offset:(offset + (T*bias[0]*bias[1]))] = \
+                        attn[:, :, :window_area, offset:(offset + (T*bias[0]*bias[1]))] + \
+                        mask_all[k+1][:, :, None, None, :].repeat(
+                            attn.shape[0] // mask_all[k+1].shape[1], 1, 1, 1, 1).view(-1, 1, 1, mask_all[k+1].shape[-1])
+
+                offset += T * bias[0] * bias[1]
+
+        if mask_all[0] is not None:
+            nW = mask_all[0].shape[0]
+            attn = attn.view(attn.shape[0] // nW, nW, self.num_heads,
+                             window_area, N)
+            attn[:, :, :, :, :
+                 window_area] = attn[:, :, :, :, :window_area] + mask_all[0][
+                     None, :, None, :, :]
+            attn = attn.view(-1, self.num_heads, window_area, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        x = (attn @ v_all).transpose(1, 2).reshape(attn.shape[0], window_area,
+                                                   C)
+        x = self.proj(x)
+        return x
+
+
+class TemporalFocalTransformerBlock(nn.Module):
+    r""" Temporal Focal Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+        focal_level (int):  The number level of focal window.
+        focal_window (int):  Window size of each focal window.
+        n_vecs (int): Required for F3N.
+        t2t_params (int): T2T parameters for F3N.
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(5, 9),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 pool_method="fc",
+                 focal_level=2,
+                 focal_window=(5, 9),
+                 norm_layer=nn.LayerNorm,
+                 n_vecs=None,
+                 t2t_params=None):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.expand_size = tuple(i // 2 for i in window_size)  # TODO
+        self.mlp_ratio = mlp_ratio
+        self.pool_method = pool_method
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+
+        self.window_size_glo = self.window_size
+
+        self.pool_layers = nn.ModuleList()
+        if self.pool_method != "none":
+            for k in range(self.focal_level - 1):
+                window_size_glo = tuple(
+                    math.floor(i / (2**k)) for i in self.window_size_glo)
+                self.pool_layers.append(
+                    nn.Linear(window_size_glo[0] * window_size_glo[1], 1))
+                self.pool_layers[-1].weight.data.fill_(
+                    1. / (window_size_glo[0] * window_size_glo[1]))
+                self.pool_layers[-1].bias.data.fill_(0)
+
+        self.norm1 = norm_layer(dim)
+
+        self.attn = WindowAttention(dim,
+                                    expand_size=self.expand_size,
+                                    window_size=self.window_size,
+                                    focal_window=focal_window,
+                                    focal_level=focal_level,
+                                    num_heads=num_heads,
+                                    qkv_bias=qkv_bias,
+                                    pool_method=pool_method)
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = FusionFeedForward(dim, n_vecs=n_vecs, t2t_params=t2t_params)
+
+    def forward(self, x):
+        output_size = x[1]
+        x = x[0]
+
+        B, T, H, W, C = x.shape
+
+        shortcut = x
+        x = self.norm1(x)
+
+        shifted_x = x
+
+        x_windows_all = [shifted_x]
+        x_window_masks_all = [None]
+
+        # partition windows tuple(i // 2 for i in window_size)
+        if self.focal_level > 1 and self.pool_method != "none":
+            # if we add coarser granularity and the pool method is not none
+            for k in range(self.focal_level - 1):
+                window_size_glo = tuple(
+                    math.floor(i / (2**k)) for i in self.window_size_glo)
+                pooled_h = math.ceil(H / window_size_glo[0]) * (2**k)
+                pooled_w = math.ceil(W / window_size_glo[1]) * (2**k)
+                H_pool = pooled_h * window_size_glo[0]
+                W_pool = pooled_w * window_size_glo[1]
+
+                x_level_k = shifted_x
+                # trim or pad shifted_x depending on the required size
+                if H > H_pool:
+                    trim_t = (H - H_pool) // 2
+                    trim_b = H - H_pool - trim_t
+                    x_level_k = x_level_k[:, :, trim_t:-trim_b]
+                elif H < H_pool:
+                    pad_t = (H_pool - H) // 2
+                    pad_b = H_pool - H - pad_t
+                    x_level_k = F.pad(x_level_k, (0, 0, 0, 0, pad_t, pad_b))
+
+                if W > W_pool:
+                    trim_l = (W - W_pool) // 2
+                    trim_r = W - W_pool - trim_l
+                    x_level_k = x_level_k[:, :, :, trim_l:-trim_r]
+                elif W < W_pool:
+                    pad_l = (W_pool - W) // 2
+                    pad_r = W_pool - W - pad_l
+                    x_level_k = F.pad(x_level_k, (0, 0, pad_l, pad_r))
+
+                x_windows_noreshape = window_partition_noreshape(
+                    x_level_k.contiguous(), window_size_glo
+                )  # B, nw, nw, T, window_size, window_size, C
+                nWh, nWw = x_windows_noreshape.shape[1:3]
+                x_windows_noreshape = x_windows_noreshape.view(
+                    B, nWh, nWw, T, window_size_glo[0] * window_size_glo[1],
+                    C).transpose(4, 5)  # B, nWh, nWw, T, C, wsize**2
+                x_windows_pooled = self.pool_layers[k](
+                    x_windows_noreshape).flatten(-2)  # B, nWh, nWw, T, C
+
+                x_windows_all += [x_windows_pooled]
+                x_window_masks_all += [None]
+
+        # nW*B, T*window_size*window_size, C
+        attn_windows = self.attn(x_windows_all, mask_all=x_window_masks_all)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, T, self.window_size[0],
+                                         self.window_size[1], C)
+        shifted_x = window_reverse(attn_windows, self.window_size, T, H,
+                                   W)  # B T H' W' C
+
+        # FFN
+        x = shortcut + shifted_x
+        y = self.norm2(x)
+        x = x + self.mlp(y.view(B, T * H * W, C), output_size).view(
+            B, T, H, W, C)
+
+        return x, output_size
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/release_model/README.md b/phantom/submodules/phantom-E2FGVI/E2FGVI/release_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b2ae3bcc2c4e717adca2d375352b88de88156a6
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/release_model/README.md
@@ -0,0 +1,11 @@
+Place the downloaded model here.
+
+:link: **Download Links:** [[Google Drive](https://drive.google.com/file/d/1tNJMTJ2gmWdIXJoHVi5-H504uImUiJW9/view?usp=sharing)] [[Baidu Disk](https://pan.baidu.com/s/1qXAErbilY_n_Fh9KB8UF7w?pwd=lsjw)]
+
+The directory structure will be arranged as:
+```
+release_model
+   |- E2FGVI-CVPR22.pth
+   |- i3d_rgb_imagenet.pt (for evaluating VFID metric)
+   |- README.md
+```
\ No newline at end of file
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/test.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..448f10c3d92843f66278b5cda867bdad400ca2d3
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/test.py
@@ -0,0 +1,224 @@
+# -*- coding: utf-8 -*-
+import cv2
+from PIL import Image
+import numpy as np
+import importlib
+import os
+import argparse
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+from matplotlib import animation
+import torch
+
+from core.utils import to_tensors
+
+parser = argparse.ArgumentParser(description="E2FGVI")
+parser.add_argument("-v", "--video", type=str, required=True)
+parser.add_argument("-c", "--ckpt", type=str, required=True)
+parser.add_argument("-m", "--mask", type=str, required=True)
+parser.add_argument("--model", type=str, choices=['e2fgvi', 'e2fgvi_hq'])
+parser.add_argument("--step", type=int, default=10)
+parser.add_argument("--num_ref", type=int, default=-1)
+parser.add_argument("--neighbor_stride", type=int, default=5)
+parser.add_argument("--savefps", type=int, default=24)
+
+# args for e2fgvi_hq (which can handle videos with arbitrary resolution)
+parser.add_argument("--set_size", action='store_true', default=False)
+parser.add_argument("--width", type=int)
+parser.add_argument("--height", type=int)
+
+args = parser.parse_args()
+
+ref_length = args.step  # ref_step
+num_ref = args.num_ref
+neighbor_stride = args.neighbor_stride
+default_fps = args.savefps
+
+
+# sample reference frames from the whole video
+def get_ref_index(f, neighbor_ids, length):
+    ref_index = []
+    if num_ref == -1:
+        for i in range(0, length, ref_length):
+            if i not in neighbor_ids:
+                ref_index.append(i)
+    else:
+        start_idx = max(0, f - ref_length * (num_ref // 2))
+        end_idx = min(length, f + ref_length * (num_ref // 2))
+        for i in range(start_idx, end_idx + 1, ref_length):
+            if i not in neighbor_ids:
+                if len(ref_index) > num_ref:
+                    break
+                ref_index.append(i)
+    return ref_index
+
+
+# read frame-wise masks
+def read_mask(mpath, size):
+    masks = []
+    mnames = os.listdir(mpath)
+    mnames.sort()
+    for mp in mnames:
+        m = Image.open(os.path.join(mpath, mp))
+        m = m.resize(size, Image.NEAREST)
+        m = np.array(m.convert('L'))
+        m = np.array(m > 0).astype(np.uint8)
+        m = cv2.dilate(m,
+                       cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)),
+                       iterations=4)
+        masks.append(Image.fromarray(m * 255))
+    return masks
+
+
+#  read frames from video
+def read_frame_from_videos(args):
+    vname = args.video
+    frames = []
+    if args.use_mp4:
+        vidcap = cv2.VideoCapture(vname)
+        success, image = vidcap.read()
+        count = 0
+        while success:
+            image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+            frames.append(image)
+            success, image = vidcap.read()
+            count += 1
+    else:
+        lst = os.listdir(vname)
+        lst.sort()
+        fr_lst = [vname + '/' + name for name in lst]
+        for fr in fr_lst:
+            image = cv2.imread(fr)
+            image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+            frames.append(image)
+    return frames
+
+
+# resize frames
+def resize_frames(frames, size=None):
+    if size is not None:
+        frames = [f.resize(size) for f in frames]
+    else:
+        size = frames[0].size
+    return frames, size
+
+
+def main_worker():
+    # set up models
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    if args.model == "e2fgvi":
+        size = (432, 240)
+    elif args.set_size:
+        size = (args.width, args.height)
+    else:
+        size = None
+
+    net = importlib.import_module('model.' + args.model)
+    model = net.InpaintGenerator().to(device)
+    data = torch.load(args.ckpt, map_location=device)
+    model.load_state_dict(data)
+    print(f'Loading model from: {args.ckpt}')
+    model.eval()
+
+    # prepare datset
+    args.use_mp4 = True if args.video.endswith('.mp4') else False
+    print(
+        f'Loading videos and masks from: {args.video} | INPUT MP4 format: {args.use_mp4}'
+    )
+    frames = read_frame_from_videos(args)
+    frames, size = resize_frames(frames, size)
+    h, w = size[1], size[0]
+    video_length = len(frames)
+    imgs = to_tensors()(frames).unsqueeze(0) * 2 - 1
+    frames = [np.array(f).astype(np.uint8) for f in frames]
+
+    masks = read_mask(args.mask, size)
+    binary_masks = [
+        np.expand_dims((np.array(m) != 0).astype(np.uint8), 2) for m in masks
+    ]
+    masks = to_tensors()(masks).unsqueeze(0)
+    imgs, masks = imgs.to(device), masks.to(device)
+    comp_frames = [None] * video_length
+
+    # completing holes by e2fgvi
+    print(f'Start test...')
+    for f in tqdm(range(0, video_length, neighbor_stride)):
+        neighbor_ids = [
+            i for i in range(max(0, f - neighbor_stride),
+                             min(video_length, f + neighbor_stride + 1))
+        ]
+        ref_ids = get_ref_index(f, neighbor_ids, video_length)
+        selected_imgs = imgs[:1, neighbor_ids + ref_ids, :, :, :]
+        selected_masks = masks[:1, neighbor_ids + ref_ids, :, :, :]
+        with torch.no_grad():
+            masked_imgs = selected_imgs * (1 - selected_masks)
+            mod_size_h = 60
+            mod_size_w = 108
+            h_pad = (mod_size_h - h % mod_size_h) % mod_size_h
+            w_pad = (mod_size_w - w % mod_size_w) % mod_size_w
+            masked_imgs = torch.cat(
+                [masked_imgs, torch.flip(masked_imgs, [3])],
+                3)[:, :, :, :h + h_pad, :]
+            masked_imgs = torch.cat(
+                [masked_imgs, torch.flip(masked_imgs, [4])],
+                4)[:, :, :, :, :w + w_pad]
+            pred_imgs, _ = model(masked_imgs, len(neighbor_ids))
+            pred_imgs = pred_imgs[:, :, :h, :w]
+            pred_imgs = (pred_imgs + 1) / 2
+            pred_imgs = pred_imgs.cpu().permute(0, 2, 3, 1).numpy() * 255
+            for i in range(len(neighbor_ids)):
+                idx = neighbor_ids[i]
+                img = np.array(pred_imgs[i]).astype(
+                    np.uint8) * binary_masks[idx] + frames[idx] * (
+                        1 - binary_masks[idx])
+                if comp_frames[idx] is None:
+                    comp_frames[idx] = img
+                else:
+                    comp_frames[idx] = comp_frames[idx].astype(
+                        np.float32) * 0.5 + img.astype(np.float32) * 0.5
+
+    # saving videos
+    print('Saving videos...')
+    save_dir_name = 'results'
+    ext_name = '_results.mp4'
+    save_base_name = args.video.split('/')[-1]
+    save_name = save_base_name.replace(
+        '.mp4', ext_name) if args.use_mp4 else save_base_name + ext_name
+    if not os.path.exists(save_dir_name):
+        os.makedirs(save_dir_name)
+    save_path = os.path.join(save_dir_name, save_name)
+    writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*"mp4v"),
+                             default_fps, size)
+    for f in range(video_length):
+        comp = comp_frames[f].astype(np.uint8)
+        writer.write(cv2.cvtColor(comp, cv2.COLOR_BGR2RGB))
+    writer.release()
+    print(f'Finish test! The result video is saved in: {save_path}.')
+
+    # show results
+    print('Let us enjoy the result!')
+    fig = plt.figure('Let us enjoy the result')
+    ax1 = fig.add_subplot(1, 2, 1)
+    ax1.axis('off')
+    ax1.set_title('Original Video')
+    ax2 = fig.add_subplot(1, 2, 2)
+    ax2.axis('off')
+    ax2.set_title('Our Result')
+    imdata1 = ax1.imshow(frames[0])
+    imdata2 = ax2.imshow(comp_frames[0].astype(np.uint8))
+
+    def update(idx):
+        imdata1.set_data(frames[idx])
+        imdata2.set_data(comp_frames[idx].astype(np.uint8))
+
+    fig.tight_layout()
+    anim = animation.FuncAnimation(fig,
+                                   update,
+                                   frames=len(frames),
+                                   interval=50)
+    plt.show()
+
+
+if __name__ == '__main__':
+    main_worker()
diff --git a/phantom/submodules/phantom-E2FGVI/E2FGVI/train.py b/phantom/submodules/phantom-E2FGVI/E2FGVI/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1770db607ae1eb2af3f5a2ce3cd96fa629602d78
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/E2FGVI/train.py
@@ -0,0 +1,89 @@
+import os
+import json
+import argparse
+from shutil import copyfile
+
+import torch
+import torch.multiprocessing as mp
+
+from core.trainer import Trainer
+from core.dist import (
+    get_world_size,
+    get_local_rank,
+    get_global_rank,
+    get_master_ip,
+)
+
+parser = argparse.ArgumentParser(description='E2FGVI')
+parser.add_argument('-c',
+                    '--config',
+                    default='configs/train_e2fgvi.json',
+                    type=str)
+parser.add_argument('-p', '--port', default='23455', type=str)
+args = parser.parse_args()
+
+
+def main_worker(rank, config):
+    if 'local_rank' not in config:
+        config['local_rank'] = config['global_rank'] = rank
+    if config['distributed']:
+        torch.cuda.set_device(int(config['local_rank']))
+        torch.distributed.init_process_group(backend='nccl',
+                                             init_method=config['init_method'],
+                                             world_size=config['world_size'],
+                                             rank=config['global_rank'],
+                                             group_name='mtorch')
+        print('using GPU {}-{} for training'.format(int(config['global_rank']),
+                                                    int(config['local_rank'])))
+
+    config['save_dir'] = os.path.join(
+        config['save_dir'],
+        '{}_{}'.format(config['model']['net'],
+                       os.path.basename(args.config).split('.')[0]))
+
+    config['save_metric_dir'] = os.path.join(
+        './scores',
+        '{}_{}'.format(config['model']['net'],
+                       os.path.basename(args.config).split('.')[0]))
+
+    if torch.cuda.is_available():
+        config['device'] = torch.device("cuda:{}".format(config['local_rank']))
+    else:
+        config['device'] = 'cpu'
+
+    if (not config['distributed']) or config['global_rank'] == 0:
+        os.makedirs(config['save_dir'], exist_ok=True)
+        os.makedirs(config['save_metric_dir'], exist_ok=True)
+        config_path = os.path.join(config['save_dir'],
+                                   args.config.split('/')[-1])
+        if not os.path.isfile(config_path):
+            copyfile(args.config, config_path)
+        print('[**] create folder {}'.format(config['save_dir']))
+
+    trainer = Trainer(config)
+    trainer.train()
+
+
+if __name__ == "__main__":
+
+    torch.backends.cudnn.benchmark = True
+
+    mp.set_sharing_strategy('file_system')
+
+    # loading configs
+    config = json.load(open(args.config))
+
+    # setting distributed configurations
+    config['world_size'] = get_world_size()
+    config['init_method'] = f"tcp://{get_master_ip()}:{args.port}"
+    config['distributed'] = True if config['world_size'] > 1 else False
+    print(config['world_size'])
+    # setup distributed parallel training environments
+    if get_master_ip() == "127.0.0.1":
+        # manually launch distributed processes
+        mp.spawn(main_worker, nprocs=config['world_size'], args=(config, ))
+    else:
+        # multiple processes have been launched by openmpi
+        config['local_rank'] = get_local_rank()
+        config['global_rank'] = get_global_rank()
+        main_worker(-1, config)
diff --git a/phantom/submodules/phantom-E2FGVI/LICENSE b/phantom/submodules/phantom-E2FGVI/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..17bc97bff80068baf08757e6e2ffd03a2c1208d4
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/LICENSE
@@ -0,0 +1,163 @@
+## creative commons
+
+# Attribution-NonCommercial 4.0 International
+
+Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible.
+
+### Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses.
+
+* __Considerations for licensors:__ Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. [More considerations for licensors](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensors).
+
+* __Considerations for the public:__ By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. [More considerations for the public](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensees).
+
+## Creative Commons Attribution-NonCommercial 4.0 International Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
+
+### Section 1 – Definitions.
+
+a. __Adapted Material__ means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
+
+b. __Adapter's License__ means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
+
+c. __Copyright and Similar Rights__ means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
+
+d. __Effective Technological Measures__ means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
+
+e. __Exceptions and Limitations__ means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
+
+f. __Licensed Material__ means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
+
+g. __Licensed Rights__ means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
+
+h. __Licensor__ means the individual(s) or entity(ies) granting rights under this Public License.
+
+i. __NonCommercial__ means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
+
+j. __Share__ means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
+
+k. __Sui Generis Database Rights__ means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
+
+l. __You__ means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
+
+### Section 2 – Scope.
+
+a. ___License grant.___
+
+   1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
+
+       A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
+
+       B. produce, reproduce, and Share Adapted Material for NonCommercial purposes only.
+
+   2. __Exceptions and Limitations.__ For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
+
+   3. __Term.__ The term of this Public License is specified in Section 6(a).
+
+   4. __Media and formats; technical modifications allowed.__ The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
+
+   5. __Downstream recipients.__
+
+        A. __Offer from the Licensor – Licensed Material.__ Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
+
+        B. __No downstream restrictions.__ You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
+
+   6. __No endorsement.__ Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
+
+b. ___Other rights.___
+
+   1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
+
+   2. Patent and trademark rights are not licensed under this Public License.
+
+   3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
+
+### Section 3 – License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the following conditions.
+
+a. ___Attribution.___
+
+   1. If You Share the Licensed Material (including in modified form), You must:
+
+       A. retain the following if it is supplied by the Licensor with the Licensed Material:
+
+         i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
+
+         ii. a copyright notice;
+
+         iii. a notice that refers to this Public License;
+
+         iv. a notice that refers to the disclaimer of warranties;
+
+         v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
+
+       B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
+
+       C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
+
+   2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
+
+   3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
+
+   4. If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License.
+
+### Section 4 – Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
+
+a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
+
+b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
+
+c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
+
+### Section 5 – Disclaimer of Warranties and Limitation of Liability.
+
+a. __Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.__
+
+b. __To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.__
+
+c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
+
+### Section 6 – Term and Termination.
+
+a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
+
+b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
+
+   1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
+
+   2. upon express reinstatement by the Licensor.
+
+   For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
+
+c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
+
+d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
+
+### Section 7 – Other Terms and Conditions.
+
+a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
+
+b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
+
+### Section 8 – Interpretation.
+
+a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
+
+b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
+
+c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
+
+d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
+
+> Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at [creativecommons.org/policies](http://creativecommons.org/policies), Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses.
+>
+> Creative Commons may be contacted at creativecommons.org
+
+Copyright (c) 2022 MCG-NKU
diff --git a/phantom/submodules/phantom-E2FGVI/README.md b/phantom/submodules/phantom-E2FGVI/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..70ed3db1788508ea2887f78d48b629f85d5a4d8a
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/README.md
@@ -0,0 +1,297 @@
+# E<sup>2</sup>FGVI (CVPR 2022)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/towards-an-end-to-end-framework-for-flow/video-inpainting-on-davis)](https://paperswithcode.com/sota/video-inpainting-on-davis?p=towards-an-end-to-end-framework-for-flow)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/towards-an-end-to-end-framework-for-flow/video-inpainting-on-youtube-vos)](https://paperswithcode.com/sota/video-inpainting-on-youtube-vos?p=towards-an-end-to-end-framework-for-flow)
+
+![Python 3.7](https://img.shields.io/badge/python-3.7-green.svg?style=plastic)
+![pytorch 1.6.0](https://img.shields.io/badge/pytorch-1.5.1-green.svg?style=plastic)
+
+English | [简体中文](README_zh-CN.md)
+
+This repository contains the official implementation of the following paper:
+> **Towards An End-to-End Framework for Flow-Guided Video Inpainting**<br>
+> Zhen Li<sup>#</sup>, Cheng-Ze Lu<sup>#</sup>, Jianhua Qin, Chun-Le Guo<sup>*</sup>, Ming-Ming Cheng<br>
+> IEEE/CVF Conference on Computer Vision and Pattern Recognition (**CVPR**), 2022<br>
+
+[[Paper](https://arxiv.org/abs/2204.02663)]
+[[Demo Video (Youtube)](https://www.youtube.com/watch?v=N--qC3T2wc4)]
+[[演示视频 (B站)](https://www.bilibili.com/video/BV1Ta411n7eH?spm_id_from=333.999.0.0)]
+[[MindSpore Implementation](https://github.com/Dragoniss/minspore-phase2-E2FGVI)]
+[Project Page (TBD)]
+[Poster (TBD)]
+
+You can try our colab demo here: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/12rwY2gtG8jVWlNx9pjmmM8uGmh5ue18G?usp=sharing)
+
+## :star: News
+- *2022.05.15:* We release E<sup>2</sup>FGVI-HQ, which can handle videos with **arbitrary resolution**. This model could generalize well to much higher resolutions, while it only used 432x240 videos for training. Besides, it performs **better** than our original model on both PSNR and SSIM metrics. 
+:link: Download links: [[Google Drive](https://drive.google.com/file/d/10wGdKSUOie0XmCr8SQ2A2FeDe-mfn5w3/view?usp=sharing)] [[Baidu Disk](https://pan.baidu.com/s/1jfm1oFU1eIy-IRfuHP8YXw?pwd=ssb3)] :movie_camera: Demo video: [[Youtube](https://www.youtube.com/watch?v=N--qC3T2wc4)] [[B站](https://www.bilibili.com/video/BV1Ta411n7eH?spm_id_from=333.999.0.0)]
+
+- *2022.04.06:* Our code is publicly available.
+## Demo
+
+![teaser](./figs/teaser.gif)
+
+### More examples (click for details):
+
+<table>
+<tr>
+   <td> 
+      <details> 
+      <summary> 
+      <strong>Coco (click me)</strong>
+      </summary> 
+      <img src="https://user-images.githubusercontent.com/21050959/159160822-8ed5947c-e91d-4597-8e20-4b443a2244ed.gif">
+      </details>
+   </td>
+   <td> 
+      <details> 
+      <summary> 
+      <strong>Tennis </strong>
+      </summary> 
+      <img src="https://user-images.githubusercontent.com/21050959/159160843-4b167115-e338-4e0b-9ca4-b564233c2c7a.gif">
+      </details>
+   </td>
+</tr>
+<tr>
+   <td> 
+      <details> 
+      <summary> 
+      <strong>Space </strong>
+      </summary> 
+      <img src="https://user-images.githubusercontent.com/21050959/159171328-1222c70e-9bb9-47e3-b765-4b1baaf631f5.gif">
+      </details>
+   </td>
+   <td> 
+      <details> 
+      <summary> 
+      <strong>Motocross </strong>
+      </summary> 
+      <img src="https://user-images.githubusercontent.com/21050959/159163010-ed78b4bd-c8dd-472c-ad3e-82bc8baca43a.gif">
+      </details>
+   </td>
+</tr>
+</table>
+
+## Overview
+![overall_structure](./figs/framework.png)
+
+### :rocket: Highlights:
+- **SOTA performance**: The proposed E<sup>2</sup>FGVI achieves significant improvements on all quantitative metrics in comparison with SOTA methods.
+- **Highly effiency**: Our method processes 432 × 240 videos at 0.12 seconds per frame on a Titan XP GPU, which is nearly 15× faster than previous flow-based methods. Besides, our method has the lowest FLOPs among all compared SOTA
+methods.
+
+## Work in Progress
+- [ ] Update website page
+- [ ] Hugging Face demo
+- [ ] Efficient inference
+
+## Dependencies and Installation
+
+1. Clone Repo
+
+   ```bash
+   git clone https://github.com/MCG-NKU/E2FGVI.git
+   ```
+
+2. Create Conda Environment and Install Dependencies
+
+   ```bash
+   conda env create -f environment.yml
+   conda activate e2fgvi
+   ```
+   - Python >= 3.7
+   - PyTorch >= 1.5
+   - CUDA >= 9.2
+   - [mmcv-full](https://github.com/open-mmlab/mmcv#installation) (following the pipeline to install)
+
+   If the `environment.yml` file does not work for you, please follow [this issue](https://github.com/MCG-NKU/E2FGVI/issues/3) to solve the problem.
+
+## Get Started
+### Prepare pretrained models
+Before performing the following steps, please download our pretrained model first.
+
+<table>
+<thead>
+  <tr>
+    <th>Model</th>
+    <th>:link: Download Links </th>
+    <th>Support Arbitrary Resolution ?</th>
+    <th> PSNR / SSIM / VFID (DAVIS) </th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>E<sup>2</sup>FGVI</td>
+    <th>
+       [<a href="https://drive.google.com/file/d/1tNJMTJ2gmWdIXJoHVi5-H504uImUiJW9/view?usp=sharing">Google Drive</a>] 
+       [<a href="https://pan.baidu.com/s/1qXAErbilY_n_Fh9KB8UF7w?pwd=lsjw">Baidu Disk</a>]
+    </th>
+    <th>:x:</th>
+    <th>33.01 / 0.9721 / 0.116</th>
+  </tr>
+  <tr>
+    <td>E<sup>2</sup>FGVI-HQ</td>
+    <th>
+       [<a href="https://drive.google.com/file/d/10wGdKSUOie0XmCr8SQ2A2FeDe-mfn5w3/view?usp=sharing">Google Drive</a>] 
+       [<a href="https://pan.baidu.com/s/1jfm1oFU1eIy-IRfuHP8YXw?pwd=ssb3">Baidu Disk</a>]
+    </th>
+    <th>:o:</th>
+    <th>33.06 / 0.9722 / 0.117</th>
+  </tr>
+</tbody>
+</table>
+
+Then, unzip the file and place the models to `release_model` directory.
+
+The directory structure will be arranged as:
+```
+release_model
+   |- E2FGVI-CVPR22.pth
+   |- E2FGVI-HQ-CVPR22.pth
+   |- i3d_rgb_imagenet.pt (for evaluating VFID metric)
+   |- README.md
+```
+
+### Quick test
+We provide two examples in the [`examples`](./examples) directory.
+
+Run the following command to enjoy them:
+```shell
+# The first example (using split video frames)
+python test.py --model e2fgvi (or e2fgvi_hq) --video examples/tennis --mask examples/tennis_mask  --ckpt release_model/E2FGVI-CVPR22.pth (or release_model/E2FGVI-HQ-CVPR22.pth)
+# The second example (using mp4 format video)
+python test.py --model e2fgvi (or e2fgvi_hq) --video examples/schoolgirls.mp4 --mask examples/schoolgirls_mask  --ckpt release_model/E2FGVI-CVPR22.pth (or release_model/E2FGVI-HQ-CVPR22.pth)
+```
+The inpainting video will be saved in the `results` directory.
+Please prepare your own **mp4 video** (or **split frames**) and **frame-wise masks** if you want to test more cases.
+
+*Note:* E<sup>2</sup>FGVI always rescales the input video to a fixed resolution (432x240), while E<sup>2</sup>FGVI-HQ does not change the resolution of the input video. If you want to custom the output resolution, please use the `--set_size` flag and set the values of `--width` and `--height`.
+
+Example:
+```shell
+# Using this command to output a 720p video
+python test.py --model e2fgvi_hq --video <video_path> --mask <mask_path>  --ckpt release_model/E2FGVI-HQ-CVPR22.pth --set_size --width 1280 --height 720
+```
+
+
+### Prepare dataset for training and evaluation
+<table>
+<thead>
+  <tr>
+    <th>Dataset</th>
+    <th>YouTube-VOS</th>
+    <th>DAVIS</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>Details</td>
+    <td>For training (3,471) and evaluation (508)</td>
+    <td>For evaluation (50 in 90)</td>
+  <tr>
+    <td>Images</td>
+    <td> [<a href="https://competitions.codalab.org/competitions/19544#participate-get-data">Official Link</a>] (Download train and test all frames) </td>
+    <td> [<a href="https://data.vision.ee.ethz.ch/csergi/share/davis/DAVIS-2017-trainval-480p.zip">Official Link</a>] (2017, 480p, TrainVal) </td>
+  </tr>
+  <tr>
+    <td>Masks</td>
+    <td colspan="2"> [<a href="https://drive.google.com/file/d/1dFTneS_zaJAHjglxU10gYzr1-xALgHa4/view?usp=sharing">Google Drive</a>] [<a href="https://pan.baidu.com/s/1JC-UKmlQfjhVtD81196cxA?pwd=87e3">Baidu Disk</a>] (For reproducing paper results) </td>
+  </tr>
+</tbody>
+</table>
+
+The training and test split files are provided in `datasets/<dataset_name>`.
+
+For each dataset, you should place `JPEGImages` to `datasets/<dataset_name>`.
+
+Then, run `sh datasets/zip_dir.sh` (**Note**: please edit the folder path accordingly) for compressing each video in `datasets/<dataset_name>/JPEGImages`.
+
+Unzip downloaded mask files to `datasets`.
+
+The `datasets` directory structure will be arranged as: (**Note**: please check it carefully)
+```
+datasets
+   |- davis
+      |- JPEGImages
+         |- <video_name>.zip
+         |- <video_name>.zip
+      |- test_masks
+         |- <video_name>
+            |- 00000.png
+            |- 00001.png   
+      |- train.json
+      |- test.json
+   |- youtube-vos
+      |- JPEGImages
+         |- <video_id>.zip
+         |- <video_id>.zip
+      |- test_masks
+         |- <video_id>
+            |- 00000.png
+            |- 00001.png
+      |- train.json
+      |- test.json   
+   |- zip_file.sh
+```
+### Evaluation
+Run one of the following commands for evaluation:
+```shell
+ # For evaluating E2FGVI model
+ python evaluate.py --model e2fgvi --dataset <dataset_name> --data_root datasets/ --ckpt release_model/E2FGVI-CVPR22.pth
+ # For evaluating E2FGVI-HQ model
+ python evaluate.py --model e2fgvi_hq --dataset <dataset_name> --data_root datasets/ --ckpt release_model/E2FGVI-HQ-CVPR22.pth
+
+```
+You will get scores as paper reported if you evaluate E<sup>2</sup>FGVI.
+The scores of E<sup>2</sup>FGVI-HQ can be found in [[Prepare pretrained models](https://github.com/MCG-NKU/E2FGVI#prepare-pretrained-models)].
+
+The scores will also be saved in the `results/<model_name>_<dataset_name>` directory.
+
+Please `--save_results` for further [evaluating temporal warping error](https://github.com/phoenix104104/fast_blind_video_consistency#evaluation).
+
+### Training
+Our training configures are provided in [`train_e2fgvi.json`](./configs/train_e2fgvi.json) (for E<sup>2</sup>FGVI) and [`train_e2fgvi_hq.json`](./configs/train_e2fgvi_hq.json) (for E<sup>2</sup>FGVI-HQ).
+
+Run one of the following commands for training:
+```shell
+ # For training E2FGVI
+ python train.py -c configs/train_e2fgvi.json
+ # For training E2FGVI-HQ
+ python train.py -c configs/train_e2fgvi_hq.json
+```
+You could run the same command if you want to resume your training.
+
+The training loss can be monitored by running:
+```shell
+tensorboard --logdir release_model                                                   
+```
+
+You could follow [this pipeline](https://github.com/MCG-NKU/E2FGVI#evaluation) to evaluate your model.
+## Results  
+
+### Quantitative results
+![quantitative_results](./figs/quantitative_results.png)
+## Citation
+
+   If you find our repo useful for your research, please consider citing our paper:
+
+   ```bibtex
+   @inproceedings{liCvpr22vInpainting,
+      title={Towards An End-to-End Framework for Flow-Guided Video Inpainting},
+      author={Li, Zhen and Lu, Cheng-Ze and Qin, Jianhua and Guo, Chun-Le and Cheng, Ming-Ming},
+      booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+      year={2022}
+   }
+   ```
+## Contact
+
+If you have any question, please feel free to contact us via `zhenli1031ATgmail.com` or `czlu919AToutlook.com`.
+
+## License
+Licensed under a [Creative Commons Attribution-NonCommercial 4.0 International](https://creativecommons.org/licenses/by-nc/4.0/) for Non-commercial use only.
+Any commercial use should get formal permission first.
+
+## Acknowledgement
+
+This repository is maintained by [Zhen Li](https://paper99.github.io) and [Cheng-Ze Lu](https://github.com/LGYoung).
+
+This code is based on [STTN](https://github.com/researchmm/STTN), [FuseFormer](https://github.com/ruiliu-ai/FuseFormer), [Focal-Transformer](https://github.com/microsoft/Focal-Transformer), and [MMEditing](https://github.com/open-mmlab/mmediting).
diff --git a/phantom/submodules/phantom-E2FGVI/README_zh-CN.md b/phantom/submodules/phantom-E2FGVI/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..a00726ff6e27dc34f555c7a236c5738686aedba5
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/README_zh-CN.md
@@ -0,0 +1,294 @@
+# E<sup>2</sup>FGVI (CVPR 2022)-简体中文
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/towards-an-end-to-end-framework-for-flow/video-inpainting-on-davis)](https://paperswithcode.com/sota/video-inpainting-on-davis?p=towards-an-end-to-end-framework-for-flow)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/towards-an-end-to-end-framework-for-flow/video-inpainting-on-youtube-vos)](https://paperswithcode.com/sota/video-inpainting-on-youtube-vos?p=towards-an-end-to-end-framework-for-flow)
+
+![Python 3.7](https://img.shields.io/badge/python-3.7-green.svg?style=plastic)
+![pytorch 1.6.0](https://img.shields.io/badge/pytorch-1.5.1-green.svg?style=plastic)
+
+[English](README.md) | 简体中文
+
+本项目包含了以下论文的官方实现：
+> **Towards An End-to-End Framework for Flow-Guided Video Inpainting**<br>
+> Zhen Li<sup>#</sup>, Cheng-Ze Lu<sup>#</sup>, Jianhua Qin, Chun-Le Guo<sup>*</sup>, Ming-Ming Cheng<br>
+> IEEE/CVF Conference on Computer Vision and Pattern Recognition (**CVPR**), 2022<br>
+
+[[论文](https://arxiv.org/abs/2204.02663)]
+[[Demo Video (Youtube)](https://www.youtube.com/watch?v=N--qC3T2wc4)]
+[[演示视频 (B站)](https://www.bilibili.com/video/BV1Ta411n7eH?spm_id_from=333.999.0.0)]
+[项目主页 (待定)]
+[海报 (待定)]
+
+Colab实例：[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/12rwY2gtG8jVWlNx9pjmmM8uGmh5ue18G?usp=sharing)
+
+## :star: 最新进展
+- *2022.05.15:* 可适配**任意分辨率**的E<sup>2</sup>FGVI-HQ已发布.该模型仅需要在 432x240 的分辨率下进行训练, 即可适配更高分辨率下的推理任务.并且, 该模型比原先模型能够取得**更好**的PSNR/SSIM指标.
+:link: 下载链接: [[Google Drive](https://drive.google.com/file/d/10wGdKSUOie0XmCr8SQ2A2FeDe-mfn5w3/view?usp=sharing)] [[Baidu Disk](https://pan.baidu.com/s/1jfm1oFU1eIy-IRfuHP8YXw?pwd=ssb3)] :movie_camera: 演示视频: [[Youtube](https://www.youtube.com/watch?v=N--qC3T2wc4)] [[B站](https://www.bilibili.com/video/BV1Ta411n7eH?spm_id_from=333.999.0.0)]
+
+- *2022.04.06:* 代码公开发布.
+## 演示视频
+
+![teaser](./figs/teaser.gif)
+
+### 更多示例 (点击查看详情):
+
+<table>
+<tr>
+   <td> 
+      <details> 
+      <summary> 
+      <strong>Coco </strong>
+      </summary> 
+      <img src="https://user-images.githubusercontent.com/21050959/159160822-8ed5947c-e91d-4597-8e20-4b443a2244ed.gif">
+      </details>
+   </td>
+   <td> 
+      <details> 
+      <summary> 
+      <strong>Tennis </strong>
+      </summary> 
+      <img src="https://user-images.githubusercontent.com/21050959/159160843-4b167115-e338-4e0b-9ca4-b564233c2c7a.gif">
+      </details>
+   </td>
+</tr>
+<tr>
+   <td> 
+      <details> 
+      <summary> 
+      <strong>Space </strong>
+      </summary> 
+      <img src="https://user-images.githubusercontent.com/21050959/159171328-1222c70e-9bb9-47e3-b765-4b1baaf631f5.gif">
+      </details>
+   </td>
+   <td> 
+      <details> 
+      <summary> 
+      <strong>Motocross </strong>
+      </summary> 
+      <img src="https://user-images.githubusercontent.com/21050959/159163010-ed78b4bd-c8dd-472c-ad3e-82bc8baca43a.gif">
+      </details>
+   </td>
+</tr>
+</table>
+
+## 概述
+![overall_structure](./figs/framework.png)
+
+### :rocket: 特性：
+- **更好的性能**: 本文提出的E<sup>2</sup>FGVI模型相较于现有工作在所有量化指标上取得了显著提升.
+- **更快的速度**: 本文的方法在一张Titan XP GPU上, 处理分辨率为 432 × 240 的视频大约需要0.12秒/帧, 大约是前有的基于光流的方法的15倍.除此以外, 本文的方法相较于之前最优的方法具有最低的FLOPs计算量.
+
+## 正在进行中的工作
+- [ ] 更新项目主页
+- [ ] Hugging Face 演示
+- [ ] 更高效的推理过程
+
+## 安装
+
+1. 克隆仓库
+
+   ```bash
+   git clone https://github.com/MCG-NKU/E2FGVI.git
+   ```
+
+2. 创建Conda环境并且安装依赖
+
+   ```bash
+   conda env create -f environment.yml
+   conda activate e2fgvi
+   ```
+   - Python >= 3.7
+   - PyTorch >= 1.5
+   - CUDA >= 9.2
+   - [mmcv-full](https://github.com/open-mmlab/mmcv#installation) (following the pipeline to install)
+
+   若无法使用`environment.yml`安装依赖, 请参照[此处](https://github.com/MCG-NKU/E2FGVI/issues/3).
+
+## 快速入门
+### 准备预训练模型
+首先请下载预训练模型
+
+<table>
+<thead>
+  <tr>
+    <th>模型</th>
+    <th>:link: 下载链接 </th>
+    <th>支持任意分辨率 ?</th>
+    <th> PSNR / SSIM / VFID (DAVIS) </th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>E<sup>2</sup>FGVI</td>
+    <th>
+       [<a href="https://drive.google.com/file/d/1tNJMTJ2gmWdIXJoHVi5-H504uImUiJW9/view?usp=sharing">谷歌网盘</a>] 
+       [<a href="https://pan.baidu.com/s/1qXAErbilY_n_Fh9KB8UF7w?pwd=lsjw">百度网盘</a>]
+    </th>
+    <th>:x:</th>
+    <th>33.01 / 0.9721 / 0.116</th>
+  </tr>
+  <tr>
+    <td>E<sup>2</sup>FGVI-HQ</td>
+    <th>
+       [<a href="https://drive.google.com/file/d/10wGdKSUOie0XmCr8SQ2A2FeDe-mfn5w3/view?usp=sharing">谷歌网盘</a>] 
+       [<a href="https://pan.baidu.com/s/1jfm1oFU1eIy-IRfuHP8YXw?pwd=ssb3">百度网盘</a>]
+    </th>
+    <th>:o:</th>
+    <th>33.06 / 0.9722 / 0.117</th>
+  </tr>
+</tbody>
+</table>
+
+然后, 解压文件并且将模型放入`release_model`文件夹下. 
+
+文件夹目录结构如下：
+```
+release_model
+   |- E2FGVI-CVPR22.pth
+   |- E2FGVI-HQ-CVPR22.pth
+   |- i3d_rgb_imagenet.pt (for evaluating VFID metric)
+   |- README.md
+```
+
+### 测试
+我们提供了两个测试[`示例`](./examples)
+
+使用如下命令运行：
+```shell
+# 第一个示例 （使用视频帧）
+python test.py --model e2fgvi (or e2fgvi_hq) --video examples/tennis --mask examples/tennis_mask  --ckpt release_model/E2FGVI-CVPR22.pth (or release_model/E2FGVI-HQ-CVPR22.pth)
+# 第二个示例 （使用mp4格式的视频）
+python test.py --model e2fgvi (or e2fgvi_hq) --video examples/schoolgirls.mp4 --mask examples/schoolgirls_mask  --ckpt release_model/E2FGVI-CVPR22.pth (or release_model/E2FGVI-HQ-CVPR22.pth)
+```
+视频补全的结果会被保存在`results`路径下.若果想要测试更多样例, 请准备**mp4视频**（或**视频帧**）以及**每一帧的mask**.
+
+*注意：* E<sup>2</sup>FGVI会将输入视频放缩到固定的分辨率（432x240）, 然而E<sup>2</sup>FGVI-HQ不会改变输入视频的分辨率.如果需要自定义输出的分辨率, 请设置`--set_size`参数以及设置输出分辨率的`--width`和`--height`值.
+
+例:
+```shell
+# 使用该命令输入720p视频
+python test.py --model e2fgvi_hq --video <video_path> --mask <mask_path>  --ckpt release_model/E2FGVI-HQ-CVPR22.pth --set_size --width 1280 --height 720
+```
+
+
+### 准备训练与验证集
+<table>
+<thead>
+  <tr>
+    <th>数据集</th>
+    <th>YouTube-VOS</th>
+    <th>DAVIS</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>详情</td>
+    <td>训练: 3,471, 验证: 508</td>
+    <td>验证: 50 (共90)</td>
+  <tr>
+    <td>Images</td>
+    <td> [<a href="https://competitions.codalab.org/competitions/19544#participate-get-data">官方链接</a>] (下载全部训练测试集) </td>
+    <td> [<a href="https://data.vision.ee.ethz.ch/csergi/share/davis/DAVIS-2017-trainval-480p.zip">官方链接</a>] (2017, 480p, TrainVal) </td>
+  </tr>
+  <tr>
+    <td>Masks</td>
+    <td colspan="2"> [<a href="https://drive.google.com/file/d/1dFTneS_zaJAHjglxU10gYzr1-xALgHa4/view?usp=sharing">谷歌网盘</a>] [<a href="https://pan.baidu.com/s/1JC-UKmlQfjhVtD81196cxA?pwd=87e3">百度网盘</a>] (复现论文结果) </td>
+  </tr>
+</tbody>
+</table>
+
+训练与测试集分割文件位于 `datasets/<dataset_name>`.
+
+对于每一个数据集, 需要将 `JPEGImages` 放入 `datasets/<dataset_name>`目录下.
+
+然后, 运行 `sh datasets/zip_dir.sh` (**注意**: 请编辑对应的目录路径) 来压缩位于`datasets/<dataset_name>/JPEGImages`的每一个视频.
+
+将下载的mask解压缩至 `datasets`.
+
+`datasets`目录结构如下: (**注意**: 请仔细核验)
+```
+datasets
+   |- davis
+      |- JPEGImages
+         |- <video_name>.zip
+         |- <video_name>.zip
+      |- test_masks
+         |- <video_name>
+            |- 00000.png
+            |- 00001.png   
+      |- train.json
+      |- test.json
+   |- youtube-vos
+      |- JPEGImages
+         |- <video_id>.zip
+         |- <video_id>.zip
+      |- test_masks
+         |- <video_id>
+            |- 00000.png
+            |- 00001.png
+      |- train.json
+      |- test.json   
+   |- zip_file.sh
+```
+### Evaluation
+运行如下的一个命令进行验证:
+```shell
+ # 验证E2FGVI模型
+ python evaluate.py --model e2fgvi --dataset <dataset_name> --data_root datasets/ --ckpt release_model/E2FGVI-CVPR22.pth
+ # 验证E2FGVI-HQ模型
+ python evaluate.py --model e2fgvi_hq --dataset <dataset_name> --data_root datasets/ --ckpt release_model/E2FGVI-HQ-CVPR22.pth
+
+```
+若你验证 E<sup>2</sup>FGVI 模型, 那么将会得到论文中的验证结果.
+E<sup>2</sup>FGVI-HQ 的验证结果请参考 [[此处](https://github.com/MCG-NKU/E2FGVI#prepare-pretrained-models)].
+
+验证结果将被保存在 `results/<model_name>_<dataset_name>` 目录下.
+
+若需[验证temporal warping error](https://github.com/phoenix104104/fast_blind_video_consistency#evaluation), 请添加 `--save_results` 参数.
+
+### 训练
+Our training configures are provided in [`train_e2fgvi.json`](./configs/train_e2fgvi.json) (for E<sup>2</sup>FGVI) and [`train_e2fgvi_hq.json`](./configs/train_e2fgvi_hq.json) (for E<sup>2</sup>FGVI-HQ).
+
+本文的训练配置如 [`train_e2fgvi.json`](./configs/train_e2fgvi.json) (对于 E<sup>2</sup>FGVI) 与 [`train_e2fgvi_hq.json`](./configs/train_e2fgvi_hq.json) (对于 E<sup>2</sup>FGVI-HQ) 所示.
+
+运行如下的一条命令进行训练：
+```shell
+ # 训练 E2FGVI
+ python train.py -c configs/train_e2fgvi.json
+ # 训练 E2FGVI-HQ
+ python train.py -c configs/train_e2fgvi_hq.json
+```
+如果需要恢复训练, 请运行相同的指令.
+
+训练损失能够使用如下命令可视化：
+```shell
+tensorboard --logdir release_model                                                   
+```
+
+请使用上述[步骤](https://github.com/MCG-NKU/E2FGVI#evaluation)来验证训练的模型.
+
+## 结果  
+
+### 定量结果
+![quantitative_results](./figs/quantitative_results.png)
+## 引用
+
+   若我们的仓库对你的研究内容有帮助, 请参考如下 bibtex 引用本文：
+
+   ```bibtex
+   @inproceedings{liCvpr22vInpainting,
+      title={Towards An End-to-End Framework for Flow-Guided Video Inpainting},
+      author={Li, Zhen and Lu, Cheng-Ze and Qin, Jianhua and Guo, Chun-Le and Cheng, Ming-Ming},
+      booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+      year={2022}
+   }
+   ```
+## 联系方式
+
+若有任何疑问, 请通过`zhenli1031ATgmail.com` 或 `czlu919AToutlook.com`联系.
+
+
+## 致谢
+
+该仓库由 [Zhen Li](https://paper99.github.io) 与 [Cheng-Ze Lu](https://github.com/LGYoung) 维护.
+
+代码基于 [STTN](https://github.com/researchmm/STTN), [FuseFormer](https://github.com/ruiliu-ai/FuseFormer), [Focal-Transformer](https://github.com/microsoft/Focal-Transformer), 与 [MMEditing](https://github.com/open-mmlab/mmediting).
diff --git a/phantom/submodules/phantom-E2FGVI/datasets/davis/test.json b/phantom/submodules/phantom-E2FGVI/datasets/davis/test.json
new file mode 100644
index 0000000000000000000000000000000000000000..54875df42cba3451a6c3f2642706652ae087996a
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/datasets/davis/test.json
@@ -0,0 +1 @@
+{"bear": 82, "blackswan": 50, "bmx-bumps": 90, "bmx-trees": 80, "boat": 75, "breakdance": 84, "breakdance-flare": 71, "bus": 80, "camel": 90, "car-roundabout": 75, "car-shadow": 40, "car-turn": 80, "cows": 104, "dance-jump": 60, "dance-twirl": 90, "dog": 60, "dog-agility": 25,  "drift-chicane": 52, "drift-straight": 50, "drift-turn": 64, "elephant": 80, "flamingo": 80, "goat": 90, "hike": 80, "hockey": 75, "horsejump-high": 50, "horsejump-low": 60,  "kite-surf": 50, "kite-walk": 80, "libby": 49, "lucia": 70, "mallard-fly": 70, "mallard-water": 80, "motocross-bumps": 60, "motocross-jump": 40, "motorbike": 43, "paragliding": 70, "paragliding-launch": 80, "parkour": 100,  "rhino": 90, "rollerblade": 35, "scooter-black": 43,  "scooter-gray": 75, "soapbox": 99, "soccerball": 48, "stroller": 91, "surf": 55, "swing": 60, "tennis": 70, "train": 80}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-E2FGVI/datasets/davis/train.json b/phantom/submodules/phantom-E2FGVI/datasets/davis/train.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f63b2d95553e8ab606d9c207a6a8ae56a28035c
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/datasets/davis/train.json
@@ -0,0 +1 @@
+{"baseball": 90, "basketball-game": 77, "bears-ball": 78, "bmx-rider": 85, "butterfly": 80, "car-competition": 66, "cat": 52, "chairlift": 99, "circus": 73, "city-ride": 70, "crafting": 45, "curling": 76, "dog-competition": 85, "dolphins-show": 74, "dribbling": 49, "drone-flying": 70, "ducks": 75, "elephant-hyenas": 55, "giraffes": 88, "gym-ball": 69, "helicopter-landing": 77, "horse-race": 80, "horses-kids": 78, "hurdles-race": 55, "ice-hockey": 52, "jet-ski": 83, "juggling-selfie": 78, "kayak-race": 63, "kids-robot": 75, "landing": 35, "luggage": 83, "mantaray": 73, "marbles": 70, "mascot": 78, "mermaid": 78, "monster-trucks": 99, "motorbike-indoors": 79, "motorbike-race": 88, "music-band": 87, "obstacles": 81, "obstacles-race": 48, "peacock": 75, "plane-exhibition": 73, "puppet": 100, "robot-battle": 85, "robotic-arm": 82, "rodeo": 85, "sea-turtle": 90, "skydiving-jumping": 75, "snowboard-race": 75, "snowboard-sand": 55, "surfer": 80, "swimmer": 86, "table-tennis": 70, "tram": 84, "trucks-race": 78, "twist-dance": 83, "volleyball-beach": 73, "water-slide": 88, "weightlifting": 90}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-E2FGVI/datasets/youtube-vos/test.json b/phantom/submodules/phantom-E2FGVI/datasets/youtube-vos/test.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4d79d915bd9171830d7b10de53f433dc92ca81d
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/datasets/youtube-vos/test.json
@@ -0,0 +1 @@
+{"0070461469": 91, "00bd64cb00": 180, "00fef116ee": 96, "012257ffcf": 180, "01475d1fe7": 180, "0163b18674": 96, "017fa2adaa": 180, "0232ba85ed": 180, "02b1a46f42": 180, "02caec8ac0": 91, "047436c72c": 96, "0481e165b4": 150, "04f98557e7": 144, "05e73c3ecb": 96, "08f95ce1ff": 144, "0b6db1c6fd": 96, "0bd8c18197": 180, "0c6d13ee2c": 91, "0c7ba00455": 96, "0cba3e52eb": 91, "0d16524447": 150, "0d4827437d": 150, "0d62fa582a": 180, "0e1f91c0d7": 91, "0ef454b3f0": 91, "10e18fcf0c": 96, "11105e147e": 91, "11444b16da": 91, "11a4df37a4": 180, "11b3298d6a": 96, "13006c4c7e": 96, "1345523ba1": 180, "144a16eb12": 180, "15a6536e74": 180, "1616507c9e": 180, "1655f4782a": 92, "16608ccef6": 96, "16bc05b66c": 150, "16f1e1779b": 96, "17caf00e26": 96, "18f1e2f716": 91, "191a0bfcdf": 180, "19d4acf831": 91, "1a1dc21969": 96, "1a72d9fcea": 150, "1a92c81edd": 180, "1b2c2022a3": 96, "1d1601d079": 180, "1db7b25d1c": 180, "1dee5b7b5a": 150, "1e0c2e54f2": 96, "1e458b1539": 92, "1e6ac08c86": 91, "1e790eae99": 56, "1ed0c6ca5b": 96, "1edbdb6d18": 180, "1f2015e056": 96, "215ac56b15": 180, "2233485b49": 96, "224d171af6": 180, "237c6ebaf4": 91, "2462c51412": 96, "24bf968338": 180, "250d5953a0": 150, "25bcf222fb": 180, "25ea8feecf": 150, "25fc493839": 92, "262f69837e": 180, "264ca20298": 180, "26d8d48248": 51, "270f84c5e5": 91, "27889bc0fe": 180, "29b87846e7": 96, "29d2e79171": 180, "2a44411a3d": 180, "2b426fd330": 180, "2c4c4e2d5b": 180, "2c4c718eda": 180, "2c962c1bbe": 180, "2cc841341c": 92, "2cf6c4d17e": 91, "2d7ef0be04": 180, "2e5e52c6c8": 150, "2ef6fce8c6": 144, "3014e769bf": 180, "30d5f163b6": 180, "318df73d6a": 90, "31fbb9df3c": 96, "3255fcad2f": 180, "3303eea8e4": 91, "3447c30052": 150, "362722660c": 180, "37e0b4642b": 91, "383e51ed93": 180, "386b050bd0": 41, "3876ba3136": 180, "388ec2934c": 180, "38b45d9c6b": 96, "396680839c": 150, "39ffa3a4a4": 180, "3b0291b2be": 150, "3b333693f4": 180, "3bde1da2cf": 96, "3c5f4e6672": 91, "3c80682cc6": 92, "3ce634a1c1": 180, "3d6a761295": 96, "3da878c317": 91, "3db571b7ee": 96, "3e2336812c": 180, "3f16b04d6d": 96, "3fbbc75c5e": 180, "4015a1e1cc": 87, "406cd7bd48": 91, "407b87ba26": 91, "40a5628dcc": 91, "41af239f5e": 180, "42c671b285": 180, "42de37f462": 180, "4381c60a2f": 180, "4445dc0af5": 180, "44a3419d24": 180, "4566034eaf": 51, "45877fd086": 180, "4595935b88": 91, "4923010cfe": 96, "49b6d81ee8": 180, "4a39c34139": 180, "4a5a9fde01": 144, "4a90394892": 180, "4af10534e4": 180, "4af307f5bc": 180, "4be0ac97df": 91, "4be9025726": 91, "4c18a7bfab": 91, "4c269afea9": 91, "4c3db058db": 179, "4e1ef26a1e": 96, "50f4c0195b": 150, "50f89963c0": 96, "5105c5e4b8": 180, "51d60e4f93": 46, "51ee638399": 96, "522ea1a892": 180, "528e9f30e7": 91, "532efb206a": 180, "544b1486ac": 91, "5592eb680c": 180, "562fadda3a": 91, "568b30cf93": 150, "575f0e2d8e": 91, "5767fe466c": 150, "581c78d558": 180, "5a0ddcf128": 96, "5adf056317": 144, "5b33c701ce": 180, "5b8f636b33": 150, "5b9d26b1d7": 180, "5c24813a0b": 180, "5d0b35f30f": 46, "5e130392e1": 96, "5e41efe5bc": 180, "5e75de78ae": 91, "5fc34880f7": 180, "60912d6bab": 96, "612c96383d": 180, "61e5fd2205": 144, "620e350d23": 180, "62c27fcaaf": 180, "637c22d967": 91, "63eaebe4a2": 96, "63fd6b311e": 180, "64099f32ab": 180, "65643c4b34": 96, "660a88feb5": 180, "664b8d0c9f": 150, "665a7947b0": 180, "66affc2e86": 180, "673b1c03c9": 96, "67780f49c2": 91, "679a24b7bd": 180, "680d35b75b": 144, "68364a69ef": 180, "683bfaf498": 180, "68e883ff28": 180, "691f63f681": 180, "69f2d3146c": 96, "6c5c018237": 91, "6caa33f43a": 96, "6d2c7cc107": 180, "6d55effbbe": 144, "6d6b09b420": 51, "6d715acc3e": 180, "6e89b7359d": 96, "6e9428d555": 150, "6e9feafa2b": 91, "6eced45fee": 180, "6ef0b3282c": 96, "6f9019f0ea": 91, "6fe0ee9b7c": 180, "6ff74d4995": 180, "712b6ec68e": 96, "71680a627f": 96, "716aad4b56": 180, "721c2cda07": 180, "72218d52ac": 96, "7286b8aac9": 91, "728ba7998d": 91, "73b2b9af5f": 96, "7452941f4f": 180, "759d8249dd": 91, "75a55907dc": 150, "75f3a2a19e": 150, "77e7e4b1a1": 144, "7898e6542c": 180, "78e639c2c4": 91, "79091168f8": 180, "7ad5af3fe6": 180, "7b1a7dec16": 150, "7b36c4c3db": 180, "7b455d07cc": 150, "7bce4cfa48": 180, "7c064444d0": 144, "7c8014406a": 91, "7cb70182e5": 96, "7d04e540f5": 91, "7d5df020bf": 96, "7dfda4322c": 96, "7e6a27cc7c": 96, "7e9e344bf4": 180, "7eb9424a53": 180, "7ec8ea61f4": 91, "7fd2806fb0": 180, "8006501830": 150, "8014aeb412": 180, "80d1d22999": 180, "812f31be15": 144, "81312af68f": 92, "82843a1676": 150, "835aea9584": 36, "8366c67e9b": 180, "8467aa6c5c": 180, "8470ee5f48": 180, "8473ae2c60": 180, "8519765a65": 150, "851f73e4fc": 96, "85621c2c81": 150, "85b045995c": 180, "860c0a7cf8": 92, "861bd4b31e": 180, "8639adb930": 180, "8683e4d414": 150, "8687e892ff": 180, "86c5907811": 180, "870c197c8b": 180, "87de455fb7": 180, "87e1975888": 96, "87f5d4903c": 96, "883ede763d": 150, "88b84fe107": 91, "88ee198ce0": 91, "89d148a39f": 96, "89f3d789c5": 180, "8a22bb6c32": 180, "8a76048654": 180, "8a99d63296": 97, "8b0697f61a": 96, "8b722babfb": 180, "8ba5691030": 180, "8bdd52a66b": 150, "8c427b6a57": 180, "8cb68f36f6": 91, "8cbf0d6194": 180, "8d1ab4a2ed": 91, "8d55a5aebb": 180, "8d8c5906bd": 180, "8eb95e2e56": 150, "8f99788aa7": 180, "8fa5b3778f": 91, "9009ab4811": 91, "90c10e44cf": 91, "90c2c5c336": 96, "9124189275": 91, "91ee8300e7": 144, "9246556dfd": 91, "9323741e3b": 150, "94a33d3d20": 180, "9584210f86": 91, "9637e3b658": 51, "966c4c022e": 180, "9781e083b5": 180, "990d358980": 180, "995c087687": 150, "99a7d42674": 144, "99f056c109": 180, "9a29032b9c": 180, "9b07fc4cf6": 180, "9b5aa49509": 96, "9b5abb8108": 91, "9be210e984": 150, "9c3c28740e": 180, "9cace717c5": 180, "9d3ff7c1c1": 91, "9d8c66d92c": 150, "9eaa2f1fcc": 91, "9f1967f60f": 96, "9fa359e1cb": 150, "9fca469ddd": 96, "9ff11b620a": 180, "9ff655b9a3": 180, "a029b21901": 180, "a0c7eedeb8": 144, "a15e70486b": 180, "a35bef8bbf": 180, "a4309379a2": 91, "a51335af59": 96, "a5690fb3bf": 180, "a5b71f76fb": 86, "a5c8b1f945": 150, "a635426233": 150, "a73cc75b81": 144, "a7863d3903": 180, "a88f1fd4e3": 144, "aa2e90aa98": 144, "aab5ecf878": 91, "aafc5edf08": 96, "ab49400ffe": 180, "acd7b890f6": 91, "ad3ee9b86b": 180, "ad5fda372c": 144, "adb2040e5f": 91, "ae30aed29d": 180, "ae57b941a0": 180, "aeb9de8f66": 41, "af658a277c": 91, "af881cd801": 150, "b016a85236": 180, "b0313efe37": 96, "b19d6e149a": 120, "b19f091836": 180, "b2304e81df": 144, "b2d23dcf3a": 150, "b3cee57f31": 36, "b41a7ebfc6": 180, "b455f801b5": 46, "b47336c07b": 96, "b499ce791f": 180, "b52d26ddf9": 96, "b5c525cb08": 180, "b5d3b9be03": 91, "b6386bc3ce": 96, "b748b0f3be": 180, "b75e9ea782": 180, "b8237af453": 180, "b8a2104720": 96, "b8d6f92a65": 96, "b8f93a4094": 180, "bb0a1708ea": 180, "bb2245ab94": 180, "bb4ae8019f": 180, "bbdc38baa0": 76, "bbfe438d63": 96, "bc2be9fdc8": 96, "bcc00265f4": 96, "bd42cc48e4": 150, "bd43315417": 180, "bd85b04982": 51, "bda3146a46": 96, "be2b40d82a": 150, "c0f856e4de": 96, "c1bfacba4a": 91, "c1dcd30fb2": 96, "c285ede7f3": 180, "c2a6163d39": 150, "c3517ebed5": 86, "c3aabac30c": 180, "c3bb62a2f7": 144, "c454f19e90": 150, "c4c410ccd7": 180, "c5b94822e3": 180, "c64e9d1f7e": 91, "c682d1748f": 150, "c6d04b1ca3": 180, "c6dda81d86": 180, "c71623ab0c": 180, "c7db88a9db": 144, "c80ecb97d6": 150, "c8dd4de705": 180, "c915c8cbba": 150, "cb25a994d8": 144, "cba3e31e88": 91, "cc43a853e2": 180, "cc6c653874": 180, "cc718c7746": 180, "cc7e050f7f": 144, "cd14ed8653": 144, "cd5e4efaad": 46, "cddf78284d": 86, "cde37afe57": 144, "ce358eaf23": 150, "ce45145721": 91, "ce7d4af66d": 180, "ce9fb4bd8e": 91, "cec4db17a0": 180, "cecdd82d3c": 180, "ceea39e735": 180, "cf3e28c92a": 180, "cf8c671dab": 150, "cfd1e8166f": 96, "cfe7d98e50": 150, "cff0bbcba8": 96, "d1219663b7": 180, "d18ea7cd51": 180, "d1ed509b94": 91, "d22c5d5908": 81, "d2c6c7d8f6": 96, "d380084b7c": 91, "d3a2586e34": 180, "d3b1039c67": 180, "d3b25a44b3": 180, "d3f1d615b1": 180, "d7203fdab6": 96, "d76e963754": 96, "d7b3892660": 66, "d8b3e257da": 150, "d8b93e6bb1": 180, "d949468ad6": 180, "da553b619f": 180, "daac20af89": 180, "db8bf2430a": 180, "dbd729449a": 180, "dc0928b157": 91, "dc9aa0b8c0": 180, "dcc0637430": 180, "dcd3e1b53e": 86, "de1854f657": 101, "deb31e46cf": 96, "debccf2743": 150, "decf924833": 150, "e08b241b91": 180, "e0daa3b339": 180, "e1a52251b7": 180, "e1fc6d5237": 91, "e228ce16fd": 96, "e36dbb2ab7": 91, "e3dcf7a45e": 180, "e411e957af": 180, "e412e6a76b": 180, "e45a003b97": 179, "e60826ddf9": 91, "e6295c843b": 96, "e62c23b62b": 150, "e6b7a8fe73": 180, "e6f0e3131c": 180, "e7a3f8884e": 180, "e7c176739c": 180, "e965cd989b": 86, "e989440f7b": 150, "e98d115b9c": 81, "ea5f8c74d6": 180, "ea8a5b5a78": 96, "eaad295e8c": 150, "eaf4947f74": 180, "eb65451f4b": 92, "eb79c39e8e": 180, "eb92c92912": 96, "ebbb88e5f5": 180, "ec9b46eb6c": 180, "eca0be379d": 180, "ed33e8efb7": 66, "eda3a7bbb1": 150, "ee3ff10184": 180, "eec8403cc8": 91, "eee2db8829": 150, "ef22b8a227": 91, "ef8737ca22": 180, "eff7c1c098": 180, "f00dc892b2": 96, "f019c9ff98": 96, "f01edcbffb": 179, "f0866da89c": 180, "f12eb5256e": 180, "f1df2ea2dc": 180, "f29119c644": 180, "f3419f3a62": 150, "f35029f76d": 180, "f39dc2240d": 180, "f3aa63fa74": 150, "f3f3c201bd": 180, "f4865471b4": 96, "f505ae958c": 91, "f7605e73cd": 150, "f7917687d6": 180, "f7d310e219": 180, "f7e25f87b2": 180, "f94cd39525": 91, "f9f9aa431c": 180, "fa666fcc95": 66, "fb10740465": 180, "fb25b14e48": 91, "fb28ec1ba3": 150, "fbdda5ec7b": 96, "fbdf2180ee": 150, "fc0db37221": 91, "fd237cf4fb": 180, "fe36582e18": 180, "fef14bb2f2": 180, "ffe59ed1c1": 150}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-E2FGVI/datasets/youtube-vos/train.json b/phantom/submodules/phantom-E2FGVI/datasets/youtube-vos/train.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac43202f1016619010595d602908690b2be9fddc
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/datasets/youtube-vos/train.json
@@ -0,0 +1 @@
+{"003234408d": 180, "0043f083b5": 96, "0044fa5fba": 87, "005a527edd": 144, "0065b171f9": 180, "00917dcfc4": 96, "00a23ccf53": 180, "00ad5016a4": 91, "01082ae388": 150, "011ac0a06f": 180, "013099c098": 91, "0155498c85": 180, "01694ad9c8": 91, "017ac35701": 180, "01b80e8e1a": 61, "01baa5a4e1": 150, "01c3111683": 180, "01c4cb5ffe": 180, "01c76f0a82": 96, "01c783268c": 180, "01e64dd36a": 91, "01ed275c6e": 96, "01ff60d1fa": 180, "020cd28cd2": 150, "02264db755": 180, "0248626d9a": 91, "02668dbffa": 150, "0274193026": 144, "02d28375aa": 180, "02f3a5c4df": 46, "031ccc99b1": 91, "0321b18c10": 92, "0348a45bca": 180, "0355e92655": 92, "0358b938c1": 91, "0368107cf1": 96, "0379ddf557": 180, "038b2cc71d": 91, "038c15a5dd": 178, "03a06cc98a": 96, "03a63e187f": 180, "03c95b4dae": 92, "03e2b57b0e": 150, "04194e1248": 180, "04259896e2": 180, "0444918a5f": 96, "04460a7a52": 180, "04474174a4": 180, "0450095513": 150, "045f00aed2": 180, "04667fabaa": 180, "04735c5030": 91, "04990d1915": 92, "04d62d9d98": 96, "04f21da964": 180, "04fbad476e": 180, "04fe256562": 96, "0503bf89c9": 150, "0536c9eed0": 92, "054acb238f": 180, "05579ca250": 150, "056c200404": 96, "05774f3a2c": 180, "058a7592c8": 96, "05a0a513df": 96, "05a569d8aa": 91, "05aa652648": 150, "05d7715782": 96, "05e0b0f28f": 150, "05fdbbdd7a": 66, "05ffcfed85": 180, "0630391881": 150, "06840b2bbe": 91, "068f7dce6f": 180, "0693719753": 150, "06ce2b51fb": 91, "06e224798e": 180, "06ee361788": 91, "06fbb3fa2c": 90, "0700264286": 96, "070c918ca7": 180, "07129e14a4": 180, "07177017e9": 86, "07238ffc58": 180, "07353b2a89": 150, "0738493cbf": 87, "075926c651": 87, "075c701292": 180, "0762ea9a30": 96, "07652ee4af": 150, "076f206928": 96, "077d32af19": 96, "079049275c": 144, "07913cdda7": 92, "07a11a35e8": 180, "07ac33b6df": 150, "07b6e8fda8": 46, "07c62c3d11": 180, "07cc1c7d74": 180, "080196ef01": 180, "081207976e": 96, "081ae4fa44": 150, "081d8250cb": 96, "082900c5d4": 96, "0860df21e2": 180, "0866d4c5e3": 91, "0891ac2eb6": 81, "08931bc458": 180, "08aa2705d5": 180, "08c8450db7": 96, "08d50b926c": 180, "08e1e4de15": 180, "08e48c1a48": 92, "08f561c65e": 180, "08feb87790": 96, "09049f6fe3": 150, "092e4ff450": 180, "09338adea8": 180, "093c335ccc": 144, "0970d28339": 180, "0974a213dc": 96, "097b471ed8": 96, "0990941758": 180, "09a348f4fa": 150, "09a6841288": 96, "09c5bad17b": 96, "09c9ce80c7": 180, "09ff54fef4": 150, "0a23765d15": 91, "0a275e7f12": 96, "0a2f2bd294": 96, "0a7a2514aa": 96, "0a7b27fde9": 180, "0a8c467cc3": 180, "0ac8c560ae": 96, "0b1627e896": 96, "0b285c47f6": 144, "0b34ec1d55": 180, "0b5b5e8e5a": 96, "0b68535614": 180, "0b6f9105fc": 180, "0b7dbfa3cb": 91, "0b9cea51ca": 180, "0b9d012be8": 180, "0bcfc4177d": 96, "0bd37b23c1": 96, "0bd864064c": 158, "0c11c6bf7b": 180, "0c26bc77ac": 180, "0c3a04798c": 96, "0c44a9d545": 180, "0c817cc390": 180, "0ca839ee9a": 180, "0cd7ac0ac0": 150, "0ce06e0121": 180, "0cfe974a89": 180, "0d2fcc0dcd": 96, "0d3aad05d2": 144, "0d40b015f4": 180, "0d97fba242": 91, "0d9cc80d7e": 51, "0dab85b6d3": 144, "0db5c427a5": 96, "0dbaf284f1": 97, "0de4923598": 97, "0df28a9101": 150, "0e04f636c4": 150, "0e05f0e232": 180, "0e0930474b": 91, "0e27472bea": 180, "0e30020549": 144, "0e621feb6c": 180, "0e803c7d73": 91, "0e9ebe4e3c": 92, "0e9f2785ec": 96, "0ea68d418b": 96, "0eb403a222": 96, "0ee92053d6": 97, "0eefca067f": 150, "0f17fa6fcb": 180, "0f1ac8e9a3": 180, "0f202e9852": 91, "0f2ab8b1ff": 180, "0f51a78756": 150, "0f5fbe16b0": 180, "0f6072077b": 91, "0f6b69b2f4": 180, "0f6c2163de": 144, "0f74ec5599": 180, "0f9683715b": 96, "0fa7b59356": 180, "0fb173695b": 96, "0fc958cde2": 150, "0fe7b1a621": 180, "0ffcdb491c": 96, "101caff7d4": 96, "1022fe8417": 96, "1032e80b37": 96, "103f501680": 180, "104e64565f": 96, "104f1ab997": 91, "106242403f": 96, "10b31f5431": 180, "10eced835e": 91, "110d26fa3a": 150, "1122c1d16a": 180, "1145b49a5f": 180, "11485838c2": 96, "114e7676ec": 180, "1157472b95": 180, "115ee1072c": 91, "1171141012": 150, "117757b4b8": 180, "1178932d2f": 180, "117cc76bda": 180, "1180cbf814": 180, "1187bbd0e3": 96, "1197e44b26": 180, "119cf20728": 180, "119dd54871": 180, "11a0c3b724": 91, "11a6ba8c94": 180, "11c722a456": 180, "11cbcb0b4d": 96, "11ccf5e99d": 96, "11ce6f452e": 91, "11e53de6f2": 46, "11feabe596": 150, "120cb9514d": 180, "12156b25b3": 180, "122896672d": 180, "1232b2f1d4": 36, "1233ac8596": 97, "1239c87234": 180, "1250423f7c": 96, "1257a1bc67": 180, "125d1b19dd": 180, "126d203967": 180, "1295e19071": 96, "12ad198c54": 144, "12bddb2bcb": 150, "12ec9b93ee": 180, "12eebedc35": 91, "132852e094": 180, "1329409f2a": 180, "13325cfa14": 96, "1336440745": 180, "134d06dbf9": 97, "135625b53d": 144, "13870016f9": 92, "13960b3c84": 96, "13adaad9d9": 180, "13ae097e20": 180, "13e3070469": 96, "13f6a8c20d": 144, "1416925cf2": 92, "142d2621f5": 91, "145d5d7c03": 180, "145fdc3ac5": 180, "1471274fa7": 76, "14a6b5a139": 180, "14c21cea0d": 180, "14dae0dc93": 96, "14f9bd22b5": 180, "14fd28ae99": 180, "15097d5d4e": 144, "150ea711f2": 180, "1514e3563f": 180, "152aaa3a9e": 180, "152b7d3bd7": 150, "15617297cc": 180, "15abbe0c52": 150, "15d1fb3de5": 180, "15f67b0fab": 180, "161eb59aad": 96, "16288ea47f": 180, "164410ce62": 91, "165c3c8cd4": 96, "165c42b41b": 91, "165ec9e22b": 144, "1669502269": 91, "16763cccbb": 150, "16adde065e": 96, "16af445362": 96, "16afd538ad": 150, "16c3fa4d5d": 96, "16d1d65c27": 180, "16e8599e94": 180, "16fe9fb444": 91, "1705796b02": 96, "1724db7671": 144, "17418e81ea": 180, "175169edbb": 144, "17622326fd": 180, "17656bae77": 91, "17b0d94172": 61, "17c220e4f6": 180, "17c7bcd146": 96, "17cb4afe89": 180, "17cd79a434": 180, "17d18604c3": 96, "17d8ca1a37": 150, "17e33f4330": 180, "17f7a6d805": 150, "180abc8378": 180, "183ba3d652": 96, "185bf64702": 96, "18913cc690": 91, "1892651815": 180, "189ac8208a": 91, "189b44e92c": 97, "18ac264b76": 150, "18b245ab49": 91, "18b5cebc34": 150, "18bad52083": 180, "18bb5144d5": 180, "18c6f205c5": 96, "1903f9ea15": 96, "1917b209f2": 91, "191e74c01d": 150, "19367bb94e": 180, "193ffaa217": 91, "19696b67d3": 96, "197f3ab6f3": 180, "1981e763cc": 180, "198afe39ae": 144, "19a6e62b9b": 150, "19b60d5335": 180, "19c00c11f9": 150, "19e061eb88": 91, "19e8bc6178": 86, "19ee80dac6": 180, "1a25a9170a": 180, "1a359a6c1a": 150, "1a3e87c566": 150, "1a5fe06b00": 91, "1a6c0fbd1e": 144, "1a6f3b5a4b": 96, "1a8afbad92": 92, "1a8bdc5842": 150, "1a95752aca": 150, "1a9c131cb7": 180, "1aa3da3ee3": 150, "1ab27ec7ea": 56, "1abf16d21d": 150, "1acd0f993b": 180, "1ad202e499": 180, "1af8d2395d": 180, "1afd39a1fa": 91, "1b2d31306f": 180, "1b3fa67f0e": 92, "1b43fa74b4": 150, "1b73ea9fc2": 92, "1b7e8bb255": 96, "1b8680f8cd": 180, "1b883843c0": 91, "1b8898785b": 180, "1b88ba1aa4": 180, "1b96a498e5": 150, "1bbc4c274f": 96, "1bd87fe9ab": 66, "1c4090c75b": 180, "1c41934f84": 96, "1c72b04b56": 180, "1c87955a3a": 150, "1c9f9eb792": 180, "1ca240fede": 96, "1ca5673803": 180, "1cada35274": 180, "1cb44b920d": 180, "1cd10e62be": 150, "1d3087d5e5": 180, "1d3685150a": 92, "1d6ff083aa": 96, "1d746352a6": 66, "1da256d146": 91, "1da4e956b1": 180, "1daf812218": 150, "1dba687bce": 180, "1dce57d05d": 86, "1de4a9e537": 97, "1dec5446c8": 180, "1dfbe6f586": 150, "1e1a18c45a": 180, "1e1e42529d": 76, "1e4be70796": 96, "1eb60959c8": 180, "1ec8b2566b": 180, "1ecdc2941c": 180, "1ee0ac70ff": 87, "1ef8e17def": 91, "1f1a2a9fc0": 86, "1f1beb8daa": 150, "1f2609ee13": 180, "1f3876f8d0": 144, "1f4ec0563d": 150, "1f64955634": 96, "1f7d31b5b2": 96, "1f8014b7fd": 96, "1f9c7d10f1": 180, "1fa350df76": 96, "1fc9538993": 180, "1fe2f0ec59": 150, "2000c02f9d": 180, "20142b2f05": 180, "201a8d75e5": 150, "2023b3ee4f": 180, "202b767bbc": 92, "203594a418": 180, "2038987336": 150, "2039c3aecb": 96, "204a90d81f": 150, "207bc6cf01": 144, "208833d1d1": 180, "20c6d8b362": 46, "20e3e52e0a": 96, "2117fa0c14": 180, "211bc5d102": 150, "2120d9c3c3": 150, "2125235a49": 180, "21386f5978": 92, "2142af8795": 150, "215dfc0f73": 96, "217bae91e5": 180, "217c0d44e4": 150, "219057c87b": 150, "21d0edbf81": 96, "21df87ad76": 96, "21f1d089f5": 96, "21f4019116": 180, "222597030f": 91, "222904eb5b": 92, "223a0e0657": 180, "223bd973ab": 92, "22472f7395": 150, "224e7c833e": 96, "225aba51d9": 86, "2261d421ea": 180, "2263a8782b": 180, "2268cb1ffd": 150, "2268e93b0a": 61, "2293c99f3f": 180, "22a1141970": 91, "22b13084b2": 180, "22d9f5ab0c": 180, "22f02efe3a": 144, "232c09b75b": 150, "2350d71b4b": 180, "2376440551": 180, "2383d8aafd": 144, "238b84e67f": 96, "238d4b86f6": 91, "238d947c6b": 46, "23993ce90d": 180, "23b0c8a9ab": 150, "23b3beafcc": 156, "23d80299fe": 92, "23f404a9fc": 96, "240118e58a": 178, "2431dec2fd": 180, "24440e0ac7": 97, "2457274dbc": 180, "2465bf515d": 91, "246b142c4d": 180, "247d729e36": 96, "2481ceafeb": 150, "24866b4e6a": 150, "2489d78320": 180, "24ab0b83e8": 180, "24b0868d92": 180, "24b5207cd9": 96, "24ddf05c03": 92, "250116161c": 71, "256ad2e3fc": 180, "256bd83d5e": 180, "256dcc8ab8": 180, "2589956baa": 150, "258b3b33c6": 91, "25ad437e29": 96, "25ae395636": 180, "25c750c6db": 150, "25d2c3fe5d": 180, "25dc80db7c": 96, "25f97e926f": 180, "26011bc28b": 150, "260846ffbe": 180, "260dd9ad33": 66, "267964ee57": 92, "2680861931": 96, "268ac7d3fc": 180, "26b895d91e": 71, "26bc786d4f": 91, "26ddd2ef12": 180, "26de3d18ca": 150, "26f7784762": 180, "2703e52a6a": 180, "270ed80c12": 180, "2719b742ab": 180, "272f4163d0": 180, "27303333e1": 96, "27659fa7d6": 180, "279214115d": 180, "27a5f92a9c": 97, "27cf2af1f3": 150, "27f0d5f8a2": 86, "28075f33c1": 180, "281629cb41": 96, "282b0d51f5": 96, "282fcab00b": 96, "28449fa0dc": 180, "28475208ca": 96, "285580b7c4": 180, "285b69e223": 150, "288c117201": 150, "28a8eb9623": 180, "28bf9c3cf3": 180, "28c6b8f86a": 180, "28c972dacd": 144, "28d9fa6016": 96, "28e392de91": 144, "28f4a45190": 150, "298c844fc9": 91, "29a0356a2b": 180, "29d779f9e3": 76, "29dde5f12b": 86, "29de7b6579": 150, "29e630bdd0": 144, "29f2332d30": 144, "2a18873352": 92, "2a3824ff31": 91, "2a559dd27f": 96, "2a5c09acbd": 76, "2a63eb1524": 96, "2a6a30a4ea": 150, "2a6d9099d1": 180, "2a821394e3": 81, "2a8c5b1342": 96, "2abc8d66d2": 96, "2ac9ef904a": 46, "2b08f37364": 150, "2b351bfd7d": 180, "2b659a49d7": 66, "2b69ee5c26": 96, "2b6c30bbbd": 180, "2b88561cf2": 144, "2b8b14954e": 180, "2ba621c750": 150, "2bab50f9a7": 180, "2bb00c2434": 91, "2bbde474ef": 92, "2bdd82fb86": 150, "2be06fb855": 96, "2bf545c2f5": 180, "2bffe4cf9a": 96, "2c04b887b7": 144, "2c05209105": 180, "2c0ad8cf39": 180, "2c11fedca8": 56, "2c1a94ebfb": 91, "2c1e8c8e2f": 180, "2c29fabcf1": 96, "2c2c076c01": 180, "2c3ea7ee7d": 92, "2c41fa0648": 87, "2c44bb6d1c": 96, "2c54cfbb78": 180, "2c5537eddf": 180, "2c6e63b7de": 150, "2cb10c6a7e": 180, "2cbcd5ccd1": 180, "2cc5d9c5f6": 180, "2cd01cf915": 180, "2cdbf5f0a7": 91, "2ce660f123": 96, "2cf114677e": 150, "2d01eef98e": 180, "2d03593bdc": 96, "2d183ac8c4": 180, "2d33ad3935": 96, "2d3991d83e": 150, "2d4333577b": 180, "2d4d015c64": 96, "2d8f5e5025": 144, "2d900bdb8e": 180, "2d9a1a1d49": 46, "2db0576a5c": 180, "2dc0838721": 180, "2dcc417f82": 150, "2df005b843": 180, "2df356de14": 180, "2e00393d96": 61, "2e03b8127a": 180, "2e0f886168": 96, "2e2bf37e6d": 180, "2e42410932": 87, "2ea78f46e4": 180, "2ebb017a26": 180, "2ee2edba2a": 96, "2efb07554a": 180, "2f17e4fc1e": 96, "2f2c65c2f3": 144, "2f2d9b33be": 150, "2f309c206b": 180, "2f53822e88": 144, "2f53998171": 96, "2f5b0c89b1": 180, "2f680909e6": 180, "2f710f66bd": 180, "2f724132b9": 91, "2f7e3517ae": 91, "2f96f5fc6f": 180, "2f97d9fecb": 96, "2fbfa431ec": 96, "2fc9520b53": 180, "2fcd9f4c62": 180, "2feb30f208": 87, "2ff7f5744f": 150, "30085a2cc6": 96, "30176e3615": 56, "301f72ee11": 92, "3026bb2f61": 180, "30318465dc": 150, "3054ca937d": 180, "306121e726": 92, "3064ad91e8": 180, "307444a47f": 180, "307bbb7409": 91, "30a20194ab": 144, "30c35c64a4": 150, "30dbdb2cd6": 91, "30fc77d72f": 150, "310021b58b": 96, "3113140ee8": 144, "3150b2ee57": 180, "31539918c4": 180, "318dfe2ce2": 144, "3193da4835": 91, "319f725ad9": 180, "31bbd0d793": 91, "322505c47f": 180, "322b237865": 92, "322da43910": 97, "3245e049fb": 66, "324c4c38f6": 180, "324e35111a": 150, "3252398f09": 150, "327dc4cabf": 180, "328d918c7d": 180, "3290c0de97": 96, "3299ae3116": 180, "32a7cd687b": 150, "33098cedb4": 92, "3332334ac4": 180, "334cb835ac": 180, "3355e056eb": 180, "33639a2847": 180, "3373891cdc": 180, "337975816b": 180, "33e29d7e91": 96, "34046fe4f2": 180, "3424f58959": 180, "34370a710f": 92, "343bc6a65a": 179, "3450382ef7": 144, "3454303a08": 180, "346aacf439": 180, "346e92ff37": 180, "34a5ece7dd": 144, "34b109755a": 180, "34d1b37101": 96, "34dd2c70a7": 180, "34efa703df": 180, "34fbee00a6": 150, "3504df2fda": 96, "35195a56a1": 150, "351c822748": 180, "351cfd6bc5": 180, "3543d8334c": 180, "35573455c7": 96, "35637a827f": 96, "357a710863": 92, "358bf16f9e": 96, "35ab34cc34": 180, "35c6235b8d": 91, "35d01a438a": 180, "3605019d3b": 96, "3609bc3f88": 92, "360e25da17": 97, "36299c687c": 96, "362c5bc56e": 180, "3649228783": 150, "365b0501ea": 92, "365f459863": 180, "369893f3ad": 180, "369c9977e1": 180, "369dde050a": 96, "36c7dac02f": 180, "36d5b1493b": 180, "36f5cc68fd": 91, "3735480d18": 180, "374b479880": 97, "375a49d38f": 180, "375a5c0e09": 180, "376bda9651": 144, "377db65f60": 144, "37c19d1087": 46, "37d4ae24fc": 96, "37ddce7f8b": 180, "37e10d33af": 180, "37e45c6247": 96, "37fa0001e8": 180, "3802d458c0": 150, "382caa3cb4": 91, "383bb93111": 91, "388843df90": 180, "38924f4a7f": 92, "38b00f93d7": 92, "38c197c10e": 96, "38c9c3d801": 180, "38eb2bf67f": 92, "38fe9b3ed1": 180, "390352cced": 180, "390c51b987": 96, "390ca6f1d6": 144, "392bc0f8a1": 96, "392ecb43bd": 92, "3935291688": 150, "3935e63b41": 180, "394454fa9c": 180, "394638fc8b": 96, "39545e20b7": 180, "397abeae8f": 180, "3988074b88": 91, "398f5d5f19": 174, "39bc49a28c": 180, "39befd99fb": 144, "39c3c7bf55": 180, "39d584b09f": 91, "39f6f6ffb1": 180, "3a079fb484": 180, "3a0d3a81b7": 150, "3a1d55d22b": 82, "3a20a7583e": 96, "3a2c1f66e5": 150, "3a33f4d225": 180, "3a3bf84b13": 144, "3a4565e5ec": 144, "3a4e32ed5e": 180, "3a7ad86ce0": 180, "3a7bdde9b8": 180, "3a98867cbe": 91, "3aa3f1c9e8": 150, "3aa7fce8b6": 91, "3aa876887d": 96, "3ab807ded6": 96, "3ab9b1a85a": 96, "3adac8d7da": 180, "3ae1a4016f": 96, "3ae2deaec2": 180, "3ae81609d6": 144, "3af847e62f": 92, "3b23792b84": 144, "3b3b0af2ee": 150, "3b512dad74": 144, "3b6c7988f6": 91, "3b6e983b5b": 180, "3b74a0fc20": 180, "3b7a50b80d": 180, "3b96d3492f": 180, "3b9ad0c5a9": 150, "3b9ba0894a": 180, "3bb4e10ed7": 144, "3bd9a9b515": 150, "3beef45388": 96, "3c019c0a24": 96, "3c090704aa": 96, "3c2784fc0d": 144, "3c47ab95f8": 150, "3c4db32d74": 91, "3c5ff93faf": 180, "3c700f073e": 180, "3c713cbf2f": 91, "3c8320669c": 180, "3c90d225ee": 180, "3cadbcc404": 96, "3cb9be84a5": 150, "3cc37fd487": 91, "3cc6f90cb2": 92, "3cd5e035ef": 180, "3cdf03531b": 178, "3cdf828f59": 180, "3d254b0bca": 180, "3d5aeac5ba": 180, "3d690473e1": 180, "3d69fed2fb": 96, "3d8997aeb6": 96, "3db0d6b07e": 96, "3db1ddb8cf": 180, "3db907ac77": 180, "3dcbc0635b": 150, "3dd48ed55f": 144, "3de4ac4ec4": 92, "3decd63d88": 180, "3e04a6be11": 180, "3e108fb65a": 96, "3e1448b01c": 150, "3e16c19634": 180, "3e2845307e": 61, "3e38336da5": 96, "3e3a819865": 180, "3e3e4be915": 96, "3e680622d7": 91, "3e7d2aeb07": 96, "3e7d8f363d": 180, "3e91f10205": 26, "3ea4c49bbe": 144, "3eb39d11ab": 180, "3ec273c8d5": 96, "3ed3f91271": 76, "3ee062a2fd": 180, "3eede9782c": 180, "3ef2fa99cb": 180, "3efc6e9892": 92, "3f0b0dfddd": 96, "3f0c860359": 91, "3f18728586": 180, "3f3b15f083": 96, "3f45a470ad": 46, "3f4f3bc803": 150, "3fd96c5267": 91, "3fea675fab": 91, "3fee8cbc9f": 96, "3fff16d112": 180, "401888b36c": 144, "4019231330": 150, "402316532d": 180, "402680df52": 180, "404d02e0c0": 150, "40709263a8": 81, "4083cfbe15": 150, "40a96c5cb1": 96, "40b8e50f82": 91, "40f4026bf5": 144, "4100b57a3a": 150, "41059fdd0b": 180, "41124e36de": 144, "4122aba5f9": 180, "413bab0f0d": 96, "4164faee0b": 180, "418035eec9": 180, "4182d51532": 96, "418bb97e10": 144, "41a34c20e7": 96, "41dab05200": 180, "41ff6d5e2a": 77, "420caf0859": 56, "42264230ba": 96, "425a0c96e0": 91, "42da96b87c": 180, "42eb5a5b0f": 180, "42f17cd14d": 91, "42f5c61c49": 180, "42ffdcdee9": 180, "432f9884f9": 91, "43326d9940": 150, "4350f3ab60": 144, "4399ffade3": 96, "43a6c21f37": 150, "43b5555faa": 180, "43d63b752a": 180, "4416bdd6ac": 92, "4444753edd": 76, "444aa274e7": 150, "444d4e0596": 150, "446b8b5f7a": 96, "4478f694bb": 91, "44b1da0d87": 92, "44b4dad8c9": 96, "44b5ece1b9": 180, "44d239b24e": 150, "44eaf8f51e": 180, "44f4f57099": 96, "44f7422af2": 180, "450787ac97": 180, "4523656564": 96, "4536c882e5": 180, "453b65daa4": 180, "454f227427": 91, "45636d806a": 180, "456fb9362e": 91, "457e717a14": 150, "45a89f35e1": 180, "45bf0e947d": 150, "45c36a9eab": 150, "45d9fc1357": 174, "45f8128b97": 180, "4607f6c03c": 91, "46146dfd39": 92, "4620e66b1e": 150, "4625f3f2d3": 96, "462b22f263": 96, "4634736113": 180, "463c0f4fdd": 180, "46565a75f8": 96, "46630b55ae": 56, "466839cb37": 91, "466ba4ae0c": 180, "4680236c9d": 180, "46bf4e8709": 91, "46e18e42f1": 150, "46f5093c59": 180, "47269e0499": 92, "472da1c484": 144, "47354fab09": 180, "4743bb84a7": 92, "474a796272": 180, "4783d2ab87": 96, "479cad5da3": 180, "479f5d7ef6": 96, "47a05fbd1d": 96, "4804ee2767": 97, "4810c3fbca": 180, "482fb439c2": 150, "48375af288": 96, "484ab44de4": 96, "485f3944cd": 96, "4867b84887": 150, "486a8ac57e": 180, "486e69c5bd": 180, "48812cf33e": 150, "4894b3b9ea": 180, "48bd66517d": 180, "48d83b48a4": 91, "49058178b8": 46, "4918d10ff0": 91, "4932911f80": 150, "49405b7900": 180, "49972c2d14": 150, "499bf07002": 96, "49b16e9377": 180, "49c104258e": 144, "49c879f82d": 96, "49e7326789": 180, "49ec3e406a": 91, "49fbf0c98a": 96, "4a0255c865": 180, "4a088fe99a": 96, "4a341402d0": 180, "4a3471bdf5": 96, "4a4b50571c": 144, "4a50f3d2e9": 96, "4a6e3faaa1": 180, "4a7191f08a": 150, "4a86fcfc30": 180, "4a885fa3ef": 144, "4a8af115de": 21, "4aa2e0f865": 180, "4aa9d6527f": 180, "4abb74bb52": 96, "4ae13de1cd": 91, "4af8cb323f": 97, "4b02c272b3": 180, "4b19c529fb": 96, "4b2974eff4": 180, "4b3154c159": 95, "4b54d2587f": 180, "4b556740ff": 144, "4b67aa9ef6": 178, "4b97cc7b8d": 96, "4baa1ed4aa": 91, "4bc8c676bb": 96, "4beaea4dbe": 180, "4bf5763d24": 96, "4bffa92b67": 138, "4c25dfa8ec": 96, "4c397b6fd4": 180, "4c51e75d66": 150, "4c7710908f": 180, "4c9b5017be": 180, "4ca2ffc361": 92, "4cad2e93bc": 150, "4cd427b535": 180, "4cd9a4b1ef": 180, "4cdfe3c2b2": 180, "4cef87b649": 96, "4cf208e9b3": 180, "4cf5bc3e60": 92, "4cfdd73249": 91, "4cff5c9e42": 180, "4d26d41091": 96, "4d5c23c554": 180, "4d67c59727": 150, "4d983cad9f": 180, "4da0d00b55": 144, "4daa179861": 91, "4dadd57153": 92, "4db117e6c5": 91, "4de4ce4dea": 180, "4dfaee19e5": 180, "4dfdd7fab0": 180, "4e3f346aa5": 92, "4e49c2a9c7": 56, "4e4e06a749": 180, "4e70279712": 96, "4e72856cc7": 91, "4e752f8075": 180, "4e7a28907f": 66, "4e824b9247": 180, "4e82b1df57": 180, "4e87a639bc": 180, "4ea77bfd15": 150, "4eb6fc23a2": 180, "4ec9da329e": 96, "4efb9a0720": 180, "4f062fbc63": 96, "4f35be0e0b": 96, "4f37e86797": 91, "4f414dd6e7": 180, "4f424abded": 180, "4f470cc3ae": 144, "4f601d255a": 150, "4f7386a1ab": 144, "4f824d3dcd": 91, "4f827b0751": 144, "4f8db33a13": 180, "4fa160f8a3": 180, "4fa9c30a45": 180, "4facd8f0e8": 96, "4fca07ad01": 91, "4fded94004": 180, "4fdfef4dea": 91, "4feb3ac01f": 92, "4fffec8479": 96, "500c835a86": 180, "50168342bf": 180, "50243cffdc": 180, "5031d5a036": 180, "504dd9c0fd": 96, "50568fbcfb": 180, "5069c7c5b3": 180, "508189ac91": 180, "50b6b3d4b7": 91, "50c6f4fe3e": 86, "50cce40173": 180, "50efbe152f": 180, "50f290b95d": 91, "5104aa1fea": 96, "5110dc72c0": 180, "511e8ecd7f": 150, "513aada14e": 92, "5158d6e985": 180, "5161e1fa57": 180, "51794ddd58": 96, "517d276725": 91, "51a597ee04": 51, "51b37b6d97": 96, "51b5dc30a0": 96, "51e85b347b": 180, "51eea1fdac": 150, "51eef778af": 91, "51f384721c": 76, "521cfadcb4": 180, "52355da42f": 96, "5247d4b160": 180, "524b470fd0": 180, "524cee1534": 96, "5252195e8a": 91, "5255c9ca97": 144, "525928f46f": 96, "526df007a7": 180, "529b12de78": 91, "52c7a3d653": 150, "52c8ec0373": 91, "52d225ed52": 96, "52ee406d9e": 180, "52ff1ccd4a": 96, "53143511e8": 180, "5316d11eb7": 96, "53253f2362": 180, "534a560609": 91, "5352c4a70e": 180, "536096501f": 92, "536b17bcea": 180, "5380eaabff": 144, "5390a43a54": 180, "53af427bb2": 91, "53bf5964ce": 180, "53c30110b5": 96, "53cad8e44a": 150, "53d9c45013": 91, "53e274f1b5": 150, "53e32d21ea": 96, "540850e1c7": 96, "540cb31cfe": 180, "541c4da30f": 91, "541d7935d7": 180, "545468262b": 180, "5458647306": 144, "54657855cd": 96, "547b3fb23b": 180, "5497dc3712": 150, "549c56f1d4": 96, "54a4260bb1": 150, "54b98b8d5e": 180, "54e1054b0f": 91, "54e8867b83": 180, "54ebe34f6e": 180, "5519b4ad13": 86, "551acbffd5": 150, "55341f42da": 180, "5566ab97e1": 91, "556c79bbf2": 144, "5589637cc4": 180, "558aa072f0": 180, "559824b6f6": 91, "55c1764e90": 180, "55eda6c77e": 180, "562d173565": 150, "5665c024cb": 96, "566cef4959": 91, "5675d78833": 144, "5678a91bd8": 180, "567a2b4bd0": 180, "569c282890": 86, "56cc449917": 150, "56e71f3e07": 150, "56f09b9d92": 180, "56fc0e8cf9": 144, "571ca79c71": 91, "57243657cf": 144, "57246af7d1": 91, "57427393e9": 96, "574b682c19": 180, "578f211b86": 180, "5790ac295d": 91, "579393912d": 180, "57a344ab1a": 180, "57bd3bcda4": 180, "57bfb7fa4c": 150, "57c010175e": 180, "57c457cc75": 180, "57c7fc2183": 150, "57d5289a01": 61, "58045fde85": 96, "58163c37cd": 150, "582d463e5c": 180, "5851739c15": 180, "585dd0f208": 66, "587250f3c3": 180, "589e4cc1de": 180, "589f65f5d5": 180, "58a07c17d5": 180, "58adc6d8b6": 76, "58b9bcf656": 96, "58c374917e": 96, "58fc75fd42": 87, "5914c30f05": 96, "59323787d5": 150, "5937b08d69": 96, "594065ddd7": 96, "595a0ceea6": 91, "59623ec40b": 91, "597ff7ef78": 150, "598935ef05": 46, "598c2ad3b2": 180, "59a6459751": 180, "59b175e138": 96, "59bf0a149f": 180, "59d53d1649": 180, "59e3e6fae7": 180, "59fe33e560": 180, "5a13a73fe5": 96, "5a25c22770": 150, "5a4a785006": 96, "5a50640995": 180, "5a75f7a1cf": 96, "5a841e59ad": 180, "5a91c5ab6d": 150, "5ab49d9de0": 96, "5aba1057fe": 180, "5abe46ba6d": 91, "5ac7c88d0c": 180, "5aeb95cc7d": 92, "5af15e4fc3": 91, "5afe381ae4": 96, "5b07b4229d": 51, "5b1001cc4f": 180, "5b1df237d2": 180, "5b263013bf": 91, "5b27d19f0b": 180, "5b48ae16c5": 96, "5b5babc719": 180, "5baaebdf00": 180, "5bab55cdbe": 180, "5bafef6e79": 96, "5bc77844da": 180, "5bd1f84545": 180, "5bddc3ba25": 180, "5bdf7c20d2": 180, "5bf23bc9d3": 180, "5c01f6171a": 144, "5c021681b7": 96, "5c185cff1d": 180, "5c42aba280": 180, "5c44bf8ab6": 180, "5c4c574894": 144, "5c52fa4662": 76, "5c6ea7dac3": 96, "5c74315dc2": 180, "5c7668855e": 92, "5c83e96778": 180, "5ca36173e4": 96, "5cac477371": 97, "5cb0cb1b2f": 96, "5cb0cfb98f": 144, "5cb49a19cf": 180, "5cbf7dc388": 180, "5d0e07d126": 96, "5d1e24b6e3": 81, "5d663000ff": 150, "5da6b2dc5d": 180, "5de9b90f24": 61, "5e08de0ed7": 180, "5e1011df9a": 87, "5e1ce354fd": 150, "5e35512dd7": 180, "5e418b25f9": 96, "5e4849935a": 144, "5e4ee19663": 96, "5e886ef78f": 96, "5e8d00b974": 180, "5e8d59dc31": 180, "5ed838bd5c": 96, "5edda6ee5a": 180, "5ede4d2f7a": 144, "5ede9767da": 144, "5ee23ca60e": 87, "5eec4d9fe5": 96, "5eecf07824": 180, "5eef7ed4f4": 91, "5ef5860ac6": 144, "5ef6573a99": 96, "5f1193e72b": 91, "5f29ced797": 96, "5f32cf521e": 150, "5f51876986": 96, "5f6ebe94a9": 86, "5f6f14977c": 91, "5f808d0d2d": 91, "5fb8aded6a": 180, "5fba90767d": 96, "5fd1c7a3df": 92, "5fd3da9f68": 91, "5fee2570ae": 180, "5ff66140d6": 180, "5ff8b85b53": 180, "600803c0f6": 180, "600be7f53e": 96, "6024888af8": 180, "603189a03c": 96, "6057307f6e": 180, "6061ddbb65": 96, "606c86c455": 180, "60c61cc2e5": 180, "60e51ff1ae": 150, "610e38b751": 150, "61344be2f6": 180, "6135e27185": 96, "614afe7975": 150, "614e571886": 180, "614e7078db": 96, "619812a1a7": 96, "61b481a78b": 96, "61c7172650": 180, "61cf7e40d2": 96, "61d08ef5a1": 46, "61da008958": 96, "61ed178ecb": 61, "61f5d1282c": 92, "61fd977e49": 144, "621584cffe": 180, "625817a927": 180, "625892cf0b": 96, "625b89d28a": 91, "629995af95": 150, "62a0840bb5": 180, "62ad6e121c": 87, "62d6ece152": 91, "62ede7b2da": 91, "62f025e1bc": 180, "6316faaebc": 97, "63281534dc": 150, "634058dda0": 144, "6353f09384": 180, "6363c87314": 180, "636e4872e0": 180, "637681cd6b": 180, "6376d49f31": 180, "6377809ec2": 180, "63936d7de5": 96, "639bddef11": 150, "63d37e9fd3": 180, "63d90c2bae": 96, "63e544a5d6": 180, "63ebbcf874": 96, "63fff40b31": 180, "6406c72e4d": 61, "64148128be": 96, "6419386729": 150, "643092bc41": 96, "644081b88d": 144, "64453cf61d": 180, "644bad9729": 96, "6454f548fd": 180, "645913b63a": 180, "64750b825f": 180, "64a43876b7": 96, "64dd6c83e3": 92, "64e05bf46e": 96, "64f55f1478": 150, "650b0165e4": 180, "651066ed39": 180, "652b67d960": 180, "653821d680": 180, "6538d00d73": 180, "65866dce22": 150, "6589565c8c": 150, "659832db64": 180, "65ab7e1d98": 180, "65b7dda462": 180, "65bd5eb4f5": 180, "65dcf115ab": 91, "65e9825801": 180, "65f9afe51c": 91, "65ff12bcb5": 180, "666b660284": 180, "6671643f31": 180, "668364b372": 96, "66852243cb": 96, "6693a52081": 180, "669b572898": 180, "66e98e78f5": 91, "670f12e88f": 180, "674c12c92d": 91, "675c27208a": 180, "675ed3e1ca": 144, "67741db50a": 96, "678a2357eb": 70, "67b0f4d562": 180, "67cfbff9b1": 180, "67e717d6bd": 91, "67ea169a3b": 92, "67ea809e0e": 180, "681249baa3": 180, "683de643d9": 180, "6846ac20df": 96, "6848e012ef": 96, "684bcd8812": 96, "684dc1c40c": 96, "685a1fa9cf": 91, "686dafaac9": 144, "68807d8601": 96, "6893778c77": 96, "6899d2dabe": 91, "68a2fad4ab": 180, "68cb45fda3": 180, "68cc4a1970": 96, "68dcb40675": 180, "68ea4a8c3d": 180, "68f6e7fbf0": 96, "68fa8300b4": 180, "69023db81f": 96, "6908ccf557": 91, "691a111e7c": 180, "6927723ba5": 180, "692ca0e1a2": 97, "692eb57b63": 180, "69340faa52": 96, "693cbf0c9d": 180, "6942f684ad": 96, "6944fc833b": 180, "69491c0ebf": 91, "695b61a2b0": 96, "6979b4d83f": 180, "697d4fdb02": 144, "69910460a4": 180, "6997636670": 180, "69a436750b": 96, "69aebf7669": 180, "69b8c17047": 180, "69c67f109f": 180, "69e0e7b868": 180, "69ea9c09d1": 180, "69f0af42a6": 97, "6a078cdcc7": 144, "6a37a91708": 71, "6a42176f2e": 180, "6a48e4aea8": 96, "6a5977be3a": 180, "6a5de0535f": 180, "6a80d2e2e5": 96, "6a96c8815d": 180, "6a986084e2": 96, "6aa8e50445": 92, "6ab9dce449": 150, "6abf0ba6b2": 180, "6acc6049d9": 96, "6adb31756c": 180, "6ade215eb0": 96, "6afb7d50e4": 144, "6afd692f1a": 180, "6b0b1044fe": 91, "6b17c67633": 180, "6b1b6ef28b": 92, "6b1e04d00d": 180, "6b2261888d": 96, "6b25d6528a": 144, "6b3a24395c": 150, "6b685eb75b": 96, "6b79be238c": 92, "6b928b7ba6": 96, "6b9c43c25a": 180, "6ba99cc41f": 91, "6bdab62bcd": 86, "6bf2e853b1": 180, "6bf584200f": 180, "6bf95df2b9": 150, "6c0949c51c": 180, "6c11a5f11f": 96, "6c23d89189": 61, "6c4387daf5": 96, "6c4ce479a4": 86, "6c5123e4bc": 96, "6c54265f16": 92, "6c56848429": 96, "6c623fac5f": 36, "6c81b014e9": 96, "6c99ea7c31": 92, "6c9d29d509": 91, "6c9e3b7d1a": 91, "6ca006e283": 96, "6caeb928d6": 180, "6cb2ee722a": 180, "6cbfd32c5e": 180, "6cc791250b": 150, "6cccc985e0": 96, "6d12e30c48": 180, "6d4bf200ad": 180, "6d6d2b8843": 91, "6d6eea5682": 180, "6d7a3d0c21": 96, "6d7efa9b9e": 180, "6da21f5c91": 180, "6da6adabc0": 150, "6dd2827fbb": 96, "6dd36705b9": 131, "6df3637557": 180, "6dfe55e9e5": 150, "6e1a21ba55": 96, "6e2f834767": 180, "6e36e4929a": 96, "6e4f460caf": 96, "6e618d26b6": 56, "6ead4670f7": 180, "6eaff19b9f": 180, "6eb2e1cd9e": 180, "6eb30b3b5a": 96, "6eca26c202": 91, "6ecad29e52": 96, "6ef0b44654": 96, "6efcfe9275": 180, "6f4789045c": 180, "6f49f522ef": 96, "6f67d7c4c4": 180, "6f96e91d81": 144, "6fc6fce380": 180, "6fc9b44c00": 96, "6fce7f3226": 150, "6fdf1ca888": 150, "702fd8b729": 180, "70405185d2": 180, "7053e4f41e": 180, "707bf4ce41": 87, "7082544248": 81, "708535b72a": 96, "7094ac0f60": 180, "70a6b875fa": 180, "70c3e97e41": 180, "7106b020ab": 91, "711dce6fe2": 96, "7136a4453f": 180, "7143fb084f": 180, "714d902095": 150, "7151c53b32": 150, "715357be94": 180, "7163b8085f": 150, "716df1aa59": 150, "71caded286": 150, "71d2665f35": 91, "71d67b9e19": 96, "71e06dda39": 180, "720b398b9c": 91, "720e3fa04c": 150, "720e7a5f1e": 91, "721bb6f2cb": 91, "722803f4f2": 92, "72552a07c9": 91, "726243a205": 96, "72690ef572": 46, "728cda9b65": 86, "728e81c319": 91, "72a810a799": 180, "72acb8cdf6": 180, "72b01281f9": 180, "72cac683e4": 91, "72cadebbce": 180, "72cae058a5": 180, "72d8dba870": 180, "72e8d1c1ff": 96, "72edc08285": 180, "72f04f1a38": 81, "731b825695": 144, "7320b49b13": 180, "732626383b": 87, "732df1eb05": 150, "73329902ab": 150, "733798921e": 150, "733824d431": 150, "734ea0d7fb": 91, "735a7cf7b9": 144, "7367a42892": 91, "7368d5c053": 180, "738e5a0a14": 180, "73c6ae7711": 96, "73e1852735": 150, "73e4e5cc74": 150, "73eac9156b": 180, "73f8441a88": 91, "7419e2ab3f": 91, "74267f68b9": 91, "7435690c8c": 46, "747c44785c": 81, "747f1b1f2f": 144, "748b2d5c01": 96, "74d4cee0a4": 91, "74ec2b3073": 91, "74ef677020": 96, "750be4c4d8": 96, "75172d4ac8": 96, "75285a7eb1": 180, "75504539c3": 91, "7550949b1d": 96, "7551cbd537": 150, "75595b453d": 91, "7559b4b0ec": 91, "755bd1fbeb": 96, "756f76f74d": 180, "7570ca7f3c": 180, "757a69746e": 180, "757cac96c6": 180, "7584129dc3": 144, "75a058dbcd": 91, "75b09ce005": 96, "75cae39a8f": 180, "75cee6caf0": 180, "75cf58fb2c": 91, "75d5c2f32a": 180, "75eaf5669d": 96, "75f7937438": 180, "75f99bd3b3": 96, "75fa586876": 92, "7613df1f84": 150, "762e1b3487": 96, "76379a3e69": 180, "764271f0f3": 92, "764503c499": 86, "7660005554": 46, "7666351b84": 96, "76693db153": 51, "767856368b": 92, "768671f652": 180, "768802b80d": 180, "76962c7ed2": 71, "76a75f4eee": 150, "76b90809f7": 180, "770a441457": 96, "772a0fa402": 180, "772f2ffc3e": 91, "774f6c2175": 180, "77610860e0": 56, "777e58ff3d": 96, "77920f1708": 150, "7799df28e7": 180, "779e847a9a": 81, "77ba4edc72": 96, "77c834dc43": 41, "77d8aa8691": 180, "77e7f38f4d": 144, "77eea6845e": 96, "7806308f33": 91, "78254660ea": 91, "7828af8bff": 180, "784398620a": 71, "784d201b12": 96, "78613981ed": 180, "78896c6baf": 92, "78aff3ebc0": 150, "78c7c03716": 91, "78d3676361": 91, "78e29dd4c3": 150, "78f1a1a54f": 91, "79208585cd": 180, "792218456c": 180, "7923bad550": 150, "794e6fc49f": 96, "796e6762ce": 180, "797cd21f71": 150, "79921b21c2": 150, "79a5778027": 180, "79bc006280": 180, "79bf95e624": 91, "79d9e00c55": 91, "79e20fc008": 96, "79e9db913e": 180, "79f014085e": 91, "79fcbb433a": 150, "7a13a5dfaa": 180, "7a14bc9a36": 96, "7a3c535f70": 96, "7a446a51e9": 91, "7a56e759c5": 91, "7a5f46198d": 86, "7a626ec98d": 92, "7a802264c4": 180, "7a8b5456ca": 180, "7abdff3086": 150, "7aecf9f7ac": 150, "7b0fd09c28": 96, "7b18b3db87": 180, "7b39fe7371": 144, "7b49e03d4c": 180, "7b5388c9f1": 180, "7b5cf7837f": 180, "7b733d31d8": 180, "7b74fd7b98": 180, "7b918ccb8a": 150, "7ba3ce3485": 96, "7bb0abc031": 180, "7bb5bb25cd": 180, "7bb7dac673": 92, "7bc7761b8c": 180, "7bf3820566": 96, "7c03a18ec1": 96, "7c078f211b": 150, "7c37d7991a": 71, "7c4ec17eff": 144, "7c649c2aaf": 180, "7c73340ab7": 91, "7c78a2266d": 180, "7c88ce3c5b": 180, "7ca6843a72": 180, "7cc9258dee": 96, "7cec7296ae": 46, "7d0ffa68a4": 96, "7d11b4450f": 81, "7d1333fcbe": 96, "7d18074fef": 91, "7d18c8c716": 96, "7d508fb027": 180, "7d55f791f0": 180, "7d74e3c2f6": 150, "7d783f67a9": 96, "7d83a5d854": 150, "7dd409947e": 180, "7de45f75e5": 150, "7e0cd25696": 150, "7e1922575c": 96, "7e1e3bbcc1": 180, "7e24023274": 180, "7e2f212fd3": 96, "7e6d1cc1f4": 180, "7e7cdcb284": 144, "7e9b6bef69": 66, "7ea5b49283": 92, "7eb2605d96": 91, "7eb26b8485": 180, "7ecd1f0c69": 96, "7f02b3cfe2": 180, "7f1723f0d5": 97, "7f21063c3a": 81, "7f3658460e": 91, "7f54132e48": 144, "7f559f9d4a": 144, "7f5faedf8b": 96, "7f838baf2b": 180, "7fa5f527e3": 96, "7ff84d66dd": 150, "802b45c8c4": 180, "804382b1ad": 180, "804c558adb": 96, "804f6338a4": 180, "8056117b89": 150, "806b6223ab": 96, "8088bda461": 46, "80b790703b": 180, "80c4a94706": 96, "80ce2e351b": 180, "80db581acd": 96, "80e12193df": 150, "80e41b608f": 180, "80f16b016d": 91, "81541b3725": 91, "8175486e6a": 96, "8179095000": 180, "8193671178": 180, "81a58d2c6b": 150, "81aa1286fb": 96, "81dffd30fb": 96, "8200245704": 41, "823e7a86e8": 46, "824973babb": 144, "824ca5538f": 180, "827171a845": 180, "8273a03530": 180, "827cf4f886": 91, "82b865c7dd": 180, "82c1517708": 91, "82d15514d6": 150, "82e117b900": 179, "82fec06574": 150, "832b5ef379": 97, "83424c9fbf": 180, "8345358fb8": 71, "834b50b31b": 180, "835e3b67d7": 97, "836ea92b15": 90, "837c618777": 144, "838eb3bd89": 180, "839381063f": 91, "839bc71489": 180, "83a8151377": 180, "83ae88d217": 180, "83ca8bcad0": 180, "83ce590d7f": 180, "83d3130ba0": 36, "83d40bcba5": 86, "83daba503a": 144, "83de906ec0": 180, "84044f37f3": 180, "84696b5a5e": 96, "84752191a3": 91, "847eeeb2e0": 180, "848e7835a0": 96, "84a4b29286": 180, "84a4bf147d": 66, "84be115c09": 144, "84d95c4350": 180, "84e0922cf7": 150, "84f0cfc665": 96, "8515f6db22": 180, "851f2f32c1": 91, "852a4d6067": 150, "854c48b02a": 96, "857a387c86": 180, "859633d56a": 96, "85a4f4a639": 144, "85ab85510c": 180, "85b1eda0d9": 92, "85dc1041c6": 96, "85e081f3c7": 150, "85f75187ad": 96, "8604bb2b75": 96, "860745b042": 150, "863b4049d7": 180, "8643de22d0": 180, "8647d06439": 46, "864ffce4fe": 180, "8662d9441a": 180, "8666521b13": 76, "868d6a0685": 91, "869fa45998": 91, "86a40b655d": 150, "86a8ae4223": 92, "86b2180703": 180, "86c85d27df": 180, "86d3755680": 180, "86e61829a1": 180, "871015806c": 91, "871e409c5c": 180, "8744b861ce": 96, "8749369ba0": 180, "878a299541": 144, "8792c193a0": 96, "8799ab0118": 96, "87d1f7d741": 180, "882b9e4500": 180, "885673ea17": 180, "8859dedf41": 96, "8873ab2806": 91, "887a93b198": 180, "8883e991a9": 86, "8891aa6dfa": 91, "8899d8cbcd": 91, "88b8274d67": 180, "88d3b80af6": 91, "88ede83da2": 180, "88f345941b": 180, "890976d6da": 91, "8909bde9ab": 91, "8929c7d5d9": 180, "89363acf76": 150, "89379487e0": 96, "8939db6354": 180, "893f658345": 144, "8953138465": 180, "895c96d671": 180, "895cbf96f9": 180, "895e8b29a7": 91, "898fa256c8": 180, "89986c60be": 180, "89b874547b": 180, "89bdb021d5": 144, "89c802ff9c": 96, "89d6336c2b": 180, "89ebb27334": 91, "8a27e2407c": 96, "8a31f7bca5": 96, "8a4a2fc105": 96, "8a5d6c619c": 96, "8a75ad7924": 180, "8aa817e4ed": 87, "8aad0591eb": 180, "8aca214360": 180, "8ae168c71b": 96, "8b0cfbab97": 21, "8b3645d826": 96, "8b3805dbd4": 180, "8b473f0f5d": 180, "8b4f6d1186": 180, "8b4fb018b7": 66, "8b518ee936": 92, "8b523bdfd6": 150, "8b52fb5fba": 91, "8b91036e5c": 144, "8b99a77ac5": 180, "8ba04b1e7b": 96, "8ba782192f": 180, "8bbeaad78b": 96, "8bd1b45776": 180, "8bd7a2dda6": 150, "8bdb091ccf": 180, "8be56f165d": 96, "8be950d00f": 96, "8bf84e7d45": 180, "8bffc4374b": 66, "8bfff50747": 180, "8c09867481": 144, "8c0a3251c3": 180, "8c3015cccb": 180, "8c469815cf": 96, "8c9ccfedc7": 91, "8ca1af9f3c": 150, "8ca3f6e6c1": 96, "8ca6a4f60f": 96, "8cac6900fe": 96, "8cba221a1e": 180, "8cbbe62ccd": 180, "8d064b29e2": 92, "8d167e7c08": 91, "8d4ab94e1c": 96, "8d81f6f899": 180, "8d87897d66": 91, "8dcccd2bd2": 180, "8dcfb878a8": 150, "8dd3ab71b9": 91, "8dda6bf10f": 96, "8ddd51ca94": 180, "8dea22c533": 180, "8def5bd3bf": 96, "8e1848197c": 91, "8e3a83cf2d": 91, "8e478e73f3": 91, "8e98ae3c84": 96, "8ea6687ab0": 180, "8eb0d315c1": 91, "8ec10891f9": 150, "8ec3065ec2": 180, "8ecf51a971": 150, "8eddbab9f7": 91, "8ee198467a": 180, "8ee2368f40": 180, "8ef595ce82": 150, "8f0a653ad7": 150, "8f1204a732": 150, "8f1600f7f6": 91, "8f16366707": 96, "8f1ce0a411": 92, "8f2e05e814": 91, "8f320d0e09": 96, "8f3b4a84ad": 91, "8f3fdad3da": 96, "8f5d3622d8": 96, "8f62a2c633": 180, "8f81c9405a": 97, "8f8c974d53": 120, "8f918598b6": 96, "8ff61619f6": 96, "9002761b41": 96, "90107941f3": 92, "90118a42ee": 96, "902bc16b37": 91, "903e87e0d6": 144, "9041a0f489": 96, "9047bf3222": 51, "9057bfa502": 150, "90617b0954": 92, "9076f4b6db": 180, "9077e69b08": 144, "909655b4a6": 96, "909c2eca88": 180, "909dbd1b76": 180, "90bc4a319a": 180, "90c7a87887": 96, "90cc785ddd": 96, "90d300f09b": 180, "9101ea9b1b": 96, "9108130458": 150, "911ac9979b": 150, "9151cad9b5": 97, "9153762797": 180, "91634ee0c9": 91, "916942666f": 76, "9198cfb4ea": 180, "919ac864d6": 180, "91b67d58d4": 180, "91bb8df281": 150, "91be106477": 91, "91c33b4290": 180, "91ca7dd9f3": 144, "91d095f869": 180, "91f107082e": 180, "920329dd5e": 180, "920c959958": 150, "92128fbf4b": 144, "9223dacb40": 150, "923137bb7f": 61, "9268e1f88a": 180, "927647fe08": 150, "9276f5ba47": 150, "92a28cd233": 71, "92b5c1fc6d": 144, "92c46be756": 180, "92dabbe3a0": 96, "92e3159361": 180, "92ebab216a": 180, "934bdc2893": 180, "9359174efc": 180, "935d97dd2f": 91, "935feaba1b": 96, "93901858ee": 150, "939378f6d6": 91, "939bdf742e": 96, "93a22bee7e": 96, "93da9aeddf": 91, "93e2feacce": 180, "93e6f1fdf9": 96, "93e811e393": 180, "93e85d8fd3": 180, "93f623d716": 180, "93ff35e801": 46, "94031f12f2": 96, "94091a4873": 180, "94125907e3": 87, "9418653742": 91, "941c870569": 101, "94209c86f0": 180, "9437c715eb": 76, "9445c3eca2": 91, "9467c8617c": 96, "946d71fb5d": 96, "948f3ae6fb": 180, "9498baa359": 96, "94a33abeab": 91, "94bf1af5e3": 144, "94cf3a8025": 96, "94db712ac8": 180, "94e4b66cff": 92, "94e76cbaf6": 180, "950be91db1": 180, "952058e2d0": 92, "952633c37f": 96, "952ec313fe": 87, "9533fc037c": 96, "9574b81269": 92, "9579b73761": 180, "957f7bc48b": 180, "958073d2b0": 150, "9582e0eb33": 71, "9584092d0b": 91, "95b58b8004": 150, "95bd88da55": 180, "95f74a9959": 180, "962781c601": 180, "962f045bf5": 91, "964ad23b44": 91, "967b90590e": 144, "967bffe201": 86, "96825c4714": 81, "968492136a": 96, "9684ef9d64": 86, "968c41829e": 91, "96a856ef9a": 180, "96dfc49961": 180, "96e1a5b4f8": 180, "96e6ff0917": 150, "96fb88e9d7": 96, "96fbe5fc23": 150, "96fc924050": 96, "9715cc83dc": 180, "9720eff40f": 180, "972c187c0d": 180, "97476eb38d": 180, "97659ed431": 180, "9773492949": 96, "97756b264f": 96, "977bff0d10": 96, "97ab569ff3": 96, "97ba838008": 180, "97d9d008c7": 150, "97e59f09fa": 96, "97eb642e56": 96, "98043e2d14": 96, "981ff580cf": 180, "983e66cbfc": 96, "984f0f1c36": 180, "98595f2bb4": 91, "985c3be474": 91, "9869a12362": 180, "986b5a5e18": 180, "9877af5063": 180, "98911292da": 180, "9893a3cf77": 97, "9893d9202d": 91, "98a8b06e7f": 91, "98ac6f93d9": 150, "98b6974d12": 96, "98ba3c9417": 180, "98c7c00a19": 96, "98d044f206": 96, "98e909f9d1": 150, "98fe7f0410": 150, "990f2742c7": 96, "992bd0779a": 180, "994b9b47ba": 150, "9955b76bf5": 91, "9966f3adac": 46, "997117a654": 180, "999d53d841": 150, "99c04108d3": 180, "99c4277aee": 96, "99c6b1acf2": 96, "99dc8bb20b": 180, "99fcba71e5": 150, "99fecd4efb": 92, "9a02c70ba2": 96, "9a08e7a6f8": 180, "9a2f2c0f86": 81, "9a3254a76e": 92, "9a3570a020": 180, "9a39112493": 180, "9a4e9fd399": 180, "9a50af4bfb": 180, "9a68631d24": 150, "9a72318dbf": 92, "9a767493b7": 180, "9a7fc1548b": 96, "9a84ccf6a7": 150, "9a9c0e15b7": 96, "9adf06d89b": 150, "9b22b54ee4": 91, "9b473fc8fe": 96, "9b4f081782": 180, "9b997664ba": 180, "9bc454e109": 180, "9bccfd04de": 96, "9bce4583a2": 96, "9bebf1b87f": 158, "9bfc50d261": 180, "9c166c86ff": 96, "9c293ef4d7": 144, "9c29c047b0": 91, "9c3bc2e2a7": 96, "9c3ce23bd1": 91, "9c404cac0c": 180, "9c5180d23a": 144, "9c7feca6e4": 144, "9caa49d3ff": 180, "9cb2f1b646": 180, "9ce6f765c3": 91, "9cfee34031": 180, "9d01f08ec6": 180, "9d04c280b8": 91, "9d12ceaddc": 180, "9d15f8cb3c": 180, "9d2101e9bf": 180, "9d407c3aeb": 96, "9ddefc6165": 180, "9df0b1e298": 96, "9e16f115d8": 144, "9e249b4982": 96, "9e29b1982c": 92, "9e493e4773": 180, "9e4c752cd0": 91, "9e4de40671": 96, "9e6319faeb": 96, "9e6ddbb52d": 91, "9eadcea74f": 180, "9ecec5f8ea": 46, "9efb47b595": 96, "9f30bfe61e": 72, "9f3734c3a4": 180, "9f5b858101": 180, "9f66640cda": 180, "9f913803e9": 180, "9f97bc74c8": 180, "9fbad86e20": 180, "9fc2bad316": 180, "9fc5c3af78": 150, "9fcb310255": 92, "9fcc256871": 91, "9fd2fd4d47": 180, "a0071ae316": 96, "a023141022": 56, "a046399a74": 96, "a066e739c1": 150, "a06722ba82": 96, "a07a15dd64": 180, "a07b47f694": 180, "a09c39472e": 144, "a0b208fe2e": 91, "a0b61c959e": 96, "a0bc6c611d": 180, "a0e6da5ba2": 91, "a1193d6490": 96, "a14ef483ff": 91, "a14f709908": 180, "a15ccc5658": 96, "a16062456f": 180, "a174e8d989": 91, "a177c2733c": 150, "a17c62e764": 92, "a18ad065fc": 150, "a1aaf63216": 96, "a1bb65fb91": 150, "a1bd8e5349": 91, "a1dfdd0cac": 180, "a2052e4f6c": 96, "a20fd34693": 96, "a21ffe4d81": 150, "a22349e647": 180, "a235d01ec1": 180, "a24f63e8a2": 180, "a2554c9f6d": 46, "a263ce8a87": 180, "a29bfc29ec": 91, "a2a80072d4": 150, "a2a800ab63": 180, "a2bcd10a33": 180, "a2bdaff3b0": 91, "a2c146ab0d": 91, "a2c996e429": 96, "a2dc51ebe8": 180, "a2e6608bfa": 180, "a2f2a55f01": 96, "a301869dea": 180, "a31fccd2cc": 180, "a34f440f33": 180, "a35e0206da": 180, "a36bdc4cab": 180, "a36e8c79d8": 71, "a378053b20": 144, "a37db3a2b3": 91, "a38950ebc2": 180, "a39a0eb433": 91, "a39c9bca52": 180, "a3a945dc8c": 91, "a3b40a0c1e": 150, "a3b8588550": 91, "a3c502bec3": 180, "a3f2878017": 180, "a3f4d58010": 180, "a3f51855c3": 150, "a402dc0dfe": 21, "a4065a7eda": 180, "a412bb2fef": 180, "a416b56b53": 96, "a41ec95906": 91, "a43299e362": 180, "a4757bd7af": 96, "a48c53c454": 180, "a49dcf9ad5": 150, "a4a506521f": 180, "a4ba7753d9": 180, "a4bac06849": 91, "a4f05d681c": 91, "a50c10060f": 150, "a50eb5a0ea": 150, "a5122c6ec6": 150, "a522b1aa79": 96, "a590915345": 180, "a5b5b59139": 96, "a5b77abe43": 180, "a5c2b2c3e1": 96, "a5cd17bb11": 180, "a5da03aef1": 180, "a5dd11de0d": 150, "a5ea2b93b6": 150, "a5eaeac80b": 180, "a5ec5b0265": 144, "a5f350a87e": 180, "a5f472caf4": 96, "a6027a53cf": 180, "a61715bb1b": 180, "a61cf4389d": 150, "a61d9bbd9b": 180, "a6470dbbf5": 150, "a64a40f3eb": 76, "a653d5c23b": 180, "a65bd23cb5": 150, "a66e0b7ad4": 180, "a66fc5053c": 91, "a68259572b": 180, "a6a810a92c": 150, "a6bc36937f": 91, "a6c3a374e9": 180, "a6d8a4228d": 180, "a6f4e0817f": 180, "a71e0481f5": 96, "a7203deb2d": 150, "a7392d4438": 150, "a73d3c3902": 180, "a7491f1578": 150, "a74b9ca19c": 180, "a77b7a91df": 150, "a78195a5f5": 150, "a78758d4ce": 180, "a7e6d6c29a": 96, "a800d85e88": 51, "a832fa8790": 180, "a83d06410d": 150, "a8999af004": 180, "a8f78125b9": 180, "a907b18df1": 150, "a919392446": 150, "a965504e88": 96, "a96b84b8d2": 96, "a973f239cd": 91, "a977126596": 180, "a9804f2a08": 91, "a984e56893": 96, "a99738f24c": 91, "a99bdd0079": 144, "a9c9c1517e": 178, "a9cbf9c41b": 150, "a9e42e3c0c": 150, "aa07b7c1c0": 180, "aa175e5ec7": 96, "aa1a338630": 96, "aa27d7b868": 96, "aa45f1caaf": 91, "aa49e46432": 96, "aa51934e1b": 180, "aa6287bb6c": 96, "aa6d999971": 180, "aa85278334": 96, "aab33f0e2a": 180, "aaba004362": 180, "aade4cf385": 180, "aae78feda4": 91, "aaed233bf3": 180, "aaff16c2db": 96, "ab199e8dfb": 96, "ab23b78715": 96, "ab2e1b5577": 180, "ab33a18ded": 96, "ab45078265": 180, "ab56201494": 180, "ab90f0d24b": 180, "abab2e6c20": 180, "abb50c8697": 92, "abbe2d15a0": 180, "abbe73cd21": 150, "abe61a11bb": 180, "abeae8ce21": 150, "ac2b431d5f": 150, "ac2cb1b9eb": 150, "ac31fcd6d0": 91, "ac3d3a126d": 180, "ac46bd8087": 180, "ac783ef388": 180, "acb73e4297": 150, "acbf581760": 180, "accafc3531": 96, "acf2c4b745": 96, "acf44293a2": 96, "acf736a27b": 90, "acff336758": 180, "ad1fe56886": 92, "ad28f9b9d9": 91, "ad2de9f80e": 180, "ad397527b2": 97, "ad3d1cfbcb": 86, "ad3fada9d9": 180, "ad4108ee8e": 180, "ad54468654": 66, "ad573f7d31": 96, "ad6255bc29": 180, "ad65ebaa07": 144, "ad97cc064a": 96, "adabbd1cc4": 180, "adb0b5a270": 180, "adc648f890": 150, "add21ee467": 180, "adfd15ceef": 180, "adfdd52eac": 96, "ae01cdab63": 180, "ae0b50ff4f": 96, "ae13ee3d70": 180, "ae1bcbd423": 180, "ae20d09dea": 180, "ae2cecf5f6": 56, "ae3bc4a0ef": 180, "ae499c7514": 92, "ae628f2cd4": 150, "ae8545d581": 86, "ae93214fe6": 150, "ae9cd16dbf": 46, "aeba9ac967": 180, "aebb242b5c": 150, "aed4e0b4c4": 86, "aedd71f125": 180, "aef3e2cb0e": 180, "af0b54cee3": 96, "af3de54c7a": 180, "af5fd24a36": 150, "af8826d084": 91, "af8ad72057": 180, "afb71e22c5": 92, "afcb331e1f": 96, "afe1a35c1e": 150, "b01080b5d3": 180, "b05ad0d345": 96, "b0623a6232": 91, "b064dbd4b7": 96, "b06ed37831": 96, "b06f5888e6": 92, "b08dcc490e": 91, "b0a68228dc": 92, "b0aece727f": 144, "b0b0731606": 96, "b0c7f11f9f": 180, "b0cca8b830": 180, "b0dd580a89": 180, "b0de66ca08": 180, "b0df7c5c5c": 96, "b0f5295608": 96, "b11099eb09": 180, "b132a53086": 91, "b1399fac64": 180, "b13abc0c69": 96, "b1457e3b5e": 180, "b15bf4453b": 91, "b179c4a82d": 96, "b17ee70e8c": 180, "b190b1aa65": 96, "b19b3e22c0": 180, "b19c561fab": 180, "b1d1cd2e6e": 92, "b1d7c03927": 91, "b1d7fe2753": 180, "b1f540a4bd": 96, "b1fc9c64e1": 96, "b1fcbb3ced": 180, "b220939e93": 96, "b22099b419": 180, "b241e95235": 96, "b2432ae86d": 180, "b2456267df": 180, "b247940d01": 150, "b24af1c35c": 180, "b24f600420": 97, "b24fe36b2a": 150, "b258fb0b7d": 180, "b26b219919": 96, "b26d9904de": 96, "b274456ce1": 180, "b27b28d581": 72, "b2a26bc912": 180, "b2a9c51e1b": 180, "b2b0baf470": 180, "b2b2756fe7": 96, "b2ce7699e3": 180, "b2edc76bd2": 150, "b2f6b52100": 180, "b30bf47bcd": 180, "b34105a4e9": 91, "b372a82edf": 150, "b3779a1962": 96, "b379ab4ff5": 46, "b37a1d69e3": 150, "b37c01396e": 180, "b382b09e25": 150, "b3996e4ba5": 180, "b3d9ca2aee": 180, "b3dde1e1e9": 180, "b3eb7f05eb": 86, "b40b25055c": 91, "b41e0f1f19": 91, "b44e32a42b": 91, "b4805ae9cd": 46, "b4807569a5": 97, "b48efceb3e": 150, "b493c25c7f": 180, "b4b565aba1": 150, "b4b715a15b": 180, "b4d0c90bf4": 91, "b4d84bc371": 180, "b4e5ad97aa": 180, "b4eaea9e6b": 150, "b50f4b90d5": 180, "b53f675641": 150, "b54278cd43": 180, "b554843889": 150, "b573c0677a": 180, "b58d853734": 180, "b5943b18ab": 180, "b5a09a83f3": 71, "b5aae1fe25": 91, "b5b9da5364": 97, "b5eb64d419": 91, "b5ebb1d000": 96, "b5f1c0c96a": 96, "b5f7fece90": 180, "b6070de1bb": 180, "b60a76fe73": 86, "b61f998772": 96, "b62c943664": 96, "b63094ba0c": 180, "b64fca8100": 96, "b673e7dcfb": 96, "b678b7db00": 180, "b68fc1b217": 180, "b69926d9fa": 96, "b6a1df3764": 180, "b6a4859528": 96, "b6b4738b78": 96, "b6b4f847b7": 150, "b6b8d502d4": 150, "b6bb00e366": 180, "b6d65a9eef": 180, "b6d79a0845": 180, "b6e9ec577f": 91, "b6ec609f7b": 163, "b6f92a308d": 180, "b70a2c0ab1": 46, "b70a5a0d50": 180, "b70c052f2f": 150, "b70d231781": 92, "b72ac6e10b": 180, "b7302d8226": 92, "b73867d769": 150, "b751e767f2": 180, "b76df6e059": 96, "b77e5eddef": 92, "b7a2c2c83c": 96, "b7bcbe6466": 180, "b7c2a469c4": 180, "b7d69da8f0": 144, "b7f31b7c36": 61, "b7f675fb98": 46, "b7fb871660": 51, "b82e5ad1c9": 91, "b841cfb932": 96, "b84b8ae665": 180, "b85b78ac2b": 180, "b86c17caa6": 180, "b86e50d82d": 96, "b871db031a": 66, "b87d56925a": 96, "b8aaa59b75": 92, "b8c03d1091": 180, "b8c3210036": 46, "b8e16df00b": 144, "b8f34cf72e": 91, "b8fb75864e": 150, "b9004db86c": 180, "b9166cbae9": 92, "b920b256a6": 180, "b938d79dff": 20, "b93963f214": 180, "b941aef1a0": 144, "b94d34d14e": 96, "b964c57da4": 96, "b96a95bc7a": 180, "b96c57d2c7": 144, "b9b6bdde0c": 180, "b9bcb3e0f2": 96, "b9d3b92169": 180, "b9dd4b306c": 180, "b9f43ef41e": 92, "ba1f03c811": 96, "ba3a775d7b": 180, "ba3c7f2a31": 150, "ba3fcd417d": 180, "ba5e1f4faa": 150, "ba795f3089": 96, "ba8a291e6a": 150, "ba98512f97": 92, "bac9db04f5": 180, "baedae3442": 180, "baff40d29d": 180, "bb04e28695": 96, "bb1b0ee89f": 96, "bb1c770fe7": 150, "bb1fc34f99": 150, "bb2d220506": 180, "bb334e5cdb": 91, "bb337f9830": 81, "bb721eb9aa": 96, "bb87ff58bd": 96, "bb89a6b18a": 87, "bbaa9a036a": 144, "bbb4302dda": 180, "bbd31510cf": 96, "bbe0256a75": 180, "bc141b9ad5": 91, "bc17ab8a99": 150, "bc318160de": 180, "bc3b9ee033": 91, "bc4240b43c": 96, "bc4ce49105": 91, "bc4f71372d": 96, "bc6b8d6371": 180, "bcaad44ad7": 150, "bcc241b081": 91, "bcc5d8095e": 96, "bcd1d39afb": 96, "bd0d849da4": 180, "bd0e9ed437": 150, "bd2c94730f": 180, "bd321d2be6": 61, "bd3ec46511": 91, "bd5b2e2848": 41, "bd7e02b139": 96, "bd96f9943a": 180, "bda224cb25": 91, "bda4a82837": 96, "bdb74e333f": 180, "bdccd69dde": 96, "bddcc15521": 180, "be116aab29": 150, "be15e18f1e": 150, "be1a284edb": 180, "be2a367a7b": 180, "be376082d0": 150, "be3e3cffbd": 51, "be5d1d89a0": 180, "be8b72fe37": 180, "be9b29e08e": 91, "bea1f6e62c": 97, "bea83281b5": 92, "beb921a4c9": 96, "bec5e9edcd": 180, "beeb8a3f92": 150, "bf2232b58d": 96, "bf28751739": 150, "bf443804e8": 180, "bf461df850": 150, "bf5374f122": 180, "bf551a6f60": 180, "bf8d0f5ada": 96, "bf961167a6": 92, "bfab1ad8f9": 150, "bfcb05d88d": 96, "bfd8f6e6c9": 92, "bfd91d0742": 150, "bfe262322f": 87, "c013f42ed7": 180, "c01878083f": 180, "c01faff1ed": 180, "c046fd0edb": 150, "c053e35f97": 91, "c079a6482d": 96, "c0847b521a": 96, "c0a1e06710": 180, "c0e8d4635c": 96, "c0e973ad85": 96, "c0f49c6579": 92, "c0f5b222d7": 96, "c10d07c90d": 180, "c1268d998c": 96, "c130c3fc0c": 180, "c14826ad5e": 180, "c15b922281": 180, "c16f09cb63": 180, "c18e19d922": 180, "c1c830a735": 96, "c1e8aeea45": 180, "c20a5ccc99": 180, "c20fd5e597": 180, "c219d6f8dc": 150, "c2406ae462": 96, "c26f7b5824": 180, "c279e641ee": 96, "c27adaeac5": 180, "c2a35c1cda": 96, "c2a9903b8b": 180, "c2b62567c1": 96, "c2b974ec8c": 150, "c2baaff7bf": 91, "c2be6900f2": 180, "c304dd44d5": 180, "c307f33da2": 96, "c30a7b62c9": 92, "c3128733ee": 180, "c31fa6c598": 180, "c325c8201e": 96, "c32d4aa5d1": 180, "c33f28249a": 144, "c34365e2d7": 180, "c3457af795": 96, "c34d120a88": 180, "c3509e728d": 96, "c35e4fa6c4": 180, "c36240d96f": 150, "c3641dfc5a": 92, "c37b17a4a9": 180, "c39559ddf6": 180, "c3b0c6e180": 96, "c3b3d82e6c": 180, "c3be369fdb": 91, "c3bf1e40c2": 97, "c3c760b015": 96, "c3dd38bf98": 150, "c3e4274614": 91, "c3edc48cbd": 180, "c41e6587f5": 96, "c4272227b0": 96, "c42917fe82": 86, "c438858117": 180, "c44676563f": 180, "c44beb7472": 180, "c45411dacb": 91, "c4571bedc8": 91, "c46deb2956": 180, "c479ee052e": 180, "c47d551843": 180, "c49f07d46d": 180, "c4cc40c1fc": 97, "c4f256f5d5": 144, "c4f5b1ddcc": 180, "c4ff9b4885": 150, "c52bce43db": 66, "c544da6854": 180, "c55784c766": 180, "c557b69fbf": 180, "c593a3f7ab": 92, "c598faa682": 180, "c5ab1f09c8": 180, "c5b6da8602": 96, "c5b9128d94": 96, "c5e845c6b7": 150, "c5fba7b341": 150, "c60897f093": 96, "c61fe6ed7c": 96, "c62188c536": 96, "c64035b2e2": 150, "c69689f177": 180, "c6a12c131f": 51, "c6bb6d2d5c": 180, "c6c18e860f": 150, "c6d9526e0d": 180, "c6e55c33f0": 96, "c7030b28bd": 96, "c70682c7cc": 180, "c70f9be8c5": 87, "c71f30d7b6": 180, "c73c8e747f": 180, "c760eeb8b3": 144, "c7637cab0a": 150, "c7a1a17308": 87, "c7bf937af5": 91, "c7c2860db3": 180, "c7cef4aee2": 91, "c7ebfc5d57": 180, "c813dcf13c": 91, "c82235a49a": 96, "c82a7619a1": 180, "c82ecb90cb": 180, "c844f03dc7": 96, "c8557963f3": 91, "c89147e6e8": 180, "c8a46ff0c8": 150, "c8ab107dd5": 97, "c8b869a04a": 96, "c8c7b306a6": 91, "c8c8b28781": 180, "c8d79e3163": 180, "c8edab0415": 150, "c8f494f416": 96, "c8f6cba9fd": 150, "c909ceea97": 92, "c9188f4980": 180, "c922365dd4": 96, "c92c8c3c75": 96, "c937eb0b83": 91, "c94b31b5e5": 180, "c95cd17749": 180, "c96379c03c": 180, "c96465ee65": 180, "c965afa713": 144, "c9734b451f": 92, "c9862d82dc": 180, "c98b6fe013": 180, "c9999b7c48": 180, "c99e92aaf0": 97, "c9b3a8fbda": 150, "c9bf64e965": 96, "c9c3cb3797": 91, "c9d1c60cd0": 144, "c9de9c22c4": 96, "ca1828fa54": 96, "ca346f17eb": 180, "ca3787d3d3": 150, "ca4b99cbac": 96, "ca91c69e3b": 71, "ca91e99105": 46, "caa8e97f81": 96, "caac5807f8": 180, "cabba242c2": 96, "cad5a656a9": 180, "cad673e375": 180, "cad8a85930": 150, "cae7b0a02b": 180, "cae7ef3184": 180, "caeb6b6cbb": 150, "caecf0a5db": 91, "cb15312003": 76, "cb2e35d610": 150, "cb35a87504": 150, "cb3f22b0cf": 96, "cbb410da64": 91, "cc8728052e": 150, "cc892997b8": 180, "cce03c2a9b": 144, "cd47a23e31": 92, "cd4dc03dc0": 180, "cd5ae611da": 96, "cd603bb9d1": 144, "cd8f49734c": 180, "cdc6b1c032": 92, "cdcfe008ad": 144, "cdd57027c2": 96, "ce1af99b4b": 150, "ce1bc5743a": 150, "ce25872021": 97, "ce2776f78f": 180, "ce49b1f474": 180, "ce4f0a266f": 180, "ce5641b195": 180, "ce6866aa19": 180, "ce712ed3c9": 91, "ce7d1c8117": 144, "ce7dbeaa88": 180, "ce9b015a5e": 180, "cea7697b25": 96, "cebbd826cf": 150, "cec3415361": 150, "cec41ad4f4": 180, "ced49d26df": 180, "ced7705ab2": 144, "cef824a1e1": 92, "cf13f5c95a": 144, "cf4376a52d": 180, "cf85ab28b5": 180, "cfc2e50b9d": 150, "cfcd571fff": 144, "cfd9d4ae47": 180, "cfda2dcce5": 150, "cff035928b": 91, "cff8191891": 46, "d01608c2a5": 96, "d01a8f1f83": 144, "d021d68bca": 180, "d04258ca14": 150, "d0483573dc": 150, "d04a90aaff": 180, "d05279c0bd": 180, "d0696bd5fc": 91, "d072fda75b": 178, "d0a83bcd9f": 150, "d0ab39112e": 180, "d0acde820f": 96, "d0b4442c71": 144, "d0c65e9e95": 180, "d0fb600c73": 150, "d107a1457c": 61, "d123d674c1": 66, "d14d1e9289": 96, "d154e3388e": 96, "d177e9878a": 96, "d1802f69f8": 150, "d182c4483a": 180, "d195d31128": 180, "d200838929": 180, "d205e3cff5": 180, "d247420c4c": 180, "d2484bff33": 66, "d26f6ed9b0": 150, "d280fcd1cb": 180, "d2857f0faa": 180, "d292a50c7f": 46, "d295ea2dc7": 96, "d2a58b4fa6": 91, "d2b026739a": 150, "d2ebe0890f": 180, "d2ede5d862": 91, "d301ca58cc": 150, "d3069da8bb": 91, "d343d4a77d": 150, "d355e634ef": 86, "d367fb5253": 91, "d36d16358e": 76, "d38bc77e2c": 101, "d38d1679e2": 144, "d3932ad4bd": 97, "d3987b2930": 180, "d39934abe3": 144, "d3ae1c3f4c": 92, "d3b088e593": 87, "d3e6e05e16": 150, "d3eefae7c5": 144, "d3f55f5ab8": 180, "d3f5c309cc": 61, "d4034a7fdf": 180, "d4193011f3": 144, "d429c67630": 180, "d42c0ff975": 180, "d44a764409": 180, "d44e6acd1d": 66, "d45158c175": 150, "d454e8444f": 150, "d45f62717e": 180, "d48ebdcf74": 180, "d49ab52a25": 86, "d4a607ad81": 92, "d4b063c7db": 144, "d4da13e9ba": 96, "d4dd1a7d00": 180, "d4f4f7c9c3": 96, "d521aba02e": 180, "d535bb1b97": 92, "d53b955f78": 96, "d55cb7a205": 92, "d55f247a45": 150, "d5695544d8": 180, "d5853d9b8b": 180, "d5b6c6d94a": 96, "d5cae12834": 150, "d5df027f0c": 144, "d5ee40e5d0": 180, "d600046f73": 144, "d632fd3510": 144, "d6476cad55": 180, "d65a7bae86": 150, "d664c89912": 150, "d689658f06": 180, "d6917db4be": 96, "d69967143e": 96, "d699d3d798": 91, "d69f757a3f": 180, "d6ac0e065c": 91, "d6c02bfda5": 96, "d6c1b5749e": 92, "d6e12ef6cc": 92, "d6eed152c4": 180, "d6faaaf726": 96, "d704766646": 180, "d708e1350c": 180, "d7135cf104": 180, "d7157a9f44": 46, "d719cf9316": 96, "d724134cfd": 144, "d73a60a244": 180, "d7411662da": 144, "d74875ea7c": 96, "d756f5a694": 91, "d7572b7d8a": 180, "d763bd6d96": 180, "d7697c8b13": 96, "d7797196b4": 150, "d79c834768": 180, "d7b34e5d73": 91, "d7bb6b37a7": 150, "d7c7e064a6": 180, "d7fbf545b3": 96, "d82a0aa15b": 180, "d847e24abd": 144, "d8596701b7": 144, "d86101499c": 144, "d87069ba86": 150, "d87160957b": 144, "d874654b52": 91, "d88a403092": 96, "d8aee40f3f": 144, "d8e77a222d": 91, "d8eb07c381": 180, "d9010348a1": 66, "d90e3cf281": 91, "d92532c7b2": 180, "d927fae122": 150, "d95707bca8": 91, "d973b31c00": 144, "d991cb471d": 180, "d992c69d37": 150, "d99d770820": 180, "d9b63abc11": 180, "d9db6f1983": 144, "d9e52be2d2": 96, "d9edc82650": 150, "da01070697": 96, "da070ea4b7": 180, "da080507b9": 150, "da0e944cc4": 180, "da28d94ff4": 96, "da5d78b9d1": 180, "da6003fc72": 150, "da690fee9f": 180, "da6c68708f": 180, "da7a816676": 144, "dac361e828": 180, "dac71659b8": 144, "dad980385d": 96, "daebc12b77": 150, "db0968cdd3": 150, "db231a7100": 92, "db59282ace": 91, "db7f267c3f": 180, "dba35b87fd": 96, "dbba735a50": 86, "dbca076acd": 180, "dbd66dc3ac": 180, "dbdc3c292b": 180, "dbf4a5b32b": 180, "dbfc417d28": 180, "dc1745e0a2": 91, "dc32a44804": 180, "dc34b35e30": 150, "dc504a4f79": 92, "dc704dd647": 180, "dc71bc6918": 92, "dc7771b3be": 180, "dcf8c93617": 96, "dd0f4c9fb9": 180, "dd415df125": 120, "dd601f9a3f": 144, "dd61d903df": 150, "dd77583736": 150, "dd8636bd8b": 180, "dd9fe6c6ac": 92, "ddb2da4c14": 180, "ddcd450d47": 144, "dde8e67fb4": 76, "ddfc3f04d3": 150, "de2ab79dfa": 180, "de2f35b2fd": 91, "de30990a51": 180, "de36b216da": 96, "de37403340": 180, "de46e4943b": 96, "de4ddbccb1": 180, "de5e480f05": 96, "de6a9382ca": 96, "de74a601d3": 180, "de827c510d": 92, "ded6069f7b": 180, "defb71c741": 96, "df01f277f1": 180, "df05214b82": 92, "df0638b0a0": 46, "df11931ffe": 180, "df1b0e4620": 180, "df20a8650d": 92, "df2bc56d7c": 180, "df365282c6": 180, "df39a0d9df": 96, "df3c430c24": 91, "df5536cfb9": 180, "df59cfd91d": 97, "df5e2152b3": 66, "df741313c9": 96, "df7626172f": 92, "df8ad5deb9": 180, "df96aa609a": 180, "df9705605c": 180, "df9c91c4da": 180, "dfc0d3d27a": 180, "dfdbf91a99": 180, "e00baaae9b": 180, "e0a938c6e7": 91, "e0b2ceee6f": 150, "e0bdb5dfae": 36, "e0be1f6e17": 96, "e0c478f775": 150, "e0de82caa7": 180, "e0f217dd59": 91, "e0f7208874": 180, "e0fb58395e": 180, "e1194c2e9d": 150, "e11adcd05d": 180, "e128124b9d": 87, "e1495354e4": 180, "e1561d6d4b": 180, "e158805399": 91, "e16945b951": 46, "e19edcd34b": 180, "e1a1544285": 180, "e1ab7957f4": 150, "e1d26d35be": 96, "e1e957085b": 96, "e1f14510fa": 180, "e214b160f4": 180, "e2167379b8": 150, "e21acb20ab": 180, "e221105579": 180, "e22ddf8a1b": 180, "e22de45950": 96, "e22ffc469b": 180, "e23cca5244": 96, "e252f46f0b": 180, "e25fa6cf39": 180, "e26e486026": 150, "e275760245": 96, "e27bbedbfe": 92, "e29e9868a8": 180, "e2b37ff8af": 96, "e2b608d309": 180, "e2bef4da9a": 96, "e2c87a6421": 96, "e2ea25542c": 144, "e2fb1d6497": 178, "e2fcc99117": 91, "e33c18412a": 71, "e348377191": 91, "e352cb59c8": 180, "e36ac982f0": 91, "e391bc981e": 96, "e39e3e0a06": 96, "e3bf38265f": 51, "e3d5b2cd21": 150, "e3d60e82d5": 46, "e3e3245492": 96, "e3e4134877": 150, "e3f4635e03": 180, "e4004ee048": 180, "e402d1afa5": 180, "e415093d27": 71, "e41ceb5d81": 180, "e424653b78": 96, "e42b6d3dbb": 96, "e42d60f0d4": 180, "e436d0ff1e": 180, "e43d7ae2c5": 92, "e4428801bc": 97, "e44e0b4917": 180, "e470345ede": 180, "e48e8b4263": 180, "e4922e3726": 180, "e4936852bb": 96, "e495f32c60": 41, "e499228f26": 150, "e4af66e163": 180, "e4b2095f58": 180, "e4d19c8283": 180, "e4d4872dab": 96, "e4e2983570": 41, "e4eaa63aab": 91, "e4ef0a3a34": 91, "e4f8e5f46e": 96, "e4ffb6d0dd": 71, "e53e21aa02": 180, "e57f4f668b": 180, "e588433c1e": 96, "e597442c99": 150, "e5abc0e96b": 91, "e5be628030": 180, "e5ce96a55d": 61, "e5d6b70a9f": 81, "e5fde1574c": 92, "e625e1d27b": 180, "e6261d2348": 91, "e6267d46bc": 96, "e6295f223f": 180, "e63463d8c6": 96, "e6387bd1e0": 180, "e653883384": 96, "e65f134e0b": 150, "e668ef5664": 180, "e672ccd250": 92, "e674510b20": 91, "e676107765": 150, "e699da0cdf": 180, "e6be243065": 46, "e6deab5e0b": 76, "e6f065f2b9": 96, "e71629e7b5": 96, "e72a7d7b0b": 150, "e72f6104e1": 92, "e75a466eea": 72, "e76c55933f": 150, "e7784ec8ad": 180, "e78922e5e6": 47, "e78d450a9c": 91, "e7c6354e77": 91, "e7c8de1fce": 150, "e7ea10db28": 150, "e803918710": 180, "e8073a140b": 180, "e828dd02db": 150, "e845994987": 150, "e8485a2615": 96, "e85c5118a7": 180, "e88b6736e4": 180, "e8962324e3": 91, "e8b3018d36": 91, "e8cee8bf0b": 150, "e8d97ebece": 144, "e8da49ea6a": 96, "e8ed1a3ccf": 180, "e8f7904326": 72, "e8f8341dec": 180, "e8fa21eb13": 180, "e90c10fc4c": 150, "e914b8cac8": 180, "e92b6bfea4": 46, "e92e1b7623": 150, "e93f83e512": 92, "e9422ad240": 46, "e9460b55f9": 180, "e9502628f6": 180, "e950befd5f": 180, "e9582bdd1b": 91, "e95e5afe0f": 96, "e97cfac475": 96, "e98d57d99c": 91, "e98eda8978": 92, "e99706b555": 41, "e9bc0760ba": 91, "e9d3c78bf3": 87, "e9ec1b7ea8": 144, "ea065cc205": 180, "ea138b6617": 150, "ea16d3fd48": 180, "ea2545d64b": 180, "ea286a581c": 150, "ea320da917": 96, "ea345f3627": 91, "ea3b94a591": 180, "ea444a37eb": 71, "ea4a01216b": 180, "ea5672ffa8": 81, "eaa99191cb": 150, "eaab4d746c": 91, "eac7a59bc1": 150, "ead5d3835a": 96, "eaec65cfa7": 180, "eaed1a87be": 180, "eb2f821c6f": 180, "eb383cb82e": 91, "eb6992fe02": 150, "eb6ac20a01": 92, "eb6d7ab39e": 96, "eb7921facd": 180, "eb8fce51a6": 180, "ebbb90e9f9": 91, "ebbf5c9ee1": 180, "ebc4ec32e6": 91, "ebe56e5ef8": 180, "ec1299aee4": 97, "ec139ff675": 180, "ec193e1a01": 180, "ec28252938": 150, "ec387be051": 180, "ec3d4fac00": 91, "ec4186ce12": 95, "ec579c2f96": 91, "ecae59b782": 180, "ecb33a0448": 180, "ece6bc9e92": 150, "ecfedd4035": 92, "ecfff22fd6": 180, "ed3291c3d6": 180, "ed3cd5308d": 180, "ed3e6fc1a5": 180, "ed72ae8825": 180, "ed7455da68": 92, "ed844e879f": 150, "ed8f814b2b": 92, "ed911a1f63": 180, "ed9ff4f649": 180, "eda8ab984b": 180, "edb8878849": 96, "edbfdfe1b4": 180, "edd22c46a2": 96, "edd663afa3": 180, "ede3552eae": 96, "edeab61ee0": 174, "ee07583fc0": 150, "ee316eaed6": 91, "ee3f509537": 150, "ee40a1e491": 92, "ee4bf100f1": 180, "ee6f9b01f9": 180, "ee947ed771": 96, "ee9706ac7f": 91, "ee9a7840ae": 180, "eeb90cb569": 180, "eebf45e5c5": 92, "eeed0c7d73": 87, "ef0061a309": 96, "ef07f1a655": 96, "ef0a8e8f35": 56, "ef232a2aed": 150, "ef308ad2e9": 180, "ef44945428": 96, "ef45ce3035": 180, "ef5dde449d": 180, "ef5e770988": 144, "ef6359cea3": 96, "ef65268834": 180, "ef6cb5eae0": 86, "ef78972bc2": 150, "ef8cfcfc4f": 82, "ef96501dd0": 150, "ef9a2e976b": 91, "efb24f950f": 180, "efce0c1868": 180, "efe5ac6901": 91, "efe828affa": 180, "efea4e0523": 144, "f0268aa627": 180, "f0483250c8": 180, "f04cf99ee6": 62, "f05b189097": 96, "f08928c6d3": 96, "f09d74856f": 150, "f0a7607d63": 180, "f0ad38da27": 71, "f0c34e1213": 92, "f0c7f86c29": 180, "f0dfa18ba7": 150, "f0eb3179f7": 180, "f119bab27d": 150, "f14409b6a3": 180, "f1489baff4": 86, "f14c18cf6a": 180, "f15c607b92": 180, "f1af214222": 97, "f1b77bd309": 180, "f1ba9e1a3e": 180, "f1d99239eb": 66, "f1dc710cf4": 180, "f1ec5c08fa": 97, "f22648fe12": 180, "f22d21f1f1": 144, "f233257395": 91, "f23e95dbe5": 96, "f2445b1572": 150, "f253b3486d": 144, "f277c7a6a4": 91, "f2ab2b84d6": 87, "f2b7c9b1f3": 150, "f2b83d5ce5": 180, "f2c276018f": 150, "f2cfd94d64": 150, "f2dd6e3add": 150, "f2e7653f16": 180, "f2f333ad06": 96, "f2f55d6713": 180, "f2fdb6abec": 180, "f305a56d9f": 46, "f3085d6570": 96, "f3325c3338": 180, "f3400f1204": 180, "f34497c932": 97, "f34a56525e": 91, "f36483c824": 96, "f3704d5663": 91, "f3734c4913": 150, "f38e5aa5b4": 86, "f3986fba44": 180, "f3a0ffc7d9": 180, "f3b24a7d28": 96, "f3e6c35ec3": 180, "f3fc0ea80b": 96, "f40a683fbe": 180, "f4207ca554": 180, "f4377499c2": 150, "f46184f393": 144, "f46c2d0a6d": 180, "f46c364dca": 180, "f46f7a0b63": 180, "f46fe141b0": 91, "f470b9aeb0": 180, "f47eb7437f": 96, "f48b535719": 92, "f49e4866ac": 180, "f4aa882cfd": 180, "f4daa3dbd5": 96, "f4dd51ac35": 91, "f507a1b9dc": 96, "f51c5ac84b": 86, "f52104164b": 180, "f54c67b9bb": 96, "f5966cadd2": 180, "f5bddf5598": 91, "f5d85cfd17": 92, "f5e2e7d6a0": 96, "f5f051e9b4": 180, "f5f8a93a76": 150, "f6283e8af5": 96, "f635e9568b": 180, "f6474735be": 144, "f659251be2": 150, "f66981af4e": 96, "f6708fa398": 87, "f697fe8e8f": 96, "f6adb12c42": 76, "f6c7906ca4": 180, "f6cd0a8016": 144, "f6d6f15ae7": 144, "f6e501892c": 96, "f6f59d986f": 180, "f6fe8c90a5": 180, "f714160545": 144, "f74c3888d7": 180, "f7782c430e": 150, "f7783ae5f2": 96, "f77ab47923": 97, "f788a98327": 91, "f7961ac1f0": 96, "f7a71e7574": 150, "f7a8521432": 180, "f7afbf4947": 150, "f7b7cd5f44": 81, "f7cf4b4a39": 92, "f7d49799ad": 150, "f7e0c9bb83": 180, "f7e5b84928": 96, "f7e6bd58be": 96, "f7f2a38ac6": 96, "f7f6cb2d6d": 150, "f83f19e796": 76, "f85796a921": 91, "f8603c26b2": 180, "f8819b42ec": 144, "f891f8eaa1": 96, "f89288d10c": 92, "f895ae8cc1": 180, "f8af30d4b6": 97, "f8b4ac12f1": 180, "f8c3fb2b01": 180, "f8c8de2764": 180, "f8db369b40": 92, "f8fcb6a78c": 180, "f94aafdeef": 180, "f95d217b70": 96, "f9681d5103": 92, "f9750192a4": 91, "f9823a32c2": 96, "f991ddb4c2": 96, "f99d535567": 96, "f9ae3d98b7": 144, "f9b6217959": 91, "f9bd1fabf5": 96, "f9c68eaa64": 180, "f9d3e04c4f": 92, "f9daf64494": 180, "f9e4cc5a0a": 96, "f9ea6b7f31": 96, "f9f3852526": 180, "fa04c615cf": 150, "fa08e00a56": 180, "fa4370d74d": 180, "fa67744af3": 180, "fa88d48a92": 150, "fa8b904cc9": 92, "fa9526bdf1": 150, "fa9b9d2426": 150, "fad633fbe1": 150, "faf5222dc3": 91, "faff0e15f1": 180, "fb08c64e8c": 180, "fb23455a7f": 150, "fb2e19fa6e": 180, "fb34dfbb77": 180, "fb47fcea1e": 96, "fb49738155": 180, "fb4cbc514b": 71, "fb4e6062f7": 180, "fb5ba7ad6e": 96, "fb63cd1236": 96, "fb81157a07": 180, "fb92abdaeb": 180, "fba22a6848": 92, "fbaca0c9df": 180, "fbc645f602": 96, "fbd77444cd": 96, "fbe53dc8e8": 96, "fbe541dd73": 97, "fbe8488798": 91, "fbfd25174f": 96, "fc28cb305e": 97, "fc33b1ffd6": 150, "fc6186f0bb": 180, "fc918e3a40": 150, "fc96cda9d8": 150, "fc9832eea4": 150, "fcb10d0f81": 180, "fcd20a2509": 180, "fcf637e3ab": 92, "fcfd81727f": 96, "fd31890379": 180, "fd33551c28": 144, "fd542da05e": 144, "fd6789b3fe": 180, "fd77828200": 180, "fd7af75f4d": 150, "fdb28d0fbb": 150, "fdb3d1fb1e": 82, "fdb8b04124": 96, "fdc6e3d581": 91, "fdfce7e6fc": 180, "fe0f76d41b": 180, "fe24b0677d": 180, "fe3c02699d": 144, "fe58b48235": 96, "fe6a5596b8": 91, "fe6c244f63": 66, "fe7afec086": 180, "fe985d510a": 144, "fe9db35d15": 96, "fea8ffcd36": 144, "feb1080388": 180, "fed208bfca": 180, "feda5ad1c2": 180, "feec95b386": 91, "ff15a5eff6": 144, "ff204daf4b": 96, "ff25f55852": 180, "ff2ada194f": 180, "ff2ce142e8": 96, "ff49d36d20": 180, "ff5a1ec4f3": 180, "ff66152b25": 180, "ff692fdc56": 180, "ff773b1a1e": 96, "ff97129478": 144, "ffb904207d": 180, "ffc43fc345": 150, "fffe5f8df6": 180}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-E2FGVI/datasets/zip_dir.sh b/phantom/submodules/phantom-E2FGVI/datasets/zip_dir.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a7ade29b7880710d13dc4babd8e962d25c06af12
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/datasets/zip_dir.sh
@@ -0,0 +1,23 @@
+#!/bin/sh  
+
+# Choose one path to compress videos
+folder='./datasets/davis/JPEGImages'
+# folder='./datasets/youtube-vos/JPEGImages'
+
+if  [ -f $folder ];then
+    for file in $folder/*
+    do
+        if test -f $file
+        then
+            echo $file is file
+        else
+            echo compressing \"$file\" ...
+            zip -q -r -j $file.zip $file/
+            rm -rf $file/
+        fi
+    done
+else
+    echo '['$folder']' 'is not exist. Please check the directory.'
+fi
+
+echo 'Done!'
diff --git a/phantom/submodules/phantom-E2FGVI/environment.yml b/phantom/submodules/phantom-E2FGVI/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9633d331abb4ff2b3664e37ccc0d01269d434ae1
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/environment.yml
@@ -0,0 +1,139 @@
+name: phantom_e2fgvi
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - absl-py=0.15.0=pyhd3eb1b0_0
+  - aiohttp=3.8.1=py37h7f8727e_1
+  - aiosignal=1.2.0=pyhd3eb1b0_0
+  - async-timeout=4.0.1=pyhd3eb1b0_0
+  - asynctest=0.13.0=py_0
+  - attrs=21.4.0=pyhd3eb1b0_0
+  - blas=1.0=mkl
+  - blinker=1.4=py37h06a4308_0
+  - brotli=1.0.9=he6710b0_2
+  - brotlipy=0.7.0=py37h27cfd23_1003
+  - c-ares=1.18.1=h7f8727e_0
+  - ca-certificates=2022.3.29=h06a4308_0
+  - cachetools=4.2.2=pyhd3eb1b0_0
+  - certifi=2021.10.8=py37h06a4308_2
+  - cffi=1.15.0=py37hd667e15_1
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - click=8.0.4=py37h06a4308_0
+  - cloudpickle=2.0.0=pyhd3eb1b0_0
+  - cryptography=3.4.8=py37hd23ed53_0
+  - cudatoolkit=10.1.243=h6bb024c_0
+  - cycler=0.11.0=pyhd3eb1b0_0
+  - cytoolz=0.11.0=py37h7b6447c_0
+  - dask-core=2021.10.0=pyhd3eb1b0_0
+  - dataclasses=0.8=pyh6d0b6a4_7
+  - dbus=1.13.18=hb2f20db_0
+  - expat=2.4.4=h295c915_0
+  - fontconfig=2.13.1=h6c09931_0
+  - fonttools=4.25.0=pyhd3eb1b0_0
+  - freetype=2.11.0=h70c0345_0
+  - frozenlist=1.2.0=py37h7f8727e_0
+  - fsspec=2022.2.0=pyhd3eb1b0_0
+  - giflib=5.2.1=h7b6447c_0
+  - glib=2.69.1=h4ff587b_1
+  - google-auth=2.6.0=pyhd3eb1b0_0
+  - google-auth-oauthlib=0.4.1=py_2
+  - grpcio=1.42.0=py37hce63b2e_0
+  - gst-plugins-base=1.14.0=h8213a91_2
+  - gstreamer=1.14.0=h28cd5cc_2
+  - icu=58.2=he6710b0_3
+  - idna=3.3=pyhd3eb1b0_0
+  - imageio=2.9.0=pyhd3eb1b0_0
+  - importlib-metadata=4.11.3=py37h06a4308_0
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - jpeg=9d=h7f8727e_0
+  - kiwisolver=1.3.2=py37h295c915_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.35.1=h7274673_9
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgfortran-ng=7.5.0=ha8ba4b0_17
+  - libgfortran4=7.5.0=ha8ba4b0_17
+  - libgomp=9.3.0=h5101ec6_17
+  - libpng=1.6.37=hbc83047_0
+  - libprotobuf=3.19.1=h4ff587b_0
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - libtiff=4.2.0=h85742a9_0
+  - libuuid=1.0.3=h7f8727e_2
+  - libwebp=1.2.2=h55f646e_0
+  - libwebp-base=1.2.2=h7f8727e_0
+  - libxcb=1.14=h7b6447c_0
+  - libxml2=2.9.12=h03d6c58_0
+  - locket=0.2.1=py37h06a4308_2
+  - lz4-c=1.9.3=h295c915_1
+  - markdown=3.3.4=py37h06a4308_0
+  - matplotlib=3.5.1=py37h06a4308_1
+  - matplotlib-base=3.5.1=py37ha18d171_1
+  - mkl=2021.4.0=h06a4308_640
+  - mkl-service=2.4.0=py37h7f8727e_0
+  - mkl_fft=1.3.1=py37hd3c417c_0
+  - mkl_random=1.2.2=py37h51133e4_0
+  - multidict=5.2.0=py37h7f8727e_2
+  - munkres=1.1.4=py_0
+  - ncurses=6.3=h7f8727e_2
+  - networkx=2.6.3=pyhd3eb1b0_0
+  - ninja=1.10.2=py37hd09550d_3
+  - numpy=1.21.2=py37h20f2e39_0
+  - numpy-base=1.21.2=py37h79a1101_0
+  - oauthlib=3.2.0=pyhd3eb1b0_0
+  - openssl=1.1.1n=h7f8727e_0
+  - packaging=21.3=pyhd3eb1b0_0
+  - partd=1.2.0=pyhd3eb1b0_1
+  - pcre=8.45=h295c915_0
+  - pillow=9.0.1=py37h22f2fdc_0
+  - pip=21.2.2=py37h06a4308_0
+  - protobuf=3.19.1=py37h295c915_0
+  - pyasn1=0.4.8=pyhd3eb1b0_0
+  - pyasn1-modules=0.2.8=py_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyjwt=2.1.0=py37h06a4308_0
+  - pyopenssl=21.0.0=pyhd3eb1b0_1
+  - pyqt=5.9.2=py37h05f1152_2
+  - pysocks=1.7.1=py37_1
+  - python=3.7.13=h12debd9_0
+  - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - pytorch=1.5.1=py3.7_cuda10.1.243_cudnn7.6.3_0
+  - pywavelets=1.3.0=py37h7f8727e_0
+  - qt=5.9.7=h5867ecd_1
+  - readline=8.1.2=h7f8727e_1
+  - requests=2.27.1=pyhd3eb1b0_0
+  - requests-oauthlib=1.3.0=py_0
+  - rsa=4.7.2=pyhd3eb1b0_1
+  - scikit-image=0.16.2=py37h0573a6f_0
+  - scipy=1.7.3=py37hc147768_0
+  - setuptools=58.0.4=py37h06a4308_0
+  - sip=4.19.8=py37hf484d3e_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - sqlite=3.38.2=hc218d9a_0
+  - tensorboard=2.6.0=py_1
+  - tensorboard-data-server=0.6.0=py37hca6d32c_0
+  - tensorboard-plugin-wit=1.6.0=py_0
+  - tk=8.6.11=h1ccaba5_0
+  - toolz=0.11.2=pyhd3eb1b0_0
+  - torchvision=0.6.1=py37_cu101
+  - tornado=6.1=py37h27cfd23_0
+  - typing-extensions=4.1.1=hd3eb1b0_0
+  - typing_extensions=4.1.1=pyh06a4308_0
+  - urllib3=1.26.8=pyhd3eb1b0_0
+  - werkzeug=2.0.3=pyhd3eb1b0_0
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - xz=5.2.5=h7b6447c_0
+  - yaml=0.2.5=h7b6447c_0
+  - yarl=1.6.3=py37h27cfd23_0
+  - zipp=3.7.0=pyhd3eb1b0_0
+  - zlib=1.2.11=h7f8727e_4
+  - zstd=1.4.9=haebb681_0
+  - pip:
+    - addict==2.4.0
+    - opencv-python==4.5.5.64
+    - pyparsing==3.0.8
+    - pyyaml==6.0
+    - tqdm==4.64.0
+    - yapf==0.32.0
diff --git a/phantom/submodules/phantom-E2FGVI/setup.py b/phantom/submodules/phantom-E2FGVI/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d15508ab2c48d365abc8c8e549063018fa25075
--- /dev/null
+++ b/phantom/submodules/phantom-E2FGVI/setup.py
@@ -0,0 +1,7 @@
+import setuptools
+
+setuptools.setup(
+    name="E2FGVI",
+    packages=setuptools.find_packages(),
+    version="0.0.1",
+)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-hamer/.dockerignore b/phantom/submodules/phantom-hamer/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..72c966ee7e809a7e6f4573030acce21cdba4bfbb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/.dockerignore
@@ -0,0 +1,13 @@
+# Virtual environment ignores:
+venv/
+.venv/
+.hamer/
+
+# Ignoring the .tar.gz files because of the fetching scripts:
+*.tar.gz
+_DATA/
+
+# Ignoring .zip files becuase of the mano_v1_2.zip download with
+# the MANO_RIGHT.pkl file inside:
+*.zip
+mano_v1_2/
\ No newline at end of file
diff --git a/phantom/submodules/phantom-hamer/.gitignore b/phantom/submodules/phantom-hamer/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f7975d8606aed6f4212877ce6d5fdfe68bbe56df
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/.gitignore
@@ -0,0 +1,154 @@
+# Specific
+/logs*/
+/results/
+/sandbox/
+*.lock
+*.pt
+*.npy
+/example_data/downloaded*
+*.tar
+*.tar.gz
+/discord_sandbox/
+/demo_out/
+token_channel.csv
+/_DATA/
+/hamer_training_data/
+/outputs/
+
+mano_v1_2/
+
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+.hamer/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+/checkpoints/
+/data/
+
+*.zip
diff --git a/phantom/submodules/phantom-hamer/.gitmodules b/phantom/submodules/phantom-hamer/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..ee1faa1f77617036e5d816a73102ff51115ad02d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "third-party/ViTPose"]
+	path = third-party/ViTPose
+	url = https://github.com/MarionLepert/ViTPose.git
+	branch = main
diff --git a/phantom/submodules/phantom-hamer/LICENSE.md b/phantom/submodules/phantom-hamer/LICENSE.md
new file mode 100644
index 0000000000000000000000000000000000000000..209a6f5b97b2fc76a146c550a1a8d1d91af5eba7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 UC Regents, Georgios Pavlakos
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/phantom/submodules/phantom-hamer/README.md b/phantom/submodules/phantom-hamer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..964dd9699da2aa54ac0053e8479420c9946c13f6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/README.md
@@ -0,0 +1,126 @@
+# HaMeR: Hand Mesh Recovery
+Code repository for the paper:
+**Reconstructing Hands in 3D with Transformers**
+
+[Georgios Pavlakos](https://geopavlakos.github.io/), [Dandan Shan](https://ddshan.github.io/), [Ilija Radosavovic](https://people.eecs.berkeley.edu/~ilija/), [Angjoo Kanazawa](https://people.eecs.berkeley.edu/~kanazawa/), [David Fouhey](https://cs.nyu.edu/~fouhey/), [Jitendra Malik](http://people.eecs.berkeley.edu/~malik/)
+
+[![arXiv](https://img.shields.io/badge/arXiv-2312.05251-00ff00.svg)](https://arxiv.org/pdf/2312.05251.pdf)  [![Website shields.io](https://img.shields.io/website-up-down-green-red/http/shields.io.svg)](https://geopavlakos.github.io/hamer/)     [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rQbQzegFWGVOm1n1d-S6koOWDo7F2ucu?usp=sharing)  [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/geopavlakos/HaMeR)
+
+![teaser](assets/teaser.jpg)
+
+## News
+
+- [2024/06] HaMeR received the 2nd place award in the Ego-Pose Hands task of the Ego-Exo4D Challenge! Please check the [validation report](https://www.cs.utexas.edu/~pavlakos/hamer/resources/egoexo4d_challenge.pdf).
+- [2024/05] We have released the evaluation pipeline!
+- [2024/05] We have released the HInt dataset annotations! Please check [here](https://github.com/ddshan/hint).
+- [2023/12] Original release!
+
+## Installation
+First you need to clone the repo:
+```
+git clone --recursive https://github.com/geopavlakos/hamer.git
+cd hamer
+```
+
+We recommend creating a virtual environment for HaMeR. You can use venv:
+```bash
+python3.10 -m venv .hamer
+source .hamer/bin/activate
+```
+
+or alternatively conda:
+```bash
+conda create --name hamer python=3.10
+conda activate hamer
+```
+
+Then, you can install the rest of the dependencies. This is for CUDA 11.7, but you can adapt accordingly:
+```bash
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cu117
+pip install -e .[all]
+pip install -v -e third-party/ViTPose
+```
+
+You also need to download the trained models:
+```bash
+bash fetch_demo_data.sh
+```
+
+Besides these files, you also need to download the MANO model. Please visit the [MANO website](https://mano.is.tue.mpg.de) and register to get access to the downloads section.  We only require the right hand model. You need to put `MANO_RIGHT.pkl` under the `_DATA/data/mano` folder.
+
+### Docker Compose
+
+If you wish to use HaMeR with Docker, you can use the following command:
+
+```
+docker compose -f ./docker/docker-compose.yml up -d
+```
+
+After the image is built successfully, enter the container and run the steps as above:
+
+```
+docker compose -f ./docker/docker-compose.yml exec hamer-dev /bin/bash
+```
+
+Continue with the installation steps:
+
+```bash
+bash fetch_demo_data.sh
+```
+
+## Demo
+```bash
+python demo.py \
+    --img_folder example_data --out_folder demo_out \
+    --batch_size=48 --side_view --save_mesh --full_frame
+```
+
+## HInt Dataset
+We have released the annotations for the HInt dataset. Please follow the instructions [here](https://github.com/ddshan/hint)
+
+## Training
+First, download the training data to `./hamer_training_data/` by running:
+```
+bash fetch_training_data.sh
+```
+
+Then you can start training using the following command:
+```
+python train.py exp_name=hamer data=mix_all experiment=hamer_vit_transformer trainer=gpu launcher=local
+```
+Checkpoints and logs will be saved to `./logs/`.
+
+## Evaluation
+Download the [evaluation metadata](https://www.dropbox.com/scl/fi/7ip2vnnu355e2kqbyn1bc/hamer_evaluation_data.tar.gz?rlkey=nb4x10uc8mj2qlfq934t5mdlh) to `./hamer_evaluation_data/`. Additionally, download the FreiHAND, HO-3D, and HInt dataset images and update the corresponding paths in  `hamer/configs/datasets_eval.yaml`.
+
+Run evaluation on multiple datasets as follows, results are stored in `results/eval_regression.csv`. 
+```bash
+python eval.py --dataset 'FREIHAND-VAL,HO3D-VAL,NEWDAYS-TEST-ALL,NEWDAYS-TEST-VIS,NEWDAYS-TEST-OCC,EPICK-TEST-ALL,EPICK-TEST-VIS,EPICK-TEST-OCC,EGO4D-TEST-ALL,EGO4D-TEST-VIS,EGO4D-TEST-OCC'
+```
+
+Results for HInt are stored in `results/eval_regression.csv`. For [FreiHAND](https://github.com/lmb-freiburg/freihand) and [HO-3D](https://codalab.lisn.upsaclay.fr/competitions/4318) you get as output a `.json` file that can be used for evaluation using their corresponding evaluation processes.
+
+## Acknowledgements
+Parts of the code are taken or adapted from the following repos:
+- [4DHumans](https://github.com/shubham-goel/4D-Humans)
+- [SLAHMR](https://github.com/vye16/slahmr)
+- [ProHMR](https://github.com/nkolot/ProHMR)
+- [SPIN](https://github.com/nkolot/SPIN)
+- [SMPLify-X](https://github.com/vchoutas/smplify-x)
+- [HMR](https://github.com/akanazawa/hmr)
+- [ViTPose](https://github.com/ViTAE-Transformer/ViTPose)
+- [Detectron2](https://github.com/facebookresearch/detectron2)
+
+Additionally, we thank [StabilityAI](https://stability.ai/) for a generous compute grant that enabled this work.
+
+## Citing
+If you find this code useful for your research, please consider citing the following paper:
+
+```bibtex
+@inproceedings{pavlakos2024reconstructing,
+    title={Reconstructing Hands in 3{D} with Transformers},
+    author={Pavlakos, Georgios and Shan, Dandan and Radosavovic, Ilija and Kanazawa, Angjoo and Fouhey, David and Malik, Jitendra},
+    booktitle={CVPR},
+    year={2024}
+}
+```
diff --git a/phantom/submodules/phantom-hamer/demo.py b/phantom/submodules/phantom-hamer/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8a414149fa0f044230e97e2842e7be389e3dbce
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/demo.py
@@ -0,0 +1,208 @@
+from pathlib import Path
+import torch
+import argparse
+import os
+import cv2
+import numpy as np
+
+from hamer.configs import CACHE_DIR_HAMER
+from hamer.models import HAMER, download_models, load_hamer, DEFAULT_CHECKPOINT
+from hamer.utils import recursive_to
+from hamer.datasets.vitdet_dataset import ViTDetDataset, DEFAULT_MEAN, DEFAULT_STD
+from hamer.utils.renderer import Renderer, cam_crop_to_full
+
+LIGHT_BLUE=(0.65098039,  0.74117647,  0.85882353)
+
+from vitpose_model import ViTPoseModel
+
+import json
+from typing import Dict, Optional
+
+def main():
+    parser = argparse.ArgumentParser(description='HaMeR demo code')
+    parser.add_argument('--checkpoint', type=str, default=DEFAULT_CHECKPOINT, help='Path to pretrained model checkpoint')
+    parser.add_argument('--img_folder', type=str, default='images', help='Folder with input images')
+    parser.add_argument('--out_folder', type=str, default='out_demo', help='Output folder to save rendered results')
+    parser.add_argument('--side_view', dest='side_view', action='store_true', default=False, help='If set, render side view also')
+    parser.add_argument('--full_frame', dest='full_frame', action='store_true', default=True, help='If set, render all people together also')
+    parser.add_argument('--save_mesh', dest='save_mesh', action='store_true', default=False, help='If set, save meshes to disk also')
+    parser.add_argument('--batch_size', type=int, default=1, help='Batch size for inference/fitting')
+    parser.add_argument('--rescale_factor', type=float, default=2.0, help='Factor for padding the bbox')
+    parser.add_argument('--body_detector', type=str, default='vitdet', choices=['vitdet', 'regnety'], help='Using regnety improves runtime and reduces memory')
+    parser.add_argument('--file_type', nargs='+', default=['*.jpg', '*.png'], help='List of file extensions to consider')
+
+    args = parser.parse_args()
+
+    # Download and load checkpoints
+    download_models(CACHE_DIR_HAMER)
+    model, model_cfg = load_hamer(args.checkpoint)
+
+    # Setup HaMeR model
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    model = model.to(device)
+    model.eval()
+
+    # Load detector
+    from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy
+    if args.body_detector == 'vitdet':
+        from detectron2.config import LazyConfig
+        import hamer
+        cfg_path = Path(hamer.__file__).parent/'configs'/'cascade_mask_rcnn_vitdet_h_75ep.py'
+        detectron2_cfg = LazyConfig.load(str(cfg_path))
+        detectron2_cfg.train.init_checkpoint = "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h/f328730692/model_final_f05665.pkl"
+        for i in range(3):
+            detectron2_cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25
+        detector = DefaultPredictor_Lazy(detectron2_cfg)
+    elif args.body_detector == 'regnety':
+        from detectron2 import model_zoo
+        from detectron2.config import get_cfg
+        detectron2_cfg = model_zoo.get_config('new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ.py', trained=True)
+        detectron2_cfg.model.roi_heads.box_predictor.test_score_thresh = 0.5
+        detectron2_cfg.model.roi_heads.box_predictor.test_nms_thresh   = 0.4
+        detector       = DefaultPredictor_Lazy(detectron2_cfg)
+
+    # keypoint detector
+    cpm = ViTPoseModel(device)
+
+    # Setup the renderer
+    renderer = Renderer(model_cfg, faces=model.mano.faces)
+
+    # Make output directory if it does not exist
+    os.makedirs(args.out_folder, exist_ok=True)
+
+    # Get all demo images ends with .jpg or .png
+    img_paths = [img for end in args.file_type for img in Path(args.img_folder).glob(end)]
+
+    # Iterate over all images in folder
+    for img_path in img_paths:
+        img_cv2 = cv2.imread(str(img_path))
+
+        # Detect humans in image
+        det_out = detector(img_cv2)
+        img = img_cv2.copy()[:, :, ::-1]
+
+        det_instances = det_out['instances']
+        valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5)
+        pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
+        pred_scores=det_instances.scores[valid_idx].cpu().numpy()
+
+        # Detect human keypoints for each person
+        vitposes_out = cpm.predict_pose(
+            img,
+            [np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)],
+        )
+
+        bboxes = []
+        is_right = []
+
+        # Use hands based on hand keypoint detections
+        for vitposes in vitposes_out:
+            left_hand_keyp = vitposes['keypoints'][-42:-21]
+            right_hand_keyp = vitposes['keypoints'][-21:]
+
+            # Rejecting not confident detections
+            keyp = left_hand_keyp
+            valid = keyp[:,2] > 0.5
+            if sum(valid) > 3:
+                bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()]
+                bboxes.append(bbox)
+                is_right.append(0)
+            keyp = right_hand_keyp
+            valid = keyp[:,2] > 0.5
+            if sum(valid) > 3:
+                bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()]
+                bboxes.append(bbox)
+                is_right.append(1)
+
+        if len(bboxes) == 0:
+            continue
+
+        boxes = np.stack(bboxes)
+        right = np.stack(is_right)
+
+        # Run reconstruction on all detected hands
+        dataset = ViTDetDataset(model_cfg, img_cv2, boxes, right, rescale_factor=args.rescale_factor)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0)
+
+        all_verts = []
+        all_cam_t = []
+        all_right = []
+        
+        for batch in dataloader:
+            batch = recursive_to(batch, device)
+            with torch.no_grad():
+                out = model(batch)
+
+            multiplier = (2*batch['right']-1)
+            pred_cam = out['pred_cam']
+            pred_cam[:,1] = multiplier*pred_cam[:,1]
+            box_center = batch["box_center"].float()
+            box_size = batch["box_size"].float()
+            img_size = batch["img_size"].float()
+            multiplier = (2*batch['right']-1)
+            scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max()
+            pred_cam_t_full = cam_crop_to_full(pred_cam, box_center, box_size, img_size, scaled_focal_length).detach().cpu().numpy()
+
+            # Render the result
+            batch_size = batch['img'].shape[0]
+            for n in range(batch_size):
+                # Get filename from path img_path
+                img_fn, _ = os.path.splitext(os.path.basename(img_path))
+                person_id = int(batch['personid'][n])
+                white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:,None,None]/255) / (DEFAULT_STD[:,None,None]/255)
+                input_patch = batch['img'][n].cpu() * (DEFAULT_STD[:,None,None]/255) + (DEFAULT_MEAN[:,None,None]/255)
+                input_patch = input_patch.permute(1,2,0).numpy()
+
+                regression_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(),
+                                        out['pred_cam_t'][n].detach().cpu().numpy(),
+                                        batch['img'][n],
+                                        mesh_base_color=LIGHT_BLUE,
+                                        scene_bg_color=(1, 1, 1),
+                                        )
+
+                if args.side_view:
+                    side_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(),
+                                            out['pred_cam_t'][n].detach().cpu().numpy(),
+                                            white_img,
+                                            mesh_base_color=LIGHT_BLUE,
+                                            scene_bg_color=(1, 1, 1),
+                                            side_view=True)
+                    final_img = np.concatenate([input_patch, regression_img, side_img], axis=1)
+                else:
+                    final_img = np.concatenate([input_patch, regression_img], axis=1)
+
+                cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_{person_id}.png'), 255*final_img[:, :, ::-1])
+
+                # Add all verts and cams to list
+                verts = out['pred_vertices'][n].detach().cpu().numpy()
+                is_right = batch['right'][n].cpu().numpy()
+                verts[:,0] = (2*is_right-1)*verts[:,0]
+                cam_t = pred_cam_t_full[n]
+                all_verts.append(verts)
+                all_cam_t.append(cam_t)
+                all_right.append(is_right)
+
+                # Save all meshes to disk
+                if args.save_mesh:
+                    camera_translation = cam_t.copy()
+                    tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE, is_right=is_right)
+                    tmesh.export(os.path.join(args.out_folder, f'{img_fn}_{person_id}.obj'))
+
+        # Render front view
+        if args.full_frame and len(all_verts) > 0:
+            misc_args = dict(
+                mesh_base_color=LIGHT_BLUE,
+                scene_bg_color=(1, 1, 1),
+                focal_length=scaled_focal_length,
+            )
+            cam_view = renderer.render_rgba_multiple(all_verts, cam_t=all_cam_t, render_res=img_size[n], is_right=all_right, **misc_args)
+
+            # Overlay image
+            input_img = img_cv2.astype(np.float32)[:,:,::-1]/255.0
+            input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel
+            input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:]
+
+            cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_all.jpg'), 255*input_img_overlay[:, :, ::-1])
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/docker/docker-compose.yml b/phantom/submodules/phantom-hamer/docker/docker-compose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..216336f4a270e95f05614deb388f55a1f7128992
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/docker/docker-compose.yml
@@ -0,0 +1,16 @@
+services:
+  hamer-dev:
+    build:
+      context: ../
+      dockerfile: ./docker/hamer-dev.Dockerfile
+    volumes:
+      - ../:/app
+    tty: true
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1 # alternatively, use `count: all` for all GPUs
+              capabilities: [gpu]
+
diff --git a/phantom/submodules/phantom-hamer/docker/hamer-dev.Dockerfile b/phantom/submodules/phantom-hamer/docker/hamer-dev.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..fa693e18e3872bd44e1f0707d9f5f3e47656d31d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/docker/hamer-dev.Dockerfile
@@ -0,0 +1,53 @@
+ARG BASE=nvidia/cuda:12.6.2-devel-ubuntu22.04
+FROM ${BASE} AS hamer
+
+# Install OS dependencies:
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    gcc g++ \
+    make \
+    python3 python3-dev python3-pip python3-venv python3-wheel \
+    espeak-ng libsndfile1-dev \
+    git \
+    wget \
+    ffmpeg \
+    libsm6 libxext6 \
+    libglfw3-dev libgles2-mesa-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install hamer:
+WORKDIR /app
+
+# Create virtual environment:
+RUN python3 -m venv /opt/venv
+
+# Add virtual environment to PATH
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Activate virtual environment and install dependencies:
+# REVIEW: We need to install/upgrade wheel and setuptools first because otherwise installation fails:
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --upgrade wheel setuptools
+
+# Install torch and torchvision:
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install torch==2.2.0 torchvision==0.17.0 --index-url https://download.pytorch.org/whl/cu118
+
+# REVIEW: Numpy is installed separately because otherwise installation fails:
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install numpy
+
+# Install gdown (used for fetching scripts):
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install gdown
+
+# Install third-party dependencies ViTPose:
+COPY third-party/ third-party/
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -v -e third-party/ViTPose
+
+# Install project dependencies:
+COPY . .
+# Install hamer:
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -e .[all]
diff --git a/phantom/submodules/phantom-hamer/eval.py b/phantom/submodules/phantom-hamer/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..f119d27661fc7361ff6546f469211925f8ab2763
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/eval.py
@@ -0,0 +1,166 @@
+import argparse
+import os
+import json
+from pathlib import Path
+import traceback
+from typing import List, Optional
+
+import pandas as pd
+import torch
+from filelock import FileLock
+from hamer.configs import dataset_eval_config
+from hamer.datasets import create_dataset
+from hamer.utils import Evaluator, recursive_to
+from tqdm import tqdm
+
+from hamer.configs import CACHE_DIR_HAMER
+from hamer.models import HAMER, download_models, load_hamer, DEFAULT_CHECKPOINT
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate trained models')
+    parser.add_argument('--checkpoint', type=str, default=DEFAULT_CHECKPOINT, help='Path to pretrained model checkpoint')
+    parser.add_argument('--results_folder', type=str, default='results', help='Path to results folder.')
+    parser.add_argument('--dataset', type=str, default='FREIHAND-VAL,HO3D-VAL,NEWDAYS-TEST-ALL,NEWDAYS-TEST-VIS,NEWDAYS-TEST-OCC,EPICK-TEST-ALL,EPICK-TEST-VIS,EPICK-TEST-OCC,EGO4D-TEST-ALL,EGO4D-TEST-VIS,EGO4D-TEST-OCC', help='Dataset to evaluate')
+    parser.add_argument('--batch_size', type=int, default=16, help='Batch size for inference')
+    parser.add_argument('--num_samples', type=int, default=1, help='Number of test samples to draw')
+    parser.add_argument('--num_workers', type=int, default=8, help='Number of workers used for data loading')
+    parser.add_argument('--log_freq', type=int, default=10, help='How often to log results')
+    parser.add_argument('--shuffle', dest='shuffle', action='store_true', default=False, help='Shuffle the dataset during evaluation')
+    parser.add_argument('--exp_name', type=str, default=None, help='Experiment name')
+
+    args = parser.parse_args()
+
+    # Download and load checkpoints
+    download_models(CACHE_DIR_HAMER)
+    model, model_cfg = load_hamer(args.checkpoint)
+
+    # Setup HMR2.0 model
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    model = model.to(device)
+    model.eval()
+
+    # Load config and run eval, one dataset at a time
+    print('Evaluating on datasets: {}'.format(args.dataset), flush=True)
+    for dataset in args.dataset.split(','):
+        dataset_cfg = dataset_eval_config()[dataset]
+        args.dataset = dataset
+        run_eval(model, model_cfg, dataset_cfg, device, args)
+
+def run_eval(model, model_cfg, dataset_cfg, device, args):
+
+    # List of metrics to log
+    if args.dataset in ['FREIHAND-VAL', 'HO3D-VAL']:
+        metrics = None
+        preds = ['vertices', 'keypoints_3d']
+        pck_thresholds = None
+        rescale_factor = -1
+    elif args.dataset in ['NEWDAYS-TEST-ALL', 'NEWDAYS-TEST-VIS', 'NEWDAYS-TEST-OCC',
+                          'EPICK-TEST-ALL', 'EPICK-TEST-VIS', 'EPICK-TEST-OCC',
+                          'EGO4D-TEST-ALL', 'EGO4D-TEST-VIS', 'EGO4D-TEST-OCC']:
+        metrics = ['mode_kpl2']
+        preds = None
+        pck_thresholds = [0.05, 0.1, 0.15]
+        rescale_factor = 2
+
+    # Create dataset and data loader
+    dataset = create_dataset(model_cfg, dataset_cfg, train=False, rescale_factor=rescale_factor)
+    dataloader = torch.utils.data.DataLoader(dataset, args.batch_size, shuffle=args.shuffle, num_workers=args.num_workers)
+
+    # Setup evaluator object
+    evaluator = Evaluator(
+        dataset_length=dataset.__len__(),
+        dataset=args.dataset,
+        keypoint_list=dataset_cfg.KEYPOINT_LIST, 
+        pelvis_ind=model_cfg.EXTRA.PELVIS_IND, 
+        metrics=metrics,
+        preds=preds,
+        pck_thresholds=pck_thresholds,
+    )
+
+    # Go over the images in the dataset.
+    try:
+        for i, batch in enumerate(tqdm(dataloader)):
+            batch = recursive_to(batch, device)
+            with torch.no_grad():
+                out = model(batch)
+            evaluator(out, batch)
+            if i % args.log_freq == args.log_freq - 1:
+                evaluator.log()
+        evaluator.log()
+        error = None
+    except (Exception, KeyboardInterrupt) as e:
+        traceback.print_exc()
+        error = repr(e)
+        i = 0
+
+    # Append results to file
+    if metrics is not None:
+        metrics_dict = evaluator.get_metrics_dict()
+        results_csv = os.path.join(args.results_folder, 'eval_regression.csv')
+        save_eval_result(results_csv, metrics_dict, args.checkpoint, args.dataset, error=error, iters_done=i, exp_name=args.exp_name)
+    if preds is not None:
+        results_json = os.path.join(args.results_folder, '%s.json' % args.dataset.lower())
+        preds_dict = evaluator.get_preds_dict()
+        save_preds_result(results_json, preds_dict)
+
+def save_eval_result(
+    csv_path: str,
+    metric_dict: float,
+    checkpoint_path: str,
+    dataset_name: str,
+    # start_time: pd.Timestamp,
+    error: Optional[str] = None,
+    iters_done=None,
+    exp_name=None,
+) -> None:
+    """Save evaluation results for a single scene file to a common CSV file."""
+
+    timestamp = pd.Timestamp.now()
+    exists: bool = os.path.exists(csv_path)
+    exp_name = exp_name or Path(checkpoint_path).parent.parent.name
+
+    # save each metric as different row to the csv path
+    metric_names = list(metric_dict.keys())
+    metric_values = list(metric_dict.values())
+    N = len(metric_names)
+    df = pd.DataFrame(
+        dict(
+            timestamp=[timestamp] * N,
+            checkpoint_path=[checkpoint_path] * N,
+            exp_name=[exp_name] * N,
+            dataset=[dataset_name] * N,
+            metric_name=metric_names,
+            metric_value=metric_values,
+            error=[error] * N,
+            iters_done=[iters_done] * N,
+        ),
+        index=list(range(N)),
+    )
+
+    # Lock the file to prevent multiple processes from writing to it at the same time.
+    lock = FileLock(f"{csv_path}.lock", timeout=10)
+    with lock:
+        df.to_csv(csv_path, mode="a", header=not exists, index=False)
+
+def save_preds_result(
+    pred_out_path: str,
+    preds_dict: float,
+) -> None:
+    """ Save predictions into a json file. """
+    xyz_pred_list = preds_dict['keypoints_3d']
+    verts_pred_list = preds_dict['vertices']
+    # make sure its only lists
+    xyz_pred_list = [x.tolist() for x in xyz_pred_list]
+    verts_pred_list = [x.tolist() for x in verts_pred_list]
+
+    # save to a json
+    with open(pred_out_path, 'w') as fo:
+        json.dump(
+            [
+                xyz_pred_list,
+                verts_pred_list
+            ], fo)
+    print('Dumped %d joints and %d verts predictions to %s' % (len(xyz_pred_list), len(verts_pred_list), pred_out_path))
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/fetch_demo_data.sh b/phantom/submodules/phantom-hamer/fetch_demo_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..776ac06de6d9b73717ef20672478e50f4360267a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/fetch_demo_data.sh
@@ -0,0 +1,7 @@
+# Google drive link to download the model
+gdown https://drive.google.com/uc?id=1mv7CUAnm73oKsEEG1xE3xH2C_oqcFSzT
+
+# Alternatively, you can use wget
+#wget https://www.cs.utexas.edu/~pavlakos/hamer/data/hamer_demo_data.tar.gz
+
+tar --warning=no-unknown-keyword --exclude=".*" -xvf hamer_demo_data.tar.gz
diff --git a/phantom/submodules/phantom-hamer/fetch_training_data.sh b/phantom/submodules/phantom-hamer/fetch_training_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6c33df0892b3047504f8c98d8cd467cc1a11a174
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/fetch_training_data.sh
@@ -0,0 +1,19 @@
+# Google drive links to download the training data
+gdown https://drive.google.com/uc?id=1BuKEc9qoBVgF8ApTTgAKRFVvDagWHPt7 # hamer_training_data_part1.tar.gz
+gdown https://drive.google.com/uc?id=1lNqBsifaxMP3NHIV_KKJVCT1zDvUn2T_ # hamer_training_data_part2.tar.gz
+gdown https://drive.google.com/uc?id=16xfV_ALY_M3VZeXKpjlB3MhnCNiS5XSq # hamer_training_data_part3.tar.gz
+gdown https://drive.google.com/uc?id=1SqzFHH2-UI6PlGTMd0Ds2FJKALBBOaEO # hamer_training_data_part4a.tar.gz
+gdown https://drive.google.com/uc?id=1xQxEjpaa3WqJt60UMtX_wpnOZ3bpj-u9 # hamer_training_data_part4b.tar.gz
+gdown https://drive.google.com/uc?id=1ozejeAue1M-p4boIfpnpX0E8RFIeHUZH # hamer_training_data_part4c.tar.gz
+
+# Alternatively, consider using the dropbox links:
+#wget -O hamer_training_data_part1.tar.gz https://www.dropbox.com/scl/fi/bqq0jheev3626q1wiijs1/hamer_training_data_part1.tar.gz?rlkey=8fv4ktvk7r3txofd90q0trgxr
+#wget -O hamer_training_data_part2.tar.gz https://www.dropbox.com/scl/fi/l9l5udalchu0mh4qxnw2t/hamer_training_data_part2.tar.gz?rlkey=i0n2lzix4q6jxmhm4sr5rtmkt
+#wget -O hamer_training_data_part3.tar.gz https://www.dropbox.com/scl/fi/6lamcbwt79ri0oj4knwm3/hamer_training_data_part3.tar.gz?rlkey=j5y7ea7xrlu440ud12otaj2ne
+#wget -O hamer_training_data_part4a.tar.gz https://www.dropbox.com/scl/fi/vp6cw7he8t0eigjf6001l/hamer_training_data_part4a.tar.gz?rlkey=wylmufft4a5nq3yxep2olifrk
+#wget -O hamer_training_data_part4b.tar.gz https://www.dropbox.com/scl/fi/vyjasngr67ru14fb8s108/hamer_training_data_part4b.tar.gz?rlkey=qgotg1v9lkgo5eu78gh8b007t
+#wget -O hamer_training_data_part4c.tar.gz https://www.dropbox.com/scl/fi/nfvz5zpcmhz8hkwzc6ji4/hamer_training_data_part4c.tar.gz?rlkey=ygh0wvse04twhh1ri3xiw2sag
+
+for f in hamer_training_data_part*.tar.gz; do
+    tar --warning=no-unknown-keyword --exclude=".*" -xvf $f
+done
diff --git a/phantom/submodules/phantom-hamer/hamer/__init__.py b/phantom/submodules/phantom-hamer/hamer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-hamer/hamer/configs/__init__.py b/phantom/submodules/phantom-hamer/hamer/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..47d17a4153c4b71f0410616274689c6c1806e6e7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs/__init__.py
@@ -0,0 +1,114 @@
+import os
+from typing import Dict
+from yacs.config import CfgNode as CN
+
+CACHE_DIR_HAMER = "./_DATA"
+
+def to_lower(x: Dict) -> Dict:
+    """
+    Convert all dictionary keys to lowercase
+    Args:
+      x (dict): Input dictionary
+    Returns:
+      dict: Output dictionary with all keys converted to lowercase
+    """
+    return {k.lower(): v for k, v in x.items()}
+
+_C = CN(new_allowed=True)
+
+_C.GENERAL = CN(new_allowed=True)
+_C.GENERAL.RESUME = True
+_C.GENERAL.TIME_TO_RUN = 3300
+_C.GENERAL.VAL_STEPS = 100
+_C.GENERAL.LOG_STEPS = 100
+_C.GENERAL.CHECKPOINT_STEPS = 20000
+_C.GENERAL.CHECKPOINT_DIR = "checkpoints"
+_C.GENERAL.SUMMARY_DIR = "tensorboard"
+_C.GENERAL.NUM_GPUS = 1
+_C.GENERAL.NUM_WORKERS = 4
+_C.GENERAL.MIXED_PRECISION = True
+_C.GENERAL.ALLOW_CUDA = True
+_C.GENERAL.PIN_MEMORY = False
+_C.GENERAL.DISTRIBUTED = False
+_C.GENERAL.LOCAL_RANK = 0
+_C.GENERAL.USE_SYNCBN = False
+_C.GENERAL.WORLD_SIZE = 1
+
+_C.TRAIN = CN(new_allowed=True)
+_C.TRAIN.NUM_EPOCHS = 100
+_C.TRAIN.BATCH_SIZE = 32
+_C.TRAIN.SHUFFLE = True
+_C.TRAIN.WARMUP = False
+_C.TRAIN.NORMALIZE_PER_IMAGE = False
+_C.TRAIN.CLIP_GRAD = False
+_C.TRAIN.CLIP_GRAD_VALUE = 1.0
+_C.LOSS_WEIGHTS = CN(new_allowed=True)
+
+_C.DATASETS = CN(new_allowed=True)
+
+_C.MODEL = CN(new_allowed=True)
+_C.MODEL.IMAGE_SIZE = 224
+
+_C.EXTRA = CN(new_allowed=True)
+_C.EXTRA.FOCAL_LENGTH = 5000
+
+_C.DATASETS.CONFIG = CN(new_allowed=True)
+_C.DATASETS.CONFIG.SCALE_FACTOR = 0.3
+_C.DATASETS.CONFIG.ROT_FACTOR = 30
+_C.DATASETS.CONFIG.TRANS_FACTOR = 0.02
+_C.DATASETS.CONFIG.COLOR_SCALE = 0.2
+_C.DATASETS.CONFIG.ROT_AUG_RATE = 0.6
+_C.DATASETS.CONFIG.TRANS_AUG_RATE = 0.5
+_C.DATASETS.CONFIG.DO_FLIP = False
+_C.DATASETS.CONFIG.FLIP_AUG_RATE = 0.5
+_C.DATASETS.CONFIG.EXTREME_CROP_AUG_RATE = 0.10
+
+def default_config() -> CN:
+    """
+    Get a yacs CfgNode object with the default config values.
+    """
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    return _C.clone()
+
+def dataset_config(name='datasets_tar.yaml') -> CN:
+    """
+    Get dataset config file
+    Returns:
+      CfgNode: Dataset config as a yacs CfgNode object.
+    """
+    cfg = CN(new_allowed=True)
+    config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), name)
+    cfg.merge_from_file(config_file)
+    cfg.freeze()
+    return cfg
+
+def dataset_eval_config() -> CN:
+    return dataset_config('datasets_eval.yaml')
+
+def get_config(config_file: str, merge: bool = True, update_cachedir: bool = False) -> CN:
+    """
+    Read a config file and optionally merge it with the default config file.
+    Args:
+      config_file (str): Path to config file.
+      merge (bool): Whether to merge with the default config or not.
+    Returns:
+      CfgNode: Config as a yacs CfgNode object.
+    """
+    if merge:
+      cfg = default_config()
+    else:
+      cfg = CN(new_allowed=True)
+    cfg.merge_from_file(config_file)
+
+    if update_cachedir:
+      def update_path(path: str) -> str:
+        if os.path.isabs(path):
+          return path
+        return os.path.join(CACHE_DIR_HAMER, path)
+
+      cfg.MANO.MODEL_PATH = update_path(cfg.MANO.MODEL_PATH)
+      cfg.MANO.MEAN_PARAMS = update_path(cfg.MANO.MEAN_PARAMS)
+
+    cfg.freeze()
+    return cfg
diff --git a/phantom/submodules/phantom-hamer/hamer/configs/cascade_mask_rcnn_vitdet_h_75ep.py b/phantom/submodules/phantom-hamer/hamer/configs/cascade_mask_rcnn_vitdet_h_75ep.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c6ae0eaf48c2c2d3b70529a0d2d915432e43db6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs/cascade_mask_rcnn_vitdet_h_75ep.py
@@ -0,0 +1,129 @@
+## coco_loader_lsj.py
+
+import detectron2.data.transforms as T
+from detectron2 import model_zoo
+from detectron2.config import LazyCall as L
+
+# Data using LSJ
+image_size = 1024
+dataloader = model_zoo.get_config("common/data/coco.py").dataloader
+dataloader.train.mapper.augmentations = [
+    L(T.RandomFlip)(horizontal=True),  # flip first
+    L(T.ResizeScale)(
+        min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size
+    ),
+    L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False),
+]
+dataloader.train.mapper.image_format = "RGB"
+dataloader.train.total_batch_size = 64
+# recompute boxes due to cropping
+dataloader.train.mapper.recompute_boxes = True
+
+dataloader.test.mapper.augmentations = [
+    L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size),
+]
+
+from functools import partial
+from fvcore.common.param_scheduler import MultiStepParamScheduler
+
+from detectron2 import model_zoo
+from detectron2.config import LazyCall as L
+from detectron2.solver import WarmupParamScheduler
+from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
+
+# mask_rcnn_vitdet_b_100ep.py
+
+model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model
+
+# Initialization and trainer settings
+train = model_zoo.get_config("common/train.py").train
+train.amp.enabled = True
+train.ddp.fp16_compression = True
+train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth"
+
+
+# Schedule
+# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
+train.max_iter = 184375
+
+lr_multiplier = L(WarmupParamScheduler)(
+    scheduler=L(MultiStepParamScheduler)(
+        values=[1.0, 0.1, 0.01],
+        milestones=[163889, 177546],
+        num_updates=train.max_iter,
+    ),
+    warmup_length=250 / train.max_iter,
+    warmup_factor=0.001,
+)
+
+# Optimizer
+optimizer = model_zoo.get_config("common/optim.py").AdamW
+optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7)
+optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}}
+
+# cascade_mask_rcnn_vitdet_b_100ep.py
+
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.roi_heads import (
+    FastRCNNOutputLayers,
+    FastRCNNConvFCHead,
+    CascadeROIHeads,
+)
+
+# arguments that don't exist for Cascade R-CNN
+[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
+
+model.roi_heads.update(
+    _target_=CascadeROIHeads,
+    box_heads=[
+        L(FastRCNNConvFCHead)(
+            input_shape=ShapeSpec(channels=256, height=7, width=7),
+            conv_dims=[256, 256, 256, 256],
+            fc_dims=[1024],
+            conv_norm="LN",
+        )
+        for _ in range(3)
+    ],
+    box_predictors=[
+        L(FastRCNNOutputLayers)(
+            input_shape=ShapeSpec(channels=1024),
+            test_score_thresh=0.05,
+            box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
+            cls_agnostic_bbox_reg=True,
+            num_classes="${...num_classes}",
+        )
+        for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
+    ],
+    proposal_matchers=[
+        L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
+        for th in [0.5, 0.6, 0.7]
+    ],
+)
+
+# cascade_mask_rcnn_vitdet_h_75ep.py
+
+from functools import partial
+
+train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth"
+
+model.backbone.net.embed_dim = 1280
+model.backbone.net.depth = 32
+model.backbone.net.num_heads = 16
+model.backbone.net.drop_path_rate = 0.5
+# 7, 15, 23, 31 for global attention
+model.backbone.net.window_block_indexes = (
+    list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31))
+)
+
+optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32)
+optimizer.params.overrides = {}
+optimizer.params.weight_decay_norm = None
+
+train.max_iter = train.max_iter * 3 // 4  # 100ep -> 75ep
+lr_multiplier.scheduler.milestones = [
+    milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones
+]
+lr_multiplier.scheduler.num_updates = train.max_iter
diff --git a/phantom/submodules/phantom-hamer/hamer/configs/datasets_eval.yaml b/phantom/submodules/phantom-hamer/hamer/configs/datasets_eval.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..492dfedffe05933fa5707d250da77aae9bb10126
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs/datasets_eval.yaml
@@ -0,0 +1,65 @@
+FREIHAND-VAL:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/freihand_val.npz
+    IMG_DIR: /home/user/datasets/FreiHAND_pub_v2_eval/evaluation/rgb/
+    KEYPOINT_LIST: [0] # Dummy
+
+HO3D-VAL:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/ho3d_val.npz
+    IMG_DIR: /home/user/datasets/HO3D_v2/evaluation/
+    KEYPOINT_LIST: [0] # Dummy
+
+EGO4D-TEST-ALL:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/TEST_ego4d_img_all.npz
+    IMG_DIR: /home/user/datasets/HInt_annotation_partial/TEST_ego4d_img/
+    KEYPOINT_LIST: [0] # Dummy
+
+EGO4D-TEST-VIS:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/TEST_ego4d_img_vis.npz
+    IMG_DIR: /home/user/datasets/HInt_annotation_partial/TEST_ego4d_img/
+    KEYPOINT_LIST: [0] # Dummy
+
+EGO4D-TEST-OCC:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/TEST_ego4d_img_occ.npz
+    IMG_DIR: /home/user/datasets/HInt_annotation_partial/TEST_ego4d_img/
+    KEYPOINT_LIST: [0] # Dummy
+
+EPICK-TEST-ALL:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/TEST_epick_img_all.npz
+    IMG_DIR: /home/user/datasets/HInt_annotation_partial/TEST_epick_img/
+    KEYPOINT_LIST: [0] # Dummy
+
+EPICK-TEST-VIS:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/TEST_epick_img_vis.npz
+    IMG_DIR: /home/user/datasets/HInt_annotation_partial/TEST_epick_img/
+    KEYPOINT_LIST: [0] # Dummy
+
+EPICK-TEST-OCC:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/TEST_epick_img_occ.npz
+    IMG_DIR: /home/user/datasets/HInt_annotation_partial/TEST_epick_img/
+    KEYPOINT_LIST: [0] # Dummy
+
+NEWDAYS-TEST-ALL:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/TEST_newdays_img_all.npz
+    IMG_DIR: /home/user/datasets/HInt_annotation_partial/TEST_newdays_img/
+    KEYPOINT_LIST: [0] # Dummy
+
+NEWDAYS-TEST-VIS:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/TEST_newdays_img_vis.npz
+    IMG_DIR: /home/user/datasets/HInt_annotation_partial/TEST_newdays_img/
+    KEYPOINT_LIST: [0] # Dummy
+
+NEWDAYS-TEST-OCC:
+    TYPE: ImageDataset
+    DATASET_FILE: hamer_evaluation_data/TEST_newdays_img_occ.npz
+    IMG_DIR: /home/user/datasets/HInt_annotation_partial/TEST_newdays_img/
+    KEYPOINT_LIST: [0] # Dummy
diff --git a/phantom/submodules/phantom-hamer/hamer/configs/datasets_tar.yaml b/phantom/submodules/phantom-hamer/hamer/configs/datasets_tar.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ebad8a6404e5fe59db55f9e042af8301053eb66
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs/datasets_tar.yaml
@@ -0,0 +1,42 @@
+FREIHAND-TRAIN:
+    TYPE: ImageDataset
+    URLS: hamer_training_data/dataset_tars/freihand-train/{000000..000130}.tar
+    epoch_size: 130_240
+INTERHAND26M-TRAIN:
+    TYPE: ImageDataset
+    URLS: hamer_training_data/dataset_tars/interhand26m-train/{000000..001056}.tar
+    epoch_size: 1_424_632
+HALPE-TRAIN:
+    TYPE: ImageDataset
+    URLS: hamer_training_data/dataset_tars/halpe-train/{000000..000022}.tar
+    epoch_size: 34_289
+COCOW-TRAIN:
+    TYPE: ImageDataset
+    URLS: hamer_training_data/dataset_tars/cocow-train/{000000..000036}.tar
+    epoch_size: 78_666
+MTC-TRAIN:
+    TYPE: ImageDataset
+    URLS: hamer_training_data/dataset_tars/mtc-train/{000000..000306}.tar
+    epoch_size: 363_947
+RHD-TRAIN:
+    TYPE: ImageDataset
+    URLS: hamer_training_data/dataset_tars/rhd-train/{000000..000041}.tar
+    epoch_size: 61_705
+MPIINZSL-TRAIN:
+    TYPE: ImageDataset
+    URLS: hamer_training_data/dataset_tars/mpiinzsl-train/{000000..000015}.tar
+    epoch_size: 15_184
+HO3D-TRAIN:
+    TYPE: ImageDataset
+    URLS: hamer_training_data/dataset_tars/ho3d-train/{000000..000083}.tar
+    epoch_size: 83_325
+H2O3D-TRAIN:
+    TYPE: ImageDataset
+    URLS: hamer_training_data/dataset_tars/h2o3d-train/{000000..000060}.tar
+    epoch_size: 121_996
+DEX-TRAIN:
+    TYPE: ImageDataset
+    URLS: hamer_training_data/dataset_tars/dex-train/{000000..000406}.tar
+    epoch_size: 406_888
+FREIHAND-MOCAP:
+    DATASET_FILE: hamer_training_data/freihand_mocap.npz
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/data/mix_all.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/data/mix_all.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26e0d7102553772cbb9a4893e55863f56e3bc41d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/data/mix_all.yaml
@@ -0,0 +1,31 @@
+# @package _global_
+defaults:
+  - /data_filtering: low1
+
+DATASETS:
+  TRAIN: 
+    FREIHAND-TRAIN:
+      WEIGHT: 0.25
+    INTERHAND26M-TRAIN:
+      WEIGHT: 0.25
+    MTC-TRAIN:
+      WEIGHT: 0.1
+    RHD-TRAIN:
+      WEIGHT: 0.05
+    COCOW-TRAIN:
+      WEIGHT: 0.1
+    HALPE-TRAIN:
+      WEIGHT: 0.05
+    MPIINZSL-TRAIN:
+      WEIGHT: 0.05
+    HO3D-TRAIN:
+      WEIGHT: 0.05
+    H2O3D-TRAIN:
+      WEIGHT: 0.05
+    DEX-TRAIN:
+      WEIGHT: 0.05
+  VAL:
+    FREIHAND-TRAIN:
+      WEIGHT: 1.0
+
+  MOCAP: FREIHAND-MOCAP
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/data_filtering/low1.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/data_filtering/low1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bea3b9df8c10100f1de32600546f254aa70a5081
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/data_filtering/low1.yaml
@@ -0,0 +1,13 @@
+# @package _global_
+
+DATASETS:
+  # Data filtering during training
+  SUPPRESS_KP_CONF_THRESH: 0.3
+  FILTER_NUM_KP: 4
+  FILTER_NUM_KP_THRESH: 0.0
+  FILTER_REPROJ_THRESH: 31000
+
+  SUPPRESS_BETAS_THRESH: 3.0
+  SUPPRESS_BAD_POSES: False
+  POSES_BETAS_SIMULTANEOUS: True
+  FILTER_NO_POSES: False # If True, filters images that don't have poses
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/experiment/default.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/experiment/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..323acd5b609dfba819334f58040a5fa12baf8734
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/experiment/default.yaml
@@ -0,0 +1,29 @@
+# @package _global_
+
+MANO:
+  DATA_DIR: _DATA/data/
+  MODEL_PATH: ${MANO.DATA_DIR}/mano
+  GENDER: neutral
+  NUM_HAND_JOINTS: 15
+  MEAN_PARAMS: ${MANO.DATA_DIR}/mano_mean_params.npz
+  CREATE_BODY_POSE: FALSE
+
+EXTRA:
+  FOCAL_LENGTH: 5000
+  NUM_LOG_IMAGES: 4
+  NUM_LOG_SAMPLES_PER_IMAGE: 8
+  PELVIS_IND: 0
+
+DATASETS:
+  BETAS_REG: True
+  CONFIG:
+    SCALE_FACTOR: 0.3
+    ROT_FACTOR: 30
+    TRANS_FACTOR: 0.02
+    COLOR_SCALE: 0.2
+    ROT_AUG_RATE: 0.6
+    TRANS_AUG_RATE: 0.5
+    DO_FLIP: False
+    FLIP_AUG_RATE: 0.0
+    EXTREME_CROP_AUG_RATE: 0.0
+    EXTREME_CROP_AUG_LEVEL: 1
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/experiment/hamer_vit_transformer.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/experiment/hamer_vit_transformer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a3a5924a4f2d9e61ddf82615de7433bb2f66518
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/experiment/hamer_vit_transformer.yaml
@@ -0,0 +1,51 @@
+# @package _global_
+
+defaults:
+  - default.yaml
+
+GENERAL:
+  TOTAL_STEPS: 1_000_000
+  LOG_STEPS: 1000
+  VAL_STEPS: 1000
+  CHECKPOINT_STEPS: 1000
+  CHECKPOINT_SAVE_TOP_K: 1
+  NUM_WORKERS: 4
+  PREFETCH_FACTOR: 2
+
+TRAIN:
+  LR: 1e-5
+  WEIGHT_DECAY: 1e-4
+  BATCH_SIZE: 8
+  LOSS_REDUCTION: mean
+  NUM_TRAIN_SAMPLES: 2
+  NUM_TEST_SAMPLES: 64
+  POSE_2D_NOISE_RATIO: 0.01
+  SMPL_PARAM_NOISE_RATIO: 0.005
+
+MODEL:
+  IMAGE_SIZE: 256
+  IMAGE_MEAN: [0.485, 0.456, 0.406]
+  IMAGE_STD: [0.229, 0.224, 0.225]
+  BACKBONE:
+    TYPE: vit
+    PRETRAINED_WEIGHTS: hamer_training_data/vitpose_backbone.pth
+  MANO_HEAD:
+    TYPE: transformer_decoder
+    IN_CHANNELS: 2048
+    TRANSFORMER_DECODER:
+      depth: 6
+      heads: 8
+      mlp_dim: 1024
+      dim_head: 64
+      dropout: 0.0
+      emb_dropout: 0.0
+      norm: layer
+      context_dim: 1280 # from vitpose-H
+
+LOSS_WEIGHTS:
+  KEYPOINTS_3D: 0.05
+  KEYPOINTS_2D: 0.01
+  GLOBAL_ORIENT: 0.001
+  HAND_POSE: 0.001
+  BETAS: 0.0005
+  ADVERSARIAL: 0.0005
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/extras/default.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/extras/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b9c6b622283a647fbc513166fc14f016cc3ed8a0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/extras/default.yaml
@@ -0,0 +1,8 @@
+# disable python warnings if they annoy you
+ignore_warnings: False
+
+# ask user for tags if none are provided in the config
+enforce_tags: True
+
+# pretty print config tree at the start of the run using Rich library
+print_config: True
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/hydra/default.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/hydra/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c30c188f4e68b205ec0f1e5679345626fe187164
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/hydra/default.yaml
@@ -0,0 +1,26 @@
+# @package _global_
+# https://hydra.cc/docs/configure_hydra/intro/
+
+# enable color logging
+defaults:
+  - override /hydra/hydra_logging: colorlog
+  - override /hydra/job_logging: colorlog
+
+# exp_name: ovrd_${hydra:job.override_dirname}
+exp_name: ${now:%Y-%m-%d}_${now:%H-%M-%S}
+
+hydra:
+  run:
+    dir: ${paths.log_dir}/${task_name}/runs/${exp_name}
+  sweep:
+    dir: ${paths.log_dir}/${task_name}/multiruns/${exp_name}
+    subdir: ${hydra.job.num}
+  job:
+    config:
+      override_dirname:
+        exclude_keys:
+          - trainer
+          - trainer.devices
+          - trainer.num_nodes
+          - callbacks
+          - debug
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/launcher/local.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/launcher/local.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da87047acd416fe6d03bc81a74ab62b449b4ac35
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/launcher/local.yaml
@@ -0,0 +1,13 @@
+# @package _global_
+
+defaults:
+  - override /hydra/launcher: submitit_local
+
+hydra:
+  launcher:
+    timeout_min: 10_080   # 7 days
+    nodes: 1
+    tasks_per_node: ${trainer.devices}
+    cpus_per_task: 6
+    gpus_per_node: ${trainer.devices}
+    name: hamer
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/launcher/slurm.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/launcher/slurm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f30ccce9069210830270c665bd31294c9d1799b7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/launcher/slurm.yaml
@@ -0,0 +1,22 @@
+# @package _global_
+
+defaults:
+  - override /hydra/launcher: submitit_slurm
+
+hydra:
+  launcher:
+    timeout_min: 10_080   # 7 days
+    max_num_timeout: 3
+    partition: g40
+    qos: idle
+    nodes: 1
+    tasks_per_node: ${trainer.devices}
+    gpus_per_task: null
+    cpus_per_task: 12
+    gpus_per_node: ${trainer.devices}
+    cpus_per_gpu: null
+    comment: laion
+    name: hamer
+    setup:
+      - module load cuda openmpi libfabric-aws
+      - export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/paths/default.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/paths/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2afd22a65d1b34d881943cb48ee4ce3ff37d165
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/paths/default.yaml
@@ -0,0 +1,18 @@
+# path to root directory
+# this requires PROJECT_ROOT environment variable to exist
+# PROJECT_ROOT is inferred and set by pyrootutils package in `train.py` and `eval.py`
+root_dir: ${oc.env:PROJECT_ROOT}
+
+# path to data directory
+data_dir: ${paths.root_dir}/data/
+
+# path to logging directory
+log_dir: logs/
+
+# path to output directory, created dynamically by hydra
+# path generation pattern is specified in `configs/hydra/default.yaml`
+# use it to store all files generated during the run, like ckpts and metrics
+output_dir: ${hydra:runtime.output_dir}
+
+# path to working directory
+work_dir: ${hydra:runtime.cwd}
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/train.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/train.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5021b4c156fc5738aee3d7d2fbd9395a2b3bb987
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/train.yaml
@@ -0,0 +1,47 @@
+# @package _global_
+
+# specify here default configuration
+# order of defaults determines the order in which configs override each other
+defaults:
+  - _self_
+  - data: mix_all.yaml
+  - trainer: ddp.yaml
+  - paths: default.yaml
+  - extras: default.yaml
+  - hydra: default.yaml
+
+  # experiment configs allow for version control of specific hyperparameters
+  # e.g. best hyperparameters for given model and datamodule
+  - experiment: null
+  - texture_exp: null
+
+  # optional local config for machine/user specific settings
+  # it's optional since it doesn't need to exist and is excluded from version control
+  - optional launcher: local.yaml
+  # - optional launcher: slurm.yaml
+
+  # debugging config (enable through command line, e.g. `python train.py debug=default)
+  - debug: null
+
+# task name, determines output directory path
+task_name: "train"
+
+# tags to help you identify your experiments
+# you can overwrite this in experiment configs
+# overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
+# appending lists from command line is currently not supported :(
+# https://github.com/facebookresearch/hydra/issues/1547
+tags: ["dev"]
+
+# set False to skip model training
+train: True
+
+# evaluate on test set, using best model weights achieved during training
+# lightning chooses best weights based on the metric specified in checkpoint callback
+test: False
+
+# simply provide checkpoint path to resume training
+ckpt_path: null
+
+# seed for random number generators in pytorch, numpy and python.random
+seed: null
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/cpu.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/cpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2464b95ee0d6c03a3dfe202f8a99b0cf04f37031
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/cpu.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - default.yaml
+  - default_hamer.yaml
+
+accelerator: cpu
+devices: 1
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/ddp.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/ddp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b365ff6df35d3218970a82895f4f0e27b9647780
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/ddp.yaml
@@ -0,0 +1,14 @@
+defaults:
+  - default.yaml
+  - default_hamer.yaml
+
+# use "ddp_spawn" instead of "ddp",
+# it's slower but normal "ddp" currently doesn't work ideally with hydra
+# https://github.com/facebookresearch/hydra/issues/2070
+# https://pytorch-lightning.readthedocs.io/en/latest/accelerators/gpu_intermediate.html#distributed-data-parallel-spawn
+strategy: ddp
+
+accelerator: gpu
+devices: 8
+num_nodes: 1
+sync_batchnorm: True
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/default.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7d444f4671fc77d7cf3f11ec74e638f3f620098f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/default.yaml
@@ -0,0 +1,10 @@
+_target_: pytorch_lightning.Trainer
+
+default_root_dir: ${paths.output_dir}
+
+accelerator: cpu
+devices: 1
+
+# set True to to ensure deterministic results
+# makes training slower but gives more reproducibility than just setting seeds
+deterministic: False
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/default_hamer.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/default_hamer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..963b2393c9651ba53f8e0e69256193d635821174
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/default_hamer.yaml
@@ -0,0 +1,8 @@
+num_sanity_val_steps: 0
+log_every_n_steps: ${GENERAL.LOG_STEPS}
+val_check_interval: ${GENERAL.VAL_STEPS}
+precision: 16
+max_steps: ${GENERAL.TOTAL_STEPS}
+# move_metrics_to_cpu: True
+limit_val_batches: 1
+# track_grad_norm: -1
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/gpu.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/gpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b0c8b9171a83784a1f243d3e4515bfec0a10b1d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/gpu.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - default.yaml
+  - default_hamer.yaml
+
+accelerator: gpu
+devices: 1
diff --git a/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/mps.yaml b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/mps.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25806bc3cd66c3130ee82c4e14e1700d28b471a0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/configs_hydra/trainer/mps.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - default.yaml
+  - default_hamer.yaml
+
+accelerator: mps
+devices: 1
diff --git a/phantom/submodules/phantom-hamer/hamer/datasets/__init__.py b/phantom/submodules/phantom-hamer/hamer/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ddcda25c621ad564053e21d39de85fc47cdb298
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/datasets/__init__.py
@@ -0,0 +1,88 @@
+from typing import Dict, Optional
+
+import torch
+import numpy as np
+import pytorch_lightning as pl
+from yacs.config import CfgNode
+
+import webdataset as wds
+from ..configs import to_lower
+from .dataset import Dataset
+from .image_dataset import ImageDataset
+from .mocap_dataset import MoCapDataset
+
+def create_dataset(cfg: CfgNode, dataset_cfg: CfgNode, train: bool = True, **kwargs) -> Dataset:
+    """
+    Instantiate a dataset from a config file.
+    Args:
+        cfg (CfgNode): Model configuration file.
+        dataset_cfg (CfgNode): Dataset configuration info.
+        train (bool): Variable to select between train and val datasets.
+    """
+
+    dataset_type = Dataset.registry[dataset_cfg.TYPE]
+    return dataset_type(cfg, **to_lower(dataset_cfg), train=train, **kwargs)
+
+def create_webdataset(cfg: CfgNode, dataset_cfg: CfgNode, train: bool = True) -> Dataset:
+    """
+    Like `create_dataset` but load data from tars.
+    """
+    dataset_type = Dataset.registry[dataset_cfg.TYPE]
+    return dataset_type.load_tars_as_webdataset(cfg, **to_lower(dataset_cfg), train=train)
+
+
+class MixedWebDataset(wds.WebDataset):
+    def __init__(self, cfg: CfgNode, dataset_cfg: CfgNode, train: bool = True) -> None:
+        super(wds.WebDataset, self).__init__()
+        dataset_list = cfg.DATASETS.TRAIN if train else cfg.DATASETS.VAL
+        datasets = [create_webdataset(cfg, dataset_cfg[dataset], train=train) for dataset, v in dataset_list.items()]
+        weights = np.array([v.WEIGHT for dataset, v in dataset_list.items()])
+        weights = weights / weights.sum()  # normalize
+        self.append(wds.RandomMix(datasets, weights))
+
+class HAMERDataModule(pl.LightningDataModule):
+
+    def __init__(self, cfg: CfgNode, dataset_cfg: CfgNode) -> None:
+        """
+        Initialize LightningDataModule for HAMER training
+        Args:
+            cfg (CfgNode): Config file as a yacs CfgNode containing necessary dataset info.
+            dataset_cfg (CfgNode): Dataset configuration file
+        """
+        super().__init__()
+        self.cfg = cfg
+        self.dataset_cfg = dataset_cfg
+        self.train_dataset = None
+        self.val_dataset = None
+        self.test_dataset = None
+        self.mocap_dataset = None
+
+    def setup(self, stage: Optional[str] = None) -> None:
+        """
+        Load datasets necessary for training
+        Args:
+            cfg (CfgNode): Config file as a yacs CfgNode containing necessary dataset info.
+        """
+        if self.train_dataset == None:
+            self.train_dataset = MixedWebDataset(self.cfg, self.dataset_cfg, train=True).with_epoch(100_000).shuffle(4000)
+            self.val_dataset = MixedWebDataset(self.cfg, self.dataset_cfg, train=False).shuffle(4000)
+            self.mocap_dataset = MoCapDataset(**to_lower(self.dataset_cfg[self.cfg.DATASETS.MOCAP]))
+
+    def train_dataloader(self) -> Dict:
+        """
+        Setup training data loader.
+        Returns:
+            Dict: Dictionary containing image and mocap data dataloaders
+        """
+        train_dataloader = torch.utils.data.DataLoader(self.train_dataset, self.cfg.TRAIN.BATCH_SIZE, drop_last=True, num_workers=self.cfg.GENERAL.NUM_WORKERS, prefetch_factor=self.cfg.GENERAL.PREFETCH_FACTOR)
+        mocap_dataloader = torch.utils.data.DataLoader(self.mocap_dataset, self.cfg.TRAIN.NUM_TRAIN_SAMPLES * self.cfg.TRAIN.BATCH_SIZE, shuffle=True, drop_last=True, num_workers=1)
+        return {'img': train_dataloader, 'mocap': mocap_dataloader}
+
+    def val_dataloader(self) -> torch.utils.data.DataLoader:
+        """
+        Setup val data loader.
+        Returns:
+            torch.utils.data.DataLoader: Validation dataloader  
+        """
+        val_dataloader = torch.utils.data.DataLoader(self.val_dataset, self.cfg.TRAIN.BATCH_SIZE, drop_last=True, num_workers=self.cfg.GENERAL.NUM_WORKERS)
+        return val_dataloader
diff --git a/phantom/submodules/phantom-hamer/hamer/datasets/dataset.py b/phantom/submodules/phantom-hamer/hamer/datasets/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..22fc5bc5f4a7b75da672bd89859da14823e71aff
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/datasets/dataset.py
@@ -0,0 +1,27 @@
+"""
+This file contains the defition of the base Dataset class.
+"""
+
+class DatasetRegistration(type):
+    """
+    Metaclass for registering different datasets
+    """
+    def __init__(cls, name, bases, nmspc):
+        super().__init__(name, bases, nmspc)
+        if not hasattr(cls, 'registry'):
+            cls.registry = dict()
+        cls.registry[name] = cls
+
+    # Metamethods, called on class objects:
+    def __iter__(cls):
+        return iter(cls.registry)
+
+    def __str__(cls):
+        return str(cls.registry)
+
+class Dataset(metaclass=DatasetRegistration):
+    """
+    Base Dataset class
+    """
+    def __init__(self, *args, **kwargs):
+        pass
\ No newline at end of file
diff --git a/phantom/submodules/phantom-hamer/hamer/datasets/image_dataset.py b/phantom/submodules/phantom-hamer/hamer/datasets/image_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..03b824fd32bc4622fbfa4a74e5f4092256ae55d7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/datasets/image_dataset.py
@@ -0,0 +1,434 @@
+import copy
+import os
+import numpy as np
+import torch
+from typing import Any, Dict, List
+from yacs.config import CfgNode
+import braceexpand
+import cv2
+
+from .dataset import Dataset
+from .utils import get_example, expand_to_aspect_ratio
+
+def expand(s):
+    return os.path.expanduser(os.path.expandvars(s))
+def expand_urls(urls: str|List[str]):
+    if isinstance(urls, str):
+        urls = [urls]
+    urls = [u for url in urls for u in braceexpand.braceexpand(expand(url))]
+    return urls
+
+FLIP_KEYPOINT_PERMUTATION = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
+
+DEFAULT_MEAN = 255. * np.array([0.485, 0.456, 0.406])
+DEFAULT_STD = 255. * np.array([0.229, 0.224, 0.225])
+DEFAULT_IMG_SIZE = 256
+
+class ImageDataset(Dataset):
+
+    def __init__(self,
+                 cfg: CfgNode,
+                 dataset_file: str,
+                 img_dir: str,
+                 train: bool = True,
+                 rescale_factor = 2,
+                 prune: Dict[str, Any] = {},
+                 **kwargs):
+        """
+        Dataset class used for loading images and corresponding annotations.
+        Args:
+            cfg (CfgNode): Model config file.
+            dataset_file (str): Path to npz file containing dataset info.
+            img_dir (str): Path to image folder.
+            train (bool): Whether it is for training or not (enables data augmentation).
+        """
+        super(ImageDataset, self).__init__()
+        self.train = train
+        self.cfg = cfg
+
+        self.img_size = cfg.MODEL.IMAGE_SIZE
+        self.mean = 255. * np.array(self.cfg.MODEL.IMAGE_MEAN)
+        self.std = 255. * np.array(self.cfg.MODEL.IMAGE_STD)
+        self.rescale_factor = rescale_factor
+
+        self.img_dir = img_dir
+        self.data = np.load(dataset_file, allow_pickle=True)
+
+        self.imgname = self.data['imgname']
+        self.personid = np.zeros(len(self.imgname), dtype=np.int32)
+        self.extra_info = self.data.get('extra_info', [{} for _ in range(len(self.imgname))])
+
+        self.flip_keypoint_permutation = copy.copy(FLIP_KEYPOINT_PERMUTATION)
+
+        num_pose = 3 * (self.cfg.MANO.NUM_HAND_JOINTS + 1)
+
+        # Bounding boxes are assumed to be in the center and scale format
+        self.center = self.data['center']
+        self.scale = self.data['scale'].reshape(len(self.center), -1) / 200.0
+        if self.scale.shape[1] == 1:
+            self.scale = np.tile(self.scale, (1, 2))
+        assert self.scale.shape == (len(self.center), 2)
+
+        try:
+            self.right = self.data['right']
+        except KeyError:
+            self.right = np.ones(len(self.imgname), dtype=np.float32)
+
+        # Get gt MANO parameters, if available
+        try:
+            self.hand_pose = self.data['hand_pose'].astype(np.float32)
+            self.has_hand_pose = self.data['has_hand_pose'].astype(np.float32)
+        except KeyError:
+            self.hand_pose = np.zeros((len(self.imgname), num_pose), dtype=np.float32)
+            self.has_hand_pose = np.zeros(len(self.imgname), dtype=np.float32)
+        try:
+            self.betas = self.data['betas'].astype(np.float32)
+            self.has_betas = self.data['has_betas'].astype(np.float32)
+        except KeyError:
+            self.betas = np.zeros((len(self.imgname), 10), dtype=np.float32)
+            self.has_betas = np.zeros(len(self.imgname), dtype=np.float32)
+
+        # Try to get 2d keypoints, if available
+        try:
+            hand_keypoints_2d = self.data['hand_keypoints_2d']
+        except KeyError:
+            hand_keypoints_2d = np.zeros((len(self.center), 21, 3))
+
+        self.keypoints_2d = hand_keypoints_2d
+
+        # Try to get 3d keypoints, if available
+        try:
+            hand_keypoints_3d = self.data['hand_keypoints_3d'].astype(np.float32)
+        except KeyError:
+            hand_keypoints_3d = np.zeros((len(self.center), 21, 4), dtype=np.float32)
+
+        self.keypoints_3d = hand_keypoints_3d
+
+    def __len__(self) -> int:
+        return len(self.scale)
+
+    def __getitem__(self, idx: int) -> Dict:
+        """
+        Returns an example from the dataset.
+        """
+        try:
+            image_file_rel = self.imgname[idx].decode('utf-8')
+        except AttributeError:
+            image_file_rel = self.imgname[idx]
+        image_file = os.path.join(self.img_dir, image_file_rel)
+        keypoints_2d = self.keypoints_2d[idx].copy()
+        keypoints_3d = self.keypoints_3d[idx].copy()
+
+        center = self.center[idx].copy()
+        center_x = center[0]
+        center_y = center[1]
+        scale = self.scale[idx]
+        right = self.right[idx].copy()
+        if self.rescale_factor == -1:
+            BBOX_SHAPE = self.cfg.MODEL.get('BBOX_SHAPE', None)
+            bbox_size = expand_to_aspect_ratio(scale*200, target_aspect_ratio=BBOX_SHAPE).max()
+            bbox_expand_factor = bbox_size / ((scale*200).max())
+        else:
+            bbox_expand_factor = self.rescale_factor
+            bbox_size = bbox_expand_factor*scale.max()*200
+        hand_pose = self.hand_pose[idx].copy().astype(np.float32)
+        betas = self.betas[idx].copy().astype(np.float32)
+
+        has_hand_pose = self.has_hand_pose[idx].copy()
+        has_betas = self.has_betas[idx].copy()
+
+        mano_params = {'global_orient': hand_pose[:3],
+                       'hand_pose': hand_pose[3:],
+                       'betas': betas
+                      }
+
+        has_mano_params = {'global_orient': has_hand_pose,
+                           'hand_pose': has_hand_pose,
+                           'betas': has_betas
+                           }
+
+        mano_params_is_axis_angle = {'global_orient': True,
+                                     'hand_pose': True,
+                                     'betas': False
+                                    }
+
+        augm_config = self.cfg.DATASETS.CONFIG
+        # Crop image and (possibly) perform data augmentation
+        img_patch, keypoints_2d, keypoints_3d, mano_params, has_mano_params, img_size = get_example(image_file,
+                                                                                                    center_x, center_y,
+                                                                                                    bbox_size, bbox_size,
+                                                                                                    keypoints_2d, keypoints_3d,
+                                                                                                    mano_params, has_mano_params,
+                                                                                                    self.flip_keypoint_permutation,
+                                                                                                    self.img_size, self.img_size,
+                                                                                                    self.mean, self.std, self.train, right, augm_config)
+        item = {}
+        # These are the keypoints in the original image coordinates (before cropping)
+        orig_keypoints_2d = self.keypoints_2d[idx].copy()
+
+        item['img'] = img_patch
+        item['keypoints_2d'] = keypoints_2d.astype(np.float32)
+        item['keypoints_3d'] = keypoints_3d.astype(np.float32)
+        item['orig_keypoints_2d'] = orig_keypoints_2d
+        item['box_center'] = self.center[idx].copy()
+        item['box_size'] = bbox_size
+        item['bbox_expand_factor'] = bbox_expand_factor
+        item['img_size'] = 1.0 * img_size[::-1].copy()
+        item['mano_params'] = mano_params
+        item['has_mano_params'] = has_mano_params
+        item['mano_params_is_axis_angle'] = mano_params_is_axis_angle
+        item['imgname'] = image_file
+        item['imgname_rel'] = image_file_rel
+        item['personid'] = int(self.personid[idx])
+        item['extra_info'] = copy.deepcopy(self.extra_info[idx])
+        item['idx'] = idx
+        item['_scale'] = scale
+        item['right'] = self.right[idx].copy()
+        return item
+
+    @staticmethod
+    def load_tars_as_webdataset(cfg: CfgNode, urls: str|List[str], train: bool,
+            resampled=False,
+            epoch_size=None,
+            cache_dir=None,
+            **kwargs) -> Dataset:
+        """
+        Loads the dataset from a webdataset tar file.
+        """
+
+        IMG_SIZE = cfg.MODEL.IMAGE_SIZE
+        BBOX_SHAPE = cfg.MODEL.get('BBOX_SHAPE', None)
+        MEAN = 255. * np.array(cfg.MODEL.IMAGE_MEAN)
+        STD = 255. * np.array(cfg.MODEL.IMAGE_STD)
+
+        def split_data(source):
+            for item in source:
+                datas = item['data.pyd']
+                for data in datas:
+                    if 'detection.npz' in item:
+                        det_idx = data['extra_info']['detection_npz_idx']
+                        mask = item['detection.npz']['masks'][det_idx]
+                    else:
+                        mask = np.ones_like(item['jpg'][:,:,0], dtype=bool)
+                    yield {
+                        '__key__': item['__key__'],
+                        'jpg': item['jpg'],
+                        'data.pyd': data,
+                        'mask': mask,
+                    }
+
+        def suppress_bad_kps(item, thresh=0.0):
+            if thresh > 0:
+                kp2d = item['data.pyd']['keypoints_2d']
+                kp2d_conf = np.where(kp2d[:, 2] < thresh, 0.0, kp2d[:, 2])
+                item['data.pyd']['keypoints_2d'] = np.concatenate([kp2d[:,:2], kp2d_conf[:,None]], axis=1)
+            return item
+
+        def filter_numkp(item, numkp=4, thresh=0.0):
+            kp_conf = item['data.pyd']['keypoints_2d'][:, 2]
+            return (kp_conf > thresh).sum() > numkp
+
+        def filter_reproj_error(item, thresh=10**4.5):
+            losses = item['data.pyd'].get('extra_info', {}).get('fitting_loss', np.array({})).item()
+            reproj_loss = losses.get('reprojection_loss', None)
+            return reproj_loss is None or reproj_loss < thresh
+
+        def filter_bbox_size(item, thresh=1):
+            bbox_size_min = item['data.pyd']['scale'].min().item() * 200.
+            return bbox_size_min > thresh
+
+        def filter_no_poses(item):
+            return (item['data.pyd']['has_hand_pose'] > 0)
+
+        def supress_bad_betas(item, thresh=3):
+            has_betas = item['data.pyd']['has_betas']
+            if thresh > 0 and has_betas:
+                betas_abs = np.abs(item['data.pyd']['betas'])
+                if (betas_abs > thresh).any():
+                    item['data.pyd']['has_betas'] = False
+            return item
+
+        def supress_bad_poses(item):
+            has_hand_pose = item['data.pyd']['has_hand_pose']
+            if has_hand_pose:
+                hand_pose = item['data.pyd']['hand_pose']
+                pose_is_probable = poses_check_probable(torch.from_numpy(hand_pose)[None, 3:], amass_poses_hist100_smooth).item()
+                if not pose_is_probable:
+                    item['data.pyd']['has_hand_pose'] = False
+            return item
+
+        def poses_betas_simultaneous(item):
+            # We either have both hand_pose and betas, or neither
+            has_betas = item['data.pyd']['has_betas']
+            has_hand_pose = item['data.pyd']['has_hand_pose']
+            item['data.pyd']['has_betas'] = item['data.pyd']['has_hand_pose'] = np.array(float((has_hand_pose>0) and (has_betas>0)))
+            return item
+
+        def set_betas_for_reg(item):
+            # Always have betas set to true
+            has_betas = item['data.pyd']['has_betas']
+            betas = item['data.pyd']['betas']
+
+            if not (has_betas>0):
+                item['data.pyd']['has_betas'] = np.array(float((True)))
+                item['data.pyd']['betas'] = betas * 0
+            return item
+
+        # Load the dataset
+        if epoch_size is not None:
+            resampled = True
+        #corrupt_filter = lambda sample: (sample['__key__'] not in CORRUPT_KEYS)
+        import webdataset as wds
+        dataset = wds.WebDataset(expand_urls(urls),
+                                nodesplitter=wds.split_by_node,
+                                shardshuffle=True,
+                                resampled=resampled,
+                                cache_dir=cache_dir,
+                              ) #.select(corrupt_filter)
+        if train:
+            dataset = dataset.shuffle(100)
+        dataset = dataset.decode('rgb8').rename(jpg='jpg;jpeg;png')
+
+        # Process the dataset
+        dataset = dataset.compose(split_data)
+
+        # Filter/clean the dataset
+        SUPPRESS_KP_CONF_THRESH = cfg.DATASETS.get('SUPPRESS_KP_CONF_THRESH', 0.0)
+        SUPPRESS_BETAS_THRESH = cfg.DATASETS.get('SUPPRESS_BETAS_THRESH', 0.0)
+        SUPPRESS_BAD_POSES = cfg.DATASETS.get('SUPPRESS_BAD_POSES', False)
+        POSES_BETAS_SIMULTANEOUS = cfg.DATASETS.get('POSES_BETAS_SIMULTANEOUS', False)
+        BETAS_REG = cfg.DATASETS.get('BETAS_REG', False)
+        FILTER_NO_POSES = cfg.DATASETS.get('FILTER_NO_POSES', False)
+        FILTER_NUM_KP = cfg.DATASETS.get('FILTER_NUM_KP', 4)
+        FILTER_NUM_KP_THRESH = cfg.DATASETS.get('FILTER_NUM_KP_THRESH', 0.0)
+        FILTER_REPROJ_THRESH = cfg.DATASETS.get('FILTER_REPROJ_THRESH', 0.0)
+        FILTER_MIN_BBOX_SIZE = cfg.DATASETS.get('FILTER_MIN_BBOX_SIZE', 0.0)
+        if SUPPRESS_KP_CONF_THRESH > 0:
+            dataset = dataset.map(lambda x: suppress_bad_kps(x, thresh=SUPPRESS_KP_CONF_THRESH))
+        if SUPPRESS_BETAS_THRESH > 0:
+            dataset = dataset.map(lambda x: supress_bad_betas(x, thresh=SUPPRESS_BETAS_THRESH))
+        if SUPPRESS_BAD_POSES:
+            dataset = dataset.map(lambda x: supress_bad_poses(x))
+        if POSES_BETAS_SIMULTANEOUS:
+            dataset = dataset.map(lambda x: poses_betas_simultaneous(x))
+        if FILTER_NO_POSES:
+            dataset = dataset.select(lambda x: filter_no_poses(x))
+        if FILTER_NUM_KP > 0:
+            dataset = dataset.select(lambda x: filter_numkp(x, numkp=FILTER_NUM_KP, thresh=FILTER_NUM_KP_THRESH))
+        if FILTER_REPROJ_THRESH > 0:
+            dataset = dataset.select(lambda x: filter_reproj_error(x, thresh=FILTER_REPROJ_THRESH))
+        if FILTER_MIN_BBOX_SIZE > 0:
+            dataset = dataset.select(lambda x: filter_bbox_size(x, thresh=FILTER_MIN_BBOX_SIZE))
+        if BETAS_REG:
+            dataset = dataset.map(lambda x: set_betas_for_reg(x))       # NOTE: Must be at the end
+
+        use_skimage_antialias = cfg.DATASETS.get('USE_SKIMAGE_ANTIALIAS', False)
+        border_mode = {
+            'constant': cv2.BORDER_CONSTANT,
+            'replicate': cv2.BORDER_REPLICATE,
+        }[cfg.DATASETS.get('BORDER_MODE', 'constant')]
+
+        # Process the dataset further
+        dataset = dataset.map(lambda x: ImageDataset.process_webdataset_tar_item(x, train,
+                                                        augm_config=cfg.DATASETS.CONFIG,
+                                                        MEAN=MEAN, STD=STD, IMG_SIZE=IMG_SIZE,
+                                                        BBOX_SHAPE=BBOX_SHAPE,
+                                                        use_skimage_antialias=use_skimage_antialias,
+                                                        border_mode=border_mode,
+                                                        ))
+        if epoch_size is not None:
+            dataset = dataset.with_epoch(epoch_size)
+
+        return dataset
+
+    @staticmethod
+    def process_webdataset_tar_item(item, train, 
+                                    augm_config=None, 
+                                    MEAN=DEFAULT_MEAN, 
+                                    STD=DEFAULT_STD, 
+                                    IMG_SIZE=DEFAULT_IMG_SIZE,
+                                    BBOX_SHAPE=None,
+                                    use_skimage_antialias=False,
+                                    border_mode=cv2.BORDER_CONSTANT,
+                                    ):
+        # Read data from item
+        key = item['__key__']
+        image = item['jpg']
+        data = item['data.pyd']
+        mask = item['mask']
+
+        keypoints_2d = data['keypoints_2d']
+        keypoints_3d = data['keypoints_3d']
+        center = data['center']
+        scale = data['scale']
+        hand_pose = data['hand_pose']
+        betas = data['betas']
+        right = data['right']
+        has_hand_pose = data['has_hand_pose']
+        has_betas = data['has_betas']
+        # image_file = data['image_file']
+
+        # Process data
+        orig_keypoints_2d = keypoints_2d.copy()
+        center_x = center[0]
+        center_y = center[1]
+        bbox_size = expand_to_aspect_ratio(scale*200, target_aspect_ratio=BBOX_SHAPE).max()
+        if bbox_size < 1:
+            breakpoint()
+
+
+        mano_params = {'global_orient': hand_pose[:3],
+                    'hand_pose': hand_pose[3:],
+                    'betas': betas
+                    }
+
+        has_mano_params = {'global_orient': has_hand_pose,
+                        'hand_pose': has_hand_pose,
+                        'betas': has_betas
+                        }
+
+        mano_params_is_axis_angle = {'global_orient': True,
+                                    'hand_pose': True,
+                                    'betas': False
+                                    }
+
+        augm_config = copy.deepcopy(augm_config)
+        # Crop image and (possibly) perform data augmentation
+        img_rgba = np.concatenate([image, mask.astype(np.uint8)[:,:,None]*255], axis=2)
+        img_patch_rgba, keypoints_2d, keypoints_3d, mano_params, has_mano_params, img_size, trans = get_example(img_rgba,
+                                                                                                    center_x, center_y,
+                                                                                                    bbox_size, bbox_size,
+                                                                                                    keypoints_2d, keypoints_3d,
+                                                                                                    mano_params, has_mano_params,
+                                                                                                    FLIP_KEYPOINT_PERMUTATION,
+                                                                                                    IMG_SIZE, IMG_SIZE,
+                                                                                                    MEAN, STD, train, right, augm_config,
+                                                                                                    is_bgr=False, return_trans=True,
+                                                                                                    use_skimage_antialias=use_skimage_antialias,
+                                                                                                    border_mode=border_mode,
+                                                                                                    )
+        img_patch = img_patch_rgba[:3,:,:]
+        mask_patch = (img_patch_rgba[3,:,:] / 255.0).clip(0,1)
+        if (mask_patch < 0.5).all():
+            mask_patch = np.ones_like(mask_patch)
+
+        item = {}
+
+        item['img'] = img_patch
+        item['mask'] = mask_patch
+        # item['img_og'] = image
+        # item['mask_og'] = mask
+        item['keypoints_2d'] = keypoints_2d.astype(np.float32)
+        item['keypoints_3d'] = keypoints_3d.astype(np.float32)
+        item['orig_keypoints_2d'] = orig_keypoints_2d
+        item['box_center'] = center.copy()
+        item['box_size'] = bbox_size
+        item['img_size'] = 1.0 * img_size[::-1].copy()
+        item['mano_params'] = mano_params
+        item['has_mano_params'] = has_mano_params
+        item['mano_params_is_axis_angle'] = mano_params_is_axis_angle
+        item['_scale'] = scale
+        item['_trans'] = trans
+        item['imgname'] = key
+        # item['idx'] = idx
+        return item
diff --git a/phantom/submodules/phantom-hamer/hamer/datasets/json_dataset.py b/phantom/submodules/phantom-hamer/hamer/datasets/json_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e258a3e8b84baa386d0edcb75ef45a4770c6301
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/datasets/json_dataset.py
@@ -0,0 +1,213 @@
+import copy
+import os
+import json
+import glob
+import numpy as np
+import torch
+from typing import Any, Dict, List
+from yacs.config import CfgNode
+import braceexpand
+import cv2
+
+from .dataset import Dataset
+from .utils import get_example, expand_to_aspect_ratio
+from .smplh_prob_filter import poses_check_probable, load_amass_hist_smooth
+
+def expand(s):
+    return os.path.expanduser(os.path.expandvars(s))
+def expand_urls(urls: str|List[str]):
+    if isinstance(urls, str):
+        urls = [urls]
+    urls = [u for url in urls for u in braceexpand.braceexpand(expand(url))]
+    return urls
+
+AIC_TRAIN_CORRUPT_KEYS = {
+    '0a047f0124ae48f8eee15a9506ce1449ee1ba669',
+    '1a703aa174450c02fbc9cfbf578a5435ef403689',
+    '0394e6dc4df78042929b891dbc24f0fd7ffb6b6d',
+    '5c032b9626e410441544c7669123ecc4ae077058',
+    'ca018a7b4c5f53494006ebeeff9b4c0917a55f07',
+    '4a77adb695bef75a5d34c04d589baf646fe2ba35',
+    'a0689017b1065c664daef4ae2d14ea03d543217e',
+    '39596a45cbd21bed4a5f9c2342505532f8ec5cbb',
+    '3d33283b40610d87db660b62982f797d50a7366b',
+}
+CORRUPT_KEYS = {
+    *{f'aic-train/{k}' for k in AIC_TRAIN_CORRUPT_KEYS},
+    *{f'aic-train-vitpose/{k}' for k in AIC_TRAIN_CORRUPT_KEYS},
+}
+
+FLIP_KEYPOINT_PERMUTATION = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
+
+DEFAULT_MEAN = 255. * np.array([0.485, 0.456, 0.406])
+DEFAULT_STD = 255. * np.array([0.229, 0.224, 0.225])
+DEFAULT_IMG_SIZE = 256
+
+class JsonDataset(Dataset):
+
+    def __init__(self,
+                 cfg: CfgNode,
+                 dataset_file: str,
+                 img_dir: str,
+                 right: bool,
+                 train: bool = False,
+                 prune: Dict[str, Any] = {},
+                 **kwargs):
+        """
+        Dataset class used for loading images and corresponding annotations.
+        Args:
+            cfg (CfgNode): Model config file.
+            dataset_file (str): Path to npz file containing dataset info.
+            img_dir (str): Path to image folder.
+            train (bool): Whether it is for training or not (enables data augmentation).
+        """
+        super(JsonDataset, self).__init__()
+        self.train = train
+        self.cfg = cfg
+
+        self.img_size = cfg.MODEL.IMAGE_SIZE
+        self.mean = 255. * np.array(self.cfg.MODEL.IMAGE_MEAN)
+        self.std = 255. * np.array(self.cfg.MODEL.IMAGE_STD)
+
+        self.img_dir = img_dir
+        boxes = np.array(json.load(open(dataset_file, 'rb')))
+
+        self.imgname = glob.glob(os.path.join(self.img_dir,'*.jpg'))
+        self.imgname.sort()
+
+        self.flip_keypoint_permutation = copy.copy(FLIP_KEYPOINT_PERMUTATION)
+
+        num_pose = 3 * (self.cfg.MANO.NUM_HAND_JOINTS + 1)
+
+        # Bounding boxes are assumed to be in the center and scale format
+        boxes = boxes.astype(np.float32)
+        self.center = (boxes[:, 2:4] + boxes[:, 0:2]) / 2.0
+        self.scale = 2 * (boxes[:, 2:4] - boxes[:, 0:2]) / 200.0
+        self.personid = np.arange(len(boxes), dtype=np.int32)
+        if right:
+            self.right = np.ones(len(self.imgname), dtype=np.float32)
+        else:
+            self.right = np.zeros(len(self.imgname), dtype=np.float32)
+        assert self.scale.shape == (len(self.center), 2)
+
+        # Get gt SMPLX parameters, if available
+        try:
+            self.hand_pose = self.data['hand_pose'].astype(np.float32)
+            self.has_hand_pose = self.data['has_hand_pose'].astype(np.float32)
+        except:
+            self.hand_pose = np.zeros((len(self.imgname), num_pose), dtype=np.float32)
+            self.has_hand_pose = np.zeros(len(self.imgname), dtype=np.float32)
+        try:
+            self.betas = self.data['betas'].astype(np.float32)
+            self.has_betas = self.data['has_betas'].astype(np.float32)
+        except:
+            self.betas = np.zeros((len(self.imgname), 10), dtype=np.float32)
+            self.has_betas = np.zeros(len(self.imgname), dtype=np.float32)
+
+        # Try to get 2d keypoints, if available
+        try:
+            hand_keypoints_2d = self.data['hand_keypoints_2d']
+        except:
+            hand_keypoints_2d = np.zeros((len(self.center), 21, 3))
+        ## Try to get extra 2d keypoints, if available
+        #try:
+        #    extra_keypoints_2d = self.data['extra_keypoints_2d']
+        #except KeyError:
+        #    extra_keypoints_2d = np.zeros((len(self.center), 19, 3))
+
+        #self.keypoints_2d = np.concatenate((hand_keypoints_2d, extra_keypoints_2d), axis=1).astype(np.float32)
+        self.keypoints_2d = hand_keypoints_2d
+
+        # Try to get 3d keypoints, if available
+        try:
+            hand_keypoints_3d = self.data['hand_keypoints_3d'].astype(np.float32)
+        except:
+            hand_keypoints_3d = np.zeros((len(self.center), 21, 4), dtype=np.float32)
+        ## Try to get extra 3d keypoints, if available
+        #try:
+        #    extra_keypoints_3d = self.data['extra_keypoints_3d'].astype(np.float32)
+        #except KeyError:
+        #    extra_keypoints_3d = np.zeros((len(self.center), 19, 4), dtype=np.float32)
+
+        self.keypoints_3d = hand_keypoints_3d
+
+        #body_keypoints_3d[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], -1] = 0
+
+        #self.keypoints_3d = np.concatenate((body_keypoints_3d, extra_keypoints_3d), axis=1).astype(np.float32)
+
+    def __len__(self) -> int:
+        return len(self.scale)
+
+    def __getitem__(self, idx: int) -> Dict:
+        """
+        Returns an example from the dataset.
+        """
+        try:
+            image_file = self.imgname[idx].decode('utf-8')
+        except AttributeError:
+            image_file = self.imgname[idx]
+        keypoints_2d = self.keypoints_2d[idx].copy()
+        keypoints_3d = self.keypoints_3d[idx].copy()
+
+        center = self.center[idx].copy()
+        center_x = center[0]
+        center_y = center[1]
+        scale = self.scale[idx]
+        right = self.right[idx].copy()
+        BBOX_SHAPE = self.cfg.MODEL.get('BBOX_SHAPE', None)
+        #bbox_size = expand_to_aspect_ratio(scale*200, target_aspect_ratio=BBOX_SHAPE).max()
+        bbox_size = ((scale*200).max())
+        bbox_expand_factor = bbox_size / ((scale*200).max())
+        hand_pose = self.hand_pose[idx].copy().astype(np.float32)
+        betas = self.betas[idx].copy().astype(np.float32)
+
+        has_hand_pose = self.has_hand_pose[idx].copy()
+        has_betas = self.has_betas[idx].copy()
+
+        mano_params = {'global_orient': hand_pose[:3],
+                       'hand_pose': hand_pose[3:],
+                       'betas': betas
+                      }
+
+        has_mano_params = {'global_orient': has_hand_pose,
+                           'hand_pose': has_hand_pose,
+                           'betas': has_betas
+                           }
+
+        mano_params_is_axis_angle = {'global_orient': True,
+                                     'hand_pose': True,
+                                     'betas': False
+                                    }
+
+        augm_config = self.cfg.DATASETS.CONFIG
+        # Crop image and (possibly) perform data augmentation
+        img_patch, keypoints_2d, keypoints_3d, mano_params, has_mano_params, img_size = get_example(image_file,
+                                                                                                    center_x, center_y,
+                                                                                                    bbox_size, bbox_size,
+                                                                                                    keypoints_2d, keypoints_3d,
+                                                                                                    mano_params, has_mano_params,
+                                                                                                    self.flip_keypoint_permutation,
+                                                                                                    self.img_size, self.img_size,
+                                                                                                    self.mean, self.std, self.train, right, augm_config)
+
+        item = {}
+        # These are the keypoints in the original image coordinates (before cropping)
+        orig_keypoints_2d = self.keypoints_2d[idx].copy()
+
+        item['img'] = img_patch
+        item['keypoints_2d'] = keypoints_2d.astype(np.float32)
+        item['keypoints_3d'] = keypoints_3d.astype(np.float32)
+        item['orig_keypoints_2d'] = orig_keypoints_2d
+        item['box_center'] = self.center[idx].copy()
+        item['box_size'] = bbox_size
+        item['bbox_expand_factor'] = bbox_expand_factor
+        item['img_size'] = 1.0 * img_size[::-1].copy()
+        item['mano_params'] = mano_params
+        item['has_mano_params'] = has_mano_params
+        item['mano_params_is_axis_angle'] = mano_params_is_axis_angle
+        item['imgname'] = image_file
+        item['personid'] = int(self.personid[idx])
+        item['idx'] = idx
+        item['_scale'] = scale
+        item['right'] = self.right[idx].copy()
+        return item
diff --git a/phantom/submodules/phantom-hamer/hamer/datasets/mocap_dataset.py b/phantom/submodules/phantom-hamer/hamer/datasets/mocap_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbf808f83c462646a19eed7e33dea4e50037b512
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/datasets/mocap_dataset.py
@@ -0,0 +1,25 @@
+import numpy as np
+from typing import Dict
+
+class MoCapDataset:
+
+    def __init__(self, dataset_file: str):
+        """
+        Dataset class used for loading a dataset of unpaired MANO parameter annotations
+        Args:
+            cfg (CfgNode): Model config file.
+            dataset_file (str): Path to npz file containing dataset info.
+        """
+        data = np.load(dataset_file)
+        self.pose = data['hand_pose'].astype(np.float32)[:, 3:]
+        self.betas = data['betas'].astype(np.float32)
+        self.length = len(self.pose)
+
+    def __getitem__(self, idx: int) -> Dict:
+        pose = self.pose[idx].copy()
+        betas = self.betas[idx].copy()
+        item = {'hand_pose': pose, 'betas': betas}
+        return item
+
+    def __len__(self) -> int:
+        return self.length
diff --git a/phantom/submodules/phantom-hamer/hamer/datasets/utils.py b/phantom/submodules/phantom-hamer/hamer/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ded82fcd02ebf95895e3edf6a680f045919d35
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/datasets/utils.py
@@ -0,0 +1,993 @@
+"""
+Parts of the code are taken or adapted from
+https://github.com/mkocabas/EpipolarPose/blob/master/lib/utils/img_utils.py
+"""
+import torch
+import numpy as np
+from skimage.transform import rotate, resize
+from skimage.filters import gaussian
+import random
+import cv2
+from typing import List, Dict, Tuple
+from yacs.config import CfgNode
+
+def expand_to_aspect_ratio(input_shape, target_aspect_ratio=None):
+    """Increase the size of the bounding box to match the target shape."""
+    if target_aspect_ratio is None:
+        return input_shape
+
+    try:
+        w , h = input_shape
+    except (ValueError, TypeError):
+        return input_shape
+
+    w_t, h_t = target_aspect_ratio
+    if h / w < h_t / w_t:
+        h_new = max(w * h_t / w_t, h)
+        w_new = w
+    else:
+        h_new = h
+        w_new = max(h * w_t / h_t, w)
+    if h_new < h or w_new < w:
+        breakpoint()
+    return np.array([w_new, h_new])
+
+def do_augmentation(aug_config: CfgNode) -> Tuple:
+    """
+    Compute random augmentation parameters.
+    Args:
+        aug_config (CfgNode): Config containing augmentation parameters.
+    Returns:
+        scale (float): Box rescaling factor.
+        rot (float): Random image rotation.
+        do_flip (bool): Whether to flip image or not.
+        do_extreme_crop (bool): Whether to apply extreme cropping (as proposed in EFT).
+        color_scale (List): Color rescaling factor
+        tx (float): Random translation along the x axis.
+        ty (float): Random translation along the y axis. 
+    """
+
+    tx = np.clip(np.random.randn(), -1.0, 1.0) * aug_config.TRANS_FACTOR
+    ty = np.clip(np.random.randn(), -1.0, 1.0) * aug_config.TRANS_FACTOR
+    scale = np.clip(np.random.randn(), -1.0, 1.0) * aug_config.SCALE_FACTOR + 1.0
+    rot = np.clip(np.random.randn(), -2.0,
+                  2.0) * aug_config.ROT_FACTOR if random.random() <= aug_config.ROT_AUG_RATE else 0
+    do_flip = aug_config.DO_FLIP and random.random() <= aug_config.FLIP_AUG_RATE
+    do_extreme_crop = random.random() <= aug_config.EXTREME_CROP_AUG_RATE
+    extreme_crop_lvl = aug_config.get('EXTREME_CROP_AUG_LEVEL', 0)
+    # extreme_crop_lvl = 0
+    c_up = 1.0 + aug_config.COLOR_SCALE
+    c_low = 1.0 - aug_config.COLOR_SCALE
+    color_scale = [random.uniform(c_low, c_up), random.uniform(c_low, c_up), random.uniform(c_low, c_up)]
+    return scale, rot, do_flip, do_extreme_crop, extreme_crop_lvl, color_scale, tx, ty
+
+def rotate_2d(pt_2d: np.array, rot_rad: float) -> np.array:
+    """
+    Rotate a 2D point on the x-y plane.
+    Args:
+        pt_2d (np.array): Input 2D point with shape (2,).
+        rot_rad (float): Rotation angle
+    Returns:
+        np.array: Rotated 2D point.
+    """
+    x = pt_2d[0]
+    y = pt_2d[1]
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    xx = x * cs - y * sn
+    yy = x * sn + y * cs
+    return np.array([xx, yy], dtype=np.float32)
+
+
+def gen_trans_from_patch_cv(c_x: float, c_y: float,
+                            src_width: float, src_height: float,
+                            dst_width: float, dst_height: float,
+                            scale: float, rot: float) -> np.array:
+    """
+    Create transformation matrix for the bounding box crop.
+    Args:
+        c_x (float): Bounding box center x coordinate in the original image.
+        c_y (float): Bounding box center y coordinate in the original image.
+        src_width (float): Bounding box width.
+        src_height (float): Bounding box height.
+        dst_width (float): Output box width.
+        dst_height (float): Output box height.
+        scale (float): Rescaling factor for the bounding box (augmentation).
+        rot (float): Random rotation applied to the box.
+    Returns:
+        trans (np.array): Target geometric transformation.
+    """
+    # augment size with scale
+    src_w = src_width * scale
+    src_h = src_height * scale
+    src_center = np.zeros(2)
+    src_center[0] = c_x
+    src_center[1] = c_y
+    # augment rotation
+    rot_rad = np.pi * rot / 180
+    src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad)
+    src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad)
+
+    dst_w = dst_width
+    dst_h = dst_height
+    dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32)
+    dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32)
+    dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = src_center
+    src[1, :] = src_center + src_downdir
+    src[2, :] = src_center + src_rightdir
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = dst_center
+    dst[1, :] = dst_center + dst_downdir
+    dst[2, :] = dst_center + dst_rightdir
+
+    trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def trans_point2d(pt_2d: np.array, trans: np.array):
+    """
+    Transform a 2D point using translation matrix trans.
+    Args:
+        pt_2d (np.array): Input 2D point with shape (2,).
+        trans (np.array): Transformation matrix.
+    Returns:
+        np.array: Transformed 2D point.
+    """
+    src_pt = np.array([pt_2d[0], pt_2d[1], 1.]).T
+    dst_pt = np.dot(trans, src_pt)
+    return dst_pt[0:2]
+
+def get_transform(center, scale, res, rot=0):
+    """Generate transformation matrix."""
+    """Taken from PARE: https://github.com/mkocabas/PARE/blob/6e0caca86c6ab49ff80014b661350958e5b72fd8/pare/utils/image_utils.py"""
+    h = 200 * scale
+    t = np.zeros((3, 3))
+    t[0, 0] = float(res[1]) / h
+    t[1, 1] = float(res[0]) / h
+    t[0, 2] = res[1] * (-float(center[0]) / h + .5)
+    t[1, 2] = res[0] * (-float(center[1]) / h + .5)
+    t[2, 2] = 1
+    if not rot == 0:
+        rot = -rot  # To match direction of rotation from cropping
+        rot_mat = np.zeros((3, 3))
+        rot_rad = rot * np.pi / 180
+        sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+        rot_mat[0, :2] = [cs, -sn]
+        rot_mat[1, :2] = [sn, cs]
+        rot_mat[2, 2] = 1
+        # Need to rotate around center
+        t_mat = np.eye(3)
+        t_mat[0, 2] = -res[1] / 2
+        t_mat[1, 2] = -res[0] / 2
+        t_inv = t_mat.copy()
+        t_inv[:2, 2] *= -1
+        t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
+    return t
+
+
+def transform(pt, center, scale, res, invert=0, rot=0, as_int=True):
+    """Transform pixel location to different reference."""
+    """Taken from PARE: https://github.com/mkocabas/PARE/blob/6e0caca86c6ab49ff80014b661350958e5b72fd8/pare/utils/image_utils.py"""
+    t = get_transform(center, scale, res, rot=rot)
+    if invert:
+        t = np.linalg.inv(t)
+    new_pt = np.array([pt[0] - 1, pt[1] - 1, 1.]).T
+    new_pt = np.dot(t, new_pt)
+    if as_int:
+        new_pt = new_pt.astype(int)
+    return new_pt[:2] + 1
+
+def crop_img(img, ul, br, border_mode=cv2.BORDER_CONSTANT, border_value=0):
+    c_x = (ul[0] + br[0])/2
+    c_y = (ul[1] + br[1])/2
+    bb_width = patch_width = br[0] - ul[0]
+    bb_height = patch_height = br[1] - ul[1]
+    trans = gen_trans_from_patch_cv(c_x, c_y, bb_width, bb_height, patch_width, patch_height, 1.0, 0)
+    img_patch = cv2.warpAffine(img, trans, (int(patch_width), int(patch_height)), 
+                                flags=cv2.INTER_LINEAR, 
+                                borderMode=border_mode,
+                                borderValue=border_value
+                        )
+    
+    # Force borderValue=cv2.BORDER_CONSTANT for alpha channel
+    if (img.shape[2] == 4) and (border_mode != cv2.BORDER_CONSTANT):
+        img_patch[:,:,3] = cv2.warpAffine(img[:,:,3], trans, (int(patch_width), int(patch_height)), 
+                                            flags=cv2.INTER_LINEAR, 
+                                            borderMode=cv2.BORDER_CONSTANT,
+                            )
+
+    return img_patch
+
+def generate_image_patch_skimage(img: np.array, c_x: float, c_y: float,
+                                 bb_width: float, bb_height: float,
+                                 patch_width: float, patch_height: float,
+                                 do_flip: bool, scale: float, rot: float,
+                                 border_mode=cv2.BORDER_CONSTANT, border_value=0) -> Tuple[np.array, np.array]:
+    """
+    Crop image according to the supplied bounding box.
+    Args:
+        img (np.array): Input image of shape (H, W, 3)
+        c_x (float): Bounding box center x coordinate in the original image.
+        c_y (float): Bounding box center y coordinate in the original image.
+        bb_width (float): Bounding box width.
+        bb_height (float): Bounding box height.
+        patch_width (float): Output box width.
+        patch_height (float): Output box height.
+        do_flip (bool): Whether to flip image or not.
+        scale (float): Rescaling factor for the bounding box (augmentation).
+        rot (float): Random rotation applied to the box.
+    Returns:
+        img_patch (np.array): Cropped image patch of shape (patch_height, patch_height, 3)
+        trans (np.array): Transformation matrix.
+    """
+    
+    img_height, img_width, img_channels = img.shape
+    if do_flip:
+       img = img[:, ::-1, :]
+       c_x = img_width - c_x - 1
+
+    trans = gen_trans_from_patch_cv(c_x, c_y, bb_width, bb_height, patch_width, patch_height, scale, rot)
+
+    #img_patch = cv2.warpAffine(img, trans, (int(patch_width), int(patch_height)), flags=cv2.INTER_LINEAR)
+
+    # skimage
+    center = np.zeros(2)
+    center[0] = c_x
+    center[1] = c_y
+    res = np.zeros(2)
+    res[0] = patch_width
+    res[1] = patch_height
+    # assumes bb_width = bb_height
+    # assumes patch_width = patch_height
+    assert bb_width == bb_height, f'{bb_width=} != {bb_height=}'
+    assert patch_width == patch_height, f'{patch_width=} != {patch_height=}'
+    scale1 = scale*bb_width/200.
+    
+    # Upper left point
+    ul = np.array(transform([1, 1], center, scale1, res, invert=1, as_int=False)) - 1
+    # Bottom right point
+    br = np.array(transform([res[0] + 1,
+                             res[1] + 1], center, scale1, res, invert=1, as_int=False)) - 1
+
+    # Padding so that when rotated proper amount of context is included
+    try:
+        pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2) + 1
+    except:
+        breakpoint()
+    if not rot == 0:
+        ul -= pad
+        br += pad
+
+
+    if False:
+        # Old way of cropping image
+        ul_int = ul.astype(int)
+        br_int = br.astype(int)
+        new_shape = [br_int[1] - ul_int[1], br_int[0] - ul_int[0]]
+        if len(img.shape) > 2:
+            new_shape += [img.shape[2]]
+        new_img = np.zeros(new_shape)
+
+        # Range to fill new array
+        new_x = max(0, -ul_int[0]), min(br_int[0], len(img[0])) - ul_int[0]
+        new_y = max(0, -ul_int[1]), min(br_int[1], len(img)) - ul_int[1]
+        # Range to sample from original image
+        old_x = max(0, ul_int[0]), min(len(img[0]), br_int[0])
+        old_y = max(0, ul_int[1]), min(len(img), br_int[1])
+        new_img[new_y[0]:new_y[1], new_x[0]:new_x[1]] = img[old_y[0]:old_y[1],
+                                                        old_x[0]:old_x[1]]
+
+    # New way of cropping image
+    new_img = crop_img(img, ul, br, border_mode=border_mode, border_value=border_value).astype(np.float32)
+
+    # print(f'{new_img.shape=}')
+    # print(f'{new_img1.shape=}')
+    # print(f'{np.allclose(new_img, new_img1)=}')
+    # print(f'{img.dtype=}')
+
+
+    if not rot == 0:
+        # Remove padding
+
+        new_img = rotate(new_img, rot) # scipy.misc.imrotate(new_img, rot)
+        new_img = new_img[pad:-pad, pad:-pad]
+
+    if new_img.shape[0] < 1 or new_img.shape[1] < 1:
+        print(f'{img.shape=}')
+        print(f'{new_img.shape=}')
+        print(f'{ul=}')
+        print(f'{br=}')
+        print(f'{pad=}')
+        print(f'{rot=}')
+
+        breakpoint()
+
+    # resize image
+    new_img = resize(new_img, res) # scipy.misc.imresize(new_img, res)
+    
+    new_img = np.clip(new_img, 0, 255).astype(np.uint8)
+
+    return new_img, trans
+
+
+def generate_image_patch_cv2(img: np.array, c_x: float, c_y: float,
+                             bb_width: float, bb_height: float,
+                             patch_width: float, patch_height: float,
+                             do_flip: bool, scale: float, rot: float,
+                             border_mode=cv2.BORDER_CONSTANT, border_value=0) -> Tuple[np.array, np.array]:
+    """
+    Crop the input image and return the crop and the corresponding transformation matrix.
+    Args:
+        img (np.array): Input image of shape (H, W, 3)
+        c_x (float): Bounding box center x coordinate in the original image.
+        c_y (float): Bounding box center y coordinate in the original image.
+        bb_width (float): Bounding box width.
+        bb_height (float): Bounding box height.
+        patch_width (float): Output box width.
+        patch_height (float): Output box height.
+        do_flip (bool): Whether to flip image or not.
+        scale (float): Rescaling factor for the bounding box (augmentation).
+        rot (float): Random rotation applied to the box.
+    Returns:
+        img_patch (np.array): Cropped image patch of shape (patch_height, patch_height, 3)
+        trans (np.array): Transformation matrix.
+    """
+
+    img_height, img_width, img_channels = img.shape
+    if do_flip:
+        img = img[:, ::-1, :]
+        c_x = img_width - c_x - 1
+
+
+    trans = gen_trans_from_patch_cv(c_x, c_y, bb_width, bb_height, patch_width, patch_height, scale, rot)
+
+    img_patch = cv2.warpAffine(img, trans, (int(patch_width), int(patch_height)), 
+                        flags=cv2.INTER_LINEAR, 
+                        borderMode=border_mode,
+                        borderValue=border_value,
+                )
+    # Force borderValue=cv2.BORDER_CONSTANT for alpha channel
+    if (img.shape[2] == 4) and (border_mode != cv2.BORDER_CONSTANT):
+        img_patch[:,:,3] = cv2.warpAffine(img[:,:,3], trans, (int(patch_width), int(patch_height)), 
+                                            flags=cv2.INTER_LINEAR, 
+                                            borderMode=cv2.BORDER_CONSTANT,
+                            )
+
+    return img_patch, trans
+
+
+def convert_cvimg_to_tensor(cvimg: np.array):
+    """
+    Convert image from HWC to CHW format.
+    Args:
+        cvimg (np.array): Image of shape (H, W, 3) as loaded by OpenCV.
+    Returns:
+        np.array: Output image of shape (3, H, W).
+    """
+    # from h,w,c(OpenCV) to c,h,w
+    img = cvimg.copy()
+    img = np.transpose(img, (2, 0, 1))
+    # from int to float
+    img = img.astype(np.float32)
+    return img
+
+def fliplr_params(mano_params: Dict, has_mano_params: Dict) -> Tuple[Dict, Dict]:
+    """
+    Flip MANO parameters when flipping the image.
+    Args:
+        mano_params (Dict): MANO parameter annotations.
+        has_mano_params (Dict): Whether MANO annotations are valid.
+    Returns:
+        Dict, Dict: Flipped MANO parameters and valid flags.
+    """
+    global_orient = mano_params['global_orient'].copy()
+    hand_pose = mano_params['hand_pose'].copy()
+    betas = mano_params['betas'].copy()
+    has_global_orient = has_mano_params['global_orient'].copy()
+    has_hand_pose = has_mano_params['hand_pose'].copy()
+    has_betas = has_mano_params['betas'].copy()
+
+    global_orient[1::3] *= -1
+    global_orient[2::3] *= -1
+    hand_pose[1::3] *= -1
+    hand_pose[2::3] *= -1
+
+    mano_params = {'global_orient': global_orient.astype(np.float32),
+                   'hand_pose': hand_pose.astype(np.float32),
+                   'betas': betas.astype(np.float32)
+                  }
+
+    has_mano_params = {'global_orient': has_global_orient,
+                       'hand_pose': has_hand_pose,
+                       'betas': has_betas
+                      }
+
+    return mano_params, has_mano_params
+
+
+def fliplr_keypoints(joints: np.array, width: float, flip_permutation: List[int]) -> np.array:
+    """
+    Flip 2D or 3D keypoints.
+    Args:
+        joints (np.array): Array of shape (N, 3) or (N, 4) containing 2D or 3D keypoint locations and confidence.
+        flip_permutation (List): Permutation to apply after flipping.
+    Returns:
+        np.array: Flipped 2D or 3D keypoints with shape (N, 3) or (N, 4) respectively.
+    """
+    joints = joints.copy()
+    # Flip horizontal
+    joints[:, 0] = width - joints[:, 0] - 1
+    joints = joints[flip_permutation, :]
+
+    return joints
+
+def keypoint_3d_processing(keypoints_3d: np.array, flip_permutation: List[int], rot: float, do_flip: float) -> np.array:
+    """
+    Process 3D keypoints (rotation/flipping).
+    Args:
+        keypoints_3d (np.array): Input array of shape (N, 4) containing the 3D keypoints and confidence.
+        flip_permutation (List): Permutation to apply after flipping.
+        rot (float): Random rotation applied to the keypoints.
+        do_flip (bool): Whether to flip keypoints or not.
+    Returns:
+        np.array: Transformed 3D keypoints with shape (N, 4).
+    """
+    if do_flip:
+        keypoints_3d = fliplr_keypoints(keypoints_3d, 1, flip_permutation)
+    # in-plane rotation
+    rot_mat = np.eye(3)
+    if not rot == 0:
+        rot_rad = -rot * np.pi / 180
+        sn,cs = np.sin(rot_rad), np.cos(rot_rad)
+        rot_mat[0,:2] = [cs, -sn]
+        rot_mat[1,:2] = [sn, cs]
+    keypoints_3d[:, :-1] = np.einsum('ij,kj->ki', rot_mat, keypoints_3d[:, :-1])
+    # flip the x coordinates
+    keypoints_3d = keypoints_3d.astype('float32')
+    return keypoints_3d
+
+def rot_aa(aa: np.array, rot: float) -> np.array:
+    """
+    Rotate axis angle parameters.
+    Args:
+        aa (np.array): Axis-angle vector of shape (3,).
+        rot (np.array): Rotation angle in degrees.
+    Returns:
+        np.array: Rotated axis-angle vector.
+    """
+    # pose parameters
+    R = np.array([[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+                  [np.sin(np.deg2rad(-rot)), np.cos(np.deg2rad(-rot)), 0],
+                  [0, 0, 1]])
+    # find the rotation of the hand in camera frame
+    per_rdg, _ = cv2.Rodrigues(aa)
+    # apply the global rotation to the global orientation
+    resrot, _ = cv2.Rodrigues(np.dot(R,per_rdg))
+    aa = (resrot.T)[0]
+    return aa.astype(np.float32)
+
+def mano_param_processing(mano_params: Dict, has_mano_params: Dict, rot: float, do_flip: bool) -> Tuple[Dict, Dict]:
+    """
+    Apply random augmentations to the MANO parameters.
+    Args:
+        mano_params (Dict): MANO parameter annotations.
+        has_mano_params (Dict): Whether mano annotations are valid.
+        rot (float): Random rotation applied to the keypoints.
+        do_flip (bool): Whether to flip keypoints or not.
+    Returns:
+        Dict, Dict: Transformed MANO parameters and valid flags.
+    """
+    if do_flip:
+        mano_params, has_mano_params = fliplr_params(mano_params, has_mano_params)
+    mano_params['global_orient'] = rot_aa(mano_params['global_orient'], rot)
+    return mano_params, has_mano_params
+
+
+
+def get_example(img_path: str|np.ndarray, center_x: float, center_y: float,
+                width: float, height: float,
+                keypoints_2d: np.array, keypoints_3d: np.array,
+                mano_params: Dict, has_mano_params: Dict,
+                flip_kp_permutation: List[int],
+                patch_width: int, patch_height: int,
+                mean: np.array, std: np.array,
+                do_augment: bool, is_right: bool, augm_config: CfgNode,
+                is_bgr: bool = True,
+                use_skimage_antialias: bool = False,
+                border_mode: int = cv2.BORDER_CONSTANT,
+                return_trans: bool = False) -> Tuple:
+    """
+    Get an example from the dataset and (possibly) apply random augmentations.
+    Args:
+        img_path (str): Image filename
+        center_x (float): Bounding box center x coordinate in the original image.
+        center_y (float): Bounding box center y coordinate in the original image.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array with shape (N,3) containing the 2D keypoints in the original image coordinates.
+        keypoints_3d (np.array): Array with shape (N,4) containing the 3D keypoints.
+        mano_params (Dict): MANO parameter annotations.
+        has_mano_params (Dict): Whether MANO annotations are valid.
+        flip_kp_permutation (List): Permutation to apply to the keypoints after flipping.
+        patch_width (float): Output box width.
+        patch_height (float): Output box height.
+        mean (np.array): Array of shape (3,) containing the mean for normalizing the input image.
+        std (np.array): Array of shape (3,) containing the std for normalizing the input image.
+        do_augment (bool): Whether to apply data augmentation or not.
+        aug_config (CfgNode): Config containing augmentation parameters.
+    Returns:
+        return img_patch, keypoints_2d, keypoints_3d, mano_params, has_mano_params, img_size
+        img_patch (np.array): Cropped image patch of shape (3, patch_height, patch_height)
+        keypoints_2d (np.array): Array with shape (N,3) containing the transformed 2D keypoints.
+        keypoints_3d (np.array): Array with shape (N,4) containing the transformed 3D keypoints.
+        mano_params (Dict): Transformed MANO parameters.
+        has_mano_params (Dict): Valid flag for transformed MANO parameters.
+        img_size (np.array): Image size of the original image.
+        """
+    if isinstance(img_path, str):
+        # 1. load image
+        cvimg = cv2.imread(img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
+        if not isinstance(cvimg, np.ndarray):
+            raise IOError("Fail to read %s" % img_path)
+    elif isinstance(img_path, np.ndarray):
+        cvimg = img_path
+    else:
+        raise TypeError('img_path must be either a string or a numpy array')
+    img_height, img_width, img_channels = cvimg.shape
+
+    img_size = np.array([img_height, img_width])
+
+    # 2. get augmentation params
+    if do_augment:
+        scale, rot, do_flip, do_extreme_crop, extreme_crop_lvl, color_scale, tx, ty = do_augmentation(augm_config)
+    else:
+        scale, rot, do_flip, do_extreme_crop, extreme_crop_lvl, color_scale, tx, ty = 1.0, 0, False, False, 0, [1.0, 1.0, 1.0], 0., 0.
+
+    # if it's a left hand, we flip
+    if not is_right:
+        do_flip = True
+
+    if width < 1 or height < 1:
+        breakpoint()
+
+    if do_extreme_crop:
+        if extreme_crop_lvl == 0:
+            center_x1, center_y1, width1, height1 = extreme_cropping(center_x, center_y, width, height, keypoints_2d)
+        elif extreme_crop_lvl == 1:
+            center_x1, center_y1, width1, height1 = extreme_cropping_aggressive(center_x, center_y, width, height, keypoints_2d)
+
+        THRESH = 4
+        if width1 < THRESH or height1 < THRESH:
+            # print(f'{do_extreme_crop=}')
+            # print(f'width: {width}, height: {height}')
+            # print(f'width1: {width1}, height1: {height1}')
+            # print(f'center_x: {center_x}, center_y: {center_y}')
+            # print(f'center_x1: {center_x1}, center_y1: {center_y1}')
+            # print(f'keypoints_2d: {keypoints_2d}')
+            # print(f'\n\n', flush=True)
+            # breakpoint()
+            pass
+            # print(f'skip ==> width1: {width1}, height1: {height1}, width: {width}, height: {height}')
+        else:
+            center_x, center_y, width, height = center_x1, center_y1, width1, height1
+
+    center_x += width * tx
+    center_y += height * ty
+
+    # Process 3D keypoints
+    keypoints_3d = keypoint_3d_processing(keypoints_3d, flip_kp_permutation, rot, do_flip)
+
+    # 3. generate image patch
+    if use_skimage_antialias:
+        # Blur image to avoid aliasing artifacts
+        downsampling_factor = (patch_width / (width*scale))
+        if downsampling_factor > 1.1:
+            cvimg  = gaussian(cvimg, sigma=(downsampling_factor-1)/2, channel_axis=2, preserve_range=True, truncate=3.0)
+
+    img_patch_cv, trans = generate_image_patch_cv2(cvimg,
+                                                    center_x, center_y,
+                                                    width, height,
+                                                    patch_width, patch_height,
+                                                    do_flip, scale, rot, 
+                                                    border_mode=border_mode)
+        # img_patch_cv, trans = generate_image_patch_skimage(cvimg,
+        #                                                 center_x, center_y,
+        #                                                 width, height,
+        #                                                 patch_width, patch_height,
+        #                                                 do_flip, scale, rot, 
+        #                                                 border_mode=border_mode)
+
+    image = img_patch_cv.copy()
+    if is_bgr:
+        image = image[:, :, ::-1]
+    img_patch_cv = image.copy()
+    img_patch = convert_cvimg_to_tensor(image)
+
+
+    mano_params, has_mano_params = mano_param_processing(mano_params, has_mano_params, rot, do_flip)
+
+    # apply normalization
+    for n_c in range(min(img_channels, 3)):
+        img_patch[n_c, :, :] = np.clip(img_patch[n_c, :, :] * color_scale[n_c], 0, 255)
+        if mean is not None and std is not None:
+            img_patch[n_c, :, :] = (img_patch[n_c, :, :] - mean[n_c]) / std[n_c]
+    if do_flip:
+        keypoints_2d = fliplr_keypoints(keypoints_2d, img_width, flip_kp_permutation)
+
+
+    for n_jt in range(len(keypoints_2d)):
+        keypoints_2d[n_jt, 0:2] = trans_point2d(keypoints_2d[n_jt, 0:2], trans)
+    keypoints_2d[:, :-1] = keypoints_2d[:, :-1] / patch_width - 0.5
+
+    if not return_trans:
+        return img_patch, keypoints_2d, keypoints_3d, mano_params, has_mano_params, img_size
+    else:
+        return img_patch, keypoints_2d, keypoints_3d, mano_params, has_mano_params, img_size, trans
+
+def crop_to_hips(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array) -> Tuple:
+    """
+    Extreme cropping: Crop the box up to the hip locations.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    lower_body_keypoints = [10, 11, 13, 14, 19, 20, 21, 22, 23, 24, 25+0, 25+1, 25+4, 25+5]
+    keypoints_2d[lower_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+
+
+def crop_to_shoulders(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box up to the shoulder locations.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    lower_body_keypoints = [3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 23, 24] + [25 + i for i in [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, 16]]
+    keypoints_2d[lower_body_keypoints, :] = 0
+    center, scale = get_bbox(keypoints_2d)
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.2 * scale[0]
+        height = 1.2 * scale[1]
+    return center_x, center_y, width, height
+
+def crop_to_head(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the head.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    lower_body_keypoints = [3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 23, 24] + [25 + i for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16]]
+    keypoints_2d[lower_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.3 * scale[0]
+        height = 1.3 * scale[1]
+    return center_x, center_y, width, height
+
+def crop_torso_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the torso.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nontorso_body_keypoints = [0, 3, 4, 6, 7, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] + [25 + i for i in [0, 1, 4, 5, 6, 7, 10, 11, 13, 17, 18]]
+    keypoints_2d[nontorso_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+
+def crop_rightarm_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the right arm.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nonrightarm_body_keypoints = [0, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] + [25 + i for i in [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]]
+    keypoints_2d[nonrightarm_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+
+def crop_leftarm_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the left arm.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nonleftarm_body_keypoints = [0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] + [25 + i for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18]]
+    keypoints_2d[nonleftarm_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+
+def crop_legs_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the legs.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nonlegs_body_keypoints = [0, 1, 2, 3, 4, 5, 6, 7, 15, 16, 17, 18] + [25 + i for i in [6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18]]
+    keypoints_2d[nonlegs_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+
+def crop_rightleg_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the right leg.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nonrightleg_body_keypoints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] + [25 + i for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]]
+    keypoints_2d[nonrightleg_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+
+def crop_leftleg_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the left leg.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nonleftleg_body_keypoints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 22, 23, 24] + [25 + i for i in [0, 1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]]
+    keypoints_2d[nonleftleg_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+
+def full_body(keypoints_2d: np.array) -> bool:
+    """
+    Check if all main body joints are visible.
+    Args:
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        bool: True if all main body joints are visible.
+    """
+
+    body_keypoints_openpose = [2, 3, 4, 5, 6, 7, 10, 11, 13, 14]
+    body_keypoints = [25 + i for i in [8, 7, 6, 9, 10, 11, 1, 0, 4, 5]]
+    return (np.maximum(keypoints_2d[body_keypoints, -1], keypoints_2d[body_keypoints_openpose, -1]) > 0).sum() == len(body_keypoints)
+
+def upper_body(keypoints_2d: np.array):
+    """
+    Check if all upper body joints are visible.
+    Args:
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        bool: True if all main body joints are visible.
+    """
+    lower_body_keypoints_openpose = [10, 11, 13, 14]
+    lower_body_keypoints = [25 + i for i in [1, 0, 4, 5]]
+    upper_body_keypoints_openpose = [0, 1, 15, 16, 17, 18]
+    upper_body_keypoints = [25+8, 25+9, 25+12, 25+13, 25+17, 25+18]
+    return ((keypoints_2d[lower_body_keypoints + lower_body_keypoints_openpose, -1] > 0).sum() == 0)\
+       and ((keypoints_2d[upper_body_keypoints + upper_body_keypoints_openpose, -1] > 0).sum() >= 2)
+
+def get_bbox(keypoints_2d: np.array, rescale: float = 1.2) -> Tuple:
+    """
+    Get center and scale for bounding box from openpose detections.
+    Args:
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+        rescale (float): Scale factor to rescale bounding boxes computed from the keypoints.
+    Returns:
+        center (np.array): Array of shape (2,) containing the new bounding box center.
+        scale (float): New bounding box scale.
+    """
+    valid = keypoints_2d[:,-1] > 0
+    valid_keypoints = keypoints_2d[valid][:,:-1]
+    center = 0.5 * (valid_keypoints.max(axis=0) + valid_keypoints.min(axis=0))
+    bbox_size = (valid_keypoints.max(axis=0) - valid_keypoints.min(axis=0))
+    # adjust bounding box tightness
+    scale = bbox_size
+    scale *= rescale
+    return center, scale
+
+def extreme_cropping(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array) -> Tuple:
+    """
+    Perform extreme cropping
+    Args:
+        center_x (float): x coordinate of bounding box center.
+        center_y (float): y coordinate of bounding box center.
+        width (float): bounding box width.
+        height (float): bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+        rescale (float): Scale factor to rescale bounding boxes computed from the keypoints.
+    Returns:
+        center_x (float): x coordinate of bounding box center.
+        center_y (float): y coordinate of bounding box center.
+        width (float): bounding box width.
+        height (float): bounding box height.
+    """
+    p = torch.rand(1).item()
+    if full_body(keypoints_2d):
+        if p < 0.7:
+            center_x, center_y, width, height = crop_to_hips(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.9:
+            center_x, center_y, width, height = crop_to_shoulders(center_x, center_y, width, height, keypoints_2d)
+        else:
+            center_x, center_y, width, height = crop_to_head(center_x, center_y, width, height, keypoints_2d)
+    elif upper_body(keypoints_2d):
+        if p < 0.9:
+            center_x, center_y, width, height = crop_to_shoulders(center_x, center_y, width, height, keypoints_2d)
+        else:
+            center_x, center_y, width, height = crop_to_head(center_x, center_y, width, height, keypoints_2d)
+
+    return center_x, center_y, max(width, height), max(width, height)
+
+def extreme_cropping_aggressive(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array) -> Tuple:
+    """
+    Perform aggressive extreme cropping
+    Args:
+        center_x (float): x coordinate of bounding box center.
+        center_y (float): y coordinate of bounding box center.
+        width (float): bounding box width.
+        height (float): bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+        rescale (float): Scale factor to rescale bounding boxes computed from the keypoints.
+    Returns:
+        center_x (float): x coordinate of bounding box center.
+        center_y (float): y coordinate of bounding box center.
+        width (float): bounding box width.
+        height (float): bounding box height.
+    """
+    p = torch.rand(1).item()
+    if full_body(keypoints_2d):
+        if p < 0.2:
+            center_x, center_y, width, height = crop_to_hips(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.3:
+            center_x, center_y, width, height = crop_to_shoulders(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.4:
+            center_x, center_y, width, height = crop_to_head(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.5:
+            center_x, center_y, width, height = crop_torso_only(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.6:
+            center_x, center_y, width, height = crop_rightarm_only(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.7:
+            center_x, center_y, width, height = crop_leftarm_only(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.8:
+            center_x, center_y, width, height = crop_legs_only(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.9:
+            center_x, center_y, width, height = crop_rightleg_only(center_x, center_y, width, height, keypoints_2d)
+        else:
+            center_x, center_y, width, height = crop_leftleg_only(center_x, center_y, width, height, keypoints_2d)
+    elif upper_body(keypoints_2d):
+        if p < 0.2:
+            center_x, center_y, width, height = crop_to_shoulders(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.4:
+            center_x, center_y, width, height = crop_to_head(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.6:
+            center_x, center_y, width, height = crop_torso_only(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.8:
+            center_x, center_y, width, height = crop_rightarm_only(center_x, center_y, width, height, keypoints_2d)
+        else:
+            center_x, center_y, width, height = crop_leftarm_only(center_x, center_y, width, height, keypoints_2d)
+    return center_x, center_y, max(width, height), max(width, height)
diff --git a/phantom/submodules/phantom-hamer/hamer/datasets/vitdet_dataset.py b/phantom/submodules/phantom-hamer/hamer/datasets/vitdet_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..17a13344637a9e1c4e17727a2b1ec4f22be62204
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/datasets/vitdet_dataset.py
@@ -0,0 +1,94 @@
+from typing import Dict
+
+import cv2
+import numpy as np
+from skimage.filters import gaussian
+from yacs.config import CfgNode
+import torch
+
+from .utils import (convert_cvimg_to_tensor,
+                    expand_to_aspect_ratio,
+                    generate_image_patch_cv2)
+
+DEFAULT_MEAN = 255. * np.array([0.485, 0.456, 0.406])
+DEFAULT_STD = 255. * np.array([0.229, 0.224, 0.225])
+
+class ViTDetDataset(torch.utils.data.Dataset):
+
+    def __init__(self,
+                 cfg: CfgNode,
+                 img_cv2: np.array,
+                 boxes: np.array,
+                 right: np.array,
+                 rescale_factor=2.5,
+                 train: bool = False,
+                 **kwargs):
+        super().__init__()
+        self.cfg = cfg
+        self.img_cv2 = img_cv2
+        # self.boxes = boxes
+
+        assert train == False, "ViTDetDataset is only for inference"
+        self.train = train
+        self.img_size = cfg.MODEL.IMAGE_SIZE
+        self.mean = 255. * np.array(self.cfg.MODEL.IMAGE_MEAN)
+        self.std = 255. * np.array(self.cfg.MODEL.IMAGE_STD)
+
+        # Preprocess annotations
+        boxes = boxes.astype(np.float32)
+        self.center = (boxes[:, 2:4] + boxes[:, 0:2]) / 2.0
+        self.scale = rescale_factor * (boxes[:, 2:4] - boxes[:, 0:2]) / 200.0
+        self.personid = np.arange(len(boxes), dtype=np.int32)
+        self.right = right.astype(np.float32)
+
+    def __len__(self) -> int:
+        return len(self.personid)
+
+    def __getitem__(self, idx: int) -> Dict[str, np.array]:
+
+        center = self.center[idx].copy()
+        center_x = center[0]
+        center_y = center[1]
+
+        scale = self.scale[idx]
+        BBOX_SHAPE = self.cfg.MODEL.get('BBOX_SHAPE', None)
+        bbox_size = expand_to_aspect_ratio(scale*200, target_aspect_ratio=BBOX_SHAPE).max()
+
+        patch_width = patch_height = self.img_size
+
+        right = self.right[idx].copy()
+        flip = right == 0
+
+        # 3. generate image patch
+        # if use_skimage_antialias:
+        cvimg = self.img_cv2.copy()
+        if True:
+            # Blur image to avoid aliasing artifacts
+            downsampling_factor = ((bbox_size*1.0) / patch_width)
+            downsampling_factor = downsampling_factor / 2.0
+            if downsampling_factor > 1.1:
+                cvimg  = gaussian(cvimg, sigma=(downsampling_factor-1)/2, channel_axis=2, preserve_range=True)
+
+
+        img_patch_cv, trans = generate_image_patch_cv2(cvimg,
+                                                    center_x, center_y,
+                                                    bbox_size, bbox_size,
+                                                    patch_width, patch_height,
+                                                    flip, 1.0, 0,
+                                                    border_mode=cv2.BORDER_CONSTANT)
+        img_patch_cv = img_patch_cv[:, :, ::-1]
+        img_patch = convert_cvimg_to_tensor(img_patch_cv)
+
+        # apply normalization
+        for n_c in range(min(self.img_cv2.shape[2], 3)):
+            img_patch[n_c, :, :] = (img_patch[n_c, :, :] - self.mean[n_c]) / self.std[n_c]
+
+        item = {
+            'img': img_patch,
+            'personid': int(self.personid[idx]),
+        }
+        item['box_center'] = self.center[idx].copy()
+        item['box_size'] = bbox_size
+        item['img_size'] = 1.0 * np.array([cvimg.shape[1], cvimg.shape[0]])
+        item['right'] = self.right[idx].copy()
+        return item
diff --git a/phantom/submodules/phantom-hamer/hamer/models/__init__.py b/phantom/submodules/phantom-hamer/hamer/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7261b0a6e84d43e9a7e9675a06af163e6f65e75f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/__init__.py
@@ -0,0 +1,52 @@
+from .mano_wrapper import MANO
+from .hamer import HAMER
+from .discriminator import Discriminator
+
+from ..utils.download import cache_url
+from ..configs import CACHE_DIR_HAMER
+
+
+def download_models(folder=CACHE_DIR_HAMER):
+    """Download checkpoints and files for running inference.
+    """
+    import os
+    os.makedirs(folder, exist_ok=True)
+    download_files = {
+        "hamer_demo_data.tar.gz"      : ["https://www.cs.utexas.edu/~pavlakos/hamer/data/hamer_demo_data.tar.gz", folder],
+    }
+    
+    for file_name, url in download_files.items():
+        output_path = os.path.join(url[1], file_name)
+        if not os.path.exists(output_path):
+            print("Downloading file: " + file_name)
+            # output = gdown.cached_download(url[0], output_path, fuzzy=True)
+            output = cache_url(url[0], output_path)
+            assert os.path.exists(output_path), f"{output} does not exist"
+
+            # if ends with tar.gz, tar -xzf
+            if file_name.endswith(".tar.gz"):
+                print("Extracting file: " + file_name)
+                os.system("tar -xvf " + output_path)
+
+DEFAULT_CHECKPOINT=f'{CACHE_DIR_HAMER}/hamer_ckpts/checkpoints/hamer.ckpt'
+def load_hamer(checkpoint_path=DEFAULT_CHECKPOINT):
+    from pathlib import Path
+    from ..configs import get_config
+    model_cfg = str(Path(checkpoint_path).parent.parent / 'model_config.yaml')
+    model_cfg = get_config(model_cfg, update_cachedir=True)
+
+    # Override some config values, to crop bbox correctly
+    if (model_cfg.MODEL.BACKBONE.TYPE == 'vit') and ('BBOX_SHAPE' not in model_cfg.MODEL):
+        model_cfg.defrost()
+        assert model_cfg.MODEL.IMAGE_SIZE == 256, f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone"
+        model_cfg.MODEL.BBOX_SHAPE = [192,256]
+        model_cfg.freeze()
+
+    # Update config to be compatible with demo
+    if ('PRETRAINED_WEIGHTS' in model_cfg.MODEL.BACKBONE):
+        model_cfg.defrost()
+        model_cfg.MODEL.BACKBONE.pop('PRETRAINED_WEIGHTS')
+        model_cfg.freeze()
+
+    model = HAMER.load_from_checkpoint(checkpoint_path, strict=False, cfg=model_cfg)
+    return model, model_cfg
diff --git a/phantom/submodules/phantom-hamer/hamer/models/backbones/__init__.py b/phantom/submodules/phantom-hamer/hamer/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b217b0e624dc5612dcc405c450fa4b43039dff
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/backbones/__init__.py
@@ -0,0 +1,7 @@
+from .vit import vit
+
+def create_backbone(cfg):
+    if cfg.MODEL.BACKBONE.TYPE == 'vit':
+        return vit(cfg)
+    else:
+        raise NotImplementedError('Backbone type is not implemented')
diff --git a/phantom/submodules/phantom-hamer/hamer/models/backbones/vit.py b/phantom/submodules/phantom-hamer/hamer/models/backbones/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..c56c71889cd441294f57ad687d0678d2443d1eed
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/backbones/vit.py
@@ -0,0 +1,348 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+def vit(cfg):
+    return ViT(
+                img_size=(256, 192),
+                patch_size=16,
+                embed_dim=1280,
+                depth=32,
+                num_heads=16,
+                ratio=1,
+                use_checkpoint=False,
+                mlp_ratio=4,
+                qkv_bias=True,
+                drop_path_rate=0.55,
+            )
+
+def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    cls_token = None
+    B, L, C = abs_pos.shape
+    if has_cls_token:
+        cls_token = abs_pos[:, 0:1]
+        abs_pos = abs_pos[:, 1:]
+
+    if ori_h != h or ori_w != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).reshape(B, -1, C)
+
+    else:
+        new_abs_pos = abs_pos
+    
+    if cls_token is not None:
+        new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
+    return new_abs_pos
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    
+    def extra_repr(self):
+        return 'p={}'.format(self.drop_prob)
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None,):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, 
+                 drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, 
+                 norm_layer=nn.LayerNorm, attn_head_dim=None
+                 ):
+        super().__init__()
+        
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
+            )
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
+        self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
+        self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+class ViT(nn.Module):
+
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False, 
+                 frozen_stages=-1, ratio=1, last_norm=True,
+                 patch_padding='pad', freeze_attn=False, freeze_ffn=False,
+                 ):
+        # Protect mutable default arguments
+        super(ViT, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
+        num_patches = self.patch_embed.num_patches
+
+        # since the pretraining model has class token
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                )
+            for i in range(depth)])
+
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        self.apply(_init_weights)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward_features(self, x):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+        x = self.last_norm(x)
+
+        xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
+
+        return xp
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
diff --git a/phantom/submodules/phantom-hamer/hamer/models/components/__init__.py b/phantom/submodules/phantom-hamer/hamer/models/components/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-hamer/hamer/models/components/pose_transformer.py b/phantom/submodules/phantom-hamer/hamer/models/components/pose_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac04971407cb59637490cc4842f048b9bc4758be
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/components/pose_transformer.py
@@ -0,0 +1,358 @@
+from inspect import isfunction
+from typing import Callable, Optional
+
+import torch
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from torch import nn
+
+from .t_cond_mlp import (
+    AdaptiveLayerNorm1D,
+    FrequencyEmbedder,
+    normalization_layer,
+)
+# from .vit import Attention, FeedForward
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+class PreNorm(nn.Module):
+    def __init__(self, dim: int, fn: Callable, norm: str = "layer", norm_cond_dim: int = -1):
+        super().__init__()
+        self.norm = normalization_layer(norm, dim, norm_cond_dim)
+        self.fn = fn
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        if isinstance(self.norm, AdaptiveLayerNorm1D):
+            return self.fn(self.norm(x, *args), **kwargs)
+        else:
+            return self.fn(self.norm(x), **kwargs)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, dropout=0.0):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        project_out = not (heads == 1 and dim_head == dim)
+
+        self.heads = heads
+        self.scale = dim_head**-0.5
+
+        self.attend = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+
+        self.to_out = (
+            nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+            if project_out
+            else nn.Identity()
+        )
+
+    def forward(self, x):
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), qkv)
+
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+
+        attn = self.attend(dots)
+        attn = self.dropout(attn)
+
+        out = torch.matmul(attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        project_out = not (heads == 1 and dim_head == dim)
+
+        self.heads = heads
+        self.scale = dim_head**-0.5
+
+        self.attend = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+
+        context_dim = default(context_dim, dim)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=False)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+
+        self.to_out = (
+            nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+            if project_out
+            else nn.Identity()
+        )
+
+    def forward(self, x, context=None):
+        context = default(context, x)
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        q = self.to_q(x)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), [q, k, v])
+
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+
+        attn = self.attend(dots)
+        attn = self.dropout(attn)
+
+        out = torch.matmul(attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int,
+        dim_head: int,
+        mlp_dim: int,
+        dropout: float = 0.0,
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            sa = Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)
+            ff = FeedForward(dim, mlp_dim, dropout=dropout)
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PreNorm(dim, sa, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ff, norm=norm, norm_cond_dim=norm_cond_dim),
+                    ]
+                )
+            )
+
+    def forward(self, x: torch.Tensor, *args):
+        for attn, ff in self.layers:
+            x = attn(x, *args) + x
+            x = ff(x, *args) + x
+        return x
+
+
+class TransformerCrossAttn(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int,
+        dim_head: int,
+        mlp_dim: int,
+        dropout: float = 0.0,
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        context_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            sa = Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)
+            ca = CrossAttention(
+                dim, context_dim=context_dim, heads=heads, dim_head=dim_head, dropout=dropout
+            )
+            ff = FeedForward(dim, mlp_dim, dropout=dropout)
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PreNorm(dim, sa, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ca, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ff, norm=norm, norm_cond_dim=norm_cond_dim),
+                    ]
+                )
+            )
+
+    def forward(self, x: torch.Tensor, *args, context=None, context_list=None):
+        if context_list is None:
+            context_list = [context] * len(self.layers)
+        if len(context_list) != len(self.layers):
+            raise ValueError(f"len(context_list) != len(self.layers) ({len(context_list)} != {len(self.layers)})")
+
+        for i, (self_attn, cross_attn, ff) in enumerate(self.layers):
+            x = self_attn(x, *args) + x
+            x = cross_attn(x, *args, context=context_list[i]) + x
+            x = ff(x, *args) + x
+        return x
+
+
+class DropTokenDropout(nn.Module):
+    def __init__(self, p: float = 0.1):
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                "dropout probability has to be between 0 and 1, " "but got {}".format(p)
+            )
+        self.p = p
+
+    def forward(self, x: torch.Tensor):
+        # x: (batch_size, seq_len, dim)
+        if self.training and self.p > 0:
+            zero_mask = torch.full_like(x[0, :, 0], self.p).bernoulli().bool()
+            # TODO: permutation idx for each batch using torch.argsort
+            if zero_mask.any():
+                x = x[:, ~zero_mask, :]
+        return x
+
+
+class ZeroTokenDropout(nn.Module):
+    def __init__(self, p: float = 0.1):
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                "dropout probability has to be between 0 and 1, " "but got {}".format(p)
+            )
+        self.p = p
+
+    def forward(self, x: torch.Tensor):
+        # x: (batch_size, seq_len, dim)
+        if self.training and self.p > 0:
+            zero_mask = torch.full_like(x[:, :, 0], self.p).bernoulli().bool()
+            # Zero-out the masked tokens
+            x[zero_mask, :] = 0
+        return x
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self,
+        num_tokens: int,
+        token_dim: int,
+        dim: int,
+        depth: int,
+        heads: int,
+        mlp_dim: int,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        emb_dropout: float = 0.0,
+        emb_dropout_type: str = "drop",
+        emb_dropout_loc: str = "token",
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        token_pe_numfreq: int = -1,
+    ):
+        super().__init__()
+        if token_pe_numfreq > 0:
+            token_dim_new = token_dim * (2 * token_pe_numfreq + 1)
+            self.to_token_embedding = nn.Sequential(
+                Rearrange("b n d -> (b n) d", n=num_tokens, d=token_dim),
+                FrequencyEmbedder(token_pe_numfreq, token_pe_numfreq - 1),
+                Rearrange("(b n) d -> b n d", n=num_tokens, d=token_dim_new),
+                nn.Linear(token_dim_new, dim),
+            )
+        else:
+            self.to_token_embedding = nn.Linear(token_dim, dim)
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_tokens, dim))
+        if emb_dropout_type == "drop":
+            self.dropout = DropTokenDropout(emb_dropout)
+        elif emb_dropout_type == "zero":
+            self.dropout = ZeroTokenDropout(emb_dropout)
+        else:
+            raise ValueError(f"Unknown emb_dropout_type: {emb_dropout_type}")
+        self.emb_dropout_loc = emb_dropout_loc
+
+        self.transformer = Transformer(
+            dim, depth, heads, dim_head, mlp_dim, dropout, norm=norm, norm_cond_dim=norm_cond_dim
+        )
+
+    def forward(self, inp: torch.Tensor, *args, **kwargs):
+        x = inp
+
+        if self.emb_dropout_loc == "input":
+            x = self.dropout(x)
+        x = self.to_token_embedding(x)
+
+        if self.emb_dropout_loc == "token":
+            x = self.dropout(x)
+        b, n, _ = x.shape
+        x += self.pos_embedding[:, :n]
+
+        if self.emb_dropout_loc == "token_afterpos":
+            x = self.dropout(x)
+        x = self.transformer(x, *args)
+        return x
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(
+        self,
+        num_tokens: int,
+        token_dim: int,
+        dim: int,
+        depth: int,
+        heads: int,
+        mlp_dim: int,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        emb_dropout: float = 0.0,
+        emb_dropout_type: str = 'drop',
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        context_dim: Optional[int] = None,
+        skip_token_embedding: bool = False,
+    ):
+        super().__init__()
+        if not skip_token_embedding:
+            self.to_token_embedding = nn.Linear(token_dim, dim)
+        else:
+            self.to_token_embedding = nn.Identity()
+            if token_dim != dim:
+                raise ValueError(
+                    f"token_dim ({token_dim}) != dim ({dim}) when skip_token_embedding is True"
+                )
+
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_tokens, dim))
+        if emb_dropout_type == "drop":
+            self.dropout = DropTokenDropout(emb_dropout)
+        elif emb_dropout_type == "zero":
+            self.dropout = ZeroTokenDropout(emb_dropout)
+        elif emb_dropout_type == "normal":
+            self.dropout = nn.Dropout(emb_dropout)
+
+        self.transformer = TransformerCrossAttn(
+            dim,
+            depth,
+            heads,
+            dim_head,
+            mlp_dim,
+            dropout,
+            norm=norm,
+            norm_cond_dim=norm_cond_dim,
+            context_dim=context_dim,
+        )
+
+    def forward(self, inp: torch.Tensor, *args, context=None, context_list=None):
+        x = self.to_token_embedding(inp)
+        b, n, _ = x.shape
+
+        x = self.dropout(x)
+        x += self.pos_embedding[:, :n]
+
+        x = self.transformer(x, *args, context=context, context_list=context_list)
+        return x
+
diff --git a/phantom/submodules/phantom-hamer/hamer/models/components/t_cond_mlp.py b/phantom/submodules/phantom-hamer/hamer/models/components/t_cond_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..44d5a09bf54f67712a69953039b7b5af41c3f029
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/components/t_cond_mlp.py
@@ -0,0 +1,199 @@
+import copy
+from typing import List, Optional
+
+import torch
+
+
+class AdaptiveLayerNorm1D(torch.nn.Module):
+    def __init__(self, data_dim: int, norm_cond_dim: int):
+        super().__init__()
+        if data_dim <= 0:
+            raise ValueError(f"data_dim must be positive, but got {data_dim}")
+        if norm_cond_dim <= 0:
+            raise ValueError(f"norm_cond_dim must be positive, but got {norm_cond_dim}")
+        self.norm = torch.nn.LayerNorm(
+            data_dim
+        )  # TODO: Check if elementwise_affine=True is correct
+        self.linear = torch.nn.Linear(norm_cond_dim, 2 * data_dim)
+        torch.nn.init.zeros_(self.linear.weight)
+        torch.nn.init.zeros_(self.linear.bias)
+
+    def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        # x: (batch, ..., data_dim)
+        # t: (batch, norm_cond_dim)
+        # return: (batch, data_dim)
+        x = self.norm(x)
+        alpha, beta = self.linear(t).chunk(2, dim=-1)
+
+        # Add singleton dimensions to alpha and beta
+        if x.dim() > 2:
+            alpha = alpha.view(alpha.shape[0], *([1] * (x.dim() - 2)), alpha.shape[1])
+            beta = beta.view(beta.shape[0], *([1] * (x.dim() - 2)), beta.shape[1])
+
+        return x * (1 + alpha) + beta
+
+
+class SequentialCond(torch.nn.Sequential):
+    def forward(self, input, *args, **kwargs):
+        for module in self:
+            if isinstance(module, (AdaptiveLayerNorm1D, SequentialCond, ResidualMLPBlock)):
+                # print(f'Passing on args to {module}', [a.shape for a in args])
+                input = module(input, *args, **kwargs)
+            else:
+                # print(f'Skipping passing args to {module}', [a.shape for a in args])
+                input = module(input)
+        return input
+
+
+def normalization_layer(norm: Optional[str], dim: int, norm_cond_dim: int = -1):
+    if norm == "batch":
+        return torch.nn.BatchNorm1d(dim)
+    elif norm == "layer":
+        return torch.nn.LayerNorm(dim)
+    elif norm == "ada":
+        assert norm_cond_dim > 0, f"norm_cond_dim must be positive, got {norm_cond_dim}"
+        return AdaptiveLayerNorm1D(dim, norm_cond_dim)
+    elif norm is None:
+        return torch.nn.Identity()
+    else:
+        raise ValueError(f"Unknown norm: {norm}")
+
+
+def linear_norm_activ_dropout(
+    input_dim: int,
+    output_dim: int,
+    activation: torch.nn.Module = torch.nn.ReLU(),
+    bias: bool = True,
+    norm: Optional[str] = "layer",  # Options: ada/batch/layer
+    dropout: float = 0.0,
+    norm_cond_dim: int = -1,
+) -> SequentialCond:
+    layers = []
+    layers.append(torch.nn.Linear(input_dim, output_dim, bias=bias))
+    if norm is not None:
+        layers.append(normalization_layer(norm, output_dim, norm_cond_dim))
+    layers.append(copy.deepcopy(activation))
+    if dropout > 0.0:
+        layers.append(torch.nn.Dropout(dropout))
+    return SequentialCond(*layers)
+
+
+def create_simple_mlp(
+    input_dim: int,
+    hidden_dims: List[int],
+    output_dim: int,
+    activation: torch.nn.Module = torch.nn.ReLU(),
+    bias: bool = True,
+    norm: Optional[str] = "layer",  # Options: ada/batch/layer
+    dropout: float = 0.0,
+    norm_cond_dim: int = -1,
+) -> SequentialCond:
+    layers = []
+    prev_dim = input_dim
+    for hidden_dim in hidden_dims:
+        layers.extend(
+            linear_norm_activ_dropout(
+                prev_dim, hidden_dim, activation, bias, norm, dropout, norm_cond_dim
+            )
+        )
+        prev_dim = hidden_dim
+    layers.append(torch.nn.Linear(prev_dim, output_dim, bias=bias))
+    return SequentialCond(*layers)
+
+
+class ResidualMLPBlock(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        num_hidden_layers: int,
+        output_dim: int,
+        activation: torch.nn.Module = torch.nn.ReLU(),
+        bias: bool = True,
+        norm: Optional[str] = "layer",  # Options: ada/batch/layer
+        dropout: float = 0.0,
+        norm_cond_dim: int = -1,
+    ):
+        super().__init__()
+        if not (input_dim == output_dim == hidden_dim):
+            raise NotImplementedError(
+                f"input_dim {input_dim} != output_dim {output_dim} is not implemented"
+            )
+
+        layers = []
+        prev_dim = input_dim
+        for i in range(num_hidden_layers):
+            layers.append(
+                linear_norm_activ_dropout(
+                    prev_dim, hidden_dim, activation, bias, norm, dropout, norm_cond_dim
+                )
+            )
+            prev_dim = hidden_dim
+        self.model = SequentialCond(*layers)
+        self.skip = torch.nn.Identity()
+
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        return x + self.model(x, *args, **kwargs)
+
+
+class ResidualMLP(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        num_hidden_layers: int,
+        output_dim: int,
+        activation: torch.nn.Module = torch.nn.ReLU(),
+        bias: bool = True,
+        norm: Optional[str] = "layer",  # Options: ada/batch/layer
+        dropout: float = 0.0,
+        num_blocks: int = 1,
+        norm_cond_dim: int = -1,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.model = SequentialCond(
+            linear_norm_activ_dropout(
+                input_dim, hidden_dim, activation, bias, norm, dropout, norm_cond_dim
+            ),
+            *[
+                ResidualMLPBlock(
+                    hidden_dim,
+                    hidden_dim,
+                    num_hidden_layers,
+                    hidden_dim,
+                    activation,
+                    bias,
+                    norm,
+                    dropout,
+                    norm_cond_dim,
+                )
+                for _ in range(num_blocks)
+            ],
+            torch.nn.Linear(hidden_dim, output_dim, bias=bias),
+        )
+
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        return self.model(x, *args, **kwargs)
+
+
+class FrequencyEmbedder(torch.nn.Module):
+    def __init__(self, num_frequencies, max_freq_log2):
+        super().__init__()
+        frequencies = 2 ** torch.linspace(0, max_freq_log2, steps=num_frequencies)
+        self.register_buffer("frequencies", frequencies)
+
+    def forward(self, x):
+        # x should be of size (N,) or (N, D)
+        N = x.size(0)
+        if x.dim() == 1:  # (N,)
+            x = x.unsqueeze(1)  # (N, D) where D=1
+        x_unsqueezed = x.unsqueeze(-1)  # (N, D, 1)
+        scaled = self.frequencies.view(1, 1, -1) * x_unsqueezed  # (N, D, num_frequencies)
+        s = torch.sin(scaled)
+        c = torch.cos(scaled)
+        embedded = torch.cat([s, c, x_unsqueezed], dim=-1).view(
+            N, -1
+        )  # (N, D * 2 * num_frequencies + D)
+        return embedded
+
diff --git a/phantom/submodules/phantom-hamer/hamer/models/discriminator.py b/phantom/submodules/phantom-hamer/hamer/models/discriminator.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1cb2d1a21fbab47e8fa10dcc603b3d2012686a7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/discriminator.py
@@ -0,0 +1,98 @@
+import torch
+import torch.nn as nn
+
+class Discriminator(nn.Module):
+
+    def __init__(self):
+        """
+        Pose + Shape discriminator proposed in HMR
+        """
+        super(Discriminator, self).__init__()
+
+        self.num_joints = 15
+        # poses_alone
+        self.D_conv1 = nn.Conv2d(9, 32, kernel_size=1)
+        nn.init.xavier_uniform_(self.D_conv1.weight)
+        nn.init.zeros_(self.D_conv1.bias)
+        self.relu = nn.ReLU(inplace=True)
+        self.D_conv2 = nn.Conv2d(32, 32, kernel_size=1)
+        nn.init.xavier_uniform_(self.D_conv2.weight)
+        nn.init.zeros_(self.D_conv2.bias)
+        pose_out = []
+        for i in range(self.num_joints):
+            pose_out_temp = nn.Linear(32, 1)
+            nn.init.xavier_uniform_(pose_out_temp.weight)
+            nn.init.zeros_(pose_out_temp.bias)
+            pose_out.append(pose_out_temp)
+        self.pose_out = nn.ModuleList(pose_out)
+
+        # betas
+        self.betas_fc1 = nn.Linear(10, 10)
+        nn.init.xavier_uniform_(self.betas_fc1.weight)
+        nn.init.zeros_(self.betas_fc1.bias)
+        self.betas_fc2 = nn.Linear(10, 5)
+        nn.init.xavier_uniform_(self.betas_fc2.weight)
+        nn.init.zeros_(self.betas_fc2.bias)
+        self.betas_out = nn.Linear(5, 1)
+        nn.init.xavier_uniform_(self.betas_out.weight)
+        nn.init.zeros_(self.betas_out.bias)
+
+        # poses_joint
+        self.D_alljoints_fc1 = nn.Linear(32*self.num_joints, 1024)
+        nn.init.xavier_uniform_(self.D_alljoints_fc1.weight)
+        nn.init.zeros_(self.D_alljoints_fc1.bias)
+        self.D_alljoints_fc2 = nn.Linear(1024, 1024)
+        nn.init.xavier_uniform_(self.D_alljoints_fc2.weight)
+        nn.init.zeros_(self.D_alljoints_fc2.bias)
+        self.D_alljoints_out = nn.Linear(1024, 1)
+        nn.init.xavier_uniform_(self.D_alljoints_out.weight)
+        nn.init.zeros_(self.D_alljoints_out.bias)
+
+
+    def forward(self, poses: torch.Tensor, betas: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the discriminator.
+        Args:
+            poses (torch.Tensor): Tensor of shape (B, 23, 3, 3) containing a batch of MANO hand poses (excluding the global orientation).
+            betas (torch.Tensor): Tensor of shape (B, 10) containign a batch of MANO beta coefficients.
+        Returns:
+            torch.Tensor: Discriminator output with shape (B, 25)
+        """
+        #bn = poses.shape[0]
+        # poses B x 207
+        #poses = poses.reshape(bn, -1)
+        # poses B x num_joints x 1 x 9
+        poses = poses.reshape(-1, self.num_joints, 1, 9)
+        bn = poses.shape[0]
+        # poses B x 9 x num_joints x 1
+        poses = poses.permute(0, 3, 1, 2).contiguous()
+
+        # poses_alone
+        poses = self.D_conv1(poses)
+        poses = self.relu(poses)
+        poses = self.D_conv2(poses)
+        poses = self.relu(poses)
+
+        poses_out = []
+        for i in range(self.num_joints):
+            poses_out_ = self.pose_out[i](poses[:, :, i, 0])
+            poses_out.append(poses_out_)
+        poses_out = torch.cat(poses_out, dim=1)
+
+        # betas
+        betas = self.betas_fc1(betas)
+        betas = self.relu(betas)
+        betas = self.betas_fc2(betas)
+        betas = self.relu(betas)
+        betas_out = self.betas_out(betas)
+
+        # poses_joint
+        poses = poses.reshape(bn,-1)
+        poses_all = self.D_alljoints_fc1(poses)
+        poses_all = self.relu(poses_all)
+        poses_all = self.D_alljoints_fc2(poses_all)
+        poses_all = self.relu(poses_all)
+        poses_all_out = self.D_alljoints_out(poses_all)
+
+        disc_out = torch.cat((poses_out, betas_out, poses_all_out), 1)
+        return disc_out
diff --git a/phantom/submodules/phantom-hamer/hamer/models/hamer.py b/phantom/submodules/phantom-hamer/hamer/models/hamer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a79e4d52b4399d1114ec18598a58dc10ce266c0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/hamer.py
@@ -0,0 +1,355 @@
+import torch
+import pytorch_lightning as pl
+from typing import Any, Dict, Mapping, Tuple
+
+from yacs.config import CfgNode
+
+from ..utils import SkeletonRenderer, MeshRenderer
+from ..utils.geometry import aa_to_rotmat, perspective_projection
+from ..utils.pylogger import get_pylogger
+from .backbones import create_backbone
+from .heads import build_mano_head
+from .discriminator import Discriminator
+from .losses import Keypoint3DLoss, Keypoint2DLoss, ParameterLoss
+from . import MANO
+
+log = get_pylogger(__name__)
+
+class HAMER(pl.LightningModule):
+
+    def __init__(self, cfg: CfgNode, init_renderer: bool = True):
+        """
+        Setup HAMER model
+        Args:
+            cfg (CfgNode): Config file as a yacs CfgNode
+        """
+        super().__init__()
+
+        # Save hyperparameters
+        self.save_hyperparameters(logger=False, ignore=['init_renderer'])
+
+        self.cfg = cfg
+        # Create backbone feature extractor
+        self.backbone = create_backbone(cfg)
+        if cfg.MODEL.BACKBONE.get('PRETRAINED_WEIGHTS', None):
+            log.info(f'Loading backbone weights from {cfg.MODEL.BACKBONE.PRETRAINED_WEIGHTS}')
+            self.backbone.load_state_dict(torch.load(cfg.MODEL.BACKBONE.PRETRAINED_WEIGHTS, map_location='cpu')['state_dict'])
+
+        # Create MANO head
+        self.mano_head = build_mano_head(cfg)
+
+        # Create discriminator
+        if self.cfg.LOSS_WEIGHTS.ADVERSARIAL > 0:
+            self.discriminator = Discriminator()
+
+        # Define loss functions
+        self.keypoint_3d_loss = Keypoint3DLoss(loss_type='l1')
+        self.keypoint_2d_loss = Keypoint2DLoss(loss_type='l1')
+        self.mano_parameter_loss = ParameterLoss()
+
+        # Instantiate MANO model
+        mano_cfg = {k.lower(): v for k,v in dict(cfg.MANO).items()}
+        self.mano = MANO(**mano_cfg)
+
+        # Buffer that shows whetheer we need to initialize ActNorm layers
+        self.register_buffer('initialized', torch.tensor(False))
+        # Setup renderer for visualization
+        if init_renderer:
+            self.renderer = SkeletonRenderer(self.cfg)
+            self.mesh_renderer = MeshRenderer(self.cfg, faces=self.mano.faces)
+        else:
+            self.renderer = None
+            self.mesh_renderer = None
+
+        # Disable automatic optimization since we use adversarial training
+        self.automatic_optimization = False
+
+    def get_parameters(self):
+        all_params = list(self.mano_head.parameters())
+        all_params += list(self.backbone.parameters())
+        return all_params
+
+    def configure_optimizers(self) -> Tuple[torch.optim.Optimizer, torch.optim.Optimizer]:
+        """
+        Setup model and distriminator Optimizers
+        Returns:
+            Tuple[torch.optim.Optimizer, torch.optim.Optimizer]: Model and discriminator optimizers
+        """
+        param_groups = [{'params': filter(lambda p: p.requires_grad, self.get_parameters()), 'lr': self.cfg.TRAIN.LR}]
+
+        optimizer = torch.optim.AdamW(params=param_groups,
+                                        # lr=self.cfg.TRAIN.LR,
+                                        weight_decay=self.cfg.TRAIN.WEIGHT_DECAY)
+        optimizer_disc = torch.optim.AdamW(params=self.discriminator.parameters(),
+                                            lr=self.cfg.TRAIN.LR,
+                                            weight_decay=self.cfg.TRAIN.WEIGHT_DECAY)
+
+        return optimizer, optimizer_disc
+
+    def forward_step(self, batch: Dict, train: bool = False) -> Dict:
+        """
+        Run a forward step of the network
+        Args:
+            batch (Dict): Dictionary containing batch data
+            train (bool): Flag indicating whether it is training or validation mode
+        Returns:
+            Dict: Dictionary containing the regression output
+        """
+
+        # Use RGB image as input
+        x = batch['img']
+        batch_size = x.shape[0]
+
+        # Compute conditioning features using the backbone
+        # if using ViT backbone, we need to use a different aspect ratio
+        conditioning_feats = self.backbone(x[:,:,:,32:-32])
+
+        pred_mano_params, pred_cam, _ = self.mano_head(conditioning_feats)
+
+        # Store useful regression outputs to the output dict
+        output = {}
+        output['pred_cam'] = pred_cam
+        output['pred_mano_params'] = {k: v.clone() for k,v in pred_mano_params.items()}
+
+        # Compute camera translation
+        device = pred_mano_params['hand_pose'].device
+        dtype = pred_mano_params['hand_pose'].dtype
+        focal_length = self.cfg.EXTRA.FOCAL_LENGTH * torch.ones(batch_size, 2, device=device, dtype=dtype)
+        pred_cam_t = torch.stack([pred_cam[:, 1],
+                                  pred_cam[:, 2],
+                                  2*focal_length[:, 0]/(self.cfg.MODEL.IMAGE_SIZE * pred_cam[:, 0] +1e-9)],dim=-1)
+        output['pred_cam_t'] = pred_cam_t
+        output['focal_length'] = focal_length
+
+        # Compute model vertices, joints and the projected joints
+        pred_mano_params['global_orient'] = pred_mano_params['global_orient'].reshape(batch_size, -1, 3, 3)
+        pred_mano_params['hand_pose'] = pred_mano_params['hand_pose'].reshape(batch_size, -1, 3, 3)
+        pred_mano_params['betas'] = pred_mano_params['betas'].reshape(batch_size, -1)
+        mano_output = self.mano(**{k: v.float() for k,v in pred_mano_params.items()}, pose2rot=False)
+        pred_keypoints_3d = mano_output.joints
+        pred_vertices = mano_output.vertices
+        output['pred_keypoints_3d'] = pred_keypoints_3d.reshape(batch_size, -1, 3)
+        output['pred_vertices'] = pred_vertices.reshape(batch_size, -1, 3)
+        pred_cam_t = pred_cam_t.reshape(-1, 3)
+        focal_length = focal_length.reshape(-1, 2)
+        pred_keypoints_2d = perspective_projection(pred_keypoints_3d,
+                                                   translation=pred_cam_t,
+                                                   focal_length=focal_length / self.cfg.MODEL.IMAGE_SIZE)
+
+        output['pred_keypoints_2d'] = pred_keypoints_2d.reshape(batch_size, -1, 2)
+        return output
+
+    def compute_loss(self, batch: Dict, output: Dict, train: bool = True) -> torch.Tensor:
+        """
+        Compute losses given the input batch and the regression output
+        Args:
+            batch (Dict): Dictionary containing batch data
+            output (Dict): Dictionary containing the regression output
+            train (bool): Flag indicating whether it is training or validation mode
+        Returns:
+            torch.Tensor : Total loss for current batch
+        """
+
+        pred_mano_params = output['pred_mano_params']
+        pred_keypoints_2d = output['pred_keypoints_2d']
+        pred_keypoints_3d = output['pred_keypoints_3d']
+
+
+        batch_size = pred_mano_params['hand_pose'].shape[0]
+        device = pred_mano_params['hand_pose'].device
+        dtype = pred_mano_params['hand_pose'].dtype
+
+        # Get annotations
+        gt_keypoints_2d = batch['keypoints_2d']
+        gt_keypoints_3d = batch['keypoints_3d']
+        gt_mano_params = batch['mano_params']
+        has_mano_params = batch['has_mano_params']
+        is_axis_angle = batch['mano_params_is_axis_angle']
+
+        # Compute 3D keypoint loss
+        loss_keypoints_2d = self.keypoint_2d_loss(pred_keypoints_2d, gt_keypoints_2d)
+        loss_keypoints_3d = self.keypoint_3d_loss(pred_keypoints_3d, gt_keypoints_3d, pelvis_id=0)
+
+        # Compute loss on MANO parameters
+        loss_mano_params = {}
+        for k, pred in pred_mano_params.items():
+            gt = gt_mano_params[k].view(batch_size, -1)
+            if is_axis_angle[k].all():
+                gt = aa_to_rotmat(gt.reshape(-1, 3)).view(batch_size, -1, 3, 3)
+            has_gt = has_mano_params[k]
+            loss_mano_params[k] = self.mano_parameter_loss(pred.reshape(batch_size, -1), gt.reshape(batch_size, -1), has_gt)
+
+        loss = self.cfg.LOSS_WEIGHTS['KEYPOINTS_3D'] * loss_keypoints_3d+\
+               self.cfg.LOSS_WEIGHTS['KEYPOINTS_2D'] * loss_keypoints_2d+\
+               sum([loss_mano_params[k] * self.cfg.LOSS_WEIGHTS[k.upper()] for k in loss_mano_params])
+
+        losses = dict(loss=loss.detach(),
+                      loss_keypoints_2d=loss_keypoints_2d.detach(),
+                      loss_keypoints_3d=loss_keypoints_3d.detach())
+
+        for k, v in loss_mano_params.items():
+            losses['loss_' + k] = v.detach()
+
+        output['losses'] = losses
+
+        return loss
+
+    # Tensoroboard logging should run from first rank only
+    @pl.utilities.rank_zero.rank_zero_only
+    def tensorboard_logging(self, batch: Dict, output: Dict, step_count: int, train: bool = True, write_to_summary_writer: bool = True) -> None:
+        """
+        Log results to Tensorboard
+        Args:
+            batch (Dict): Dictionary containing batch data
+            output (Dict): Dictionary containing the regression output
+            step_count (int): Global training step count
+            train (bool): Flag indicating whether it is training or validation mode
+        """
+
+        mode = 'train' if train else 'val'
+        batch_size = batch['keypoints_2d'].shape[0]
+        images = batch['img']
+        images = images * torch.tensor([0.229, 0.224, 0.225], device=images.device).reshape(1,3,1,1)
+        images = images + torch.tensor([0.485, 0.456, 0.406], device=images.device).reshape(1,3,1,1)
+        #images = 255*images.permute(0, 2, 3, 1).cpu().numpy()
+
+        pred_keypoints_3d = output['pred_keypoints_3d'].detach().reshape(batch_size, -1, 3)
+        pred_vertices = output['pred_vertices'].detach().reshape(batch_size, -1, 3)
+        focal_length = output['focal_length'].detach().reshape(batch_size, 2)
+        gt_keypoints_3d = batch['keypoints_3d']
+        gt_keypoints_2d = batch['keypoints_2d']
+        losses = output['losses']
+        pred_cam_t = output['pred_cam_t'].detach().reshape(batch_size, 3)
+        pred_keypoints_2d = output['pred_keypoints_2d'].detach().reshape(batch_size, -1, 2)
+
+        if write_to_summary_writer:
+            summary_writer = self.logger.experiment
+            for loss_name, val in losses.items():
+                summary_writer.add_scalar(mode +'/' + loss_name, val.detach().item(), step_count)
+        num_images = min(batch_size, self.cfg.EXTRA.NUM_LOG_IMAGES)
+
+        gt_keypoints_3d = batch['keypoints_3d']
+        pred_keypoints_3d = output['pred_keypoints_3d'].detach().reshape(batch_size, -1, 3)
+
+        # We render the skeletons instead of the full mesh because rendering a lot of meshes will make the training slow.
+        #predictions = self.renderer(pred_keypoints_3d[:num_images],
+        #                            gt_keypoints_3d[:num_images],
+        #                            2 * gt_keypoints_2d[:num_images],
+        #                            images=images[:num_images],
+        #                            camera_translation=pred_cam_t[:num_images])
+        predictions = self.mesh_renderer.visualize_tensorboard(pred_vertices[:num_images].cpu().numpy(),
+                                                               pred_cam_t[:num_images].cpu().numpy(),
+                                                               images[:num_images].cpu().numpy(),
+                                                               pred_keypoints_2d[:num_images].cpu().numpy(),
+                                                               gt_keypoints_2d[:num_images].cpu().numpy(),
+                                                               focal_length=focal_length[:num_images].cpu().numpy())
+        if write_to_summary_writer:
+            summary_writer.add_image('%s/predictions' % mode, predictions, step_count)
+
+        return predictions
+
+    def forward(self, batch: Dict) -> Dict:
+        """
+        Run a forward step of the network in val mode
+        Args:
+            batch (Dict): Dictionary containing batch data
+        Returns:
+            Dict: Dictionary containing the regression output
+        """
+        return self.forward_step(batch, train=False)
+
+    def training_step_discriminator(self, batch: Dict,
+                                    hand_pose: torch.Tensor,
+                                    betas: torch.Tensor,
+                                    optimizer: torch.optim.Optimizer) -> torch.Tensor:
+        """
+        Run a discriminator training step
+        Args:
+            batch (Dict): Dictionary containing mocap batch data
+            hand_pose (torch.Tensor): Regressed hand pose from current step
+            betas (torch.Tensor): Regressed betas from current step
+            optimizer (torch.optim.Optimizer): Discriminator optimizer
+        Returns:
+            torch.Tensor: Discriminator loss
+        """
+        batch_size = hand_pose.shape[0]
+        gt_hand_pose = batch['hand_pose']
+        gt_betas = batch['betas']
+        gt_rotmat = aa_to_rotmat(gt_hand_pose.view(-1,3)).view(batch_size, -1, 3, 3)
+        disc_fake_out = self.discriminator(hand_pose.detach(), betas.detach())
+        loss_fake = ((disc_fake_out - 0.0) ** 2).sum() / batch_size
+        disc_real_out = self.discriminator(gt_rotmat, gt_betas)
+        loss_real = ((disc_real_out - 1.0) ** 2).sum() / batch_size
+        loss_disc = loss_fake + loss_real
+        loss = self.cfg.LOSS_WEIGHTS.ADVERSARIAL * loss_disc
+        optimizer.zero_grad()
+        self.manual_backward(loss)
+        optimizer.step()
+        return loss_disc.detach()
+
+    def training_step(self, joint_batch: Dict, batch_idx: int) -> Dict:
+        """
+        Run a full training step
+        Args:
+            joint_batch (Dict): Dictionary containing image and mocap batch data
+            batch_idx (int): Unused.
+            batch_idx (torch.Tensor): Unused.
+        Returns:
+            Dict: Dictionary containing regression output.
+        """
+        batch = joint_batch['img']
+        mocap_batch = joint_batch['mocap']
+        optimizer = self.optimizers(use_pl_optimizer=True)
+        if self.cfg.LOSS_WEIGHTS.ADVERSARIAL > 0:
+            optimizer, optimizer_disc = optimizer
+
+        batch_size = batch['img'].shape[0]
+        output = self.forward_step(batch, train=True)
+        pred_mano_params = output['pred_mano_params']
+        if self.cfg.get('UPDATE_GT_SPIN', False):
+            self.update_batch_gt_spin(batch, output)
+        loss = self.compute_loss(batch, output, train=True)
+        if self.cfg.LOSS_WEIGHTS.ADVERSARIAL > 0:
+            disc_out = self.discriminator(pred_mano_params['hand_pose'].reshape(batch_size, -1), pred_mano_params['betas'].reshape(batch_size, -1))
+            loss_adv = ((disc_out - 1.0) ** 2).sum() / batch_size
+            loss = loss + self.cfg.LOSS_WEIGHTS.ADVERSARIAL * loss_adv
+
+        # Error if Nan
+        if torch.isnan(loss):
+            raise ValueError('Loss is NaN')
+
+        optimizer.zero_grad()
+        self.manual_backward(loss)
+        # Clip gradient
+        if self.cfg.TRAIN.get('GRAD_CLIP_VAL', 0) > 0:
+            gn = torch.nn.utils.clip_grad_norm_(self.get_parameters(), self.cfg.TRAIN.GRAD_CLIP_VAL, error_if_nonfinite=True)
+            self.log('train/grad_norm', gn, on_step=True, on_epoch=True, prog_bar=True, logger=True)
+        optimizer.step()
+        if self.cfg.LOSS_WEIGHTS.ADVERSARIAL > 0:
+            loss_disc = self.training_step_discriminator(mocap_batch, pred_mano_params['hand_pose'].reshape(batch_size, -1), pred_mano_params['betas'].reshape(batch_size, -1), optimizer_disc)
+            output['losses']['loss_gen'] = loss_adv
+            output['losses']['loss_disc'] = loss_disc
+
+        if self.global_step > 0 and self.global_step % self.cfg.GENERAL.LOG_STEPS == 0:
+            self.tensorboard_logging(batch, output, self.global_step, train=True)
+
+        self.log('train/loss', output['losses']['loss'], on_step=True, on_epoch=True, prog_bar=True, logger=False)
+
+        return output
+
+    def validation_step(self, batch: Dict, batch_idx: int, dataloader_idx=0) -> Dict:
+        """
+        Run a validation step and log to Tensorboard
+        Args:
+            batch (Dict): Dictionary containing batch data
+            batch_idx (int): Unused.
+        Returns:
+            Dict: Dictionary containing regression output.
+        """
+        # batch_size = batch['img'].shape[0]
+        output = self.forward_step(batch, train=False)
+        loss = self.compute_loss(batch, output, train=False)
+        output['loss'] = loss
+        self.tensorboard_logging(batch, output, self.global_step, train=False)
+
+        return output
diff --git a/phantom/submodules/phantom-hamer/hamer/models/heads/__init__.py b/phantom/submodules/phantom-hamer/hamer/models/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27e24ee70c20d9979a880a149efc9bc617f65e74
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/heads/__init__.py
@@ -0,0 +1 @@
+from .mano_head import build_mano_head
diff --git a/phantom/submodules/phantom-hamer/hamer/models/heads/mano_head.py b/phantom/submodules/phantom-hamer/hamer/models/heads/mano_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c58487305d4816597d958017415033337f9100f2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/heads/mano_head.py
@@ -0,0 +1,111 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import einops
+
+from ...utils.geometry import rot6d_to_rotmat, aa_to_rotmat
+from ..components.pose_transformer import TransformerDecoder
+
+def build_mano_head(cfg):
+    mano_head_type = cfg.MODEL.MANO_HEAD.get('TYPE', 'hamer')
+    if  mano_head_type == 'transformer_decoder':
+        return MANOTransformerDecoderHead(cfg)
+    else:
+        raise ValueError('Unknown MANO head type: {}'.format(mano_head_type))
+
+class MANOTransformerDecoderHead(nn.Module):
+    """ Cross-attention based MANO Transformer decoder
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.joint_rep_type = cfg.MODEL.MANO_HEAD.get('JOINT_REP', '6d')
+        self.joint_rep_dim = {'6d': 6, 'aa': 3}[self.joint_rep_type]
+        npose = self.joint_rep_dim * (cfg.MANO.NUM_HAND_JOINTS + 1)
+        self.npose = npose
+        self.input_is_mean_shape = cfg.MODEL.MANO_HEAD.get('TRANSFORMER_INPUT', 'zero') == 'mean_shape'
+        transformer_args = dict(
+            num_tokens=1,
+            token_dim=(npose + 10 + 3) if self.input_is_mean_shape else 1,
+            dim=1024,
+        )
+        transformer_args = (transformer_args | dict(cfg.MODEL.MANO_HEAD.TRANSFORMER_DECODER))
+        self.transformer = TransformerDecoder(
+            **transformer_args
+        )
+        dim=transformer_args['dim']
+        self.decpose = nn.Linear(dim, npose)
+        self.decshape = nn.Linear(dim, 10)
+        self.deccam = nn.Linear(dim, 3)
+
+        if cfg.MODEL.MANO_HEAD.get('INIT_DECODER_XAVIER', False):
+            # True by default in MLP. False by default in Transformer
+            nn.init.xavier_uniform_(self.decpose.weight, gain=0.01)
+            nn.init.xavier_uniform_(self.decshape.weight, gain=0.01)
+            nn.init.xavier_uniform_(self.deccam.weight, gain=0.01)
+
+        mean_params = np.load(cfg.MANO.MEAN_PARAMS)
+        init_hand_pose = torch.from_numpy(mean_params['pose'].astype(np.float32)).unsqueeze(0)
+        init_betas = torch.from_numpy(mean_params['shape'].astype('float32')).unsqueeze(0)
+        init_cam = torch.from_numpy(mean_params['cam'].astype(np.float32)).unsqueeze(0)
+        self.register_buffer('init_hand_pose', init_hand_pose)
+        self.register_buffer('init_betas', init_betas)
+        self.register_buffer('init_cam', init_cam)
+
+    def forward(self, x, **kwargs):
+
+        batch_size = x.shape[0]
+        # vit pretrained backbone is channel-first. Change to token-first
+        x = einops.rearrange(x, 'b c h w -> b (h w) c')
+
+        init_hand_pose = self.init_hand_pose.expand(batch_size, -1)
+        init_betas = self.init_betas.expand(batch_size, -1)
+        init_cam = self.init_cam.expand(batch_size, -1)
+
+        # TODO: Convert init_hand_pose to aa rep if needed
+        if self.joint_rep_type == 'aa':
+            raise NotImplementedError
+
+        pred_hand_pose = init_hand_pose
+        pred_betas = init_betas
+        pred_cam = init_cam
+        pred_hand_pose_list = []
+        pred_betas_list = []
+        pred_cam_list = []
+        for i in range(self.cfg.MODEL.MANO_HEAD.get('IEF_ITERS', 1)):
+            # Input token to transformer is zero token
+            if self.input_is_mean_shape:
+                token = torch.cat([pred_hand_pose, pred_betas, pred_cam], dim=1)[:,None,:]
+            else:
+                token = torch.zeros(batch_size, 1, 1).to(x.device)
+
+            # Pass through transformer
+            token_out = self.transformer(token, context=x)
+            token_out = token_out.squeeze(1) # (B, C)
+
+            # Readout from token_out
+            pred_hand_pose = self.decpose(token_out) + pred_hand_pose
+            pred_betas = self.decshape(token_out) + pred_betas
+            pred_cam = self.deccam(token_out) + pred_cam
+            pred_hand_pose_list.append(pred_hand_pose)
+            pred_betas_list.append(pred_betas)
+            pred_cam_list.append(pred_cam)
+
+        # Convert self.joint_rep_type -> rotmat
+        joint_conversion_fn = {
+            '6d': rot6d_to_rotmat,
+            'aa': lambda x: aa_to_rotmat(x.view(-1, 3).contiguous())
+        }[self.joint_rep_type]
+
+        pred_mano_params_list = {}
+        pred_mano_params_list['hand_pose'] = torch.cat([joint_conversion_fn(pbp).view(batch_size, -1, 3, 3)[:, 1:, :, :] for pbp in pred_hand_pose_list], dim=0)
+        pred_mano_params_list['betas'] = torch.cat(pred_betas_list, dim=0)
+        pred_mano_params_list['cam'] = torch.cat(pred_cam_list, dim=0)
+        pred_hand_pose = joint_conversion_fn(pred_hand_pose).view(batch_size, self.cfg.MANO.NUM_HAND_JOINTS+1, 3, 3)
+
+        pred_mano_params = {'global_orient': pred_hand_pose[:, [0]],
+                            'hand_pose': pred_hand_pose[:, 1:],
+                            'betas': pred_betas}
+        return pred_mano_params, pred_cam, pred_mano_params_list
diff --git a/phantom/submodules/phantom-hamer/hamer/models/losses.py b/phantom/submodules/phantom-hamer/hamer/models/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6e493c081a4d99b97b5641e85152c4d56072a58
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/losses.py
@@ -0,0 +1,92 @@
+import torch
+import torch.nn as nn
+
+class Keypoint2DLoss(nn.Module):
+
+    def __init__(self, loss_type: str = 'l1'):
+        """
+        2D keypoint loss module.
+        Args:
+            loss_type (str): Choose between l1 and l2 losses.
+        """
+        super(Keypoint2DLoss, self).__init__()
+        if loss_type == 'l1':
+            self.loss_fn = nn.L1Loss(reduction='none')
+        elif loss_type == 'l2':
+            self.loss_fn = nn.MSELoss(reduction='none')
+        else:
+            raise NotImplementedError('Unsupported loss function')
+
+    def forward(self, pred_keypoints_2d: torch.Tensor, gt_keypoints_2d: torch.Tensor) -> torch.Tensor:
+        """
+        Compute 2D reprojection loss on the keypoints.
+        Args:
+            pred_keypoints_2d (torch.Tensor): Tensor of shape [B, S, N, 2] containing projected 2D keypoints (B: batch_size, S: num_samples, N: num_keypoints)
+            gt_keypoints_2d (torch.Tensor): Tensor of shape [B, S, N, 3] containing the ground truth 2D keypoints and confidence.
+        Returns:
+            torch.Tensor: 2D keypoint loss.
+        """
+        conf = gt_keypoints_2d[:, :, -1].unsqueeze(-1).clone()
+        batch_size = conf.shape[0]
+        loss = (conf * self.loss_fn(pred_keypoints_2d, gt_keypoints_2d[:, :, :-1])).sum(dim=(1,2))
+        return loss.sum()
+
+
+class Keypoint3DLoss(nn.Module):
+
+    def __init__(self, loss_type: str = 'l1'):
+        """
+        3D keypoint loss module.
+        Args:
+            loss_type (str): Choose between l1 and l2 losses.
+        """
+        super(Keypoint3DLoss, self).__init__()
+        if loss_type == 'l1':
+            self.loss_fn = nn.L1Loss(reduction='none')
+        elif loss_type == 'l2':
+            self.loss_fn = nn.MSELoss(reduction='none')
+        else:
+            raise NotImplementedError('Unsupported loss function')
+
+    def forward(self, pred_keypoints_3d: torch.Tensor, gt_keypoints_3d: torch.Tensor, pelvis_id: int = 0):
+        """
+        Compute 3D keypoint loss.
+        Args:
+            pred_keypoints_3d (torch.Tensor): Tensor of shape [B, S, N, 3] containing the predicted 3D keypoints (B: batch_size, S: num_samples, N: num_keypoints)
+            gt_keypoints_3d (torch.Tensor): Tensor of shape [B, S, N, 4] containing the ground truth 3D keypoints and confidence.
+        Returns:
+            torch.Tensor: 3D keypoint loss.
+        """
+        batch_size = pred_keypoints_3d.shape[0]
+        gt_keypoints_3d = gt_keypoints_3d.clone()
+        pred_keypoints_3d = pred_keypoints_3d - pred_keypoints_3d[:, pelvis_id, :].unsqueeze(dim=1)
+        gt_keypoints_3d[:, :, :-1] = gt_keypoints_3d[:, :, :-1] - gt_keypoints_3d[:, pelvis_id, :-1].unsqueeze(dim=1)
+        conf = gt_keypoints_3d[:, :, -1].unsqueeze(-1).clone()
+        gt_keypoints_3d = gt_keypoints_3d[:, :, :-1]
+        loss = (conf * self.loss_fn(pred_keypoints_3d, gt_keypoints_3d)).sum(dim=(1,2))
+        return loss.sum()
+
+class ParameterLoss(nn.Module):
+
+    def __init__(self):
+        """
+        MANO parameter loss module.
+        """
+        super(ParameterLoss, self).__init__()
+        self.loss_fn = nn.MSELoss(reduction='none')
+
+    def forward(self, pred_param: torch.Tensor, gt_param: torch.Tensor, has_param: torch.Tensor):
+        """
+        Compute MANO parameter loss.
+        Args:
+            pred_param (torch.Tensor): Tensor of shape [B, S, ...] containing the predicted parameters (body pose / global orientation / betas)
+            gt_param (torch.Tensor): Tensor of shape [B, S, ...] containing the ground truth MANO parameters.
+        Returns:
+            torch.Tensor: L2 parameter loss loss.
+        """
+        batch_size = pred_param.shape[0]
+        num_dims = len(pred_param.shape)
+        mask_dimension = [batch_size] + [1] * (num_dims-1)
+        has_param = has_param.type(pred_param.type()).view(*mask_dimension)
+        loss_param = (has_param * self.loss_fn(pred_param, gt_param))
+        return loss_param.sum()
diff --git a/phantom/submodules/phantom-hamer/hamer/models/mano_wrapper.py b/phantom/submodules/phantom-hamer/hamer/models/mano_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6f0cc336098e9303d2514c571307c56baf3bc86
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/models/mano_wrapper.py
@@ -0,0 +1,40 @@
+import torch
+import numpy as np
+import pickle
+from typing import Optional
+import smplx
+from smplx.lbs import vertices2joints
+from smplx.utils import MANOOutput, to_tensor
+from smplx.vertex_ids import vertex_ids
+
+
+class MANO(smplx.MANOLayer):
+    def __init__(self, *args, joint_regressor_extra: Optional[str] = None, **kwargs):
+        """
+        Extension of the official MANO implementation to support more joints.
+        Args:
+            Same as MANOLayer.
+            joint_regressor_extra (str): Path to extra joint regressor.
+        """
+        super(MANO, self).__init__(*args, **kwargs)
+        mano_to_openpose = [0, 13, 14, 15, 16, 1, 2, 3, 17, 4, 5, 6, 18, 10, 11, 12, 19, 7, 8, 9, 20]
+
+        #2, 3, 5, 4, 1
+        if joint_regressor_extra is not None:
+            self.register_buffer('joint_regressor_extra', torch.tensor(pickle.load(open(joint_regressor_extra, 'rb'), encoding='latin1'), dtype=torch.float32))
+        self.register_buffer('extra_joints_idxs', to_tensor(list(vertex_ids['mano'].values()), dtype=torch.long))
+        self.register_buffer('joint_map', torch.tensor(mano_to_openpose, dtype=torch.long))
+
+    def forward(self, *args, **kwargs) -> MANOOutput:
+        """
+        Run forward pass. Same as MANO and also append an extra set of joints if joint_regressor_extra is specified.
+        """
+        mano_output = super(MANO, self).forward(*args, **kwargs)
+        extra_joints = torch.index_select(mano_output.vertices, 1, self.extra_joints_idxs)
+        joints = torch.cat([mano_output.joints, extra_joints], dim=1)
+        joints = joints[:, self.joint_map, :]
+        if hasattr(self, 'joint_regressor_extra'):
+            extra_joints = vertices2joints(self.joint_regressor_extra, mano_output.vertices)
+            joints = torch.cat([joints, extra_joints], dim=1)
+        mano_output.joints = joints
+        return mano_output
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/__init__.py b/phantom/submodules/phantom-hamer/hamer/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..09e47cdf8cdb303432d64902fbe58b256273f88a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/__init__.py
@@ -0,0 +1,25 @@
+import torch
+from typing import Any
+
+from .renderer import Renderer
+from .mesh_renderer import MeshRenderer
+from .skeleton_renderer import SkeletonRenderer
+from .pose_utils import eval_pose, Evaluator
+
+def recursive_to(x: Any, target: torch.device):
+    """
+    Recursively transfer a batch of data to the target device
+    Args:
+        x (Any): Batch of data.
+        target (torch.device): Target device.
+    Returns:
+        Batch of data where all tensors are transfered to the target device.
+    """
+    if isinstance(x, dict):
+        return {k: recursive_to(v, target) for k, v in x.items()}
+    elif isinstance(x, torch.Tensor):
+        return x.to(target)
+    elif isinstance(x, list):
+        return [recursive_to(i, target) for i in x]
+    else:
+        return x
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/download.py b/phantom/submodules/phantom-hamer/hamer/utils/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..84d9b34a4546aa8f456e9ceae2276ecbe1f60fb6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/download.py
@@ -0,0 +1,66 @@
+import os
+import re
+import sys
+from urllib import request as urlrequest
+
+
+def _progress_bar(count, total):
+    """Report download progress. Credit:
+    https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113
+    """
+    bar_len = 60
+    filled_len = int(round(bar_len * count / float(total)))
+    percents = round(100.0 * count / float(total), 1)
+    bar = "=" * filled_len + "-" * (bar_len - filled_len)
+    sys.stdout.write(
+        "  [{}] {}% of {:.1f}MB file  \r".format(bar, percents, total / 1024 / 1024)
+    )
+    sys.stdout.flush()
+    if count >= total:
+        sys.stdout.write("\n")
+
+
+def download_url(url, dst_file_path, chunk_size=8192, progress_hook=_progress_bar):
+    """Download url and write it to dst_file_path. Credit:
+    https://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
+    """
+    # url = url + "?dl=1" if "dropbox" in url else url
+    req = urlrequest.Request(url)
+    response = urlrequest.urlopen(req)
+    total_size = response.info().get("Content-Length")
+    if total_size is None:
+        raise ValueError("Cannot determine size of download from {}".format(url))
+    total_size = int(total_size.strip())
+    bytes_so_far = 0
+
+    with open(dst_file_path, "wb") as f:
+        while 1:
+            chunk = response.read(chunk_size)
+            bytes_so_far += len(chunk)
+            if not chunk:
+                break
+
+            if progress_hook:
+                progress_hook(bytes_so_far, total_size)
+
+            f.write(chunk)
+    return bytes_so_far
+
+
+def cache_url(url_or_file, cache_file_path, download=True):
+    """Download the file specified by the URL to the cache_dir and return the path to
+    the cached file. If the argument is not a URL, simply return it as is.
+    """
+    is_url = re.match(r"^(?:http)s?://", url_or_file, re.IGNORECASE) is not None
+    if not is_url:
+        return url_or_file
+    url = url_or_file
+    if os.path.exists(cache_file_path):
+        return cache_file_path
+    cache_file_dir = os.path.dirname(cache_file_path)
+    if not os.path.exists(cache_file_dir):
+        os.makedirs(cache_file_dir)
+    if download:
+        print("Downloading remote file {} to {}".format(url, cache_file_path))
+        download_url(url, cache_file_path)
+    return cache_file_path
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/geometry.py b/phantom/submodules/phantom-hamer/hamer/utils/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..7929ef52608618a4682788487008e73c5736101b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/geometry.py
@@ -0,0 +1,102 @@
+from typing import Optional
+import torch
+from torch.nn import functional as F
+
+def aa_to_rotmat(theta: torch.Tensor):
+    """
+    Convert axis-angle representation to rotation matrix.
+    Works by first converting it to a quaternion.
+    Args:
+        theta (torch.Tensor): Tensor of shape (B, 3) containing axis-angle representations.
+    Returns:
+        torch.Tensor: Corresponding rotation matrices with shape (B, 3, 3).
+    """
+    norm = torch.norm(theta + 1e-8, p = 2, dim = 1)
+    angle = torch.unsqueeze(norm, -1)
+    normalized = torch.div(theta, angle)
+    angle = angle * 0.5
+    v_cos = torch.cos(angle)
+    v_sin = torch.sin(angle)
+    quat = torch.cat([v_cos, v_sin * normalized], dim = 1)
+    return quat_to_rotmat(quat)
+
+def quat_to_rotmat(quat: torch.Tensor) -> torch.Tensor:
+    """
+    Convert quaternion representation to rotation matrix.
+    Args:
+        quat (torch.Tensor) of shape (B, 4); 4 <===> (w, x, y, z).
+    Returns:
+        torch.Tensor: Corresponding rotation matrices with shape (B, 3, 3).
+    """
+    norm_quat = quat
+    norm_quat = norm_quat/norm_quat.norm(p=2, dim=1, keepdim=True)
+    w, x, y, z = norm_quat[:,0], norm_quat[:,1], norm_quat[:,2], norm_quat[:,3]
+
+    B = quat.size(0)
+
+    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
+    wx, wy, wz = w*x, w*y, w*z
+    xy, xz, yz = x*y, x*z, y*z
+
+    rotMat = torch.stack([w2 + x2 - y2 - z2, 2*xy - 2*wz, 2*wy + 2*xz,
+                          2*wz + 2*xy, w2 - x2 + y2 - z2, 2*yz - 2*wx,
+                          2*xz - 2*wy, 2*wx + 2*yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3)
+    return rotMat
+
+
+def rot6d_to_rotmat(x: torch.Tensor) -> torch.Tensor:
+    """
+    Convert 6D rotation representation to 3x3 rotation matrix.
+    Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019
+    Args:
+        x (torch.Tensor): (B,6) Batch of 6-D rotation representations.
+    Returns:
+        torch.Tensor: Batch of corresponding rotation matrices with shape (B,3,3).
+    """
+    x = x.reshape(-1,2,3).permute(0, 2, 1).contiguous()
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    return torch.stack((b1, b2, b3), dim=-1)
+
+def perspective_projection(points: torch.Tensor,
+                           translation: torch.Tensor,
+                           focal_length: torch.Tensor,
+                           camera_center: Optional[torch.Tensor] = None,
+                           rotation: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Computes the perspective projection of a set of 3D points.
+    Args:
+        points (torch.Tensor): Tensor of shape (B, N, 3) containing the input 3D points.
+        translation (torch.Tensor): Tensor of shape (B, 3) containing the 3D camera translation.
+        focal_length (torch.Tensor): Tensor of shape (B, 2) containing the focal length in pixels.
+        camera_center (torch.Tensor): Tensor of shape (B, 2) containing the camera center in pixels.
+        rotation (torch.Tensor): Tensor of shape (B, 3, 3) containing the camera rotation.
+    Returns:
+        torch.Tensor: Tensor of shape (B, N, 2) containing the projection of the input points.
+    """
+    batch_size = points.shape[0]
+    if rotation is None:
+        rotation = torch.eye(3, device=points.device, dtype=points.dtype).unsqueeze(0).expand(batch_size, -1, -1)
+    if camera_center is None:
+        camera_center = torch.zeros(batch_size, 2, device=points.device, dtype=points.dtype)
+    # Populate intrinsic camera matrix K.
+    K = torch.zeros([batch_size, 3, 3], device=points.device, dtype=points.dtype)
+    K[:,0,0] = focal_length[:,0]
+    K[:,1,1] = focal_length[:,1]
+    K[:,2,2] = 1.
+    K[:,:-1, -1] = camera_center
+
+    # Transform points
+    points = torch.einsum('bij,bkj->bki', rotation, points)
+    points = points + translation.unsqueeze(1)
+
+    # Apply perspective distortion
+    projected_points = points / points[:,:,-1].unsqueeze(-1)
+
+    # Apply camera intrinsics
+    projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
+
+    return projected_points[:, :, :-1]
\ No newline at end of file
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/mesh_renderer.py b/phantom/submodules/phantom-hamer/hamer/utils/mesh_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb3e8ed2e9aed8157ec852d06d5f13e8f4ff7c54
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/mesh_renderer.py
@@ -0,0 +1,149 @@
+import os
+if 'PYOPENGL_PLATFORM' not in os.environ:
+    os.environ['PYOPENGL_PLATFORM'] = 'egl'
+import torch
+from torchvision.utils import make_grid
+import numpy as np
+import pyrender
+import trimesh
+import cv2
+import torch.nn.functional as F
+
+from .render_openpose import render_openpose
+
+def create_raymond_lights():
+    import pyrender
+    thetas = np.pi * np.array([1.0 / 6.0, 1.0 / 6.0, 1.0 / 6.0])
+    phis = np.pi * np.array([0.0, 2.0 / 3.0, 4.0 / 3.0])
+
+    nodes = []
+
+    for phi, theta in zip(phis, thetas):
+        xp = np.sin(theta) * np.cos(phi)
+        yp = np.sin(theta) * np.sin(phi)
+        zp = np.cos(theta)
+
+        z = np.array([xp, yp, zp])
+        z = z / np.linalg.norm(z)
+        x = np.array([-z[1], z[0], 0.0])
+        if np.linalg.norm(x) == 0:
+            x = np.array([1.0, 0.0, 0.0])
+        x = x / np.linalg.norm(x)
+        y = np.cross(z, x)
+
+        matrix = np.eye(4)
+        matrix[:3,:3] = np.c_[x,y,z]
+        nodes.append(pyrender.Node(
+            light=pyrender.DirectionalLight(color=np.ones(3), intensity=1.0),
+            matrix=matrix
+        ))
+
+    return nodes
+
+class MeshRenderer:
+
+    def __init__(self, cfg, faces=None):
+        self.cfg = cfg
+        self.focal_length = cfg.EXTRA.FOCAL_LENGTH
+        self.img_res = cfg.MODEL.IMAGE_SIZE
+        self.renderer = pyrender.OffscreenRenderer(viewport_width=self.img_res,
+                                       viewport_height=self.img_res,
+                                       point_size=1.0)
+        
+        self.camera_center = [self.img_res // 2, self.img_res // 2]
+        self.faces = faces
+
+    def visualize(self, vertices, camera_translation, images, focal_length=None, nrow=3, padding=2):
+        images_np = np.transpose(images, (0,2,3,1))
+        rend_imgs = []
+        for i in range(vertices.shape[0]):
+            fl = self.focal_length
+            rend_img = torch.from_numpy(np.transpose(self.__call__(vertices[i], camera_translation[i], images_np[i], focal_length=fl, side_view=False), (2,0,1))).float()
+            rend_img_side = torch.from_numpy(np.transpose(self.__call__(vertices[i], camera_translation[i], images_np[i], focal_length=fl, side_view=True), (2,0,1))).float()
+            rend_imgs.append(torch.from_numpy(images[i]))
+            rend_imgs.append(rend_img)
+            rend_imgs.append(rend_img_side)
+        rend_imgs = make_grid(rend_imgs, nrow=nrow, padding=padding)
+        return rend_imgs
+
+    def visualize_tensorboard(self, vertices, camera_translation, images, pred_keypoints, gt_keypoints, focal_length=None, nrow=5, padding=2):
+        images_np = np.transpose(images, (0,2,3,1))
+        rend_imgs = []
+        pred_keypoints = np.concatenate((pred_keypoints, np.ones_like(pred_keypoints)[:, :, [0]]), axis=-1)
+        pred_keypoints = self.img_res * (pred_keypoints + 0.5)
+        gt_keypoints[:, :, :-1] = self.img_res * (gt_keypoints[:, :, :-1] + 0.5)
+        #keypoint_matches = [(1, 12), (2, 8), (3, 7), (4, 6), (5, 9), (6, 10), (7, 11), (8, 14), (9, 2), (10, 1), (11, 0), (12, 3), (13, 4), (14, 5)]
+        for i in range(vertices.shape[0]):
+            fl = self.focal_length
+            rend_img = torch.from_numpy(np.transpose(self.__call__(vertices[i], camera_translation[i], images_np[i], focal_length=fl, side_view=False), (2,0,1))).float()
+            rend_img_side = torch.from_numpy(np.transpose(self.__call__(vertices[i], camera_translation[i], images_np[i], focal_length=fl, side_view=True), (2,0,1))).float()
+            hand_keypoints = pred_keypoints[i, :21]
+            #extra_keypoints = pred_keypoints[i, -19:]
+            #for pair in keypoint_matches:
+            #    hand_keypoints[pair[0], :] = extra_keypoints[pair[1], :]
+            pred_keypoints_img = render_openpose(255 * images_np[i].copy(), hand_keypoints) / 255
+            hand_keypoints = gt_keypoints[i, :21]
+            #extra_keypoints = gt_keypoints[i, -19:]
+            #for pair in keypoint_matches:
+            #    if extra_keypoints[pair[1], -1] > 0 and hand_keypoints[pair[0], -1] == 0:
+            #        hand_keypoints[pair[0], :] = extra_keypoints[pair[1], :]
+            gt_keypoints_img = render_openpose(255*images_np[i].copy(), hand_keypoints) / 255
+            rend_imgs.append(torch.from_numpy(images[i]))
+            rend_imgs.append(rend_img)
+            rend_imgs.append(rend_img_side)
+            rend_imgs.append(torch.from_numpy(pred_keypoints_img).permute(2,0,1))
+            rend_imgs.append(torch.from_numpy(gt_keypoints_img).permute(2,0,1))
+        rend_imgs = make_grid(rend_imgs, nrow=nrow, padding=padding)
+        return rend_imgs
+
+    def __call__(self, vertices, camera_translation, image, focal_length=5000, text=None, resize=None, side_view=False, baseColorFactor=(1.0, 1.0, 0.9, 1.0), rot_angle=90):
+        renderer = pyrender.OffscreenRenderer(viewport_width=image.shape[1],
+                                              viewport_height=image.shape[0],
+                                              point_size=1.0)
+        material = pyrender.MetallicRoughnessMaterial(
+            metallicFactor=0.0,
+            alphaMode='OPAQUE',
+            baseColorFactor=baseColorFactor)
+
+        camera_translation[0] *= -1.
+
+        mesh = trimesh.Trimesh(vertices.copy(), self.faces.copy())
+        if side_view:
+            rot = trimesh.transformations.rotation_matrix(
+                np.radians(rot_angle), [0, 1, 0])
+            mesh.apply_transform(rot)
+        rot = trimesh.transformations.rotation_matrix(
+            np.radians(180), [1, 0, 0])
+        mesh.apply_transform(rot)
+        mesh = pyrender.Mesh.from_trimesh(mesh, material=material)
+
+        scene = pyrender.Scene(bg_color=[0.0, 0.0, 0.0, 0.0],
+                               ambient_light=(0.3, 0.3, 0.3))
+        scene.add(mesh, 'mesh')
+
+        camera_pose = np.eye(4)
+        camera_pose[:3, 3] = camera_translation
+        camera_center = [image.shape[1] / 2., image.shape[0] / 2.]
+        camera = pyrender.IntrinsicsCamera(fx=focal_length, fy=focal_length,
+                                           cx=camera_center[0], cy=camera_center[1])
+        scene.add(camera, pose=camera_pose)
+
+
+        light_nodes = create_raymond_lights()
+        for node in light_nodes:
+            scene.add_node(node)
+
+        color, rend_depth = renderer.render(scene, flags=pyrender.RenderFlags.RGBA)
+        color = color.astype(np.float32) / 255.0
+        valid_mask = (color[:, :, -1] > 0)[:, :, np.newaxis]
+        if not side_view:
+            output_img = (color[:, :, :3] * valid_mask +
+                      (1 - valid_mask) * image)
+        else:
+            output_img = color[:, :, :3]
+        if resize is not None:
+            output_img = cv2.resize(output_img, resize)
+
+        output_img = output_img.astype(np.float32)
+        renderer.delete()
+        return output_img
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/misc.py b/phantom/submodules/phantom-hamer/hamer/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffcfe784872b305c264ce6ef67fd0a9e9ad3390f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/misc.py
@@ -0,0 +1,203 @@
+import time
+import warnings
+from importlib.util import find_spec
+from pathlib import Path
+from typing import Callable, List
+
+import hydra
+from omegaconf import DictConfig, OmegaConf
+from pytorch_lightning import Callback
+from pytorch_lightning.loggers import Logger
+from pytorch_lightning.utilities import rank_zero_only
+
+from . import pylogger, rich_utils
+
+log = pylogger.get_pylogger(__name__)
+
+
+def task_wrapper(task_func: Callable) -> Callable:
+    """Optional decorator that wraps the task function in extra utilities.
+
+    Makes multirun more resistant to failure.
+
+    Utilities:
+    - Calling the `utils.extras()` before the task is started
+    - Calling the `utils.close_loggers()` after the task is finished
+    - Logging the exception if occurs
+    - Logging the task total execution time
+    - Logging the output dir
+    """
+
+    def wrap(cfg: DictConfig):
+
+        # apply extra utilities
+        extras(cfg)
+
+        # execute the task
+        try:
+            start_time = time.time()
+            ret = task_func(cfg=cfg)
+        except Exception as ex:
+            log.exception("")  # save exception to `.log` file
+            raise ex
+        finally:
+            path = Path(cfg.paths.output_dir, "exec_time.log")
+            content = f"'{cfg.task_name}' execution time: {time.time() - start_time} (s)"
+            save_file(path, content)  # save task execution time (even if exception occurs)
+            close_loggers()  # close loggers (even if exception occurs so multirun won't fail)
+
+        log.info(f"Output dir: {cfg.paths.output_dir}")
+
+        return ret
+
+    return wrap
+
+
+def extras(cfg: DictConfig) -> None:
+    """Applies optional utilities before the task is started.
+
+    Utilities:
+    - Ignoring python warnings
+    - Setting tags from command line
+    - Rich config printing
+    """
+
+    # return if no `extras` config
+    if not cfg.get("extras"):
+        log.warning("Extras config not found! <cfg.extras=null>")
+        return
+
+    # disable python warnings
+    if cfg.extras.get("ignore_warnings"):
+        log.info("Disabling python warnings! <cfg.extras.ignore_warnings=True>")
+        warnings.filterwarnings("ignore")
+
+    # prompt user to input tags from command line if none are provided in the config
+    if cfg.extras.get("enforce_tags"):
+        log.info("Enforcing tags! <cfg.extras.enforce_tags=True>")
+        rich_utils.enforce_tags(cfg, save_to_file=True)
+
+    # pretty print config tree using Rich library
+    if cfg.extras.get("print_config"):
+        log.info("Printing config tree with Rich! <cfg.extras.print_config=True>")
+        rich_utils.print_config_tree(cfg, resolve=True, save_to_file=True)
+
+
+@rank_zero_only
+def save_file(path: str, content: str) -> None:
+    """Save file in rank zero mode (only on one process in multi-GPU setup)."""
+    with open(path, "w+") as file:
+        file.write(content)
+
+
+def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:
+    """Instantiates callbacks from config."""
+    callbacks: List[Callback] = []
+
+    if not callbacks_cfg:
+        log.warning("Callbacks config is empty.")
+        return callbacks
+
+    if not isinstance(callbacks_cfg, DictConfig):
+        raise TypeError("Callbacks config must be a DictConfig!")
+
+    for _, cb_conf in callbacks_cfg.items():
+        if isinstance(cb_conf, DictConfig) and "_target_" in cb_conf:
+            log.info(f"Instantiating callback <{cb_conf._target_}>")
+            callbacks.append(hydra.utils.instantiate(cb_conf))
+
+    return callbacks
+
+
+def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]:
+    """Instantiates loggers from config."""
+    logger: List[Logger] = []
+
+    if not logger_cfg:
+        log.warning("Logger config is empty.")
+        return logger
+
+    if not isinstance(logger_cfg, DictConfig):
+        raise TypeError("Logger config must be a DictConfig!")
+
+    for _, lg_conf in logger_cfg.items():
+        if isinstance(lg_conf, DictConfig) and "_target_" in lg_conf:
+            log.info(f"Instantiating logger <{lg_conf._target_}>")
+            logger.append(hydra.utils.instantiate(lg_conf))
+
+    return logger
+
+
+@rank_zero_only
+def log_hyperparameters(object_dict: dict) -> None:
+    """Controls which config parts are saved by lightning loggers.
+
+    Additionally saves:
+    - Number of model parameters
+    """
+
+    hparams = {}
+
+    cfg = object_dict["cfg"]
+    model = object_dict["model"]
+    trainer = object_dict["trainer"]
+
+    if not trainer.logger:
+        log.warning("Logger not found! Skipping hyperparameter logging...")
+        return
+
+    # save number of model parameters
+    hparams["model/params/total"] = sum(p.numel() for p in model.parameters())
+    hparams["model/params/trainable"] = sum(
+        p.numel() for p in model.parameters() if p.requires_grad
+    )
+    hparams["model/params/non_trainable"] = sum(
+        p.numel() for p in model.parameters() if not p.requires_grad
+    )
+
+    for k in cfg.keys():
+        hparams[k] = cfg.get(k)
+
+    # Resolve all interpolations
+    def _resolve(_cfg):
+        if isinstance(_cfg, DictConfig):
+            _cfg = OmegaConf.to_container(_cfg, resolve=True)
+        return _cfg
+
+    hparams = {k: _resolve(v) for k, v in hparams.items()}
+
+    # send hparams to all loggers
+    trainer.logger.log_hyperparams(hparams)
+
+
+def get_metric_value(metric_dict: dict, metric_name: str) -> float:
+    """Safely retrieves value of the metric logged in LightningModule."""
+
+    if not metric_name:
+        log.info("Metric name is None! Skipping metric value retrieval...")
+        return None
+
+    if metric_name not in metric_dict:
+        raise Exception(
+            f"Metric value not found! <metric_name={metric_name}>\n"
+            "Make sure metric name logged in LightningModule is correct!\n"
+            "Make sure `optimized_metric` name in `hparams_search` config is correct!"
+        )
+
+    metric_value = metric_dict[metric_name].item()
+    log.info(f"Retrieved metric value! <{metric_name}={metric_value}>")
+
+    return metric_value
+
+
+def close_loggers() -> None:
+    """Makes sure all loggers closed properly (prevents logging failure during multirun)."""
+
+    log.info("Closing loggers...")
+
+    if find_spec("wandb"):  # if wandb is installed
+        import wandb
+
+        if wandb.run:
+            log.info("Closing wandb!")
+            wandb.finish()
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/pose_utils.py b/phantom/submodules/phantom-hamer/hamer/utils/pose_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f66061035f27098d27e4903cbd928032aad5205b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/pose_utils.py
@@ -0,0 +1,352 @@
+"""
+Code adapted from: https://github.com/akanazawa/hmr/blob/master/src/benchmark/eval_util.py
+"""
+
+import torch
+import numpy as np
+from typing import Optional, Dict, List, Tuple
+
+def compute_similarity_transform(S1: torch.Tensor, S2: torch.Tensor) -> torch.Tensor:
+    """
+    Computes a similarity transform (sR, t) in a batched way that takes
+    a set of 3D points S1 (B, N, 3) closest to a set of 3D points S2 (B, N, 3),
+    where R is a 3x3 rotation matrix, t 3x1 translation, s scale.
+    i.e. solves the orthogonal Procrutes problem.
+    Args:
+        S1 (torch.Tensor): First set of points of shape (B, N, 3).
+        S2 (torch.Tensor): Second set of points of shape (B, N, 3).
+    Returns:
+        (torch.Tensor): The first set of points after applying the similarity transformation.
+    """
+
+    batch_size = S1.shape[0]
+    S1 = S1.permute(0, 2, 1)
+    S2 = S2.permute(0, 2, 1)
+    # 1. Remove mean.
+    mu1 = S1.mean(dim=2, keepdim=True)
+    mu2 = S2.mean(dim=2, keepdim=True)
+    X1 = S1 - mu1
+    X2 = S2 - mu2
+
+    # 2. Compute variance of X1 used for scale.
+    var1 = (X1**2).sum(dim=(1,2))
+
+    # 3. The outer product of X1 and X2.
+    K = torch.matmul(X1, X2.permute(0, 2, 1))
+
+    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are singular vectors of K.
+    U, s, V = torch.svd(K)
+    Vh = V.permute(0, 2, 1)
+
+    # Construct Z that fixes the orientation of R to get det(R)=1.
+    Z = torch.eye(U.shape[1], device=U.device).unsqueeze(0).repeat(batch_size, 1, 1)
+    Z[:, -1, -1] *= torch.sign(torch.linalg.det(torch.matmul(U, Vh)))
+
+    # Construct R.
+    R = torch.matmul(torch.matmul(V, Z), U.permute(0, 2, 1))
+
+    # 5. Recover scale.
+    trace = torch.matmul(R, K).diagonal(offset=0, dim1=-1, dim2=-2).sum(dim=-1)
+    scale = (trace / var1).unsqueeze(dim=-1).unsqueeze(dim=-1)
+
+    # 6. Recover translation.
+    t = mu2 - scale*torch.matmul(R, mu1)
+
+    # 7. Error:
+    S1_hat = scale*torch.matmul(R, S1) + t
+
+    return S1_hat.permute(0, 2, 1)
+
+def reconstruction_error(S1, S2) -> np.array:
+    """
+    Computes the mean Euclidean distance of 2 set of points S1, S2 after performing Procrustes alignment.
+    Args:
+        S1 (torch.Tensor): First set of points of shape (B, N, 3).
+        S2 (torch.Tensor): Second set of points of shape (B, N, 3).
+    Returns:
+        (np.array): Reconstruction error.
+    """
+    S1_hat = compute_similarity_transform(S1, S2)
+    re = torch.sqrt( ((S1_hat - S2)** 2).sum(dim=-1)).mean(dim=-1)
+    return re
+
+def eval_pose(pred_joints, gt_joints) -> Tuple[np.array, np.array]:
+    """
+    Compute joint errors in mm before and after Procrustes alignment.
+    Args:
+        pred_joints (torch.Tensor): Predicted 3D joints of shape (B, N, 3).
+        gt_joints (torch.Tensor): Ground truth 3D joints of shape (B, N, 3).
+    Returns:
+        Tuple[np.array, np.array]: Joint errors in mm before and after alignment.
+    """
+    # Absolute error (MPJPE)
+    mpjpe = torch.sqrt(((pred_joints - gt_joints) ** 2).sum(dim=-1)).mean(dim=-1).cpu().numpy()
+
+    # Reconstruction_error
+    r_error = reconstruction_error(pred_joints, gt_joints).cpu().numpy()
+    return 1000 * mpjpe, 1000 * r_error
+
+class Evaluator:
+
+    def __init__(self,
+                 dataset_length: int,
+                 dataset: str,
+                 keypoint_list: List,
+                 pelvis_ind: int,
+                 metrics: List = ['mode_mpjpe', 'mode_re', 'min_mpjpe', 'min_re'],
+                 preds: List = ['vertices', 'keypoints_3d'],
+                 pck_thresholds: Optional[List] = None):
+        """
+        Class used for evaluating trained models on different 3D pose datasets.
+        Args:
+            dataset_length (int): Total dataset length.
+            keypoint_list [List]: List of keypoints used for evaluation.
+            pelvis_ind (int): Index of pelvis keypoint; used for aligning the predictions and ground truth.
+            metrics [List]: List of evaluation metrics to record.
+        """
+        self.dataset_length = dataset_length
+        self.dataset = dataset
+        self.keypoint_list = keypoint_list
+        self.pelvis_ind = pelvis_ind
+        self.metrics = metrics
+        self.preds = preds
+        if self.metrics is not None:
+            for metric in self.metrics:
+                setattr(self, metric, np.zeros((dataset_length,)))
+        if self.preds is not None:
+            for pred in self.preds:
+                if pred == 'vertices':
+                    self.vertices = np.zeros((dataset_length, 778, 3))
+                if pred == 'keypoints_3d':
+                    self.keypoints_3d = np.zeros((dataset_length, 21, 3))
+        self.counter = 0
+        if pck_thresholds is None:
+            self.pck_evaluator = None
+        else:
+            self.pck_evaluator = EvaluatorPCK(pck_thresholds)
+
+    def log(self):
+        """
+        Print current evaluation metrics
+        """
+        if self.counter == 0:
+            print('Evaluation has not started')
+            return
+        print(f'{self.counter} / {self.dataset_length} samples')
+        if self.pck_evaluator is not None:
+            self.pck_evaluator.log()
+        if self.metrics is not None:
+            for metric in self.metrics:
+                if metric in ['mode_mpjpe', 'mode_re', 'min_mpjpe', 'min_re']:
+                    unit = 'mm'
+                else:
+                    unit = ''
+                print(f'{metric}: {getattr(self, metric)[:self.counter].mean()} {unit}')
+        print('***')
+
+    def get_metrics_dict(self) -> Dict:
+        """
+        Returns:
+            Dict: Dictionary of evaluation metrics.
+        """
+        d1 = {metric: getattr(self, metric)[:self.counter].mean() for metric in self.metrics}
+        if self.pck_evaluator is not None:
+            d2 = self.pck_evaluator.get_metrics_dict()
+            d1.update(d2)
+        return d1
+
+    def get_preds_dict(self) -> Dict:
+        """
+        Returns:
+            Dict: Dictionary of evaluation preds.
+        """
+        d1 = {pred: getattr(self, pred)[:self.counter] for pred in self.preds}
+        return d1
+
+    def __call__(self, output: Dict, batch: Dict, opt_output: Optional[Dict] = None):
+        """
+        Evaluate current batch.
+        Args:
+            output (Dict): Regression output.
+            batch (Dict): Dictionary containing images and their corresponding annotations.
+            opt_output (Dict): Optimization output.
+        """
+        if self.pck_evaluator is not None:
+            self.pck_evaluator(output, batch, opt_output)
+
+        pred_keypoints_3d = output['pred_keypoints_3d'].detach()
+        pred_keypoints_3d = pred_keypoints_3d[:,None,:,:]
+        batch_size = pred_keypoints_3d.shape[0]
+        num_samples = pred_keypoints_3d.shape[1]
+        gt_keypoints_3d = batch['keypoints_3d'][:, :, :-1].unsqueeze(1).repeat(1, num_samples, 1, 1)
+        pred_vertices = output['pred_vertices'].detach()
+
+        # Align predictions and ground truth such that the pelvis location is at the origin
+        pred_keypoints_3d -= pred_keypoints_3d[:, :, [self.pelvis_ind]]
+        gt_keypoints_3d -= gt_keypoints_3d[:, :, [self.pelvis_ind]]
+
+        # Compute joint errors
+        mpjpe, re = eval_pose(pred_keypoints_3d.reshape(batch_size * num_samples, -1, 3)[:, self.keypoint_list], gt_keypoints_3d.reshape(batch_size * num_samples, -1 ,3)[:, self.keypoint_list])
+        mpjpe = mpjpe.reshape(batch_size, num_samples)
+        re = re.reshape(batch_size, num_samples)
+
+        # Compute 2d keypoint errors
+        bbox_expand_factor = batch['bbox_expand_factor'][:,None,None,None].detach()
+        pred_keypoints_2d = output['pred_keypoints_2d'].detach()
+        pred_keypoints_2d = pred_keypoints_2d[:,None,:,:]*bbox_expand_factor
+        gt_keypoints_2d = batch['keypoints_2d'][:,None,:,:].repeat(1, num_samples, 1, 1)*bbox_expand_factor
+        conf = gt_keypoints_2d[:, :, :, -1].clone()
+        kp_err = torch.nn.functional.mse_loss(
+                        pred_keypoints_2d,
+                        gt_keypoints_2d[:, :, :, :-1],
+                        reduction='none'
+                    ).sum(dim=3)
+        kp_l2_loss = (conf * kp_err).mean(dim=2)
+        kp_l2_loss = kp_l2_loss.detach().cpu().numpy()
+
+        # Compute joint errors after optimization, if available.
+        if opt_output is not None:
+            opt_keypoints_3d = opt_output['model_joints']
+            opt_keypoints_3d -= opt_keypoints_3d[:, [self.pelvis_ind]]
+            opt_mpjpe, opt_re = eval_pose(opt_keypoints_3d[:, self.keypoint_list], gt_keypoints_3d[:, 0, self.keypoint_list])
+
+        # The 0-th sample always corresponds to the mode
+        if hasattr(self, 'mode_mpjpe'):
+            mode_mpjpe = mpjpe[:, 0]
+            self.mode_mpjpe[self.counter:self.counter+batch_size] = mode_mpjpe
+        if hasattr(self, 'mode_re'):
+            mode_re = re[:, 0]
+            self.mode_re[self.counter:self.counter+batch_size] = mode_re
+        if hasattr(self, 'mode_kpl2'):
+            mode_kpl2 = kp_l2_loss[:, 0]
+            self.mode_kpl2[self.counter:self.counter+batch_size] = mode_kpl2
+        if hasattr(self, 'min_mpjpe'):
+            min_mpjpe = mpjpe.min(axis=-1)
+            self.min_mpjpe[self.counter:self.counter+batch_size] = min_mpjpe
+        if hasattr(self, 'min_re'):
+            min_re = re.min(axis=-1)
+            self.min_re[self.counter:self.counter+batch_size] = min_re
+        if hasattr(self, 'min_kpl2'):
+            min_kpl2 = kp_l2_loss.min(axis=-1)
+            self.min_kpl2[self.counter:self.counter+batch_size] = min_kpl2
+        if hasattr(self, 'opt_mpjpe'):
+            self.opt_mpjpe[self.counter:self.counter+batch_size] = opt_mpjpe
+        if hasattr(self, 'opt_re'):
+            self.opt_re[self.counter:self.counter+batch_size] = opt_re
+        if hasattr(self, 'vertices'):
+            self.vertices[self.counter:self.counter+batch_size] = pred_vertices.cpu().numpy()
+        if hasattr(self, 'keypoints_3d'):
+            if self.dataset == 'HO3D-VAL':
+                pred_keypoints_3d = pred_keypoints_3d[:,:,[0,5,6,7,9,10,11,17,18,19,13,14,15,1,2,3,4,8,12,16,20]]
+            self.keypoints_3d[self.counter:self.counter+batch_size] = pred_keypoints_3d.squeeze().cpu().numpy()
+
+        self.counter += batch_size
+
+        if hasattr(self, 'mode_mpjpe') and hasattr(self, 'mode_re'):
+            return {
+                'mode_mpjpe': mode_mpjpe,
+                'mode_re': mode_re,
+            }
+        else:
+            return {}
+
+
+class EvaluatorPCK:
+
+    def __init__(self, thresholds: List = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5],):
+        """
+        Class used for evaluating trained models on different 3D pose datasets.
+        Args:
+            thresholds [List]: List of PCK thresholds to evaluate.
+            metrics [List]: List of evaluation metrics to record.
+        """
+        self.thresholds = thresholds
+        self.pred_kp_2d = []
+        self.gt_kp_2d = []
+        self.gt_conf_2d = []
+        self.scale = []
+        self.counter = 0
+
+    def log(self):
+        """
+        Print current evaluation metrics
+        """
+        if self.counter == 0:
+            print('Evaluation has not started')
+            return
+        print(f'{self.counter} samples')
+        metrics_dict = self.get_metrics_dict()
+        for metric in metrics_dict:
+            print(f'{metric}: {metrics_dict[metric]}')
+        print('***')
+
+    def get_metrics_dict(self) -> Dict:
+        """
+        Returns:
+            Dict: Dictionary of evaluation metrics.
+        """
+        pcks = self.compute_pcks()
+        metrics = {}
+        for thr, (acc,avg_acc,cnt) in zip(self.thresholds, pcks):
+            metrics.update({f'kp{i}_pck_{thr}': float(a) for i, a in enumerate(acc) if a>=0})
+            metrics.update({f'kpAvg_pck_{thr}': float(avg_acc)})
+        return metrics
+
+    def compute_pcks(self):
+        pred_kp_2d = np.concatenate(self.pred_kp_2d, axis=0)
+        gt_kp_2d = np.concatenate(self.gt_kp_2d, axis=0)
+        gt_conf_2d = np.concatenate(self.gt_conf_2d, axis=0)
+        scale = np.concatenate(self.scale, axis=0)
+        assert pred_kp_2d.shape == gt_kp_2d.shape
+        assert pred_kp_2d[..., 0].shape == gt_conf_2d.shape
+        assert pred_kp_2d.shape[1] == 1 # num_samples
+        assert scale.shape[0] == gt_conf_2d.shape[0] # num_samples
+
+        pcks = [
+            self.keypoint_pck_accuracy(
+                pred_kp_2d[:, 0, :, :],
+                gt_kp_2d[:, 0, :, :],
+                gt_conf_2d[:, 0, :]>0.5,
+                thr=thr,
+                scale = scale[:,None]
+            )
+            for thr in self.thresholds
+        ]
+        return pcks
+
+    def keypoint_pck_accuracy(self, pred, gt, conf, thr, scale):
+        dist = np.sqrt(np.sum((pred-gt)**2, axis=2))
+        all_joints = conf>0.5
+        correct_joints = np.logical_and(dist<=scale*thr, all_joints)
+        pck = correct_joints.sum(axis=0)/all_joints.sum(axis=0)
+        return pck, pck.mean(), pck.shape[0]
+
+    def __call__(self, output: Dict, batch: Dict, opt_output: Optional[Dict] = None):
+        """
+        Evaluate current batch.
+        Args:
+            output (Dict): Regression output.
+            batch (Dict): Dictionary containing images and their corresponding annotations.
+            opt_output (Dict): Optimization output.
+        """
+        pred_keypoints_2d = output['pred_keypoints_2d'].detach()
+        num_samples = 1
+        batch_size = pred_keypoints_2d.shape[0]
+
+        right = batch['right'].detach()
+        pred_keypoints_2d[:,:,0] = (2*right[:,None]-1)*pred_keypoints_2d[:,:,0]
+        box_size = batch['box_size'].detach()
+        box_center = batch['box_center'].detach()
+        bbox_expand_factor = batch['bbox_expand_factor'].detach()
+        scale = box_size/bbox_expand_factor
+        bbox_expand_factor = bbox_expand_factor[:,None,None,None]
+        pred_keypoints_2d = pred_keypoints_2d*box_size[:,None,None]+box_center[:,None]
+        pred_keypoints_2d = pred_keypoints_2d[:,None,:,:]
+        gt_keypoints_2d = batch['orig_keypoints_2d'][:,None,:,:].repeat(1, num_samples, 1, 1)
+        
+        self.pred_kp_2d.append(pred_keypoints_2d[:, :, :, :2].detach().cpu().numpy())
+        self.gt_conf_2d.append(gt_keypoints_2d[:, :, :, -1].detach().cpu().numpy())
+        self.gt_kp_2d.append(gt_keypoints_2d[:, :, :, :2].detach().cpu().numpy())
+        self.scale.append(scale.detach().cpu().numpy())
+
+        self.counter += batch_size
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/pylogger.py b/phantom/submodules/phantom-hamer/hamer/utils/pylogger.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ffa71893ec20acde65e44d899334a38d8d1333
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/pylogger.py
@@ -0,0 +1,17 @@
+import logging
+
+from pytorch_lightning.utilities import rank_zero_only
+
+
+def get_pylogger(name=__name__) -> logging.Logger:
+    """Initializes multi-GPU-friendly python command line logger."""
+
+    logger = logging.getLogger(name)
+
+    # this ensures all logging levels get marked with the rank zero decorator
+    # otherwise logs would get multiplied for each GPU process in multi-GPU setup
+    logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical")
+    for level in logging_levels:
+        setattr(logger, level, rank_zero_only(getattr(logger, level)))
+
+    return logger
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/render_openpose.py b/phantom/submodules/phantom-hamer/hamer/utils/render_openpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eb7784e125d40e57eca4cf1e470f43e39654dae
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/render_openpose.py
@@ -0,0 +1,191 @@
+"""
+Render OpenPose keypoints.
+Code was ported to Python from the official C++ implementation https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/utilities/keypoint.cpp
+"""
+import cv2
+import math
+import numpy as np
+from typing import List, Tuple
+
+def get_keypoints_rectangle(keypoints: np.array, threshold: float) -> Tuple[float, float, float]:
+    """
+    Compute rectangle enclosing keypoints above the threshold.
+    Args:
+        keypoints (np.array): Keypoint array of shape (N, 3).
+        threshold (float): Confidence visualization threshold.
+    Returns:
+        Tuple[float, float, float]: Rectangle width, height and area.
+    """
+    valid_ind = keypoints[:, -1] > threshold
+    if valid_ind.sum() > 0:
+        valid_keypoints = keypoints[valid_ind][:, :-1]
+        max_x = valid_keypoints[:,0].max()
+        max_y = valid_keypoints[:,1].max()
+        min_x = valid_keypoints[:,0].min()
+        min_y = valid_keypoints[:,1].min()
+        width = max_x - min_x
+        height = max_y - min_y
+        area = width * height
+        return width, height, area
+    else:
+        return 0,0,0
+
+def render_keypoints(img: np.array,
+                     keypoints: np.array,
+                     pairs: List,
+                     colors: List,
+                     thickness_circle_ratio: float,
+                     thickness_line_ratio_wrt_circle: float,
+                     pose_scales: List,
+                     threshold: float = 0.1,
+                     alpha: float = 1.0) -> np.array:
+    """
+    Render keypoints on input image.
+    Args:
+        img (np.array): Input image of shape (H, W, 3) with pixel values in the [0,255] range.
+        keypoints (np.array): Keypoint array of shape (N, 3).
+        pairs (List): List of keypoint pairs per limb.
+        colors: (List): List of colors per keypoint.
+        thickness_circle_ratio (float): Circle thickness ratio.
+        thickness_line_ratio_wrt_circle (float): Line thickness ratio wrt the circle.
+        pose_scales (List): List of pose scales.
+        threshold (float): Only visualize keypoints with confidence above the threshold.
+    Returns:
+        (np.array): Image of shape (H, W, 3) with keypoints drawn on top of the original image. 
+    """
+    img_orig = img.copy()
+    width, height = img.shape[1], img.shape[2]
+    area = width * height
+
+    lineType = 8
+    shift = 0
+    numberColors = len(colors)
+    thresholdRectangle = 0.1
+
+    person_width, person_height, person_area = get_keypoints_rectangle(keypoints, thresholdRectangle)
+    if person_area > 0:
+        ratioAreas = min(1, max(person_width / width, person_height / height))
+        thicknessRatio = np.maximum(np.round(math.sqrt(area) * thickness_circle_ratio * ratioAreas), 2)
+        thicknessCircle = np.maximum(1, thicknessRatio if ratioAreas > 0.05 else -np.ones_like(thicknessRatio))
+        thicknessLine = np.maximum(1, np.round(thicknessRatio * thickness_line_ratio_wrt_circle))
+        radius = thicknessRatio / 2
+
+        img = np.ascontiguousarray(img.copy())
+        for i, pair in enumerate(pairs):
+            index1, index2 = pair
+            if keypoints[index1, -1] > threshold and keypoints[index2, -1] > threshold:
+                thicknessLineScaled = int(round(min(thicknessLine[index1], thicknessLine[index2]) * pose_scales[0]))
+                colorIndex = index2
+                color = colors[colorIndex % numberColors]
+                keypoint1 = keypoints[index1, :-1].astype(np.int32)
+                keypoint2 = keypoints[index2, :-1].astype(np.int32)
+                cv2.line(img, tuple(keypoint1.tolist()), tuple(keypoint2.tolist()), tuple(color.tolist()), thicknessLineScaled, lineType, shift)
+        for part in range(len(keypoints)):
+            faceIndex = part
+            if keypoints[faceIndex, -1] > threshold:
+                radiusScaled = int(round(radius[faceIndex] * pose_scales[0]))
+                thicknessCircleScaled = int(round(thicknessCircle[faceIndex] * pose_scales[0]))
+                colorIndex = part
+                color = colors[colorIndex % numberColors]
+                center = keypoints[faceIndex, :-1].astype(np.int32)
+                cv2.circle(img, tuple(center.tolist()), radiusScaled, tuple(color.tolist()), thicknessCircleScaled, lineType, shift)
+    return img
+
+def render_hand_keypoints(img, right_hand_keypoints, threshold=0.1, use_confidence=False, map_fn=lambda x: np.ones_like(x), alpha=1.0):
+    if use_confidence and map_fn is not None:
+        #thicknessCircleRatioLeft = 1./50 * map_fn(left_hand_keypoints[:, -1])
+        thicknessCircleRatioRight = 1./50 * map_fn(right_hand_keypoints[:, -1])
+    else:
+        #thicknessCircleRatioLeft = 1./50 * np.ones(left_hand_keypoints.shape[0])
+        thicknessCircleRatioRight = 1./50 * np.ones(right_hand_keypoints.shape[0])
+    thicknessLineRatioWRTCircle = 0.75
+    pairs = [0,1,  1,2,  2,3,  3,4,  0,5,  5,6,  6,7,  7,8,  0,9,  9,10,  10,11,  11,12,  0,13,  13,14,  14,15,  15,16,  0,17,  17,18,  18,19,  19,20]
+    pairs = np.array(pairs).reshape(-1,2)
+
+    colors = [100.,  100.,  100.,
+              100.,    0.,    0.,
+              150.,    0.,    0.,
+              200.,    0.,    0.,
+              255.,    0.,    0.,
+              100.,  100.,    0.,
+              150.,  150.,    0.,
+              200.,  200.,    0.,
+              255.,  255.,    0.,
+                0.,  100.,   50.,
+                0.,  150.,   75.,
+                0.,  200.,  100.,
+                0.,  255.,  125.,
+                0.,   50.,  100.,
+                0.,   75.,  150.,
+                0.,  100.,  200.,
+                0.,  125.,  255.,
+              100.,    0.,  100.,
+              150.,    0.,  150.,
+              200.,    0.,  200.,
+              255.,    0.,  255.]
+    colors = np.array(colors).reshape(-1,3)
+    #colors = np.zeros_like(colors)
+    poseScales = [1]
+    #img = render_keypoints(img, left_hand_keypoints, pairs, colors, thicknessCircleRatioLeft, thicknessLineRatioWRTCircle, poseScales, threshold, alpha=alpha)
+    img = render_keypoints(img, right_hand_keypoints, pairs, colors, thicknessCircleRatioRight, thicknessLineRatioWRTCircle, poseScales, threshold, alpha=alpha)
+    #img = render_keypoints(img, right_hand_keypoints, pairs, colors, thickness_circle_ratio, thickness_line_ratio_wrt_circle, pose_scales, 0.1)
+    return img
+
+def render_body_keypoints(img: np.array,
+                          body_keypoints: np.array) -> np.array:
+    """
+    Render OpenPose body keypoints on input image.
+    Args:
+        img (np.array): Input image of shape (H, W, 3) with pixel values in the [0,255] range.
+        body_keypoints (np.array): Keypoint array of shape (N, 3); 3 <====> (x, y, confidence).
+    Returns:
+        (np.array): Image of shape (H, W, 3) with keypoints drawn on top of the original image. 
+    """
+
+    thickness_circle_ratio = 1./75. * np.ones(body_keypoints.shape[0])
+    thickness_line_ratio_wrt_circle = 0.75
+    pairs = []
+    pairs = [1,8,1,2,1,5,2,3,3,4,5,6,6,7,8,9,9,10,10,11,8,12,12,13,13,14,1,0,0,15,15,17,0,16,16,18,14,19,19,20,14,21,11,22,22,23,11,24]
+    pairs = np.array(pairs).reshape(-1,2)
+    colors = [255.,     0.,     85.,
+              255.,     0.,     0.,
+              255.,    85.,     0.,
+              255.,   170.,     0.,
+              255.,   255.,     0.,
+              170.,   255.,     0.,
+               85.,   255.,     0.,
+                0.,   255.,     0.,
+              255.,     0.,     0.,
+                0.,   255.,    85.,
+                0.,   255.,   170.,
+                0.,   255.,   255.,
+                0.,   170.,   255.,
+                0.,    85.,   255.,
+                0.,     0.,   255.,
+              255.,     0.,   170.,
+              170.,     0.,   255.,
+              255.,     0.,   255.,
+               85.,     0.,   255.,
+                0.,     0.,   255.,
+                0.,     0.,   255.,
+                0.,     0.,   255.,
+                0.,   255.,   255.,
+                0.,   255.,   255.,
+                0.,   255.,   255.]
+    colors = np.array(colors).reshape(-1,3)
+    pose_scales = [1]
+    return render_keypoints(img, body_keypoints, pairs, colors, thickness_circle_ratio, thickness_line_ratio_wrt_circle, pose_scales, 0.1)
+
+def render_openpose(img: np.array,
+                    hand_keypoints: np.array) -> np.array:
+    """
+    Render keypoints in the OpenPose format on input image.
+    Args:
+        img (np.array): Input image of shape (H, W, 3) with pixel values in the [0,255] range.
+        body_keypoints (np.array): Keypoint array of shape (N, 3); 3 <====> (x, y, confidence).
+    Returns:
+        (np.array): Image of shape (H, W, 3) with keypoints drawn on top of the original image. 
+    """
+    #img = render_body_keypoints(img, body_keypoints)
+    img = render_hand_keypoints(img, hand_keypoints)
+    return img
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/renderer.py b/phantom/submodules/phantom-hamer/hamer/utils/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e161bb05921e52a684427e3eb87c4f8739a5d89
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/renderer.py
@@ -0,0 +1,423 @@
+import os
+if 'PYOPENGL_PLATFORM' not in os.environ:
+    os.environ['PYOPENGL_PLATFORM'] = 'egl'
+import torch
+import numpy as np
+import pyrender
+import trimesh
+import cv2
+from yacs.config import CfgNode
+from typing import List, Optional
+
+def cam_crop_to_full(cam_bbox, box_center, box_size, img_size, focal_length=5000.):
+    # Convert cam_bbox to full image
+    img_w, img_h = img_size[:, 0], img_size[:, 1]
+    cx, cy, b = box_center[:, 0], box_center[:, 1], box_size
+    w_2, h_2 = img_w / 2., img_h / 2.
+    bs = b * cam_bbox[:, 0] + 1e-9
+    tz = 2 * focal_length / bs
+    tx = (2 * (cx - w_2) / bs) + cam_bbox[:, 1]
+    ty = (2 * (cy - h_2) / bs) + cam_bbox[:, 2]
+    full_cam = torch.stack([tx, ty, tz], dim=-1)
+    return full_cam
+
+def get_light_poses(n_lights=5, elevation=np.pi / 3, dist=12):
+    # get lights in a circle around origin at elevation
+    thetas = elevation * np.ones(n_lights)
+    phis = 2 * np.pi * np.arange(n_lights) / n_lights
+    poses = []
+    trans = make_translation(torch.tensor([0, 0, dist]))
+    for phi, theta in zip(phis, thetas):
+        rot = make_rotation(rx=-theta, ry=phi, order="xyz")
+        poses.append((rot @ trans).numpy())
+    return poses
+
+def make_translation(t):
+    return make_4x4_pose(torch.eye(3), t)
+
+def make_rotation(rx=0, ry=0, rz=0, order="xyz"):
+    Rx = rotx(rx)
+    Ry = roty(ry)
+    Rz = rotz(rz)
+    if order == "xyz":
+        R = Rz @ Ry @ Rx
+    elif order == "xzy":
+        R = Ry @ Rz @ Rx
+    elif order == "yxz":
+        R = Rz @ Rx @ Ry
+    elif order == "yzx":
+        R = Rx @ Rz @ Ry
+    elif order == "zyx":
+        R = Rx @ Ry @ Rz
+    elif order == "zxy":
+        R = Ry @ Rx @ Rz
+    return make_4x4_pose(R, torch.zeros(3))
+
+def make_4x4_pose(R, t):
+    """
+    :param R (*, 3, 3)
+    :param t (*, 3)
+    return (*, 4, 4)
+    """
+    dims = R.shape[:-2]
+    pose_3x4 = torch.cat([R, t.view(*dims, 3, 1)], dim=-1)
+    bottom = (
+        torch.tensor([0, 0, 0, 1], device=R.device)
+        .reshape(*(1,) * len(dims), 1, 4)
+        .expand(*dims, 1, 4)
+    )
+    return torch.cat([pose_3x4, bottom], dim=-2)
+
+
+def rotx(theta):
+    return torch.tensor(
+        [
+            [1, 0, 0],
+            [0, np.cos(theta), -np.sin(theta)],
+            [0, np.sin(theta), np.cos(theta)],
+        ],
+        dtype=torch.float32,
+    )
+
+
+def roty(theta):
+    return torch.tensor(
+        [
+            [np.cos(theta), 0, np.sin(theta)],
+            [0, 1, 0],
+            [-np.sin(theta), 0, np.cos(theta)],
+        ],
+        dtype=torch.float32,
+    )
+
+
+def rotz(theta):
+    return torch.tensor(
+        [
+            [np.cos(theta), -np.sin(theta), 0],
+            [np.sin(theta), np.cos(theta), 0],
+            [0, 0, 1],
+        ],
+        dtype=torch.float32,
+    )
+    
+
+def create_raymond_lights() -> List[pyrender.Node]:
+    """
+    Return raymond light nodes for the scene.
+    """
+    thetas = np.pi * np.array([1.0 / 6.0, 1.0 / 6.0, 1.0 / 6.0])
+    phis = np.pi * np.array([0.0, 2.0 / 3.0, 4.0 / 3.0])
+
+    nodes = []
+
+    for phi, theta in zip(phis, thetas):
+        xp = np.sin(theta) * np.cos(phi)
+        yp = np.sin(theta) * np.sin(phi)
+        zp = np.cos(theta)
+
+        z = np.array([xp, yp, zp])
+        z = z / np.linalg.norm(z)
+        x = np.array([-z[1], z[0], 0.0])
+        if np.linalg.norm(x) == 0:
+            x = np.array([1.0, 0.0, 0.0])
+        x = x / np.linalg.norm(x)
+        y = np.cross(z, x)
+
+        matrix = np.eye(4)
+        matrix[:3,:3] = np.c_[x,y,z]
+        nodes.append(pyrender.Node(
+            light=pyrender.DirectionalLight(color=np.ones(3), intensity=1.0),
+            matrix=matrix
+        ))
+
+    return nodes
+
+class Renderer:
+
+    def __init__(self, cfg: CfgNode, faces: np.array):
+        """
+        Wrapper around the pyrender renderer to render MANO meshes.
+        Args:
+            cfg (CfgNode): Model config file.
+            faces (np.array): Array of shape (F, 3) containing the mesh faces.
+        """
+        self.cfg = cfg
+        self.focal_length = cfg.EXTRA.FOCAL_LENGTH
+        self.img_res = cfg.MODEL.IMAGE_SIZE
+
+        # add faces that make the hand mesh watertight
+        faces_new = np.array([[92, 38, 234],
+                              [234, 38, 239],
+                              [38, 122, 239],
+                              [239, 122, 279],
+                              [122, 118, 279],
+                              [279, 118, 215],
+                              [118, 117, 215],
+                              [215, 117, 214],
+                              [117, 119, 214],
+                              [214, 119, 121],
+                              [119, 120, 121],
+                              [121, 120, 78],
+                              [120, 108, 78],
+                              [78, 108, 79]])
+        faces = np.concatenate([faces, faces_new], axis=0)
+        
+        self.camera_center = [self.img_res // 2, self.img_res // 2]
+        self.faces = faces
+        self.faces_left = self.faces[:,[0,2,1]]
+
+    def __call__(self,
+                vertices: np.array,
+                camera_translation: np.array,
+                image: torch.Tensor,
+                full_frame: bool = False,
+                imgname: Optional[str] = None,
+                side_view=False, rot_angle=90,
+                mesh_base_color=(1.0, 1.0, 0.9),
+                scene_bg_color=(0,0,0),
+                return_rgba=False,
+                ) -> np.array:
+        """
+        Render meshes on input image
+        Args:
+            vertices (np.array): Array of shape (V, 3) containing the mesh vertices.
+            camera_translation (np.array): Array of shape (3,) with the camera translation.
+            image (torch.Tensor): Tensor of shape (3, H, W) containing the image crop with normalized pixel values.
+            full_frame (bool): If True, then render on the full image.
+            imgname (Optional[str]): Contains the original image filenamee. Used only if full_frame == True.
+        """
+        
+        if full_frame:
+            image = cv2.imread(imgname).astype(np.float32)[:, :, ::-1] / 255.
+        else:
+            image = image.clone() * torch.tensor(self.cfg.MODEL.IMAGE_STD, device=image.device).reshape(3,1,1)
+            image = image + torch.tensor(self.cfg.MODEL.IMAGE_MEAN, device=image.device).reshape(3,1,1)
+            image = image.permute(1, 2, 0).cpu().numpy()
+
+        renderer = pyrender.OffscreenRenderer(viewport_width=image.shape[1],
+                                              viewport_height=image.shape[0],
+                                              point_size=1.0)
+        material = pyrender.MetallicRoughnessMaterial(
+            metallicFactor=0.0,
+            alphaMode='OPAQUE',
+            baseColorFactor=(*mesh_base_color, 1.0))
+
+        camera_translation[0] *= -1.
+
+        mesh = trimesh.Trimesh(vertices.copy(), self.faces.copy())
+        if side_view:
+            rot = trimesh.transformations.rotation_matrix(
+                np.radians(rot_angle), [0, 1, 0])
+            mesh.apply_transform(rot)
+        rot = trimesh.transformations.rotation_matrix(
+            np.radians(180), [1, 0, 0])
+        mesh.apply_transform(rot)
+        mesh = pyrender.Mesh.from_trimesh(mesh, material=material)
+
+        scene = pyrender.Scene(bg_color=[*scene_bg_color, 0.0],
+                               ambient_light=(0.3, 0.3, 0.3))
+        scene.add(mesh, 'mesh')
+
+        camera_pose = np.eye(4)
+        camera_pose[:3, 3] = camera_translation
+        camera_center = [image.shape[1] / 2., image.shape[0] / 2.]
+        camera = pyrender.IntrinsicsCamera(fx=self.focal_length, fy=self.focal_length,
+                                           cx=camera_center[0], cy=camera_center[1], zfar=1e12)
+        scene.add(camera, pose=camera_pose)
+
+
+        light_nodes = create_raymond_lights()
+        for node in light_nodes:
+            scene.add_node(node)
+
+        color, rend_depth = renderer.render(scene, flags=pyrender.RenderFlags.RGBA)
+        color = color.astype(np.float32) / 255.0
+        renderer.delete()
+
+        if return_rgba:
+            return color
+
+        valid_mask = (color[:, :, -1])[:, :, np.newaxis]
+        if not side_view:
+            output_img = (color[:, :, :3] * valid_mask + (1 - valid_mask) * image)
+        else:
+            output_img = color[:, :, :3]
+
+        output_img = output_img.astype(np.float32)
+        return output_img
+
+    def vertices_to_trimesh(self, vertices, camera_translation, mesh_base_color=(1.0, 1.0, 0.9), 
+                            rot_axis=[1,0,0], rot_angle=0, is_right=1):
+        # material = pyrender.MetallicRoughnessMaterial(
+        #     metallicFactor=0.0,
+        #     alphaMode='OPAQUE',
+        #     baseColorFactor=(*mesh_base_color, 1.0))
+        vertex_colors = np.array([(*mesh_base_color, 1.0)] * vertices.shape[0])
+        if is_right:
+            mesh = trimesh.Trimesh(vertices.copy() + camera_translation, self.faces.copy(), vertex_colors=vertex_colors)
+        else:
+            mesh = trimesh.Trimesh(vertices.copy() + camera_translation, self.faces_left.copy(), vertex_colors=vertex_colors)
+        # mesh = trimesh.Trimesh(vertices.copy(), self.faces.copy())
+        
+        rot = trimesh.transformations.rotation_matrix(
+                np.radians(rot_angle), rot_axis)
+        mesh.apply_transform(rot)
+
+        rot = trimesh.transformations.rotation_matrix(
+            np.radians(180), [1, 0, 0])
+        mesh.apply_transform(rot)
+        return mesh
+
+    def render_rgba(
+            self,
+            vertices: np.array,
+            cam_t = None,
+            rot=None,
+            rot_axis=[1,0,0],
+            rot_angle=0,
+            camera_z=3,
+            # camera_translation: np.array,
+            mesh_base_color=(1.0, 1.0, 0.9),
+            scene_bg_color=(0,0,0),
+            render_res=[256, 256],
+            focal_length=None,
+            is_right=None,
+        ):
+
+        renderer = pyrender.OffscreenRenderer(viewport_width=render_res[0],
+                                              viewport_height=render_res[1],
+                                              point_size=1.0)
+        # material = pyrender.MetallicRoughnessMaterial(
+        #     metallicFactor=0.0,
+        #     alphaMode='OPAQUE',
+        #     baseColorFactor=(*mesh_base_color, 1.0))
+
+        focal_length = focal_length if focal_length is not None else self.focal_length
+
+        if cam_t is not None:
+            camera_translation = cam_t.copy()
+            camera_translation[0] *= -1.
+        else:
+            camera_translation = np.array([0, 0, camera_z * focal_length/render_res[1]])
+
+        mesh = self.vertices_to_trimesh(vertices, np.array([0, 0, 0]), mesh_base_color, rot_axis, rot_angle, is_right=is_right)
+        mesh = pyrender.Mesh.from_trimesh(mesh)
+        # mesh = pyrender.Mesh.from_trimesh(mesh, material=material)
+
+        scene = pyrender.Scene(bg_color=[*scene_bg_color, 0.0],
+                               ambient_light=(0.3, 0.3, 0.3))
+        scene.add(mesh, 'mesh')
+
+        camera_pose = np.eye(4)
+        camera_pose[:3, 3] = camera_translation
+        camera_center = [render_res[0] / 2., render_res[1] / 2.]
+        camera = pyrender.IntrinsicsCamera(fx=focal_length, fy=focal_length,
+                                           cx=camera_center[0], cy=camera_center[1], zfar=1e12)
+
+        # Create camera node and add it to pyRender scene
+        camera_node = pyrender.Node(camera=camera, matrix=camera_pose)
+        scene.add_node(camera_node)
+        self.add_point_lighting(scene, camera_node)
+        self.add_lighting(scene, camera_node)
+
+        light_nodes = create_raymond_lights()
+        for node in light_nodes:
+            scene.add_node(node)
+
+        color, rend_depth = renderer.render(scene, flags=pyrender.RenderFlags.RGBA)
+        color = color.astype(np.float32) / 255.0
+        renderer.delete()
+
+        return color
+
+    def render_rgba_multiple(
+            self,
+            vertices: List[np.array],
+            cam_t: List[np.array],
+            rot_axis=[1,0,0],
+            rot_angle=0,
+            mesh_base_color=(1.0, 1.0, 0.9),
+            scene_bg_color=(0,0,0),
+            render_res=[256, 256],
+            focal_length=None,
+            is_right=None,
+        ):
+
+        renderer = pyrender.OffscreenRenderer(viewport_width=render_res[0],
+                                              viewport_height=render_res[1],
+                                              point_size=1.0)
+        # material = pyrender.MetallicRoughnessMaterial(
+        #     metallicFactor=0.0,
+        #     alphaMode='OPAQUE',
+        #     baseColorFactor=(*mesh_base_color, 1.0))
+
+        if is_right is None:
+            is_right = [1 for _ in range(len(vertices))]
+
+        mesh_list = [pyrender.Mesh.from_trimesh(self.vertices_to_trimesh(vvv, ttt.copy(), mesh_base_color, rot_axis, rot_angle, is_right=sss)) for vvv,ttt,sss in zip(vertices, cam_t, is_right)]
+
+        scene = pyrender.Scene(bg_color=[*scene_bg_color, 0.0],
+                               ambient_light=(0.3, 0.3, 0.3))
+        for i,mesh in enumerate(mesh_list):
+            scene.add(mesh, f'mesh_{i}')
+
+        camera_pose = np.eye(4)
+        # camera_pose[:3, 3] = camera_translation
+        camera_center = [render_res[0] / 2., render_res[1] / 2.]
+        focal_length = focal_length if focal_length is not None else self.focal_length
+        camera = pyrender.IntrinsicsCamera(fx=focal_length, fy=focal_length,
+                                           cx=camera_center[0], cy=camera_center[1], zfar=1e12)
+
+        # Create camera node and add it to pyRender scene
+        camera_node = pyrender.Node(camera=camera, matrix=camera_pose)
+        scene.add_node(camera_node)
+        self.add_point_lighting(scene, camera_node)
+        self.add_lighting(scene, camera_node)
+
+        light_nodes = create_raymond_lights()
+        for node in light_nodes:
+            scene.add_node(node)
+
+        color, rend_depth = renderer.render(scene, flags=pyrender.RenderFlags.RGBA)
+        color = color.astype(np.float32) / 255.0
+        renderer.delete()
+
+        return color
+
+    def add_lighting(self, scene, cam_node, color=np.ones(3), intensity=1.0):
+        # from phalp.visualize.py_renderer import get_light_poses
+        light_poses = get_light_poses()
+        light_poses.append(np.eye(4))
+        cam_pose = scene.get_pose(cam_node)
+        for i, pose in enumerate(light_poses):
+            matrix = cam_pose @ pose
+            node = pyrender.Node(
+                name=f"light-{i:02d}",
+                light=pyrender.DirectionalLight(color=color, intensity=intensity),
+                matrix=matrix,
+            )
+            if scene.has_node(node):
+                continue
+            scene.add_node(node)
+
+    def add_point_lighting(self, scene, cam_node, color=np.ones(3), intensity=1.0):
+        # from phalp.visualize.py_renderer import get_light_poses
+        light_poses = get_light_poses(dist=0.5)
+        light_poses.append(np.eye(4))
+        cam_pose = scene.get_pose(cam_node)
+        for i, pose in enumerate(light_poses):
+            matrix = cam_pose @ pose
+            # node = pyrender.Node(
+            #     name=f"light-{i:02d}",
+            #     light=pyrender.DirectionalLight(color=color, intensity=intensity),
+            #     matrix=matrix,
+            # )
+            node = pyrender.Node(
+                name=f"plight-{i:02d}",
+                light=pyrender.PointLight(color=color, intensity=intensity),
+                matrix=matrix,
+            )
+            if scene.has_node(node):
+                continue
+            scene.add_node(node)
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/rich_utils.py b/phantom/submodules/phantom-hamer/hamer/utils/rich_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..19f97494ed2958ec2c3d75c772360b5367f2dc7b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/rich_utils.py
@@ -0,0 +1,105 @@
+from pathlib import Path
+from typing import Sequence
+
+import rich
+import rich.syntax
+import rich.tree
+from hydra.core.hydra_config import HydraConfig
+from omegaconf import DictConfig, OmegaConf, open_dict
+from pytorch_lightning.utilities import rank_zero_only
+from rich.prompt import Prompt
+
+from . import pylogger
+
+log = pylogger.get_pylogger(__name__)
+
+
+@rank_zero_only
+def print_config_tree(
+    cfg: DictConfig,
+    print_order: Sequence[str] = (
+        "datamodule",
+        "model",
+        "callbacks",
+        "logger",
+        "trainer",
+        "paths",
+        "extras",
+    ),
+    resolve: bool = False,
+    save_to_file: bool = False,
+) -> None:
+    """Prints content of DictConfig using Rich library and its tree structure.
+
+    Args:
+        cfg (DictConfig): Configuration composed by Hydra.
+        print_order (Sequence[str], optional): Determines in what order config components are printed.
+        resolve (bool, optional): Whether to resolve reference fields of DictConfig.
+        save_to_file (bool, optional): Whether to export config to the hydra output folder.
+    """
+
+    style = "dim"
+    tree = rich.tree.Tree("CONFIG", style=style, guide_style=style)
+
+    queue = []
+
+    # add fields from `print_order` to queue
+    for field in print_order:
+        queue.append(field) if field in cfg else log.warning(
+            f"Field '{field}' not found in config. Skipping '{field}' config printing..."
+        )
+
+    # add all the other fields to queue (not specified in `print_order`)
+    for field in cfg:
+        if field not in queue:
+            queue.append(field)
+
+    # generate config tree from queue
+    for field in queue:
+        branch = tree.add(field, style=style, guide_style=style)
+
+        config_group = cfg[field]
+        if isinstance(config_group, DictConfig):
+            branch_content = OmegaConf.to_yaml(config_group, resolve=resolve)
+        else:
+            branch_content = str(config_group)
+
+        branch.add(rich.syntax.Syntax(branch_content, "yaml"))
+
+    # print config tree
+    rich.print(tree)
+
+    # save config tree to file
+    if save_to_file:
+        with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file:
+            rich.print(tree, file=file)
+
+
+@rank_zero_only
+def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None:
+    """Prompts user to input tags from command line if no tags are provided in config."""
+
+    if not cfg.get("tags"):
+        if "id" in HydraConfig().cfg.hydra.job:
+            raise ValueError("Specify tags before launching a multirun!")
+
+        log.warning("No tags provided in config. Prompting user to input tags...")
+        tags = Prompt.ask("Enter a list of comma separated tags", default="dev")
+        tags = [t.strip() for t in tags.split(",") if t != ""]
+
+        with open_dict(cfg):
+            cfg.tags = tags
+
+        log.info(f"Tags: {cfg.tags}")
+
+    if save_to_file:
+        with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file:
+            rich.print(cfg.tags, file=file)
+
+
+if __name__ == "__main__":
+    from hydra import compose, initialize
+
+    with initialize(version_base="1.2", config_path="../../configs"):
+        cfg = compose(config_name="train.yaml", return_hydra_config=False, overrides=[])
+        print_config_tree(cfg, resolve=False, save_to_file=False)
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/skeleton_renderer.py b/phantom/submodules/phantom-hamer/hamer/utils/skeleton_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..46a5df75bff887eab00984eeb5be3c1f6e752960
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/skeleton_renderer.py
@@ -0,0 +1,124 @@
+import torch
+import numpy as np
+import trimesh
+from typing import Optional
+from yacs.config import CfgNode
+
+from .geometry import perspective_projection
+from .render_openpose import render_openpose
+
+class SkeletonRenderer:
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Object used to render 3D keypoints. Faster for use during training.
+        Args:
+            cfg (CfgNode): Model config file.
+        """
+        self.cfg = cfg
+
+    def __call__(self,
+                 pred_keypoints_3d: torch.Tensor,
+                 gt_keypoints_3d: torch.Tensor,
+                 gt_keypoints_2d: torch.Tensor,
+                 images: Optional[np.array] = None,
+                 camera_translation: Optional[torch.Tensor] = None) -> np.array:
+        """
+        Render batch of 3D keypoints.
+        Args:
+            pred_keypoints_3d (torch.Tensor): Tensor of shape (B, S, N, 3) containing a batch of predicted 3D keypoints, with S samples per image.
+            gt_keypoints_3d (torch.Tensor): Tensor of shape (B, N, 4) containing corresponding ground truth 3D keypoints; last value is the confidence.
+            gt_keypoints_2d (torch.Tensor): Tensor of shape (B, N, 3) containing corresponding ground truth 2D keypoints.
+            images (torch.Tensor): Tensor of shape (B, H, W, 3) containing images with values in the [0,255] range.
+            camera_translation (torch.Tensor): Tensor of shape (B, 3) containing the camera translation.
+        Returns:
+            np.array : Image with the following layout. Each row contains the a) input image,
+                                                                              b) image with gt 2D keypoints,
+                                                                              c) image with projected gt 3D keypoints,
+                                                                              d_1, ... , d_S) image with projected predicted 3D keypoints,
+                                                                              e) gt 3D keypoints rendered from a side view,
+                                                                              f_1, ... , f_S) predicted 3D keypoints frorm a side view
+        """
+        batch_size = pred_keypoints_3d.shape[0]
+#        num_samples = pred_keypoints_3d.shape[1]
+        pred_keypoints_3d = pred_keypoints_3d.clone().cpu().float()
+        gt_keypoints_3d = gt_keypoints_3d.clone().cpu().float()
+        gt_keypoints_3d[:, :, :-1] = gt_keypoints_3d[:, :, :-1] - gt_keypoints_3d[:, [0], :-1] + pred_keypoints_3d[:, [0]]
+        gt_keypoints_2d = gt_keypoints_2d.clone().cpu().float().numpy()
+        gt_keypoints_2d[:, :, :-1] = self.cfg.MODEL.IMAGE_SIZE * (gt_keypoints_2d[:, :, :-1] + 1.0) / 2.0
+
+        #openpose_indices = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
+        #gt_indices = [12, 8, 7, 6, 9, 10, 11, 14, 2, 1, 0, 3, 4, 5]
+        #gt_indices = [25 + i for i in gt_indices]
+        openpose_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
+        gt_indices = openpose_indices
+        keypoints_to_render = torch.ones(batch_size, gt_keypoints_3d.shape[1], 1)
+        rotation = torch.eye(3).unsqueeze(0)
+        if camera_translation is None:
+            camera_translation = torch.tensor([0.0, 0.0, 2 * self.cfg.EXTRA.FOCAL_LENGTH / (0.8 * self.cfg.MODEL.IMAGE_SIZE)]).unsqueeze(0).repeat(batch_size, 1)
+        else:
+            camera_translation = camera_translation.cpu()
+
+        if images is None:
+            images = np.zeros((batch_size, self.cfg.MODEL.IMAGE_SIZE, self.cfg.MODEL.IMAGE_SIZE, 3))
+        focal_length = torch.tensor([self.cfg.EXTRA.FOCAL_LENGTH, self.cfg.EXTRA.FOCAL_LENGTH]).reshape(1, 2)
+        camera_center = torch.tensor([self.cfg.MODEL.IMAGE_SIZE, self.cfg.MODEL.IMAGE_SIZE], dtype=torch.float).reshape(1, 2) / 2.
+        gt_keypoints_3d_proj = perspective_projection(gt_keypoints_3d[:, :, :-1], rotation=rotation.repeat(batch_size, 1, 1), translation=camera_translation[:, :], focal_length=focal_length.repeat(batch_size, 1), camera_center=camera_center.repeat(batch_size, 1))
+        pred_keypoints_3d_proj = perspective_projection(pred_keypoints_3d.reshape(batch_size, -1, 3), rotation=rotation.repeat(batch_size, 1, 1), translation=camera_translation.reshape(batch_size, -1), focal_length=focal_length.repeat(batch_size, 1), camera_center=camera_center.repeat(batch_size, 1)).reshape(batch_size, -1, 2)
+        gt_keypoints_3d_proj = torch.cat([gt_keypoints_3d_proj, gt_keypoints_3d[:, :, [-1]]], dim=-1).cpu().numpy()
+        pred_keypoints_3d_proj = torch.cat([pred_keypoints_3d_proj, keypoints_to_render.reshape(batch_size, -1, 1)], dim=-1).cpu().numpy()
+        rows = []
+        # Rotate keypoints to visualize side view
+        R = torch.tensor(trimesh.transformations.rotation_matrix(np.radians(90), [0, 1, 0])[:3, :3]).float()
+        gt_keypoints_3d_side = gt_keypoints_3d.clone()
+        gt_keypoints_3d_side[:, :, :-1] = torch.einsum('bni,ij->bnj', gt_keypoints_3d_side[:, :, :-1], R)
+        pred_keypoints_3d_side = pred_keypoints_3d.clone()
+        pred_keypoints_3d_side = torch.einsum('bni,ij->bnj', pred_keypoints_3d_side, R)
+        gt_keypoints_3d_proj_side = perspective_projection(gt_keypoints_3d_side[:, :, :-1], rotation=rotation.repeat(batch_size, 1, 1), translation=camera_translation[:, :], focal_length=focal_length.repeat(batch_size, 1), camera_center=camera_center.repeat(batch_size, 1))
+        pred_keypoints_3d_proj_side = perspective_projection(pred_keypoints_3d_side.reshape(batch_size, -1, 3), rotation=rotation.repeat(batch_size, 1, 1), translation=camera_translation.reshape(batch_size, -1), focal_length=focal_length.repeat(batch_size, 1), camera_center=camera_center.repeat(batch_size, 1)).reshape(batch_size, -1, 2)
+        gt_keypoints_3d_proj_side = torch.cat([gt_keypoints_3d_proj_side, gt_keypoints_3d_side[:, :, [-1]]], dim=-1).cpu().numpy()
+        pred_keypoints_3d_proj_side = torch.cat([pred_keypoints_3d_proj_side, keypoints_to_render.reshape(batch_size, -1, 1)], dim=-1).cpu().numpy()
+        for i in range(batch_size):
+            img = images[i]
+            side_img = np.zeros((self.cfg.MODEL.IMAGE_SIZE, self.cfg.MODEL.IMAGE_SIZE, 3))
+            # gt 2D keypoints
+            body_keypoints_2d = gt_keypoints_2d[i, :21].copy()
+            for op, gt in zip(openpose_indices, gt_indices):
+                if gt_keypoints_2d[i, gt, -1] > body_keypoints_2d[op, -1]:
+                    body_keypoints_2d[op] = gt_keypoints_2d[i, gt]
+            gt_keypoints_img = render_openpose(img, body_keypoints_2d) / 255.
+            # gt 3D keypoints
+            body_keypoints_3d_proj = gt_keypoints_3d_proj[i, :21].copy()
+            for op, gt in zip(openpose_indices, gt_indices):
+                if gt_keypoints_3d_proj[i, gt, -1] > body_keypoints_3d_proj[op, -1]:
+                    body_keypoints_3d_proj[op] = gt_keypoints_3d_proj[i, gt]
+            gt_keypoints_3d_proj_img = render_openpose(img, body_keypoints_3d_proj) / 255.
+            # gt 3D keypoints from the side
+            body_keypoints_3d_proj = gt_keypoints_3d_proj_side[i, :21].copy()
+            for op, gt in zip(openpose_indices, gt_indices):
+                if gt_keypoints_3d_proj_side[i, gt, -1] > body_keypoints_3d_proj[op, -1]:
+                    body_keypoints_3d_proj[op] = gt_keypoints_3d_proj_side[i, gt]
+            gt_keypoints_3d_proj_img_side = render_openpose(side_img, body_keypoints_3d_proj) / 255.
+            # pred 3D keypoints
+            pred_keypoints_3d_proj_imgs = []
+            body_keypoints_3d_proj = pred_keypoints_3d_proj[i, :21].copy()
+            for op, gt in zip(openpose_indices, gt_indices):
+                if pred_keypoints_3d_proj[i, gt, -1] >= body_keypoints_3d_proj[op, -1]:
+                    body_keypoints_3d_proj[op] = pred_keypoints_3d_proj[i, gt]
+            pred_keypoints_3d_proj_imgs.append(render_openpose(img, body_keypoints_3d_proj) / 255.)
+            pred_keypoints_3d_proj_img = np.concatenate(pred_keypoints_3d_proj_imgs, axis=1)
+            # gt 3D keypoints from the side
+            pred_keypoints_3d_proj_imgs_side = []
+            body_keypoints_3d_proj = pred_keypoints_3d_proj_side[i, :21].copy()
+            for op, gt in zip(openpose_indices, gt_indices):
+                if pred_keypoints_3d_proj_side[i, gt, -1] >= body_keypoints_3d_proj[op, -1]:
+                    body_keypoints_3d_proj[op] = pred_keypoints_3d_proj_side[i, gt]
+            pred_keypoints_3d_proj_imgs_side.append(render_openpose(side_img, body_keypoints_3d_proj) / 255.)
+            pred_keypoints_3d_proj_img_side = np.concatenate(pred_keypoints_3d_proj_imgs_side, axis=1)
+            rows.append(np.concatenate((gt_keypoints_img, gt_keypoints_3d_proj_img, pred_keypoints_3d_proj_img, gt_keypoints_3d_proj_img_side, pred_keypoints_3d_proj_img_side), axis=1))
+        # Concatenate images
+        img = np.concatenate(rows, axis=0)
+        img[:, ::self.cfg.MODEL.IMAGE_SIZE, :] = 1.0
+        img[::self.cfg.MODEL.IMAGE_SIZE, :, :] = 1.0
+        img[:, (1+1+1)*self.cfg.MODEL.IMAGE_SIZE, :] = 0.5
+        return img
diff --git a/phantom/submodules/phantom-hamer/hamer/utils/utils_detectron2.py b/phantom/submodules/phantom-hamer/hamer/utils/utils_detectron2.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe01e02f8edbcbd5d545c6f3cb65aeb688a1dff4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/hamer/utils/utils_detectron2.py
@@ -0,0 +1,93 @@
+import detectron2.data.transforms as T
+import torch
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import CfgNode, instantiate
+from detectron2.data import MetadataCatalog
+from omegaconf import OmegaConf
+
+
+class DefaultPredictor_Lazy:
+    """Create a simple end-to-end predictor with the given config that runs on single device for a
+    single input image.
+
+    Compared to using the model directly, this class does the following additions:
+
+    1. Load checkpoint from the weights specified in config (cfg.MODEL.WEIGHTS).
+    2. Always take BGR image as the input and apply format conversion internally.
+    3. Apply resizing defined by the config (`cfg.INPUT.{MIN,MAX}_SIZE_TEST`).
+    4. Take one input image and produce a single output, instead of a batch.
+
+    This is meant for simple demo purposes, so it does the above steps automatically.
+    This is not meant for benchmarks or running complicated inference logic.
+    If you'd like to do anything more complicated, please refer to its source code as
+    examples to build and use the model manually.
+
+    Attributes:
+        metadata (Metadata): the metadata of the underlying dataset, obtained from
+            test dataset name in the config.
+
+
+    Examples:
+    ::
+        pred = DefaultPredictor(cfg)
+        inputs = cv2.imread("input.jpg")
+        outputs = pred(inputs)
+    """
+
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg: a yacs CfgNode or a omegaconf dict object.
+        """
+        if isinstance(cfg, CfgNode):
+            self.cfg = cfg.clone()  # cfg can be modified by model
+            self.model = build_model(self.cfg)  # noqa: F821
+            if len(cfg.DATASETS.TEST):
+                test_dataset = cfg.DATASETS.TEST[0]
+
+            checkpointer = DetectionCheckpointer(self.model)
+            checkpointer.load(cfg.MODEL.WEIGHTS)
+
+            self.aug = T.ResizeShortestEdge(
+                [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
+            )
+
+            self.input_format = cfg.INPUT.FORMAT
+        else:  # new LazyConfig
+            self.cfg = cfg
+            self.model = instantiate(cfg.model)
+            test_dataset = OmegaConf.select(cfg, "dataloader.test.dataset.names", default=None)
+            if isinstance(test_dataset, (list, tuple)):
+                test_dataset = test_dataset[0]
+
+            checkpointer = DetectionCheckpointer(self.model)
+            checkpointer.load(OmegaConf.select(cfg, "train.init_checkpoint", default=""))
+
+            mapper = instantiate(cfg.dataloader.test.mapper)
+            self.aug = mapper.augmentations
+            self.input_format = mapper.image_format
+
+        self.model.eval().cuda()
+        if test_dataset:
+            self.metadata = MetadataCatalog.get(test_dataset)
+        assert self.input_format in ["RGB", "BGR"], self.input_format
+
+    def __call__(self, original_image):
+        """
+        Args:
+            original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+
+        Returns:
+            predictions (dict):
+                the output of the model for one image only.
+                See :doc:`/tutorials/models` for details about the format.
+        """
+        with torch.no_grad():
+            if self.input_format == "RGB":
+                original_image = original_image[:, :, ::-1]
+            height, width = original_image.shape[:2]
+            image = self.aug(T.AugInput(original_image)).apply_image(original_image)
+            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+            inputs = {"image": image, "height": height, "width": width}
+            predictions = self.model([inputs])[0]
+            return predictions
diff --git a/phantom/submodules/phantom-hamer/setup.py b/phantom/submodules/phantom-hamer/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..59b1cc571c9b22a2b7fea30a1dab3efc42f05197
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/setup.py
@@ -0,0 +1,37 @@
+from setuptools import setup, find_packages
+
+print('Found packages:', find_packages())
+setup(
+    description='HaMeR as a package',
+    name='hamer',
+    packages=find_packages(),
+    install_requires=[
+        'gdown',
+        'numpy',
+        'opencv-python',
+        'pyrender',
+        'pytorch-lightning',
+        'scikit-image',
+        'smplx==0.1.28',
+        'torch',
+        'torchvision',
+        'yacs',
+        'detectron2 @ git+https://github.com/facebookresearch/detectron2',
+        'chumpy @ git+https://github.com/mattloper/chumpy',
+        'mmcv==1.3.9',
+        'timm',
+        'einops',
+        'xtcocotools',
+        'pandas',
+    ],
+    extras_require={
+        'all': [
+            'hydra-core',
+            'hydra-submitit-launcher',
+            'hydra-colorlog',
+            'pyrootutils',
+            'rich',
+            'webdataset',
+        ],
+    },
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/.gitignore b/phantom/submodules/phantom-hamer/third-party/ViTPose/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b102be2dbb3ba920e5d22f8714915503952cc509
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/.gitignore
@@ -0,0 +1,162 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+imgs/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
\ No newline at end of file
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/CITATION.cff b/phantom/submodules/phantom-hamer/third-party/ViTPose/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..62b75a422a45a29bf1e4c6d18bbd1d773dfbf8e1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "MMPose Contributors"
+title: "OpenMMLab Pose Estimation Toolbox and Benchmark"
+date-released: 2020-08-31
+url: "https://github.com/open-mmlab/mmpose"
+license: Apache-2.0
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/LICENSE b/phantom/submodules/phantom-hamer/third-party/ViTPose/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..b712427afe4978c6084580f113cdc87f77564fd9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018-2020 Open-MMLab. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018-2020 Open-MMLab.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/MANIFEST.in b/phantom/submodules/phantom-hamer/third-party/ViTPose/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..8a93c252bd38bafddc390bc9ae9b7278e3479246
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/MANIFEST.in
@@ -0,0 +1,5 @@
+include requirements/*.txt
+include mmpose/.mim/model-index.yml
+recursive-include mmpose/.mim/configs *.py *.yml
+recursive-include mmpose/.mim/tools *.py *.sh
+recursive-include mmpose/.mim/demo *.py
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d56759c8bf5a83043d01e1454fd33b989e958183
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/README.md
@@ -0,0 +1,293 @@
+<h1 align="left">ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation<a href="https://arxiv.org/abs/2204.12484"><img src="https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg" ></a> </h1> 
+
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/vitpose-simple-vision-transformer-baselines/pose-estimation-on-coco-test-dev)](https://paperswithcode.com/sota/pose-estimation-on-coco-test-dev?p=vitpose-simple-vision-transformer-baselines)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/vitpose-simple-vision-transformer-baselines/pose-estimation-on-aic)](https://paperswithcode.com/sota/pose-estimation-on-aic?p=vitpose-simple-vision-transformer-baselines)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/vitpose-simple-vision-transformer-baselines/pose-estimation-on-crowdpose)](https://paperswithcode.com/sota/pose-estimation-on-crowdpose?p=vitpose-simple-vision-transformer-baselines)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/vitpose-simple-vision-transformer-baselines/pose-estimation-on-ochuman)](https://paperswithcode.com/sota/pose-estimation-on-ochuman?p=vitpose-simple-vision-transformer-baselines)
+
+<p align="center">
+  <a href="#Results">Results</a> |
+  <a href="#Updates">Updates</a> |
+  <a href="#Usage">Usage</a> |
+  <a href='#Todo'>Todo</a> |
+  <a href="#Acknowledge">Acknowledge</a>
+</p>
+
+<p align="center">
+<a href="https://giphy.com/gifs/UfPQB1qKir7Vqem6sL/fullscreen"><img src="https://media.giphy.com/media/ZewXwZuixYKS2lZmNL/giphy.gif"></a>   <a href="https://giphy.com/gifs/DCvf1DrWZgbwPa8bWZ/fullscreen"><img src="https://media.giphy.com/media/2AEeuicbIjwqp2mbug/giphy.gif"></a>
+</p>
+<p align="center">
+<a href="https://giphy.com/gifs/r3GaZz7H1H6zpuIvPI/fullscreen"><img src="https://media.giphy.com/media/13oe6zo6b2B7CdsOac/giphy.gif"></a>    <a href="https://giphy.com/gifs/FjzrGJxsOzZAXaW7Vi/fullscreen"><img src="https://media.giphy.com/media/4JLERHxOEgH0tt5DZO/giphy.gif"></a>
+</p>
+
+This branch contains the pytorch implementation of <a href="https://arxiv.org/abs/2204.12484">ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation</a> and <a href="https://arxiv.org/abs/2212.04246">ViTPose+: Vision Transformer Foundation Model for Generic Body Pose Estimation</a>. It obtains 81.1 AP on MS COCO Keypoint test-dev set.
+
+<img src="figures/Throughput.png" class="left" width='80%'>
+
+## Web Demo
+
+- Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [Gradio](https://github.com/gradio-app/gradio). Try out the Web Demo for video: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/hysts/ViTPose_video) and images [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Gradio-Blocks/ViTPose)
+
+## MAE Pre-trained model
+
+- The small size MAE pre-trained model can be found in [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccZeiFjh4DJ7gjYyg?e=iTMdMq). 
+- The base, large, and huge pre-trained models using MAE can be found in the [MAE official repo](https://github.com/facebookresearch/mae).
+
+## Results from this repo on MS COCO val set (single-task training)
+
+Using detection results from a detector that obtains 56 mAP on person. The configs here are for both training and test.
+
+> With classic decoder
+
+| Model | Pretrain | Resolution | AP | AR | config | log | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| ViTPose-S | MAE | 256x192 | 73.8 | 79.2 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_small_coco_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgcchdNXBAh7ClS14pA?e=dKXmJ6) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccifT1XlGRatxg3vw?e=9wz7BY) |
+| ViTPose-B | MAE | 256x192 | 75.8 | 81.1 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_coco_256x192.py) | [log](logs/vitpose-b.log.json) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSMjp1_NrV3VRSmK?e=Q1uZKs) |
+| ViTPose-L | MAE | 256x192 | 78.3 | 83.5 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py) | [log](logs/vitpose-l.log.json) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSd9k_kuktPtiP4F?e=K7DGYT) |
+| ViTPose-H | MAE | 256x192 | 79.1 | 84.1 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py) | [log](logs/vitpose-h.log.json) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgShLMI-kkmvNfF_h?e=dEhGHe) |
+
+> With simple decoder
+
+| Model | Pretrain | Resolution | AP | AR | config | log | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| ViTPose-S | MAE | 256x192 | 73.5 | 78.9 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_small_simple_coco_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccfkqELJqE67kpRtw?e=InSjJP) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccgb_50jIgiYkHvdw?e=D7RbH2) |
+| ViTPose-B | MAE | 256x192 | 75.5 | 80.9 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py) | [log](logs/vitpose-b-simple.log.json) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSRPKrD5PmDRiv0R?e=jifvOe) |
+| ViTPose-L | MAE | 256x192 | 78.2 | 83.4 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py) | [log](logs/vitpose-l-simple.log.json) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSVS6DP2LmKwZ3sm?e=MmCvDT) |
+| ViTPose-H | MAE | 256x192 | 78.9 | 84.0 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py) | [log](logs/vitpose-h-simple.log.json) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSbHyN2mjh2n2LyG?e=y0FgMK) |
+
+
+## Results with multi-task training
+
+**Note** \* There may exist duplicate images in the crowdpose training set and the validation images in other datasets, as discussed in [issue #24](https://github.com/ViTAE-Transformer/ViTPose/issues/24). Please be careful when using these models for evaluation. We provide the results without the crowpose dataset for reference.
+
+### Human datasets (MS COCO, AIC, MPII, CrowdPose)
+> Results on MS COCO val set
+
+Using detection results from a detector that obtains 56 mAP on person. Note the configs here are only for evaluation.
+
+| Model | Dataset | Resolution | AP | AR | config | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| ViTPose-B | COCO+AIC+MPII | 256x192 | 77.1 | 82.2 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_coco_256x192.py)  | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcccwaTZ8xCFFM3Sjg?e=chmiK5) |
+| ViTPose-L | COCO+AIC+MPII | 256x192 | 78.7 | 83.8 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccdOLQqSo6E87GfMw?e=TEurgW) |
+| ViTPose-H | COCO+AIC+MPII | 256x192 | 79.5 | 84.5 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccmHofkmfJDQDukVw?e=gRK224) |
+| ViTPose-G | COCO+AIC+MPII | 576x432 | 81.0 | 85.6 | | |
+| ViTPose-B* | COCO+AIC+MPII+CrowdPose | 256x192 | 77.5 | 82.6 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_coco_256x192.py)  |[Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSrlMB093JzJtqq-?e=Jr5S3R) |
+| ViTPose-L* | COCO+AIC+MPII+CrowdPose | 256x192 | 79.1 | 84.1 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgTBm3dCVmBUbHYT6?e=fHUrTq) |
+| ViTPose-H* | COCO+AIC+MPII+CrowdPose | 256x192 | 79.8 | 84.8 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgS5rLeRAJiWobCdh?e=41GsDd) |
+| **ViTPose+-S** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 75.8 | 82.6 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_small_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccqO1JBHtBjNaeCbQ?e=ZN5NSz) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccrwORr61gT9E4n8g?e=kz9sz5) |
+| **ViTPose+-B** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 77.0 | 82.6 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccjj9lgPTlkGT1HTw?e=OlS5zv) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcckRZk1bIAuRa_E1w?e=ylDB2G) |
+| **ViTPose+-L** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 78.6 | 84.1 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccp7HJf4QMeQQpeyA?e=JagPNt) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccs1SNFUGSTsmRJ8w?e=a9zKwZ) |
+| **ViTPose+-H** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 79.4 | 84.8 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgcclxZOlwRJdqpIIjA?e=nFQgVC) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccoXv8rCUgVe7oD9Q?e=ZBw6gR) |
+
+
+> Results on OCHuman test set
+
+Using groundtruth bounding boxes. Note the configs here are only for evaluation.
+
+| Model | Dataset | Resolution | AP | AR | config | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| ViTPose-B | COCO+AIC+MPII | 256x192 | 88.0 | 89.6 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_base_ochuman_256x192.py)  | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcccwaTZ8xCFFM3Sjg?e=chmiK5) |
+| ViTPose-L | COCO+AIC+MPII | 256x192 | 90.9 | 92.2 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_large_ochuman_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccdOLQqSo6E87GfMw?e=TEurgW) |
+| ViTPose-H | COCO+AIC+MPII | 256x192 | 90.9 | 92.3 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_huge_ochuman_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccmHofkmfJDQDukVw?e=gRK224) |
+| ViTPose-G | COCO+AIC+MPII | 576x432 | 93.3 | 94.3 | | |
+| ViTPose-B* | COCO+AIC+MPII+CrowdPose | 256x192 | 88.2 | 90.0 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_base_ochuman_256x192.py)  |[Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSrlMB093JzJtqq-?e=Jr5S3R) |
+| ViTPose-L* | COCO+AIC+MPII+CrowdPose | 256x192 | 91.5 | 92.8 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_large_ochuman_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgTBm3dCVmBUbHYT6?e=fHUrTq) |
+| ViTPose-H* | COCO+AIC+MPII+CrowdPose | 256x192 | 91.6 | 92.8 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_huge_ochuman_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgS5rLeRAJiWobCdh?e=41GsDd) |
+| **ViTPose+-S** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 78.4 | 80.6 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_small_ochuman_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccqO1JBHtBjNaeCbQ?e=ZN5NSz) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccrwORr61gT9E4n8g?e=kz9sz5) |
+| **ViTPose+-B** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 82.6 | 84.8 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_base_ochuman_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccjj9lgPTlkGT1HTw?e=OlS5zv) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcckRZk1bIAuRa_E1w?e=ylDB2G) |
+| **ViTPose+-L** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 85.7 | 87.5 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_large_ochuman_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccp7HJf4QMeQQpeyA?e=JagPNt) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccs1SNFUGSTsmRJ8w?e=a9zKwZ) |
+| **ViTPose+-H** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 85.7 | 87.4 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_huge_ochuman_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgcclxZOlwRJdqpIIjA?e=nFQgVC) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccoXv8rCUgVe7oD9Q?e=ZBw6gR) |
+
+> Results on MPII val set
+
+Using groundtruth bounding boxes. Note the configs here are only for evaluation. The metric is PCKh.
+
+| Model | Dataset | Resolution | Mean | config | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| ViTPose-B | COCO+AIC+MPII | 256x192 | 93.3 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_base_mpii_256x192.py)  | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcccwaTZ8xCFFM3Sjg?e=chmiK5) |
+| ViTPose-L | COCO+AIC+MPII | 256x192 | 94.0 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_large_mpii_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccdOLQqSo6E87GfMw?e=TEurgW) |
+| ViTPose-H | COCO+AIC+MPII | 256x192 | 94.1 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_huge_mpii_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccmHofkmfJDQDukVw?e=gRK224) |
+| ViTPose-G | COCO+AIC+MPII | 576x432 | 94.3 | | |
+| ViTPose-B* | COCO+AIC+MPII+CrowdPose | 256x192 | 93.4 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_base_mpii_256x192.py)  |[Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSy_OSEm906wd2LB?e=GOSg14) |
+| ViTPose-L* | COCO+AIC+MPII+CrowdPose | 256x192 | 93.9 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_large_mpii_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgTM32I6Kpjr-esl6?e=qvh0Yl) |
+| ViTPose-H* | COCO+AIC+MPII+CrowdPose | 256x192 | 94.1 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_huge_mpii_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgTT90XEQBKy-scIH?e=D2WhTS) |
+| **ViTPose+-S** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 92.7 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_small_mpii_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccqO1JBHtBjNaeCbQ?e=ZN5NSz) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccrwORr61gT9E4n8g?e=kz9sz5) |
+| **ViTPose+-B** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 92.8 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_base_mpii_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccjj9lgPTlkGT1HTw?e=OlS5zv) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcckRZk1bIAuRa_E1w?e=ylDB2G) |
+| **ViTPose+-L** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 94.0 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_large_mpii_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccp7HJf4QMeQQpeyA?e=JagPNt) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccs1SNFUGSTsmRJ8w?e=a9zKwZ) |
+| **ViTPose+-H** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 94.2 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_huge_mpii_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgcclxZOlwRJdqpIIjA?e=nFQgVC) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccoXv8rCUgVe7oD9Q?e=ZBw6gR) |
+
+
+> Results on AI Challenger test set
+
+Using groundtruth bounding boxes. Note the configs here are only for evaluation.
+
+| Model | Dataset | Resolution | AP | AR | config | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| ViTPose-B | COCO+AIC+MPII | 256x192 | 32.0 | 36.3 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_base_aic_256x192.py)  | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcccwaTZ8xCFFM3Sjg?e=chmiK5) |
+| ViTPose-L | COCO+AIC+MPII | 256x192 | 34.5 | 39.0 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_large_aic_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccdOLQqSo6E87GfMw?e=TEurgW) |
+| ViTPose-H | COCO+AIC+MPII | 256x192 | 35.4 | 39.9 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_huge_aic_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccmHofkmfJDQDukVw?e=gRK224) |
+| ViTPose-G | COCO+AIC+MPII | 576x432 | 43.2 | 47.1 | | |
+| ViTPose-B* | COCO+AIC+MPII+CrowdPose | 256x192 | 31.9 | 36.3 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_base_aic_256x192.py)  |[Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSlvdVaXTC92SHYH?e=j7iqcp) |
+| ViTPose-L* | COCO+AIC+MPII+CrowdPose | 256x192 | 34.6 | 39.0 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_large_aic_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgTF06FX3FSAm0MOH?e=rYts9F) |
+| ViTPose-H* | COCO+AIC+MPII+CrowdPose | 256x192 | 35.3 | 39.8 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_huge_aic_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgS1MRmb2mcow_K04?e=q9jPab) |
+| **ViTPose+-S** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 29.7 | 34.3 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_small_ochuman_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccqO1JBHtBjNaeCbQ?e=ZN5NSz) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccrwORr61gT9E4n8g?e=kz9sz5) |
+| **ViTPose+-B** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 31.8 | 36.3 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_base_ochuman_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccjj9lgPTlkGT1HTw?e=OlS5zv) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcckRZk1bIAuRa_E1w?e=ylDB2G) |
+| **ViTPose+-L** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 34.3 | 38.9 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_large_ochuman_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccp7HJf4QMeQQpeyA?e=JagPNt) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccs1SNFUGSTsmRJ8w?e=a9zKwZ) |
+| **ViTPose+-H** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 34.8 | 39.1 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_huge_ochuman_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgcclxZOlwRJdqpIIjA?e=nFQgVC) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccoXv8rCUgVe7oD9Q?e=ZBw6gR) |
+
+> Results on CrowdPose test set
+
+Using YOLOv3 human detector. Note the configs here are only for evaluation.
+
+| Model | Dataset | Resolution | AP | AP(H) | config | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| ViTPose-B* | COCO+AIC+MPII+CrowdPose | 256x192 | 74.7 | 63.3 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_base_crowdpose_256x192.py)  |[Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgStrrCb91cPlaxJx?e=6Xobo6) |
+| ViTPose-L* | COCO+AIC+MPII+CrowdPose | 256x192 | 76.6 | 65.9 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_large_crowdpose_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgTK3dug-r7c6GFyu?e=1ZBpEG) |
+| ViTPose-H* | COCO+AIC+MPII+CrowdPose | 256x192 | 76.3 | 65.6 | [config](configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_huge_crowdpose_256x192.py) | [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgS-oAvEV4MTD--Xr?e=EeW2Fu) |
+
+### Animal datasets (AP10K, APT36K)
+
+> Results on AP-10K test set
+
+| Model | Dataset | Resolution | AP | config | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| **ViTPose+-S** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 71.4 | [config](configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_small_ap10k_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccqO1JBHtBjNaeCbQ?e=ZN5NSz) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccrwORr61gT9E4n8g?e=kz9sz5) |
+| **ViTPose+-B** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 74.5 | [config](configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_base_ap10k_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccjj9lgPTlkGT1HTw?e=OlS5zv) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcckRZk1bIAuRa_E1w?e=ylDB2G) |
+| **ViTPose+-L** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 80.4 | [config](configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_large_ap10k_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccp7HJf4QMeQQpeyA?e=JagPNt) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccs1SNFUGSTsmRJ8w?e=a9zKwZ) |
+| **ViTPose+-H** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 82.4 | [config](configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_huge_ap10k_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgcclxZOlwRJdqpIIjA?e=nFQgVC) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccoXv8rCUgVe7oD9Q?e=ZBw6gR) |
+
+> Results on APT-36K val set
+
+| Model | Dataset | Resolution | AP | config | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| **ViTPose+-S** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 74.2 | [config](configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_small_apt36k_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccqO1JBHtBjNaeCbQ?e=ZN5NSz) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccrwORr61gT9E4n8g?e=kz9sz5) |
+| **ViTPose+-B** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 75.9 | [config](configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_base_apt36k_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccjj9lgPTlkGT1HTw?e=OlS5zv) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcckRZk1bIAuRa_E1w?e=ylDB2G) |
+| **ViTPose+-L** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 80.8 | [config](configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_large_apt36k_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccp7HJf4QMeQQpeyA?e=JagPNt) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccs1SNFUGSTsmRJ8w?e=a9zKwZ) |
+| **ViTPose+-H** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 82.3 | [config](configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_huge_apt36k_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgcclxZOlwRJdqpIIjA?e=nFQgVC) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccoXv8rCUgVe7oD9Q?e=ZBw6gR) |
+
+### WholeBody dataset
+
+| Model | Dataset | Resolution | AP | config | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| **ViTPose+-S** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 54.4 | [config](configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_small_wholebody_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccqO1JBHtBjNaeCbQ?e=ZN5NSz) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccrwORr61gT9E4n8g?e=kz9sz5) |
+| **ViTPose+-B** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 57.4 | [config](cconfigs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_base_wholebody_256x192.py)  | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccjj9lgPTlkGT1HTw?e=OlS5zv) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgcckRZk1bIAuRa_E1w?e=ylDB2G) |
+| **ViTPose+-L** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 60.6 | [config](configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_large_wholebody_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgccp7HJf4QMeQQpeyA?e=JagPNt) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccs1SNFUGSTsmRJ8w?e=a9zKwZ) |
+| **ViTPose+-H** | COCO+AIC+MPII+AP10K+APT36K+WholeBody | 256x192 | 61.2 | [config](configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_huge_wholebody_256x192.py) | [log](https://1drv.ms/u/s!AimBgYV7JjTlgcclxZOlwRJdqpIIjA?e=nFQgVC) \| [Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgccoXv8rCUgVe7oD9Q?e=ZBw6gR) |
+
+### Transfer results on the hand dataset (InterHand2.6M)
+
+| Model | Dataset | Resolution | AUC | config | weight |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| **ViTPose+-S** | COCO+AIC+MPII+WholeBody | 256x192 | 86.5 | [config](configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_small_interhand2d_all_256x192.py)  | Coming Soon |
+| **ViTPose+-B** | COCO+AIC+MPII+WholeBody | 256x192 | 87.0 | [config](configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_base_interhand2d_all_256x192.py)  | Coming Soon |
+| **ViTPose+-L** | COCO+AIC+MPII+WholeBody | 256x192 | 87.5 | [config](configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_large_interhand2d_all_256x192.py) | Coming Soon |
+| **ViTPose+-H** | COCO+AIC+MPII+WholeBody | 256x192 | 87.6 | [config](configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_huge_interhand2d_all_256x192.py) | Coming Soon |
+
+## Updates
+
+> [2023-01-10] Update ViTPose+! It uses MoE strategies to jointly deal with human, animal, and wholebody pose estimation tasks.
+
+> [2022-05-24] Upload the single-task training code, single-task pre-trained models, and multi-task pretrained models.
+
+> [2022-05-06] Upload the logs for the base, large, and huge models!
+
+> [2022-04-27] Our ViTPose with ViTAE-G obtains 81.1 AP on COCO test-dev set! 
+
+> Applications of ViTAE Transformer include: [image classification](https://github.com/ViTAE-Transformer/ViTAE-Transformer/tree/main/Image-Classification) | [object detection](https://github.com/ViTAE-Transformer/ViTAE-Transformer/tree/main/Object-Detection) | [semantic segmentation](https://github.com/ViTAE-Transformer/ViTAE-Transformer/tree/main/Semantic-Segmentation) | [animal pose segmentation](https://github.com/ViTAE-Transformer/ViTAE-Transformer/tree/main/Animal-Pose-Estimation) | [remote sensing](https://github.com/ViTAE-Transformer/ViTAE-Transformer-Remote-Sensing) | [matting](https://github.com/ViTAE-Transformer/ViTAE-Transformer-Matting) | [VSA](https://github.com/ViTAE-Transformer/ViTAE-VSA) | [ViTDet](https://github.com/ViTAE-Transformer/ViTDet)
+
+## Usage
+
+We use PyTorch 1.9.0 or NGC docker 21.06, and mmcv 1.3.9 for the experiments.
+```bash
+git clone https://github.com/open-mmlab/mmcv.git
+cd mmcv
+git checkout v1.3.9
+MMCV_WITH_OPS=1 pip install -e .
+cd ..
+git clone https://github.com/ViTAE-Transformer/ViTPose.git
+cd ViTPose
+pip install -v -e .
+```
+
+After install the two repos, install timm and einops, i.e.,
+```bash
+pip install timm==0.4.9 einops
+```
+
+After downloading the pretrained models, please conduct the experiments by running
+
+```bash
+# for single machine
+bash tools/dist_train.sh <Config PATH> <NUM GPUs> --cfg-options model.pretrained=<Pretrained PATH> --seed 0
+
+# for multiple machines
+python -m torch.distributed.launch --nnodes <Num Machines> --node_rank <Rank of Machine> --nproc_per_node <GPUs Per Machine> --master_addr <Master Addr> --master_port <Master Port> tools/train.py <Config PATH> --cfg-options model.pretrained=<Pretrained PATH> --launcher pytorch --seed 0
+```
+
+To test the pretrained models performance, please run 
+
+```bash
+bash tools/dist_test.sh <Config PATH> <Checkpoint PATH> <NUM GPUs>
+```
+
+For ViTPose+ pre-trained models, please first re-organize the pre-trained weights using
+
+```bash
+python tools/model_split.py --source <Pretrained PATH>
+```
+
+## Todo
+
+This repo current contains modifications including:
+
+- [x] Upload configs and pretrained models
+
+- [x] More models with SOTA results
+
+- [x] Upload multi-task training config
+
+## Acknowledge
+We acknowledge the excellent implementation from [mmpose](https://github.com/open-mmlab/mmdetection) and [MAE](https://github.com/facebookresearch/mae).
+
+## Citing ViTPose
+
+For ViTPose
+
+```
+@inproceedings{
+  xu2022vitpose,
+  title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation},
+  author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao},
+  booktitle={Advances in Neural Information Processing Systems},
+  year={2022},
+}
+```
+
+For ViTPose+
+
+```
+@article{xu2022vitpose+,
+  title={ViTPose+: Vision Transformer Foundation Model for Generic Body Pose Estimation},
+  author={Xu, Yufei and Zhang, Jing and Zhang, Qiming and Tao, Dacheng},
+  journal={arXiv preprint arXiv:2212.04246},
+  year={2022}
+}
+```
+
+For ViTAE and ViTAEv2, please refer to:
+```
+@article{xu2021vitae,
+  title={Vitae: Vision transformer advanced by exploring intrinsic inductive bias},
+  author={Xu, Yufei and Zhang, Qiming and Zhang, Jing and Tao, Dacheng},
+  journal={Advances in Neural Information Processing Systems},
+  volume={34},
+  year={2021}
+}
+
+@article{zhang2022vitaev2,
+  title={ViTAEv2: Vision Transformer Advanced by Exploring Inductive Bias for Image Recognition and Beyond},
+  author={Zhang, Qiming and Xu, Yufei and Zhang, Jing and Tao, Dacheng},
+  journal={arXiv preprint arXiv:2202.10108},
+  year={2022}
+}
+```
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/300w.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/300w.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c343a2adf84947159f2651b3e918d1fc32ea90
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/300w.py
@@ -0,0 +1,384 @@
+dataset_info = dict(
+    dataset_name='300w',
+    paper_info=dict(
+        author='Sagonas, Christos and Antonakos, Epameinondas '
+        'and Tzimiropoulos, Georgios and Zafeiriou, Stefanos '
+        'and Pantic, Maja',
+        title='300 faces in-the-wild challenge: '
+        'Database and results',
+        container='Image and vision computing',
+        year='2016',
+        homepage='https://ibug.doc.ic.ac.uk/resources/300-W/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-16'),
+        1:
+        dict(
+            name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-15'),
+        2:
+        dict(
+            name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-14'),
+        3:
+        dict(
+            name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-13'),
+        4:
+        dict(
+            name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-12'),
+        5:
+        dict(
+            name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-11'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-10'),
+        7:
+        dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-9'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap=''),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-7'),
+        10:
+        dict(
+            name='kpt-10', id=10, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        11:
+        dict(
+            name='kpt-11', id=11, color=[255, 255, 255], type='',
+            swap='kpt-5'),
+        12:
+        dict(
+            name='kpt-12', id=12, color=[255, 255, 255], type='',
+            swap='kpt-4'),
+        13:
+        dict(
+            name='kpt-13', id=13, color=[255, 255, 255], type='',
+            swap='kpt-3'),
+        14:
+        dict(
+            name='kpt-14', id=14, color=[255, 255, 255], type='',
+            swap='kpt-2'),
+        15:
+        dict(
+            name='kpt-15', id=15, color=[255, 255, 255], type='',
+            swap='kpt-1'),
+        16:
+        dict(
+            name='kpt-16', id=16, color=[255, 255, 255], type='',
+            swap='kpt-0'),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-26'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-25'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-24'),
+        20:
+        dict(
+            name='kpt-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-23'),
+        21:
+        dict(
+            name='kpt-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-21'),
+        23:
+        dict(
+            name='kpt-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-20'),
+        24:
+        dict(
+            name='kpt-24',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        25:
+        dict(
+            name='kpt-25',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        26:
+        dict(
+            name='kpt-26',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        27:
+        dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap=''),
+        29:
+        dict(name='kpt-29', id=29, color=[255, 255, 255], type='', swap=''),
+        30:
+        dict(name='kpt-30', id=30, color=[255, 255, 255], type='', swap=''),
+        31:
+        dict(
+            name='kpt-31',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-35'),
+        32:
+        dict(
+            name='kpt-32',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-34'),
+        33:
+        dict(name='kpt-33', id=33, color=[255, 255, 255], type='', swap=''),
+        34:
+        dict(
+            name='kpt-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-32'),
+        35:
+        dict(
+            name='kpt-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-31'),
+        36:
+        dict(
+            name='kpt-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-45'),
+        37:
+        dict(
+            name='kpt-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-44'),
+        38:
+        dict(
+            name='kpt-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-43'),
+        39:
+        dict(
+            name='kpt-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-42'),
+        40:
+        dict(
+            name='kpt-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-47'),
+        41:
+        dict(
+            name='kpt-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-46'),
+        42:
+        dict(
+            name='kpt-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-39'),
+        43:
+        dict(
+            name='kpt-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-38'),
+        44:
+        dict(
+            name='kpt-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-37'),
+        45:
+        dict(
+            name='kpt-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-36'),
+        46:
+        dict(
+            name='kpt-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-41'),
+        47:
+        dict(
+            name='kpt-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-40'),
+        48:
+        dict(
+            name='kpt-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-54'),
+        49:
+        dict(
+            name='kpt-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-53'),
+        50:
+        dict(
+            name='kpt-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-52'),
+        51:
+        dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(
+            name='kpt-52',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-50'),
+        53:
+        dict(
+            name='kpt-53',
+            id=53,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-49'),
+        54:
+        dict(
+            name='kpt-54',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-48'),
+        55:
+        dict(
+            name='kpt-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-59'),
+        56:
+        dict(
+            name='kpt-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-58'),
+        57:
+        dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='kpt-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-56'),
+        59:
+        dict(
+            name='kpt-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-55'),
+        60:
+        dict(
+            name='kpt-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-64'),
+        61:
+        dict(
+            name='kpt-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-63'),
+        62:
+        dict(name='kpt-62', id=62, color=[255, 255, 255], type='', swap=''),
+        63:
+        dict(
+            name='kpt-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-61'),
+        64:
+        dict(
+            name='kpt-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-60'),
+        65:
+        dict(
+            name='kpt-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-67'),
+        66:
+        dict(name='kpt-66', id=66, color=[255, 255, 255], type='', swap=''),
+        67:
+        dict(
+            name='kpt-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-65'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 68,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/aflw.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/aflw.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf534cbb756e8c514c2f5e2a7fceedd55afb637e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/aflw.py
@@ -0,0 +1,83 @@
+dataset_info = dict(
+    dataset_name='aflw',
+    paper_info=dict(
+        author='Koestinger, Martin and Wohlhart, Paul and '
+        'Roth, Peter M and Bischof, Horst',
+        title='Annotated facial landmarks in the wild: '
+        'A large-scale, real-world database for facial '
+        'landmark localization',
+        container='2011 IEEE international conference on computer '
+        'vision workshops (ICCV workshops)',
+        year='2011',
+        homepage='https://www.tugraz.at/institute/icg/research/'
+        'team-bischof/lrs/downloads/aflw/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-5'),
+        1:
+        dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-4'),
+        2:
+        dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'),
+        3:
+        dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'),
+        4:
+        dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-1'),
+        5:
+        dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-0'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-11'),
+        7:
+        dict(
+            name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-10'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'),
+        10:
+        dict(
+            name='kpt-10', id=10, color=[255, 255, 255], type='',
+            swap='kpt-7'),
+        11:
+        dict(
+            name='kpt-11', id=11, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        13:
+        dict(name='kpt-13', id=13, color=[255, 255, 255], type='', swap=''),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        16:
+        dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        18:
+        dict(name='kpt-18', id=18, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 19,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/aic.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/aic.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ecdbe3f0afeb19dbb7aed42653ce5efd85cfda3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/aic.py
@@ -0,0 +1,140 @@
+dataset_info = dict(
+    dataset_name='aic',
+    paper_info=dict(
+        author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
+        'Li, Yixin and Yan, Baoming and Liang, Rui and '
+        'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
+        'Fu, Yanwei and others',
+        title='Ai challenger: A large-scale dataset for going '
+        'deeper in image understanding',
+        container='arXiv',
+        year='2017',
+        homepage='https://github.com/AIChallenger/AI_Challenger_2017',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_shoulder',
+            id=0,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        1:
+        dict(
+            name='right_elbow',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        2:
+        dict(
+            name='right_wrist',
+            id=2,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        3:
+        dict(
+            name='left_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        4:
+        dict(
+            name='left_elbow',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        5:
+        dict(
+            name='left_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        6:
+        dict(
+            name='right_hip',
+            id=6,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        7:
+        dict(
+            name='right_knee',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        8:
+        dict(
+            name='right_ankle',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        9:
+        dict(
+            name='left_hip',
+            id=9,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        10:
+        dict(
+            name='left_knee',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        11:
+        dict(
+            name='left_ankle',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        12:
+        dict(
+            name='head_top',
+            id=12,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        13:
+        dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
+        1: dict(
+            link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
+        2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
+        3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
+        4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
+        7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
+        8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
+        9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
+        10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
+        11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
+        12: dict(
+            link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
+    ],
+
+    # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
+    # 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
+    # delta = 2 x sigma
+    sigmas=[
+        0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
+        0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
+        0.01291456, 0.01236173
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/aic_info.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/aic_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..f143fd8c4be5e9cd24988e03f6a1c3ab2d1ceb19
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/aic_info.py
@@ -0,0 +1,140 @@
+aic_info = dict(
+    dataset_name='aic',
+    paper_info=dict(
+        author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
+        'Li, Yixin and Yan, Baoming and Liang, Rui and '
+        'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
+        'Fu, Yanwei and others',
+        title='Ai challenger: A large-scale dataset for going '
+        'deeper in image understanding',
+        container='arXiv',
+        year='2017',
+        homepage='https://github.com/AIChallenger/AI_Challenger_2017',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_shoulder',
+            id=0,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        1:
+        dict(
+            name='right_elbow',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        2:
+        dict(
+            name='right_wrist',
+            id=2,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        3:
+        dict(
+            name='left_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        4:
+        dict(
+            name='left_elbow',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        5:
+        dict(
+            name='left_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        6:
+        dict(
+            name='right_hip',
+            id=6,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        7:
+        dict(
+            name='right_knee',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        8:
+        dict(
+            name='right_ankle',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        9:
+        dict(
+            name='left_hip',
+            id=9,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        10:
+        dict(
+            name='left_knee',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        11:
+        dict(
+            name='left_ankle',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        12:
+        dict(
+            name='head_top',
+            id=12,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        13:
+        dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
+        1: dict(
+            link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
+        2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
+        3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
+        4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
+        7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
+        8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
+        9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
+        10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
+        11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
+        12: dict(
+            link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
+    ],
+
+    # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
+    # 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
+    # delta = 2 x sigma
+    sigmas=[
+        0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
+        0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
+        0.01291456, 0.01236173
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/animalpose.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/animalpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5bb62d951b71da25e679bd755fe566216dc3f6f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/animalpose.py
@@ -0,0 +1,166 @@
+dataset_info = dict(
+    dataset_name='animalpose',
+    paper_info=dict(
+        author='Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and '
+        'Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing',
+        title='Cross-Domain Adaptation for Animal Pose Estimation',
+        container='The IEEE International Conference on '
+        'Computer Vision (ICCV)',
+        year='2019',
+        homepage='https://sites.google.com/view/animal-pose/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(
+            name='L_EarBase',
+            id=2,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_EarBase'),
+        3:
+        dict(
+            name='R_EarBase',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_EarBase'),
+        4:
+        dict(name='Nose', id=4, color=[51, 153, 255], type='upper', swap=''),
+        5:
+        dict(name='Throat', id=5, color=[51, 153, 255], type='upper', swap=''),
+        6:
+        dict(
+            name='TailBase', id=6, color=[51, 153, 255], type='lower',
+            swap=''),
+        7:
+        dict(
+            name='Withers', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='L_F_Elbow',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Elbow'),
+        9:
+        dict(
+            name='R_F_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Elbow'),
+        10:
+        dict(
+            name='L_B_Elbow',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Elbow'),
+        11:
+        dict(
+            name='R_B_Elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Elbow'),
+        12:
+        dict(
+            name='L_F_Knee',
+            id=12,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Knee'),
+        13:
+        dict(
+            name='R_F_Knee',
+            id=13,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Knee'),
+        14:
+        dict(
+            name='L_B_Knee',
+            id=14,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Knee'),
+        15:
+        dict(
+            name='R_B_Knee',
+            id=15,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Knee'),
+        16:
+        dict(
+            name='L_F_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        17:
+        dict(
+            name='R_F_Paw',
+            id=17,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Paw'),
+        18:
+        dict(
+            name='L_B_Paw',
+            id=18,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        19:
+        dict(
+            name='R_B_Paw',
+            id=19,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Paw')
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[51, 153, 255]),
+        1: dict(link=('L_Eye', 'L_EarBase'), id=1, color=[0, 255, 0]),
+        2: dict(link=('R_Eye', 'R_EarBase'), id=2, color=[255, 128, 0]),
+        3: dict(link=('L_Eye', 'Nose'), id=3, color=[0, 255, 0]),
+        4: dict(link=('R_Eye', 'Nose'), id=4, color=[255, 128, 0]),
+        5: dict(link=('Nose', 'Throat'), id=5, color=[51, 153, 255]),
+        6: dict(link=('Throat', 'Withers'), id=6, color=[51, 153, 255]),
+        7: dict(link=('TailBase', 'Withers'), id=7, color=[51, 153, 255]),
+        8: dict(link=('Throat', 'L_F_Elbow'), id=8, color=[0, 255, 0]),
+        9: dict(link=('L_F_Elbow', 'L_F_Knee'), id=9, color=[0, 255, 0]),
+        10: dict(link=('L_F_Knee', 'L_F_Paw'), id=10, color=[0, 255, 0]),
+        11: dict(link=('Throat', 'R_F_Elbow'), id=11, color=[255, 128, 0]),
+        12: dict(link=('R_F_Elbow', 'R_F_Knee'), id=12, color=[255, 128, 0]),
+        13: dict(link=('R_F_Knee', 'R_F_Paw'), id=13, color=[255, 128, 0]),
+        14: dict(link=('TailBase', 'L_B_Elbow'), id=14, color=[0, 255, 0]),
+        15: dict(link=('L_B_Elbow', 'L_B_Knee'), id=15, color=[0, 255, 0]),
+        16: dict(link=('L_B_Knee', 'L_B_Paw'), id=16, color=[0, 255, 0]),
+        17: dict(link=('TailBase', 'R_B_Elbow'), id=17, color=[255, 128, 0]),
+        18: dict(link=('R_B_Elbow', 'R_B_Knee'), id=18, color=[255, 128, 0]),
+        19: dict(link=('R_B_Knee', 'R_B_Paw'), id=19, color=[255, 128, 0])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2,
+        1.5, 1.5, 1.5, 1.5
+    ],
+
+    # Note: The original paper did not provide enough information about
+    # the sigmas. We modified from 'https://github.com/cocodataset/'
+    # 'cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py#L523'
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.10, 0.10, 0.10, 0.107, 0.107,
+        0.107, 0.107, 0.087, 0.087, 0.087, 0.087, 0.089, 0.089, 0.089, 0.089
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/ap10k.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/ap10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0df579acbb8cf0de1ef62412ba865ee8710f0aa
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/ap10k.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='ap10k',
+    paper_info=dict(
+        author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
+        'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
+        title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
+        container='35th Conference on Neural Information Processing Systems '
+        '(NeurIPS 2021) Track on Datasets and Bench-marks.',
+        year='2021',
+        homepage='https://github.com/AlexTheBad/AP-10K',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
+        4:
+        dict(
+            name='Root of tail',
+            id=4,
+            color=[51, 153, 255],
+            type='lower',
+            swap=''),
+        5:
+        dict(
+            name='L_Shoulder',
+            id=5,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Shoulder'),
+        6:
+        dict(
+            name='L_Elbow',
+            id=6,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Elbow'),
+        7:
+        dict(
+            name='L_F_Paw',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        8:
+        dict(
+            name='R_Shoulder',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='L_Shoulder'),
+        9:
+        dict(
+            name='R_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Elbow'),
+        10:
+        dict(
+            name='R_F_Paw',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_F_Paw'),
+        11:
+        dict(
+            name='L_Hip',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Hip'),
+        12:
+        dict(
+            name='L_Knee',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Knee'),
+        13:
+        dict(
+            name='L_B_Paw',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        14:
+        dict(
+            name='R_Hip', id=14, color=[0, 255, 0], type='lower',
+            swap='L_Hip'),
+        15:
+        dict(
+            name='R_Knee',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Knee'),
+        16:
+        dict(
+            name='R_B_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_B_Paw'),
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
+        1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
+        2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
+        3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
+        4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
+        5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
+        6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
+        7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
+        8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
+        9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
+        10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
+        11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
+        12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
+        13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
+        14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
+        15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
+        16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
+        0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/ap10k_info.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/ap10k_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..af2461c75450818e821894cb1152d59a06443a26
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/ap10k_info.py
@@ -0,0 +1,142 @@
+ap10k_info = dict(
+    dataset_name='ap10k',
+    paper_info=dict(
+        author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
+        'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
+        title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
+        container='35th Conference on Neural Information Processing Systems '
+        '(NeurIPS 2021) Track on Datasets and Bench-marks.',
+        year='2021',
+        homepage='https://github.com/AlexTheBad/AP-10K',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
+        4:
+        dict(
+            name='Root of tail',
+            id=4,
+            color=[51, 153, 255],
+            type='lower',
+            swap=''),
+        5:
+        dict(
+            name='L_Shoulder',
+            id=5,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Shoulder'),
+        6:
+        dict(
+            name='L_Elbow',
+            id=6,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Elbow'),
+        7:
+        dict(
+            name='L_F_Paw',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        8:
+        dict(
+            name='R_Shoulder',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='L_Shoulder'),
+        9:
+        dict(
+            name='R_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Elbow'),
+        10:
+        dict(
+            name='R_F_Paw',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_F_Paw'),
+        11:
+        dict(
+            name='L_Hip',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Hip'),
+        12:
+        dict(
+            name='L_Knee',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Knee'),
+        13:
+        dict(
+            name='L_B_Paw',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        14:
+        dict(
+            name='R_Hip', id=14, color=[0, 255, 0], type='lower',
+            swap='L_Hip'),
+        15:
+        dict(
+            name='R_Knee',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Knee'),
+        16:
+        dict(
+            name='R_B_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_B_Paw'),
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
+        1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
+        2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
+        3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
+        4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
+        5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
+        6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
+        7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
+        8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
+        9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
+        10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
+        11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
+        12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
+        13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
+        14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
+        15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
+        16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
+        0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/atrw.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/atrw.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec71c8c508a0340139371a651ca2dd56eeae3cf
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/atrw.py
@@ -0,0 +1,144 @@
+dataset_info = dict(
+    dataset_name='atrw',
+    paper_info=dict(
+        author='Li, Shuyuan and Li, Jianguo and Tang, Hanlin '
+        'and Qian, Rui and Lin, Weiyao',
+        title='ATRW: A Benchmark for Amur Tiger '
+        'Re-identification in the Wild',
+        container='Proceedings of the 28th ACM '
+        'International Conference on Multimedia',
+        year='2020',
+        homepage='https://cvwc2019.github.io/challenge.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_ear',
+            id=0,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        1:
+        dict(
+            name='right_ear',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        2:
+        dict(name='nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(
+            name='right_shoulder',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        4:
+        dict(
+            name='right_front_paw',
+            id=4,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_front_paw'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='left_front_paw',
+            id=6,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_front_paw'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='right_knee',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        9:
+        dict(
+            name='right_back_paw',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_back_paw'),
+        10:
+        dict(
+            name='left_hip',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        11:
+        dict(
+            name='left_knee',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        12:
+        dict(
+            name='left_back_paw',
+            id=12,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_back_paw'),
+        13:
+        dict(name='tail', id=13, color=[51, 153, 255], type='lower', swap=''),
+        14:
+        dict(
+            name='center', id=14, color=[51, 153, 255], type='lower', swap=''),
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ear', 'nose'), id=0, color=[51, 153, 255]),
+        1:
+        dict(link=('right_ear', 'nose'), id=1, color=[51, 153, 255]),
+        2:
+        dict(link=('nose', 'center'), id=2, color=[51, 153, 255]),
+        3:
+        dict(
+            link=('left_shoulder', 'left_front_paw'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_shoulder', 'center'), id=4, color=[0, 255, 0]),
+        5:
+        dict(
+            link=('right_shoulder', 'right_front_paw'),
+            id=5,
+            color=[255, 128, 0]),
+        6:
+        dict(link=('right_shoulder', 'center'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('tail', 'center'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('right_back_paw', 'right_knee'), id=8, color=[255, 128, 0]),
+        9:
+        dict(link=('right_knee', 'right_hip'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('right_hip', 'tail'), id=10, color=[255, 128, 0]),
+        11:
+        dict(link=('left_back_paw', 'left_knee'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_knee', 'left_hip'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_hip', 'tail'), id=13, color=[0, 255, 0]),
+    },
+    joint_weights=[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+    sigmas=[
+        0.0277, 0.0823, 0.0831, 0.0202, 0.0716, 0.0263, 0.0646, 0.0302, 0.0440,
+        0.0316, 0.0333, 0.0547, 0.0263, 0.0683, 0.0539
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..865a95bc02fedd318f32d2e7aa8397147d78fdb5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+    dataset_name='coco',
+    paper_info=dict(
+        author='Lin, Tsung-Yi and Maire, Michael and '
+        'Belongie, Serge and Hays, James and '
+        'Perona, Pietro and Ramanan, Deva and '
+        r'Doll{\'a}r, Piotr and Zitnick, C Lawrence',
+        title='Microsoft coco: Common objects in context',
+        container='European conference on computer vision',
+        year='2014',
+        homepage='http://cocodataset.org/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef9b707017a24a1a133bb28566d212c618fee694
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody.py
@@ -0,0 +1,1154 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        17:
+        dict(
+            name='left_big_toe',
+            id=17,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_big_toe'),
+        18:
+        dict(
+            name='left_small_toe',
+            id=18,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_small_toe'),
+        19:
+        dict(
+            name='left_heel',
+            id=19,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_heel'),
+        20:
+        dict(
+            name='right_big_toe',
+            id=20,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_big_toe'),
+        21:
+        dict(
+            name='right_small_toe',
+            id=21,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_small_toe'),
+        22:
+        dict(
+            name='right_heel',
+            id=22,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_heel'),
+        23:
+        dict(
+            name='face-0',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        24:
+        dict(
+            name='face-1',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        25:
+        dict(
+            name='face-2',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        26:
+        dict(
+            name='face-3',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        27:
+        dict(
+            name='face-4',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        28:
+        dict(
+            name='face-5',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        29:
+        dict(
+            name='face-6',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        30:
+        dict(
+            name='face-7',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='face-9'),
+        31:
+        dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''),
+        32:
+        dict(
+            name='face-9',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-7'),
+        33:
+        dict(
+            name='face-10',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        34:
+        dict(
+            name='face-11',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        35:
+        dict(
+            name='face-12',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        36:
+        dict(
+            name='face-13',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        37:
+        dict(
+            name='face-14',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        38:
+        dict(
+            name='face-15',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        39:
+        dict(
+            name='face-16',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        40:
+        dict(
+            name='face-17',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        41:
+        dict(
+            name='face-18',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        42:
+        dict(
+            name='face-19',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        43:
+        dict(
+            name='face-20',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        44:
+        dict(
+            name='face-21',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        45:
+        dict(
+            name='face-22',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        46:
+        dict(
+            name='face-23',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        47:
+        dict(
+            name='face-24',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        48:
+        dict(
+            name='face-25',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        49:
+        dict(
+            name='face-26',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        50:
+        dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''),
+        51:
+        dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''),
+        53:
+        dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(
+            name='face-31',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        55:
+        dict(
+            name='face-32',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        56:
+        dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''),
+        57:
+        dict(
+            name='face-34',
+            id=57,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        58:
+        dict(
+            name='face-35',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        59:
+        dict(
+            name='face-36',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        60:
+        dict(
+            name='face-37',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        61:
+        dict(
+            name='face-38',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        62:
+        dict(
+            name='face-39',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        63:
+        dict(
+            name='face-40',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        64:
+        dict(
+            name='face-41',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        65:
+        dict(
+            name='face-42',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        66:
+        dict(
+            name='face-43',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        67:
+        dict(
+            name='face-44',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        68:
+        dict(
+            name='face-45',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        69:
+        dict(
+            name='face-46',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        70:
+        dict(
+            name='face-47',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        71:
+        dict(
+            name='face-48',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        72:
+        dict(
+            name='face-49',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        73:
+        dict(
+            name='face-50',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        74:
+        dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''),
+        75:
+        dict(
+            name='face-52',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        76:
+        dict(
+            name='face-53',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        77:
+        dict(
+            name='face-54',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        78:
+        dict(
+            name='face-55',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        79:
+        dict(
+            name='face-56',
+            id=79,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        80:
+        dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''),
+        81:
+        dict(
+            name='face-58',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        82:
+        dict(
+            name='face-59',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        83:
+        dict(
+            name='face-60',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        84:
+        dict(
+            name='face-61',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        85:
+        dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='face-63',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        87:
+        dict(
+            name='face-64',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        88:
+        dict(
+            name='face-65',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        89:
+        dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''),
+        90:
+        dict(
+            name='face-67',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65'),
+        91:
+        dict(
+            name='left_hand_root',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='right_hand_root'),
+        92:
+        dict(
+            name='left_thumb1',
+            id=92,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        93:
+        dict(
+            name='left_thumb2',
+            id=93,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        94:
+        dict(
+            name='left_thumb3',
+            id=94,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        95:
+        dict(
+            name='left_thumb4',
+            id=95,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        96:
+        dict(
+            name='left_forefinger1',
+            id=96,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        97:
+        dict(
+            name='left_forefinger2',
+            id=97,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        98:
+        dict(
+            name='left_forefinger3',
+            id=98,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        99:
+        dict(
+            name='left_forefinger4',
+            id=99,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        100:
+        dict(
+            name='left_middle_finger1',
+            id=100,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        101:
+        dict(
+            name='left_middle_finger2',
+            id=101,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        102:
+        dict(
+            name='left_middle_finger3',
+            id=102,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        103:
+        dict(
+            name='left_middle_finger4',
+            id=103,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        104:
+        dict(
+            name='left_ring_finger1',
+            id=104,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        105:
+        dict(
+            name='left_ring_finger2',
+            id=105,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        106:
+        dict(
+            name='left_ring_finger3',
+            id=106,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        107:
+        dict(
+            name='left_ring_finger4',
+            id=107,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        108:
+        dict(
+            name='left_pinky_finger1',
+            id=108,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        109:
+        dict(
+            name='left_pinky_finger2',
+            id=109,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        110:
+        dict(
+            name='left_pinky_finger3',
+            id=110,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        111:
+        dict(
+            name='left_pinky_finger4',
+            id=111,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        112:
+        dict(
+            name='right_hand_root',
+            id=112,
+            color=[255, 255, 255],
+            type='',
+            swap='left_hand_root'),
+        113:
+        dict(
+            name='right_thumb1',
+            id=113,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        114:
+        dict(
+            name='right_thumb2',
+            id=114,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        115:
+        dict(
+            name='right_thumb3',
+            id=115,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        116:
+        dict(
+            name='right_thumb4',
+            id=116,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        117:
+        dict(
+            name='right_forefinger1',
+            id=117,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        118:
+        dict(
+            name='right_forefinger2',
+            id=118,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        119:
+        dict(
+            name='right_forefinger3',
+            id=119,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        120:
+        dict(
+            name='right_forefinger4',
+            id=120,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        121:
+        dict(
+            name='right_middle_finger1',
+            id=121,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        122:
+        dict(
+            name='right_middle_finger2',
+            id=122,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        123:
+        dict(
+            name='right_middle_finger3',
+            id=123,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        124:
+        dict(
+            name='right_middle_finger4',
+            id=124,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        125:
+        dict(
+            name='right_ring_finger1',
+            id=125,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        126:
+        dict(
+            name='right_ring_finger2',
+            id=126,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        127:
+        dict(
+            name='right_ring_finger3',
+            id=127,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        128:
+        dict(
+            name='right_ring_finger4',
+            id=128,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        129:
+        dict(
+            name='right_pinky_finger1',
+            id=129,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        130:
+        dict(
+            name='right_pinky_finger2',
+            id=130,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        131:
+        dict(
+            name='right_pinky_finger3',
+            id=131,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        132:
+        dict(
+            name='right_pinky_finger4',
+            id=132,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+        19:
+        dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]),
+        20:
+        dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]),
+        21:
+        dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]),
+        22:
+        dict(
+            link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]),
+        23:
+        dict(
+            link=('right_ankle', 'right_small_toe'),
+            id=23,
+            color=[255, 128, 0]),
+        24:
+        dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]),
+        25:
+        dict(
+            link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128,
+                                                                  0]),
+        26:
+        dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]),
+        27:
+        dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]),
+        28:
+        dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]),
+        29:
+        dict(
+            link=('left_hand_root', 'left_forefinger1'),
+            id=29,
+            color=[255, 153, 255]),
+        30:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=30,
+            color=[255, 153, 255]),
+        31:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=31,
+            color=[255, 153, 255]),
+        32:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=32,
+            color=[255, 153, 255]),
+        33:
+        dict(
+            link=('left_hand_root', 'left_middle_finger1'),
+            id=33,
+            color=[102, 178, 255]),
+        34:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=34,
+            color=[102, 178, 255]),
+        35:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=35,
+            color=[102, 178, 255]),
+        36:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=36,
+            color=[102, 178, 255]),
+        37:
+        dict(
+            link=('left_hand_root', 'left_ring_finger1'),
+            id=37,
+            color=[255, 51, 51]),
+        38:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=38,
+            color=[255, 51, 51]),
+        39:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=39,
+            color=[255, 51, 51]),
+        40:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=40,
+            color=[255, 51, 51]),
+        41:
+        dict(
+            link=('left_hand_root', 'left_pinky_finger1'),
+            id=41,
+            color=[0, 255, 0]),
+        42:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=42,
+            color=[0, 255, 0]),
+        43:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=43,
+            color=[0, 255, 0]),
+        44:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=44,
+            color=[0, 255, 0]),
+        45:
+        dict(
+            link=('right_hand_root', 'right_thumb1'),
+            id=45,
+            color=[255, 128, 0]),
+        46:
+        dict(
+            link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]),
+        47:
+        dict(
+            link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]),
+        48:
+        dict(
+            link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]),
+        49:
+        dict(
+            link=('right_hand_root', 'right_forefinger1'),
+            id=49,
+            color=[255, 153, 255]),
+        50:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=50,
+            color=[255, 153, 255]),
+        51:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=51,
+            color=[255, 153, 255]),
+        52:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=52,
+            color=[255, 153, 255]),
+        53:
+        dict(
+            link=('right_hand_root', 'right_middle_finger1'),
+            id=53,
+            color=[102, 178, 255]),
+        54:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=54,
+            color=[102, 178, 255]),
+        55:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=55,
+            color=[102, 178, 255]),
+        56:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=56,
+            color=[102, 178, 255]),
+        57:
+        dict(
+            link=('right_hand_root', 'right_ring_finger1'),
+            id=57,
+            color=[255, 51, 51]),
+        58:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=58,
+            color=[255, 51, 51]),
+        59:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=59,
+            color=[255, 51, 51]),
+        60:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=60,
+            color=[255, 51, 51]),
+        61:
+        dict(
+            link=('right_hand_root', 'right_pinky_finger1'),
+            id=61,
+            color=[0, 255, 0]),
+        62:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=62,
+            color=[0, 255, 0]),
+        63:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=63,
+            color=[0, 255, 0]),
+        64:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=64,
+            color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 133,
+    # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+    # 'evaluation/myeval_wholebody.py#L175'
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066,
+        0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031,
+        0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045,
+        0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015,
+        0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017,
+        0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010,
+        0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009,
+        0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01,
+        0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035,
+        0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019,
+        0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+        0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02,
+        0.019, 0.022, 0.031
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody_face.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody_face.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c9ee3350e3bd67ab1825344849487834c71c82b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody_face.py
@@ -0,0 +1,448 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody_face',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='face-0',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        1:
+        dict(
+            name='face-1',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        2:
+        dict(
+            name='face-2',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        3:
+        dict(
+            name='face-3',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        4:
+        dict(
+            name='face-4',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        5:
+        dict(
+            name='face-5',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        6:
+        dict(
+            name='face-6',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        7:
+        dict(
+            name='face-7', id=7, color=[255, 255, 255], type='',
+            swap='face-9'),
+        8:
+        dict(name='face-8', id=8, color=[255, 255, 255], type='', swap=''),
+        9:
+        dict(
+            name='face-9', id=9, color=[255, 255, 255], type='',
+            swap='face-7'),
+        10:
+        dict(
+            name='face-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        11:
+        dict(
+            name='face-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        12:
+        dict(
+            name='face-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        13:
+        dict(
+            name='face-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        14:
+        dict(
+            name='face-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        15:
+        dict(
+            name='face-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        16:
+        dict(
+            name='face-16',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        17:
+        dict(
+            name='face-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        18:
+        dict(
+            name='face-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        19:
+        dict(
+            name='face-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        20:
+        dict(
+            name='face-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        21:
+        dict(
+            name='face-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        22:
+        dict(
+            name='face-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        23:
+        dict(
+            name='face-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        24:
+        dict(
+            name='face-24',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        25:
+        dict(
+            name='face-25',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        26:
+        dict(
+            name='face-26',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        27:
+        dict(name='face-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='face-28', id=28, color=[255, 255, 255], type='', swap=''),
+        29:
+        dict(name='face-29', id=29, color=[255, 255, 255], type='', swap=''),
+        30:
+        dict(name='face-30', id=30, color=[255, 255, 255], type='', swap=''),
+        31:
+        dict(
+            name='face-31',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        32:
+        dict(
+            name='face-32',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        33:
+        dict(name='face-33', id=33, color=[255, 255, 255], type='', swap=''),
+        34:
+        dict(
+            name='face-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        35:
+        dict(
+            name='face-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        36:
+        dict(
+            name='face-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        37:
+        dict(
+            name='face-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        38:
+        dict(
+            name='face-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        39:
+        dict(
+            name='face-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        40:
+        dict(
+            name='face-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        41:
+        dict(
+            name='face-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        42:
+        dict(
+            name='face-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        43:
+        dict(
+            name='face-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        44:
+        dict(
+            name='face-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        45:
+        dict(
+            name='face-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        46:
+        dict(
+            name='face-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        47:
+        dict(
+            name='face-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        48:
+        dict(
+            name='face-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        49:
+        dict(
+            name='face-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        50:
+        dict(
+            name='face-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        51:
+        dict(name='face-51', id=52, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(
+            name='face-52',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        53:
+        dict(
+            name='face-53',
+            id=53,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        54:
+        dict(
+            name='face-54',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        55:
+        dict(
+            name='face-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        56:
+        dict(
+            name='face-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        57:
+        dict(name='face-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='face-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        59:
+        dict(
+            name='face-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        60:
+        dict(
+            name='face-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        61:
+        dict(
+            name='face-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        62:
+        dict(name='face-62', id=62, color=[255, 255, 255], type='', swap=''),
+        63:
+        dict(
+            name='face-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        64:
+        dict(
+            name='face-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        65:
+        dict(
+            name='face-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        66:
+        dict(name='face-66', id=66, color=[255, 255, 255], type='', swap=''),
+        67:
+        dict(
+            name='face-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 68,
+
+    # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+    # 'evaluation/myeval_wholebody.py#L177'
+    sigmas=[
+        0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, 0.025, 0.020, 0.023,
+        0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, 0.013, 0.012, 0.011,
+        0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, 0.009, 0.007, 0.007,
+        0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, 0.011, 0.009, 0.011,
+        0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, 0.034, 0.008, 0.008,
+        0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, 0.009, 0.009, 0.007,
+        0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, 0.008
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody_hand.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody_hand.py
new file mode 100644
index 0000000000000000000000000000000000000000..1910b2ced5a8b31cd6f83911e41cae9f1a580222
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody_hand.py
@@ -0,0 +1,147 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody_hand',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[
+        0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, 0.018,
+        0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, 0.022,
+        0.031
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody_info.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..50ac8fe8cc726711bbcf98dadf003b6e1bc76c33
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/coco_wholebody_info.py
@@ -0,0 +1,1154 @@
+cocowholebody_info = dict(
+    dataset_name='coco_wholebody',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        17:
+        dict(
+            name='left_big_toe',
+            id=17,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_big_toe'),
+        18:
+        dict(
+            name='left_small_toe',
+            id=18,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_small_toe'),
+        19:
+        dict(
+            name='left_heel',
+            id=19,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_heel'),
+        20:
+        dict(
+            name='right_big_toe',
+            id=20,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_big_toe'),
+        21:
+        dict(
+            name='right_small_toe',
+            id=21,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_small_toe'),
+        22:
+        dict(
+            name='right_heel',
+            id=22,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_heel'),
+        23:
+        dict(
+            name='face-0',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        24:
+        dict(
+            name='face-1',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        25:
+        dict(
+            name='face-2',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        26:
+        dict(
+            name='face-3',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        27:
+        dict(
+            name='face-4',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        28:
+        dict(
+            name='face-5',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        29:
+        dict(
+            name='face-6',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        30:
+        dict(
+            name='face-7',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='face-9'),
+        31:
+        dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''),
+        32:
+        dict(
+            name='face-9',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-7'),
+        33:
+        dict(
+            name='face-10',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        34:
+        dict(
+            name='face-11',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        35:
+        dict(
+            name='face-12',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        36:
+        dict(
+            name='face-13',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        37:
+        dict(
+            name='face-14',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        38:
+        dict(
+            name='face-15',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        39:
+        dict(
+            name='face-16',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        40:
+        dict(
+            name='face-17',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        41:
+        dict(
+            name='face-18',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        42:
+        dict(
+            name='face-19',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        43:
+        dict(
+            name='face-20',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        44:
+        dict(
+            name='face-21',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        45:
+        dict(
+            name='face-22',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        46:
+        dict(
+            name='face-23',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        47:
+        dict(
+            name='face-24',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        48:
+        dict(
+            name='face-25',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        49:
+        dict(
+            name='face-26',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        50:
+        dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''),
+        51:
+        dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''),
+        53:
+        dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(
+            name='face-31',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        55:
+        dict(
+            name='face-32',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        56:
+        dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''),
+        57:
+        dict(
+            name='face-34',
+            id=57,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        58:
+        dict(
+            name='face-35',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        59:
+        dict(
+            name='face-36',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        60:
+        dict(
+            name='face-37',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        61:
+        dict(
+            name='face-38',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        62:
+        dict(
+            name='face-39',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        63:
+        dict(
+            name='face-40',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        64:
+        dict(
+            name='face-41',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        65:
+        dict(
+            name='face-42',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        66:
+        dict(
+            name='face-43',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        67:
+        dict(
+            name='face-44',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        68:
+        dict(
+            name='face-45',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        69:
+        dict(
+            name='face-46',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        70:
+        dict(
+            name='face-47',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        71:
+        dict(
+            name='face-48',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        72:
+        dict(
+            name='face-49',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        73:
+        dict(
+            name='face-50',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        74:
+        dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''),
+        75:
+        dict(
+            name='face-52',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        76:
+        dict(
+            name='face-53',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        77:
+        dict(
+            name='face-54',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        78:
+        dict(
+            name='face-55',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        79:
+        dict(
+            name='face-56',
+            id=79,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        80:
+        dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''),
+        81:
+        dict(
+            name='face-58',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        82:
+        dict(
+            name='face-59',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        83:
+        dict(
+            name='face-60',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        84:
+        dict(
+            name='face-61',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        85:
+        dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='face-63',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        87:
+        dict(
+            name='face-64',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        88:
+        dict(
+            name='face-65',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        89:
+        dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''),
+        90:
+        dict(
+            name='face-67',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65'),
+        91:
+        dict(
+            name='left_hand_root',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='right_hand_root'),
+        92:
+        dict(
+            name='left_thumb1',
+            id=92,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        93:
+        dict(
+            name='left_thumb2',
+            id=93,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        94:
+        dict(
+            name='left_thumb3',
+            id=94,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        95:
+        dict(
+            name='left_thumb4',
+            id=95,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        96:
+        dict(
+            name='left_forefinger1',
+            id=96,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        97:
+        dict(
+            name='left_forefinger2',
+            id=97,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        98:
+        dict(
+            name='left_forefinger3',
+            id=98,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        99:
+        dict(
+            name='left_forefinger4',
+            id=99,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        100:
+        dict(
+            name='left_middle_finger1',
+            id=100,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        101:
+        dict(
+            name='left_middle_finger2',
+            id=101,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        102:
+        dict(
+            name='left_middle_finger3',
+            id=102,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        103:
+        dict(
+            name='left_middle_finger4',
+            id=103,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        104:
+        dict(
+            name='left_ring_finger1',
+            id=104,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        105:
+        dict(
+            name='left_ring_finger2',
+            id=105,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        106:
+        dict(
+            name='left_ring_finger3',
+            id=106,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        107:
+        dict(
+            name='left_ring_finger4',
+            id=107,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        108:
+        dict(
+            name='left_pinky_finger1',
+            id=108,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        109:
+        dict(
+            name='left_pinky_finger2',
+            id=109,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        110:
+        dict(
+            name='left_pinky_finger3',
+            id=110,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        111:
+        dict(
+            name='left_pinky_finger4',
+            id=111,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        112:
+        dict(
+            name='right_hand_root',
+            id=112,
+            color=[255, 255, 255],
+            type='',
+            swap='left_hand_root'),
+        113:
+        dict(
+            name='right_thumb1',
+            id=113,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        114:
+        dict(
+            name='right_thumb2',
+            id=114,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        115:
+        dict(
+            name='right_thumb3',
+            id=115,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        116:
+        dict(
+            name='right_thumb4',
+            id=116,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        117:
+        dict(
+            name='right_forefinger1',
+            id=117,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        118:
+        dict(
+            name='right_forefinger2',
+            id=118,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        119:
+        dict(
+            name='right_forefinger3',
+            id=119,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        120:
+        dict(
+            name='right_forefinger4',
+            id=120,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        121:
+        dict(
+            name='right_middle_finger1',
+            id=121,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        122:
+        dict(
+            name='right_middle_finger2',
+            id=122,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        123:
+        dict(
+            name='right_middle_finger3',
+            id=123,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        124:
+        dict(
+            name='right_middle_finger4',
+            id=124,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        125:
+        dict(
+            name='right_ring_finger1',
+            id=125,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        126:
+        dict(
+            name='right_ring_finger2',
+            id=126,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        127:
+        dict(
+            name='right_ring_finger3',
+            id=127,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        128:
+        dict(
+            name='right_ring_finger4',
+            id=128,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        129:
+        dict(
+            name='right_pinky_finger1',
+            id=129,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        130:
+        dict(
+            name='right_pinky_finger2',
+            id=130,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        131:
+        dict(
+            name='right_pinky_finger3',
+            id=131,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        132:
+        dict(
+            name='right_pinky_finger4',
+            id=132,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+        19:
+        dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]),
+        20:
+        dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]),
+        21:
+        dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]),
+        22:
+        dict(
+            link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]),
+        23:
+        dict(
+            link=('right_ankle', 'right_small_toe'),
+            id=23,
+            color=[255, 128, 0]),
+        24:
+        dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]),
+        25:
+        dict(
+            link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128,
+                                                                  0]),
+        26:
+        dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]),
+        27:
+        dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]),
+        28:
+        dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]),
+        29:
+        dict(
+            link=('left_hand_root', 'left_forefinger1'),
+            id=29,
+            color=[255, 153, 255]),
+        30:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=30,
+            color=[255, 153, 255]),
+        31:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=31,
+            color=[255, 153, 255]),
+        32:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=32,
+            color=[255, 153, 255]),
+        33:
+        dict(
+            link=('left_hand_root', 'left_middle_finger1'),
+            id=33,
+            color=[102, 178, 255]),
+        34:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=34,
+            color=[102, 178, 255]),
+        35:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=35,
+            color=[102, 178, 255]),
+        36:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=36,
+            color=[102, 178, 255]),
+        37:
+        dict(
+            link=('left_hand_root', 'left_ring_finger1'),
+            id=37,
+            color=[255, 51, 51]),
+        38:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=38,
+            color=[255, 51, 51]),
+        39:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=39,
+            color=[255, 51, 51]),
+        40:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=40,
+            color=[255, 51, 51]),
+        41:
+        dict(
+            link=('left_hand_root', 'left_pinky_finger1'),
+            id=41,
+            color=[0, 255, 0]),
+        42:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=42,
+            color=[0, 255, 0]),
+        43:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=43,
+            color=[0, 255, 0]),
+        44:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=44,
+            color=[0, 255, 0]),
+        45:
+        dict(
+            link=('right_hand_root', 'right_thumb1'),
+            id=45,
+            color=[255, 128, 0]),
+        46:
+        dict(
+            link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]),
+        47:
+        dict(
+            link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]),
+        48:
+        dict(
+            link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]),
+        49:
+        dict(
+            link=('right_hand_root', 'right_forefinger1'),
+            id=49,
+            color=[255, 153, 255]),
+        50:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=50,
+            color=[255, 153, 255]),
+        51:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=51,
+            color=[255, 153, 255]),
+        52:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=52,
+            color=[255, 153, 255]),
+        53:
+        dict(
+            link=('right_hand_root', 'right_middle_finger1'),
+            id=53,
+            color=[102, 178, 255]),
+        54:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=54,
+            color=[102, 178, 255]),
+        55:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=55,
+            color=[102, 178, 255]),
+        56:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=56,
+            color=[102, 178, 255]),
+        57:
+        dict(
+            link=('right_hand_root', 'right_ring_finger1'),
+            id=57,
+            color=[255, 51, 51]),
+        58:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=58,
+            color=[255, 51, 51]),
+        59:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=59,
+            color=[255, 51, 51]),
+        60:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=60,
+            color=[255, 51, 51]),
+        61:
+        dict(
+            link=('right_hand_root', 'right_pinky_finger1'),
+            id=61,
+            color=[0, 255, 0]),
+        62:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=62,
+            color=[0, 255, 0]),
+        63:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=63,
+            color=[0, 255, 0]),
+        64:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=64,
+            color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 133,
+    # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+    # 'evaluation/myeval_wholebody.py#L175'
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066,
+        0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031,
+        0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045,
+        0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015,
+        0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017,
+        0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010,
+        0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009,
+        0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01,
+        0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035,
+        0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019,
+        0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+        0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02,
+        0.019, 0.022, 0.031
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/cofw.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/cofw.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7ad2f8d1fdbe868b3691858a370e26b59a105
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/cofw.py
@@ -0,0 +1,134 @@
+dataset_info = dict(
+    dataset_name='cofw',
+    paper_info=dict(
+        author='Burgos-Artizzu, Xavier P and Perona, '
+        r'Pietro and Doll{\'a}r, Piotr',
+        title='Robust face landmark estimation under occlusion',
+        container='Proceedings of the IEEE international '
+        'conference on computer vision',
+        year='2013',
+        homepage='http://www.vision.caltech.edu/xpburgos/ICCV13/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-1'),
+        1:
+        dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-0'),
+        2:
+        dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'),
+        3:
+        dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'),
+        4:
+        dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-6'),
+        5:
+        dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-7'),
+        6:
+        dict(name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-4'),
+        7:
+        dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-5'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'),
+        10:
+        dict(
+            name='kpt-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-11'),
+        11:
+        dict(
+            name='kpt-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-10'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        13:
+        dict(
+            name='kpt-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-13'),
+        16:
+        dict(
+            name='kpt-16',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-16'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        20:
+        dict(name='kpt-20', id=20, color=[255, 255, 255], type='', swap=''),
+        21:
+        dict(name='kpt-21', id=21, color=[255, 255, 255], type='', swap=''),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-23'),
+        23:
+        dict(
+            name='kpt-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        24:
+        dict(name='kpt-24', id=24, color=[255, 255, 255], type='', swap=''),
+        25:
+        dict(name='kpt-25', id=25, color=[255, 255, 255], type='', swap=''),
+        26:
+        dict(name='kpt-26', id=26, color=[255, 255, 255], type='', swap=''),
+        27:
+        dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 29,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/crowdpose.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/crowdpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..45086531a601870716eed15a32c5413c0e24b7ae
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/crowdpose.py
@@ -0,0 +1,147 @@
+dataset_info = dict(
+    dataset_name='crowdpose',
+    paper_info=dict(
+        author='Li, Jiefeng and Wang, Can and Zhu, Hao and '
+        'Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu',
+        title='CrowdPose: Efficient Crowded Scenes Pose Estimation '
+        'and A New Benchmark',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2019',
+        homepage='https://github.com/Jeff-sjtu/CrowdPose',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_shoulder',
+            id=0,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_shoulder'),
+        1:
+        dict(
+            name='right_shoulder',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_shoulder'),
+        2:
+        dict(
+            name='left_elbow',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_elbow'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='left_wrist',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_wrist'),
+        5:
+        dict(
+            name='right_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='left_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='left_knee',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_knee'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_ankle',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_ankle'),
+        11:
+        dict(
+            name='right_ankle',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_ankle'),
+        12:
+        dict(
+            name='top_head', id=12, color=[255, 128, 0], type='upper',
+            swap=''),
+        13:
+        dict(name='neck', id=13, color=[0, 255, 0], type='upper', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('top_head', 'neck'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('right_shoulder', 'neck'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('left_shoulder', 'neck'), id=14, color=[51, 153, 255])
+    },
+    joint_weights=[
+        0.2, 0.2, 0.2, 1.3, 1.5, 0.2, 1.3, 1.5, 0.2, 0.2, 0.5, 0.2, 0.2, 0.5
+    ],
+    sigmas=[
+        0.079, 0.079, 0.072, 0.072, 0.062, 0.062, 0.107, 0.107, 0.087, 0.087,
+        0.089, 0.089, 0.079, 0.079
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/deepfashion_full.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/deepfashion_full.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d989069ee7253d3a5b5f01c81135b1a472cd4b2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/deepfashion_full.py
@@ -0,0 +1,74 @@
+dataset_info = dict(
+    dataset_name='deepfashion_full',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left collar',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right collar'),
+        1:
+        dict(
+            name='right collar',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left collar'),
+        2:
+        dict(
+            name='left sleeve',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right sleeve'),
+        3:
+        dict(
+            name='right sleeve',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left sleeve'),
+        4:
+        dict(
+            name='left waistline',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right waistline'),
+        5:
+        dict(
+            name='right waistline',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left waistline'),
+        6:
+        dict(
+            name='left hem',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        7:
+        dict(
+            name='right hem',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 8,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/deepfashion_lower.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/deepfashion_lower.py
new file mode 100644
index 0000000000000000000000000000000000000000..db014a1747ca618f93a7d092d29027015b48ae3c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/deepfashion_lower.py
@@ -0,0 +1,46 @@
+dataset_info = dict(
+    dataset_name='deepfashion_lower',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left waistline',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right waistline'),
+        1:
+        dict(
+            name='right waistline',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left waistline'),
+        2:
+        dict(
+            name='left hem',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        3:
+        dict(
+            name='right hem',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 4,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/deepfashion_upper.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/deepfashion_upper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0b012fd37bee1ba5ed956a7a5465a8623bf0894
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/deepfashion_upper.py
@@ -0,0 +1,60 @@
+dataset_info = dict(
+    dataset_name='deepfashion_upper',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left collar',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right collar'),
+        1:
+        dict(
+            name='right collar',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left collar'),
+        2:
+        dict(
+            name='left sleeve',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right sleeve'),
+        3:
+        dict(
+            name='right sleeve',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left sleeve'),
+        4:
+        dict(
+            name='left hem',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        5:
+        dict(
+            name='right hem',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 6,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/fly.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/fly.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f94ff57ca93d8f562b6a61b9a67198abdcde217
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/fly.py
@@ -0,0 +1,237 @@
+dataset_info = dict(
+    dataset_name='fly',
+    paper_info=dict(
+        author='Pereira, Talmo D and Aldarondo, Diego E and '
+        'Willmore, Lindsay and Kislin, Mikhail and '
+        'Wang, Samuel S-H and Murthy, Mala and Shaevitz, Joshua W',
+        title='Fast animal pose estimation using deep neural networks',
+        container='Nature methods',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='head', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='eyeL', id=1, color=[255, 255, 255], type='', swap='eyeR'),
+        2:
+        dict(name='eyeR', id=2, color=[255, 255, 255], type='', swap='eyeL'),
+        3:
+        dict(name='neck', id=3, color=[255, 255, 255], type='', swap=''),
+        4:
+        dict(name='thorax', id=4, color=[255, 255, 255], type='', swap=''),
+        5:
+        dict(name='abdomen', id=5, color=[255, 255, 255], type='', swap=''),
+        6:
+        dict(
+            name='forelegR1',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        7:
+        dict(
+            name='forelegR2',
+            id=7,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL2'),
+        8:
+        dict(
+            name='forelegR3',
+            id=8,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL3'),
+        9:
+        dict(
+            name='forelegR4',
+            id=9,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL4'),
+        10:
+        dict(
+            name='midlegR1',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL1'),
+        11:
+        dict(
+            name='midlegR2',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL2'),
+        12:
+        dict(
+            name='midlegR3',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL3'),
+        13:
+        dict(
+            name='midlegR4',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL4'),
+        14:
+        dict(
+            name='hindlegR1',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        15:
+        dict(
+            name='hindlegR2',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL2'),
+        16:
+        dict(
+            name='hindlegR3',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL3'),
+        17:
+        dict(
+            name='hindlegR4',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL4'),
+        18:
+        dict(
+            name='forelegL1',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        19:
+        dict(
+            name='forelegL2',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR2'),
+        20:
+        dict(
+            name='forelegL3',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR3'),
+        21:
+        dict(
+            name='forelegL4',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR4'),
+        22:
+        dict(
+            name='midlegL1',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR1'),
+        23:
+        dict(
+            name='midlegL2',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR2'),
+        24:
+        dict(
+            name='midlegL3',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR3'),
+        25:
+        dict(
+            name='midlegL4',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR4'),
+        26:
+        dict(
+            name='hindlegL1',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        27:
+        dict(
+            name='hindlegL2',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR2'),
+        28:
+        dict(
+            name='hindlegL3',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR3'),
+        29:
+        dict(
+            name='hindlegL4',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR4'),
+        30:
+        dict(
+            name='wingL', id=30, color=[255, 255, 255], type='', swap='wingR'),
+        31:
+        dict(
+            name='wingR', id=31, color=[255, 255, 255], type='', swap='wingL'),
+    },
+    skeleton_info={
+        0: dict(link=('eyeL', 'head'), id=0, color=[255, 255, 255]),
+        1: dict(link=('eyeR', 'head'), id=1, color=[255, 255, 255]),
+        2: dict(link=('neck', 'head'), id=2, color=[255, 255, 255]),
+        3: dict(link=('thorax', 'neck'), id=3, color=[255, 255, 255]),
+        4: dict(link=('abdomen', 'thorax'), id=4, color=[255, 255, 255]),
+        5: dict(link=('forelegR2', 'forelegR1'), id=5, color=[255, 255, 255]),
+        6: dict(link=('forelegR3', 'forelegR2'), id=6, color=[255, 255, 255]),
+        7: dict(link=('forelegR4', 'forelegR3'), id=7, color=[255, 255, 255]),
+        8: dict(link=('midlegR2', 'midlegR1'), id=8, color=[255, 255, 255]),
+        9: dict(link=('midlegR3', 'midlegR2'), id=9, color=[255, 255, 255]),
+        10: dict(link=('midlegR4', 'midlegR3'), id=10, color=[255, 255, 255]),
+        11:
+        dict(link=('hindlegR2', 'hindlegR1'), id=11, color=[255, 255, 255]),
+        12:
+        dict(link=('hindlegR3', 'hindlegR2'), id=12, color=[255, 255, 255]),
+        13:
+        dict(link=('hindlegR4', 'hindlegR3'), id=13, color=[255, 255, 255]),
+        14:
+        dict(link=('forelegL2', 'forelegL1'), id=14, color=[255, 255, 255]),
+        15:
+        dict(link=('forelegL3', 'forelegL2'), id=15, color=[255, 255, 255]),
+        16:
+        dict(link=('forelegL4', 'forelegL3'), id=16, color=[255, 255, 255]),
+        17: dict(link=('midlegL2', 'midlegL1'), id=17, color=[255, 255, 255]),
+        18: dict(link=('midlegL3', 'midlegL2'), id=18, color=[255, 255, 255]),
+        19: dict(link=('midlegL4', 'midlegL3'), id=19, color=[255, 255, 255]),
+        20:
+        dict(link=('hindlegL2', 'hindlegL1'), id=20, color=[255, 255, 255]),
+        21:
+        dict(link=('hindlegL3', 'hindlegL2'), id=21, color=[255, 255, 255]),
+        22:
+        dict(link=('hindlegL4', 'hindlegL3'), id=22, color=[255, 255, 255]),
+        23: dict(link=('wingL', 'neck'), id=23, color=[255, 255, 255]),
+        24: dict(link=('wingR', 'neck'), id=24, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 32,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/freihand2d.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/freihand2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b960d10f3538801531dbccdd67aeac6e73ac572
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/freihand2d.py
@@ -0,0 +1,144 @@
+dataset_info = dict(
+    dataset_name='freihand',
+    paper_info=dict(
+        author='Zimmermann, Christian and Ceylan, Duygu and '
+        'Yang, Jimei and Russell, Bryan and '
+        'Argus, Max and Brox, Thomas',
+        title='Freihand: A dataset for markerless capture of hand pose '
+        'and shape from single rgb images',
+        container='Proceedings of the IEEE International '
+        'Conference on Computer Vision',
+        year='2019',
+        homepage='https://lmb.informatik.uni-freiburg.de/projects/freihand/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/h36m.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..00a719d8b19f9ff3c5ef98476d73216055bf9186
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/h36m.py
@@ -0,0 +1,152 @@
+dataset_info = dict(
+    dataset_name='h36m',
+    paper_info=dict(
+        author='Ionescu, Catalin and Papava, Dragos and '
+        'Olaru, Vlad and Sminchisescu, Cristian',
+        title='Human3.6M: Large Scale Datasets and Predictive '
+        'Methods for 3D Human Sensing in Natural Environments',
+        container='IEEE Transactions on Pattern Analysis and '
+        'Machine Intelligence',
+        year='2014',
+        homepage='http://vision.imar.ro/human3.6m/description.php',
+    ),
+    keypoint_info={
+        0:
+        dict(name='root', id=0, color=[51, 153, 255], type='lower', swap=''),
+        1:
+        dict(
+            name='right_hip',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        2:
+        dict(
+            name='right_knee',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        3:
+        dict(
+            name='right_foot',
+            id=3,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_foot'),
+        4:
+        dict(
+            name='left_hip',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        5:
+        dict(
+            name='left_knee',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        6:
+        dict(
+            name='left_foot',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_foot'),
+        7:
+        dict(name='spine', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(name='thorax', id=8, color=[51, 153, 255], type='upper', swap=''),
+        9:
+        dict(
+            name='neck_base',
+            id=9,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        10:
+        dict(name='head', id=10, color=[51, 153, 255], type='upper', swap=''),
+        11:
+        dict(
+            name='left_shoulder',
+            id=11,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        12:
+        dict(
+            name='left_elbow',
+            id=12,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        13:
+        dict(
+            name='left_wrist',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        14:
+        dict(
+            name='right_shoulder',
+            id=14,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        15:
+        dict(
+            name='right_elbow',
+            id=15,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        16:
+        dict(
+            name='right_wrist',
+            id=16,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('root', 'left_hip'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_hip', 'left_knee'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('left_knee', 'left_foot'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('root', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('right_hip', 'right_knee'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('right_knee', 'right_foot'), id=5, color=[255, 128, 0]),
+        6:
+        dict(link=('root', 'spine'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('spine', 'thorax'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('thorax', 'neck_base'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('neck_base', 'head'), id=9, color=[51, 153, 255]),
+        10:
+        dict(link=('thorax', 'left_shoulder'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('left_shoulder', 'left_elbow'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_elbow', 'left_wrist'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('thorax', 'right_shoulder'), id=13, color=[255, 128, 0]),
+        14:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=14, color=[255, 128,
+                                                                  0]),
+        15:
+        dict(link=('right_elbow', 'right_wrist'), id=15, color=[255, 128, 0])
+    },
+    joint_weights=[1.] * 17,
+    sigmas=[],
+    stats_info=dict(bbox_center=(528., 427.), bbox_scale=400.))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/halpe.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/halpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..1385fe81dc2190684f2142449c0f288f2cb74c1a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/halpe.py
@@ -0,0 +1,1157 @@
+dataset_info = dict(
+    dataset_name='halpe',
+    paper_info=dict(
+        author='Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie'
+        ' and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu'
+        ' and Ma, Ze and Chen, Mingyang and Lu, Cewu',
+        title='PaStaNet: Toward Human Activity Knowledge Engine',
+        container='CVPR',
+        year='2020',
+        homepage='https://github.com/Fang-Haoshu/Halpe-FullBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        17:
+        dict(name='head', id=17, color=[255, 128, 0], type='upper', swap=''),
+        18:
+        dict(name='neck', id=18, color=[255, 128, 0], type='upper', swap=''),
+        19:
+        dict(name='hip', id=19, color=[255, 128, 0], type='lower', swap=''),
+        20:
+        dict(
+            name='left_big_toe',
+            id=20,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_big_toe'),
+        21:
+        dict(
+            name='right_big_toe',
+            id=21,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_big_toe'),
+        22:
+        dict(
+            name='left_small_toe',
+            id=22,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_small_toe'),
+        23:
+        dict(
+            name='right_small_toe',
+            id=23,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_small_toe'),
+        24:
+        dict(
+            name='left_heel',
+            id=24,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_heel'),
+        25:
+        dict(
+            name='right_heel',
+            id=25,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_heel'),
+        26:
+        dict(
+            name='face-0',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        27:
+        dict(
+            name='face-1',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        28:
+        dict(
+            name='face-2',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        29:
+        dict(
+            name='face-3',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        30:
+        dict(
+            name='face-4',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        31:
+        dict(
+            name='face-5',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        32:
+        dict(
+            name='face-6',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        33:
+        dict(
+            name='face-7',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='face-9'),
+        34:
+        dict(name='face-8', id=34, color=[255, 255, 255], type='', swap=''),
+        35:
+        dict(
+            name='face-9',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-7'),
+        36:
+        dict(
+            name='face-10',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        37:
+        dict(
+            name='face-11',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        38:
+        dict(
+            name='face-12',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        39:
+        dict(
+            name='face-13',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        40:
+        dict(
+            name='face-14',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        41:
+        dict(
+            name='face-15',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        42:
+        dict(
+            name='face-16',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        43:
+        dict(
+            name='face-17',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        44:
+        dict(
+            name='face-18',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        45:
+        dict(
+            name='face-19',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        46:
+        dict(
+            name='face-20',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        47:
+        dict(
+            name='face-21',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        48:
+        dict(
+            name='face-22',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        49:
+        dict(
+            name='face-23',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        50:
+        dict(
+            name='face-24',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        51:
+        dict(
+            name='face-25',
+            id=51,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        52:
+        dict(
+            name='face-26',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        53:
+        dict(name='face-27', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(name='face-28', id=54, color=[255, 255, 255], type='', swap=''),
+        55:
+        dict(name='face-29', id=55, color=[255, 255, 255], type='', swap=''),
+        56:
+        dict(name='face-30', id=56, color=[255, 255, 255], type='', swap=''),
+        57:
+        dict(
+            name='face-31',
+            id=57,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        58:
+        dict(
+            name='face-32',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        59:
+        dict(name='face-33', id=59, color=[255, 255, 255], type='', swap=''),
+        60:
+        dict(
+            name='face-34',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        61:
+        dict(
+            name='face-35',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        62:
+        dict(
+            name='face-36',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        63:
+        dict(
+            name='face-37',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        64:
+        dict(
+            name='face-38',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        65:
+        dict(
+            name='face-39',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        66:
+        dict(
+            name='face-40',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        67:
+        dict(
+            name='face-41',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        68:
+        dict(
+            name='face-42',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        69:
+        dict(
+            name='face-43',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        70:
+        dict(
+            name='face-44',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        71:
+        dict(
+            name='face-45',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        72:
+        dict(
+            name='face-46',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        73:
+        dict(
+            name='face-47',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        74:
+        dict(
+            name='face-48',
+            id=74,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        75:
+        dict(
+            name='face-49',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        76:
+        dict(
+            name='face-50',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        77:
+        dict(name='face-51', id=77, color=[255, 255, 255], type='', swap=''),
+        78:
+        dict(
+            name='face-52',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        79:
+        dict(
+            name='face-53',
+            id=79,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        80:
+        dict(
+            name='face-54',
+            id=80,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        81:
+        dict(
+            name='face-55',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        82:
+        dict(
+            name='face-56',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        83:
+        dict(name='face-57', id=83, color=[255, 255, 255], type='', swap=''),
+        84:
+        dict(
+            name='face-58',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        85:
+        dict(
+            name='face-59',
+            id=85,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        86:
+        dict(
+            name='face-60',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        87:
+        dict(
+            name='face-61',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        88:
+        dict(name='face-62', id=88, color=[255, 255, 255], type='', swap=''),
+        89:
+        dict(
+            name='face-63',
+            id=89,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        90:
+        dict(
+            name='face-64',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        91:
+        dict(
+            name='face-65',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        92:
+        dict(name='face-66', id=92, color=[255, 255, 255], type='', swap=''),
+        93:
+        dict(
+            name='face-67',
+            id=93,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65'),
+        94:
+        dict(
+            name='left_hand_root',
+            id=94,
+            color=[255, 255, 255],
+            type='',
+            swap='right_hand_root'),
+        95:
+        dict(
+            name='left_thumb1',
+            id=95,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        96:
+        dict(
+            name='left_thumb2',
+            id=96,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        97:
+        dict(
+            name='left_thumb3',
+            id=97,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        98:
+        dict(
+            name='left_thumb4',
+            id=98,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        99:
+        dict(
+            name='left_forefinger1',
+            id=99,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        100:
+        dict(
+            name='left_forefinger2',
+            id=100,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        101:
+        dict(
+            name='left_forefinger3',
+            id=101,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        102:
+        dict(
+            name='left_forefinger4',
+            id=102,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        103:
+        dict(
+            name='left_middle_finger1',
+            id=103,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        104:
+        dict(
+            name='left_middle_finger2',
+            id=104,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        105:
+        dict(
+            name='left_middle_finger3',
+            id=105,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        106:
+        dict(
+            name='left_middle_finger4',
+            id=106,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        107:
+        dict(
+            name='left_ring_finger1',
+            id=107,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        108:
+        dict(
+            name='left_ring_finger2',
+            id=108,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        109:
+        dict(
+            name='left_ring_finger3',
+            id=109,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        110:
+        dict(
+            name='left_ring_finger4',
+            id=110,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        111:
+        dict(
+            name='left_pinky_finger1',
+            id=111,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        112:
+        dict(
+            name='left_pinky_finger2',
+            id=112,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        113:
+        dict(
+            name='left_pinky_finger3',
+            id=113,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        114:
+        dict(
+            name='left_pinky_finger4',
+            id=114,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        115:
+        dict(
+            name='right_hand_root',
+            id=115,
+            color=[255, 255, 255],
+            type='',
+            swap='left_hand_root'),
+        116:
+        dict(
+            name='right_thumb1',
+            id=116,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        117:
+        dict(
+            name='right_thumb2',
+            id=117,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        118:
+        dict(
+            name='right_thumb3',
+            id=118,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        119:
+        dict(
+            name='right_thumb4',
+            id=119,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        120:
+        dict(
+            name='right_forefinger1',
+            id=120,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        121:
+        dict(
+            name='right_forefinger2',
+            id=121,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        122:
+        dict(
+            name='right_forefinger3',
+            id=122,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        123:
+        dict(
+            name='right_forefinger4',
+            id=123,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        124:
+        dict(
+            name='right_middle_finger1',
+            id=124,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        125:
+        dict(
+            name='right_middle_finger2',
+            id=125,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        126:
+        dict(
+            name='right_middle_finger3',
+            id=126,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        127:
+        dict(
+            name='right_middle_finger4',
+            id=127,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        128:
+        dict(
+            name='right_ring_finger1',
+            id=128,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        129:
+        dict(
+            name='right_ring_finger2',
+            id=129,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        130:
+        dict(
+            name='right_ring_finger3',
+            id=130,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        131:
+        dict(
+            name='right_ring_finger4',
+            id=131,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        132:
+        dict(
+            name='right_pinky_finger1',
+            id=132,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        133:
+        dict(
+            name='right_pinky_finger2',
+            id=133,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        134:
+        dict(
+            name='right_pinky_finger3',
+            id=134,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        135:
+        dict(
+            name='right_pinky_finger4',
+            id=135,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('left_hip', 'hip'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('right_ankle', 'right_knee'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('right_knee', 'right_hip'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('right_hip', 'hip'), id=5, color=[255, 128, 0]),
+        6:
+        dict(link=('head', 'neck'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('neck', 'hip'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('neck', 'left_shoulder'), id=8, color=[0, 255, 0]),
+        9:
+        dict(link=('left_shoulder', 'left_elbow'), id=9, color=[0, 255, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('neck', 'right_shoulder'), id=11, color=[255, 128, 0]),
+        12:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=12, color=[255, 128,
+                                                                  0]),
+        13:
+        dict(link=('right_elbow', 'right_wrist'), id=13, color=[255, 128, 0]),
+        14:
+        dict(link=('left_eye', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('nose', 'left_eye'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('nose', 'right_eye'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_eye', 'left_ear'), id=17, color=[51, 153, 255]),
+        18:
+        dict(link=('right_eye', 'right_ear'), id=18, color=[51, 153, 255]),
+        19:
+        dict(link=('left_ear', 'left_shoulder'), id=19, color=[51, 153, 255]),
+        20:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=20, color=[51, 153, 255]),
+        21:
+        dict(link=('left_ankle', 'left_big_toe'), id=21, color=[0, 255, 0]),
+        22:
+        dict(link=('left_ankle', 'left_small_toe'), id=22, color=[0, 255, 0]),
+        23:
+        dict(link=('left_ankle', 'left_heel'), id=23, color=[0, 255, 0]),
+        24:
+        dict(
+            link=('right_ankle', 'right_big_toe'), id=24, color=[255, 128, 0]),
+        25:
+        dict(
+            link=('right_ankle', 'right_small_toe'),
+            id=25,
+            color=[255, 128, 0]),
+        26:
+        dict(link=('right_ankle', 'right_heel'), id=26, color=[255, 128, 0]),
+        27:
+        dict(link=('left_wrist', 'left_thumb1'), id=27, color=[255, 128, 0]),
+        28:
+        dict(link=('left_thumb1', 'left_thumb2'), id=28, color=[255, 128, 0]),
+        29:
+        dict(link=('left_thumb2', 'left_thumb3'), id=29, color=[255, 128, 0]),
+        30:
+        dict(link=('left_thumb3', 'left_thumb4'), id=30, color=[255, 128, 0]),
+        31:
+        dict(
+            link=('left_wrist', 'left_forefinger1'),
+            id=31,
+            color=[255, 153, 255]),
+        32:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=32,
+            color=[255, 153, 255]),
+        33:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=33,
+            color=[255, 153, 255]),
+        34:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=34,
+            color=[255, 153, 255]),
+        35:
+        dict(
+            link=('left_wrist', 'left_middle_finger1'),
+            id=35,
+            color=[102, 178, 255]),
+        36:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=36,
+            color=[102, 178, 255]),
+        37:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=37,
+            color=[102, 178, 255]),
+        38:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=38,
+            color=[102, 178, 255]),
+        39:
+        dict(
+            link=('left_wrist', 'left_ring_finger1'),
+            id=39,
+            color=[255, 51, 51]),
+        40:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=40,
+            color=[255, 51, 51]),
+        41:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=41,
+            color=[255, 51, 51]),
+        42:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=42,
+            color=[255, 51, 51]),
+        43:
+        dict(
+            link=('left_wrist', 'left_pinky_finger1'),
+            id=43,
+            color=[0, 255, 0]),
+        44:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=44,
+            color=[0, 255, 0]),
+        45:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=45,
+            color=[0, 255, 0]),
+        46:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=46,
+            color=[0, 255, 0]),
+        47:
+        dict(link=('right_wrist', 'right_thumb1'), id=47, color=[255, 128, 0]),
+        48:
+        dict(
+            link=('right_thumb1', 'right_thumb2'), id=48, color=[255, 128, 0]),
+        49:
+        dict(
+            link=('right_thumb2', 'right_thumb3'), id=49, color=[255, 128, 0]),
+        50:
+        dict(
+            link=('right_thumb3', 'right_thumb4'), id=50, color=[255, 128, 0]),
+        51:
+        dict(
+            link=('right_wrist', 'right_forefinger1'),
+            id=51,
+            color=[255, 153, 255]),
+        52:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=52,
+            color=[255, 153, 255]),
+        53:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=53,
+            color=[255, 153, 255]),
+        54:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=54,
+            color=[255, 153, 255]),
+        55:
+        dict(
+            link=('right_wrist', 'right_middle_finger1'),
+            id=55,
+            color=[102, 178, 255]),
+        56:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=56,
+            color=[102, 178, 255]),
+        57:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=57,
+            color=[102, 178, 255]),
+        58:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=58,
+            color=[102, 178, 255]),
+        59:
+        dict(
+            link=('right_wrist', 'right_ring_finger1'),
+            id=59,
+            color=[255, 51, 51]),
+        60:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=60,
+            color=[255, 51, 51]),
+        61:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=61,
+            color=[255, 51, 51]),
+        62:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=62,
+            color=[255, 51, 51]),
+        63:
+        dict(
+            link=('right_wrist', 'right_pinky_finger1'),
+            id=63,
+            color=[0, 255, 0]),
+        64:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=64,
+            color=[0, 255, 0]),
+        65:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=65,
+            color=[0, 255, 0]),
+        66:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=66,
+            color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 136,
+
+    # 'https://github.com/Fang-Haoshu/Halpe-FullBody/blob/master/'
+    # 'HalpeCOCOAPI/PythonAPI/halpecocotools/cocoeval.py#L245'
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.08, 0.08, 0.08,
+        0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/horse10.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/horse10.py
new file mode 100644
index 0000000000000000000000000000000000000000..a485bf191bc151b0d76e48f3e55eb8e2dda6c506
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/horse10.py
@@ -0,0 +1,201 @@
+dataset_info = dict(
+    dataset_name='horse10',
+    paper_info=dict(
+        author='Mathis, Alexander and Biasi, Thomas and '
+        'Schneider, Steffen and '
+        'Yuksekgonul, Mert and Rogers, Byron and '
+        'Bethge, Matthias and '
+        'Mathis, Mackenzie W',
+        title='Pretraining boosts out-of-domain robustness '
+        'for pose estimation',
+        container='Proceedings of the IEEE/CVF Winter Conference on '
+        'Applications of Computer Vision',
+        year='2021',
+        homepage='http://www.mackenziemathislab.org/horse10',
+    ),
+    keypoint_info={
+        0:
+        dict(name='Nose', id=0, color=[255, 153, 255], type='upper', swap=''),
+        1:
+        dict(name='Eye', id=1, color=[255, 153, 255], type='upper', swap=''),
+        2:
+        dict(
+            name='Nearknee',
+            id=2,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        3:
+        dict(
+            name='Nearfrontfetlock',
+            id=3,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        4:
+        dict(
+            name='Nearfrontfoot',
+            id=4,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        5:
+        dict(
+            name='Offknee', id=5, color=[255, 102, 255], type='upper',
+            swap=''),
+        6:
+        dict(
+            name='Offfrontfetlock',
+            id=6,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        7:
+        dict(
+            name='Offfrontfoot',
+            id=7,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        8:
+        dict(
+            name='Shoulder',
+            id=8,
+            color=[255, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='Midshoulder',
+            id=9,
+            color=[255, 153, 255],
+            type='upper',
+            swap=''),
+        10:
+        dict(
+            name='Elbow', id=10, color=[255, 153, 255], type='upper', swap=''),
+        11:
+        dict(
+            name='Girth', id=11, color=[255, 153, 255], type='upper', swap=''),
+        12:
+        dict(
+            name='Wither', id=12, color=[255, 153, 255], type='upper',
+            swap=''),
+        13:
+        dict(
+            name='Nearhindhock',
+            id=13,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        14:
+        dict(
+            name='Nearhindfetlock',
+            id=14,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        15:
+        dict(
+            name='Nearhindfoot',
+            id=15,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        16:
+        dict(name='Hip', id=16, color=[255, 153, 255], type='lower', swap=''),
+        17:
+        dict(
+            name='Stifle', id=17, color=[255, 153, 255], type='lower',
+            swap=''),
+        18:
+        dict(
+            name='Offhindhock',
+            id=18,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        19:
+        dict(
+            name='Offhindfetlock',
+            id=19,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        20:
+        dict(
+            name='Offhindfoot',
+            id=20,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        21:
+        dict(
+            name='Ischium',
+            id=21,
+            color=[255, 153, 255],
+            type='lower',
+            swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('Nose', 'Eye'), id=0, color=[255, 153, 255]),
+        1:
+        dict(link=('Eye', 'Wither'), id=1, color=[255, 153, 255]),
+        2:
+        dict(link=('Wither', 'Hip'), id=2, color=[255, 153, 255]),
+        3:
+        dict(link=('Hip', 'Ischium'), id=3, color=[255, 153, 255]),
+        4:
+        dict(link=('Ischium', 'Stifle'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('Stifle', 'Girth'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('Girth', 'Elbow'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('Elbow', 'Shoulder'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('Shoulder', 'Midshoulder'), id=8, color=[255, 153, 255]),
+        9:
+        dict(link=('Midshoulder', 'Wither'), id=9, color=[255, 153, 255]),
+        10:
+        dict(
+            link=('Nearknee', 'Nearfrontfetlock'),
+            id=10,
+            color=[255, 102, 255]),
+        11:
+        dict(
+            link=('Nearfrontfetlock', 'Nearfrontfoot'),
+            id=11,
+            color=[255, 102, 255]),
+        12:
+        dict(
+            link=('Offknee', 'Offfrontfetlock'), id=12, color=[255, 102, 255]),
+        13:
+        dict(
+            link=('Offfrontfetlock', 'Offfrontfoot'),
+            id=13,
+            color=[255, 102, 255]),
+        14:
+        dict(
+            link=('Nearhindhock', 'Nearhindfetlock'),
+            id=14,
+            color=[255, 51, 255]),
+        15:
+        dict(
+            link=('Nearhindfetlock', 'Nearhindfoot'),
+            id=15,
+            color=[255, 51, 255]),
+        16:
+        dict(
+            link=('Offhindhock', 'Offhindfetlock'),
+            id=16,
+            color=[255, 51, 255]),
+        17:
+        dict(
+            link=('Offhindfetlock', 'Offhindfoot'),
+            id=17,
+            color=[255, 51, 255])
+    },
+    joint_weights=[1.] * 22,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/interhand2d.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/interhand2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0134f07de5bf536eaffbf71155a7e6eb33b24f0a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/interhand2d.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='interhand2d',
+    paper_info=dict(
+        author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and '
+        'Shiratori, Takaaki and Lee, Kyoung Mu',
+        title='InterHand2.6M: A dataset and baseline for 3D '
+        'interacting hand pose estimation from a single RGB image',
+        container='arXiv',
+        year='2020',
+        homepage='https://mks0601.github.io/InterHand2.6M/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='thumb4', id=0, color=[255, 128, 0], type='', swap=''),
+        1:
+        dict(name='thumb3', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb1', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(
+            name='forefinger4', id=4, color=[255, 153, 255], type='', swap=''),
+        5:
+        dict(
+            name='forefinger3', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger1', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='middle_finger4',
+            id=8,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        9:
+        dict(
+            name='middle_finger3',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger1',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='ring_finger4', id=12, color=[255, 51, 51], type='', swap=''),
+        13:
+        dict(
+            name='ring_finger3', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger1', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(name='pinky_finger4', id=16, color=[0, 255, 0], type='', swap=''),
+        17:
+        dict(name='pinky_finger3', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger1', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='wrist', id=20, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/interhand3d.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/interhand3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2bd8121c281c741ec9b980c7570ebef8a632993
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/interhand3d.py
@@ -0,0 +1,487 @@
+dataset_info = dict(
+    dataset_name='interhand3d',
+    paper_info=dict(
+        author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and '
+        'Shiratori, Takaaki and Lee, Kyoung Mu',
+        title='InterHand2.6M: A dataset and baseline for 3D '
+        'interacting hand pose estimation from a single RGB image',
+        container='arXiv',
+        year='2020',
+        homepage='https://mks0601.github.io/InterHand2.6M/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_thumb4',
+            id=0,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        1:
+        dict(
+            name='right_thumb3',
+            id=1,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        2:
+        dict(
+            name='right_thumb2',
+            id=2,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        3:
+        dict(
+            name='right_thumb1',
+            id=3,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        4:
+        dict(
+            name='right_forefinger4',
+            id=4,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        5:
+        dict(
+            name='right_forefinger3',
+            id=5,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        6:
+        dict(
+            name='right_forefinger2',
+            id=6,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        7:
+        dict(
+            name='right_forefinger1',
+            id=7,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        8:
+        dict(
+            name='right_middle_finger4',
+            id=8,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        9:
+        dict(
+            name='right_middle_finger3',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        10:
+        dict(
+            name='right_middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        11:
+        dict(
+            name='right_middle_finger1',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        12:
+        dict(
+            name='right_ring_finger4',
+            id=12,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        13:
+        dict(
+            name='right_ring_finger3',
+            id=13,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        14:
+        dict(
+            name='right_ring_finger2',
+            id=14,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        15:
+        dict(
+            name='right_ring_finger1',
+            id=15,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        16:
+        dict(
+            name='right_pinky_finger4',
+            id=16,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4'),
+        17:
+        dict(
+            name='right_pinky_finger3',
+            id=17,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        18:
+        dict(
+            name='right_pinky_finger2',
+            id=18,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        19:
+        dict(
+            name='right_pinky_finger1',
+            id=19,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        20:
+        dict(
+            name='right_wrist',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='left_wrist'),
+        21:
+        dict(
+            name='left_thumb4',
+            id=21,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        22:
+        dict(
+            name='left_thumb3',
+            id=22,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        23:
+        dict(
+            name='left_thumb2',
+            id=23,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        24:
+        dict(
+            name='left_thumb1',
+            id=24,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        25:
+        dict(
+            name='left_forefinger4',
+            id=25,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        26:
+        dict(
+            name='left_forefinger3',
+            id=26,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        27:
+        dict(
+            name='left_forefinger2',
+            id=27,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        28:
+        dict(
+            name='left_forefinger1',
+            id=28,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        29:
+        dict(
+            name='left_middle_finger4',
+            id=29,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        30:
+        dict(
+            name='left_middle_finger3',
+            id=30,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        31:
+        dict(
+            name='left_middle_finger2',
+            id=31,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        32:
+        dict(
+            name='left_middle_finger1',
+            id=32,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        33:
+        dict(
+            name='left_ring_finger4',
+            id=33,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        34:
+        dict(
+            name='left_ring_finger3',
+            id=34,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        35:
+        dict(
+            name='left_ring_finger2',
+            id=35,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        36:
+        dict(
+            name='left_ring_finger1',
+            id=36,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        37:
+        dict(
+            name='left_pinky_finger4',
+            id=37,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        38:
+        dict(
+            name='left_pinky_finger3',
+            id=38,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        39:
+        dict(
+            name='left_pinky_finger2',
+            id=39,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        40:
+        dict(
+            name='left_pinky_finger1',
+            id=40,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        41:
+        dict(
+            name='left_wrist',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='right_wrist'),
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_wrist', 'right_thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_thumb1', 'right_thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_thumb2', 'right_thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_thumb3', 'right_thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(
+            link=('right_wrist', 'right_forefinger1'),
+            id=4,
+            color=[255, 153, 255]),
+        5:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=5,
+            color=[255, 153, 255]),
+        6:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=6,
+            color=[255, 153, 255]),
+        7:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=7,
+            color=[255, 153, 255]),
+        8:
+        dict(
+            link=('right_wrist', 'right_middle_finger1'),
+            id=8,
+            color=[102, 178, 255]),
+        9:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(
+            link=('right_wrist', 'right_ring_finger1'),
+            id=12,
+            color=[255, 51, 51]),
+        13:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=13,
+            color=[255, 51, 51]),
+        14:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=14,
+            color=[255, 51, 51]),
+        15:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=15,
+            color=[255, 51, 51]),
+        16:
+        dict(
+            link=('right_wrist', 'right_pinky_finger1'),
+            id=16,
+            color=[0, 255, 0]),
+        17:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=17,
+            color=[0, 255, 0]),
+        18:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=18,
+            color=[0, 255, 0]),
+        19:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=19,
+            color=[0, 255, 0]),
+        20:
+        dict(link=('left_wrist', 'left_thumb1'), id=20, color=[255, 128, 0]),
+        21:
+        dict(link=('left_thumb1', 'left_thumb2'), id=21, color=[255, 128, 0]),
+        22:
+        dict(link=('left_thumb2', 'left_thumb3'), id=22, color=[255, 128, 0]),
+        23:
+        dict(link=('left_thumb3', 'left_thumb4'), id=23, color=[255, 128, 0]),
+        24:
+        dict(
+            link=('left_wrist', 'left_forefinger1'),
+            id=24,
+            color=[255, 153, 255]),
+        25:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=25,
+            color=[255, 153, 255]),
+        26:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=26,
+            color=[255, 153, 255]),
+        27:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=27,
+            color=[255, 153, 255]),
+        28:
+        dict(
+            link=('left_wrist', 'left_middle_finger1'),
+            id=28,
+            color=[102, 178, 255]),
+        29:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=29,
+            color=[102, 178, 255]),
+        30:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=30,
+            color=[102, 178, 255]),
+        31:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=31,
+            color=[102, 178, 255]),
+        32:
+        dict(
+            link=('left_wrist', 'left_ring_finger1'),
+            id=32,
+            color=[255, 51, 51]),
+        33:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=33,
+            color=[255, 51, 51]),
+        34:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=34,
+            color=[255, 51, 51]),
+        35:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=35,
+            color=[255, 51, 51]),
+        36:
+        dict(
+            link=('left_wrist', 'left_pinky_finger1'),
+            id=36,
+            color=[0, 255, 0]),
+        37:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=37,
+            color=[0, 255, 0]),
+        38:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=38,
+            color=[0, 255, 0]),
+        39:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=39,
+            color=[0, 255, 0]),
+    },
+    joint_weights=[1.] * 42,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/jhmdb.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/jhmdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b37488498a2bade1fa6f2ff6532fcd219071803
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/jhmdb.py
@@ -0,0 +1,129 @@
+dataset_info = dict(
+    dataset_name='jhmdb',
+    paper_info=dict(
+        author='H. Jhuang and J. Gall and S. Zuffi and '
+        'C. Schmid and M. J. Black',
+        title='Towards understanding action recognition',
+        container='International Conf. on Computer Vision (ICCV)',
+        year='2013',
+        homepage='http://jhmdb.is.tue.mpg.de/dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(name='neck', id=0, color=[255, 128, 0], type='upper', swap=''),
+        1:
+        dict(name='belly', id=1, color=[255, 128, 0], type='upper', swap=''),
+        2:
+        dict(name='head', id=2, color=[255, 128, 0], type='upper', swap=''),
+        3:
+        dict(
+            name='right_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='left_shoulder'),
+        4:
+        dict(
+            name='left_shoulder',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        5:
+        dict(
+            name='right_hip',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_hip'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[51, 153, 255],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_elbow',
+            id=7,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_elbow'),
+        8:
+        dict(
+            name='left_elbow',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_elbow'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[51, 153, 255],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_knee',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_knee'),
+        11:
+        dict(
+            name='right_wrist',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        12:
+        dict(
+            name='left_wrist',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='right_wrist'),
+        13:
+        dict(
+            name='right_ankle',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_ankle'),
+        14:
+        dict(
+            name='left_ankle',
+            id=14,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle')
+    },
+    skeleton_info={
+        0: dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1: dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2: dict(link=('right_hip', 'belly'), id=2, color=[255, 128, 0]),
+        3: dict(link=('belly', 'left_hip'), id=3, color=[0, 255, 0]),
+        4: dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6: dict(link=('belly', 'neck'), id=6, color=[51, 153, 255]),
+        7: dict(link=('neck', 'head'), id=7, color=[51, 153, 255]),
+        8: dict(link=('neck', 'right_shoulder'), id=8, color=[255, 128, 0]),
+        9: dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('right_elbow', 'right_wrist'), id=10, color=[255, 128, 0]),
+        11: dict(link=('neck', 'left_shoulder'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_shoulder', 'left_elbow'), id=12, color=[0, 255, 0]),
+        13: dict(link=('left_elbow', 'left_wrist'), id=13, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2, 1.5, 1.5, 1.5, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.025, 0.107, 0.025, 0.079, 0.079, 0.107, 0.107, 0.072, 0.072, 0.087,
+        0.087, 0.062, 0.062, 0.089, 0.089
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/locust.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/locust.py
new file mode 100644
index 0000000000000000000000000000000000000000..db3fa15aa060b5806faae7a21f65460f77be2745
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/locust.py
@@ -0,0 +1,263 @@
+dataset_info = dict(
+    dataset_name='locust',
+    paper_info=dict(
+        author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and '
+        'Li, Liang and Koger, Benjamin and Costelloe, Blair R and '
+        'Couzin, Iain D',
+        title='DeepPoseKit, a software toolkit for fast and robust '
+        'animal pose estimation using deep learning',
+        container='Elife',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='head', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='neck', id=1, color=[255, 255, 255], type='', swap=''),
+        2:
+        dict(name='thorax', id=2, color=[255, 255, 255], type='', swap=''),
+        3:
+        dict(name='abdomen1', id=3, color=[255, 255, 255], type='', swap=''),
+        4:
+        dict(name='abdomen2', id=4, color=[255, 255, 255], type='', swap=''),
+        5:
+        dict(
+            name='anttipL',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='anttipR'),
+        6:
+        dict(
+            name='antbaseL',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='antbaseR'),
+        7:
+        dict(name='eyeL', id=7, color=[255, 255, 255], type='', swap='eyeR'),
+        8:
+        dict(
+            name='forelegL1',
+            id=8,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        9:
+        dict(
+            name='forelegL2',
+            id=9,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR2'),
+        10:
+        dict(
+            name='forelegL3',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR3'),
+        11:
+        dict(
+            name='forelegL4',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR4'),
+        12:
+        dict(
+            name='midlegL1',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR1'),
+        13:
+        dict(
+            name='midlegL2',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR2'),
+        14:
+        dict(
+            name='midlegL3',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR3'),
+        15:
+        dict(
+            name='midlegL4',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR4'),
+        16:
+        dict(
+            name='hindlegL1',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        17:
+        dict(
+            name='hindlegL2',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR2'),
+        18:
+        dict(
+            name='hindlegL3',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR3'),
+        19:
+        dict(
+            name='hindlegL4',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR4'),
+        20:
+        dict(
+            name='anttipR',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='anttipL'),
+        21:
+        dict(
+            name='antbaseR',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='antbaseL'),
+        22:
+        dict(name='eyeR', id=22, color=[255, 255, 255], type='', swap='eyeL'),
+        23:
+        dict(
+            name='forelegR1',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        24:
+        dict(
+            name='forelegR2',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL2'),
+        25:
+        dict(
+            name='forelegR3',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL3'),
+        26:
+        dict(
+            name='forelegR4',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL4'),
+        27:
+        dict(
+            name='midlegR1',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL1'),
+        28:
+        dict(
+            name='midlegR2',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL2'),
+        29:
+        dict(
+            name='midlegR3',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL3'),
+        30:
+        dict(
+            name='midlegR4',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL4'),
+        31:
+        dict(
+            name='hindlegR1',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        32:
+        dict(
+            name='hindlegR2',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL2'),
+        33:
+        dict(
+            name='hindlegR3',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL3'),
+        34:
+        dict(
+            name='hindlegR4',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL4')
+    },
+    skeleton_info={
+        0: dict(link=('neck', 'head'), id=0, color=[255, 255, 255]),
+        1: dict(link=('thorax', 'neck'), id=1, color=[255, 255, 255]),
+        2: dict(link=('abdomen1', 'thorax'), id=2, color=[255, 255, 255]),
+        3: dict(link=('abdomen2', 'abdomen1'), id=3, color=[255, 255, 255]),
+        4: dict(link=('antbaseL', 'anttipL'), id=4, color=[255, 255, 255]),
+        5: dict(link=('eyeL', 'antbaseL'), id=5, color=[255, 255, 255]),
+        6: dict(link=('forelegL2', 'forelegL1'), id=6, color=[255, 255, 255]),
+        7: dict(link=('forelegL3', 'forelegL2'), id=7, color=[255, 255, 255]),
+        8: dict(link=('forelegL4', 'forelegL3'), id=8, color=[255, 255, 255]),
+        9: dict(link=('midlegL2', 'midlegL1'), id=9, color=[255, 255, 255]),
+        10: dict(link=('midlegL3', 'midlegL2'), id=10, color=[255, 255, 255]),
+        11: dict(link=('midlegL4', 'midlegL3'), id=11, color=[255, 255, 255]),
+        12:
+        dict(link=('hindlegL2', 'hindlegL1'), id=12, color=[255, 255, 255]),
+        13:
+        dict(link=('hindlegL3', 'hindlegL2'), id=13, color=[255, 255, 255]),
+        14:
+        dict(link=('hindlegL4', 'hindlegL3'), id=14, color=[255, 255, 255]),
+        15: dict(link=('antbaseR', 'anttipR'), id=15, color=[255, 255, 255]),
+        16: dict(link=('eyeR', 'antbaseR'), id=16, color=[255, 255, 255]),
+        17:
+        dict(link=('forelegR2', 'forelegR1'), id=17, color=[255, 255, 255]),
+        18:
+        dict(link=('forelegR3', 'forelegR2'), id=18, color=[255, 255, 255]),
+        19:
+        dict(link=('forelegR4', 'forelegR3'), id=19, color=[255, 255, 255]),
+        20: dict(link=('midlegR2', 'midlegR1'), id=20, color=[255, 255, 255]),
+        21: dict(link=('midlegR3', 'midlegR2'), id=21, color=[255, 255, 255]),
+        22: dict(link=('midlegR4', 'midlegR3'), id=22, color=[255, 255, 255]),
+        23:
+        dict(link=('hindlegR2', 'hindlegR1'), id=23, color=[255, 255, 255]),
+        24:
+        dict(link=('hindlegR3', 'hindlegR2'), id=24, color=[255, 255, 255]),
+        25:
+        dict(link=('hindlegR4', 'hindlegR3'), id=25, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 35,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/macaque.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/macaque.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea8dac297ea2f0e36dabccccc021d953216a6ac8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/macaque.py
@@ -0,0 +1,183 @@
+dataset_info = dict(
+    dataset_name='macaque',
+    paper_info=dict(
+        author='Labuguen, Rollyn and Matsumoto, Jumpei and '
+        'Negrete, Salvador and Nishimaru, Hiroshi and '
+        'Nishijo, Hisao and Takada, Masahiko and '
+        'Go, Yasuhiro and Inoue, Ken-ichi and Shibata, Tomohiro',
+        title='MacaquePose: A novel "in the wild" macaque monkey pose dataset '
+        'for markerless motion capture',
+        container='bioRxiv',
+        year='2020',
+        homepage='http://www.pri.kyoto-u.ac.jp/datasets/'
+        'macaquepose/index.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mhp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mhp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16e37c79cb63c4352c48bb4e45602b8408f534b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mhp.py
@@ -0,0 +1,156 @@
+dataset_info = dict(
+    dataset_name='mhp',
+    paper_info=dict(
+        author='Zhao, Jian and Li, Jianshu and Cheng, Yu and '
+        'Sim, Terence and Yan, Shuicheng and Feng, Jiashi',
+        title='Understanding humans in crowded scenes: '
+        'Deep nested adversarial learning and a '
+        'new benchmark for multi-human parsing',
+        container='Proceedings of the 26th ACM '
+        'international conference on Multimedia',
+        year='2018',
+        homepage='https://lv-mhp.github.io/dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+        7:
+        dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='upper_neck',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='head_top', id=9, color=[51, 153, 255], type='upper',
+            swap=''),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='right_elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        12:
+        dict(
+            name='right_shoulder',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        13:
+        dict(
+            name='left_shoulder',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        14:
+        dict(
+            name='left_elbow',
+            id=14,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        15:
+        dict(
+            name='left_wrist',
+            id=15,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5:
+        dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+                                                                  0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+        14:
+        dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+        0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpi_inf_3dhp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpi_inf_3dhp.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd0a70297b24456ea38566ac205bb585aa47e5d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpi_inf_3dhp.py
@@ -0,0 +1,132 @@
+dataset_info = dict(
+    dataset_name='mpi_inf_3dhp',
+    paper_info=dict(
+        author='ehta, Dushyant and Rhodin, Helge and Casas, Dan and '
+        'Fua, Pascal and Sotnychenko, Oleksandr and Xu, Weipeng and '
+        'Theobalt, Christian',
+        title='Monocular 3D Human Pose Estimation In The Wild Using Improved '
+        'CNN Supervision',
+        container='2017 international conference on 3D vision (3DV)',
+        year='2017',
+        homepage='http://gvv.mpi-inf.mpg.de/3dhp-dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='head_top', id=0, color=[51, 153, 255], type='upper',
+            swap=''),
+        1:
+        dict(name='neck', id=1, color=[51, 153, 255], type='upper', swap=''),
+        2:
+        dict(
+            name='right_shoulder',
+            id=2,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='right_wrist',
+            id=4,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='left_elbow',
+            id=6,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        7:
+        dict(
+            name='left_wrist',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        8:
+        dict(
+            name='right_hip',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='right_ankle',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='left_knee',
+            id=12,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        13:
+        dict(
+            name='left_ankle',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        14:
+        dict(name='root', id=14, color=[51, 153, 255], type='lower', swap=''),
+        15:
+        dict(name='spine', id=15, color=[51, 153, 255], type='upper', swap=''),
+        16:
+        dict(name='head', id=16, color=[51, 153, 255], type='upper', swap='')
+    },
+    skeleton_info={
+        0: dict(link=('neck', 'right_shoulder'), id=0, color=[255, 128, 0]),
+        1: dict(
+            link=('right_shoulder', 'right_elbow'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_elbow', 'right_wrist'), id=2, color=[255, 128, 0]),
+        3: dict(link=('neck', 'left_shoulder'), id=3, color=[0, 255, 0]),
+        4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('root', 'right_hip'), id=6, color=[255, 128, 0]),
+        7: dict(link=('right_hip', 'right_knee'), id=7, color=[255, 128, 0]),
+        8: dict(link=('right_knee', 'right_ankle'), id=8, color=[255, 128, 0]),
+        9: dict(link=('root', 'left_hip'), id=9, color=[0, 255, 0]),
+        10: dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]),
+        11: dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 255, 0]),
+        12: dict(link=('head_top', 'head'), id=12, color=[51, 153, 255]),
+        13: dict(link=('head', 'neck'), id=13, color=[51, 153, 255]),
+        14: dict(link=('neck', 'spine'), id=14, color=[51, 153, 255]),
+        15: dict(link=('spine', 'root'), id=15, color=[51, 153, 255])
+    },
+    joint_weights=[1.] * 17,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpii.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpii.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c2a491c7b58bc3eaa5c0056d3d7184bdd1d1cc7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpii.py
@@ -0,0 +1,155 @@
+dataset_info = dict(
+    dataset_name='mpii',
+    paper_info=dict(
+        author='Mykhaylo Andriluka and Leonid Pishchulin and '
+        'Peter Gehler and Schiele, Bernt',
+        title='2D Human Pose Estimation: New Benchmark and '
+        'State of the Art Analysis',
+        container='IEEE Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2014',
+        homepage='http://human-pose.mpi-inf.mpg.de/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+        7:
+        dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='upper_neck',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='head_top', id=9, color=[51, 153, 255], type='upper',
+            swap=''),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='right_elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        12:
+        dict(
+            name='right_shoulder',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        13:
+        dict(
+            name='left_shoulder',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        14:
+        dict(
+            name='left_elbow',
+            id=14,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        15:
+        dict(
+            name='left_wrist',
+            id=15,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5:
+        dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+                                                                  0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+        14:
+        dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+        0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpii_info.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpii_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..8090992a672af4aa13a321369f382e33a4e3b1a4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpii_info.py
@@ -0,0 +1,155 @@
+mpii_info = dict(
+    dataset_name='mpii',
+    paper_info=dict(
+        author='Mykhaylo Andriluka and Leonid Pishchulin and '
+        'Peter Gehler and Schiele, Bernt',
+        title='2D Human Pose Estimation: New Benchmark and '
+        'State of the Art Analysis',
+        container='IEEE Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2014',
+        homepage='http://human-pose.mpi-inf.mpg.de/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+        7:
+        dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='upper_neck',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='head_top', id=9, color=[51, 153, 255], type='upper',
+            swap=''),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='right_elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        12:
+        dict(
+            name='right_shoulder',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        13:
+        dict(
+            name='left_shoulder',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        14:
+        dict(
+            name='left_elbow',
+            id=14,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        15:
+        dict(
+            name='left_wrist',
+            id=15,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5:
+        dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+                                                                  0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+        14:
+        dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+        0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpii_trb.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpii_trb.py
new file mode 100644
index 0000000000000000000000000000000000000000..73940d4b4827f8e08343c3b517360db788e4820d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/mpii_trb.py
@@ -0,0 +1,380 @@
+dataset_info = dict(
+    dataset_name='mpii_trb',
+    paper_info=dict(
+        author='Duan, Haodong and Lin, Kwan-Yee and Jin, Sheng and '
+        'Liu, Wentao and Qian, Chen and Ouyang, Wanli',
+        title='TRB: A Novel Triplet Representation for '
+        'Understanding 2D Human Body',
+        container='Proceedings of the IEEE International '
+        'Conference on Computer Vision',
+        year='2019',
+        homepage='https://github.com/kennymckormick/'
+        'Triplet-Representation-of-human-Body',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_shoulder',
+            id=0,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        1:
+        dict(
+            name='right_shoulder',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        2:
+        dict(
+            name='left_elbow',
+            id=2,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='left_wrist',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        5:
+        dict(
+            name='right_wrist',
+            id=5,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='left_knee',
+            id=8,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_ankle',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        11:
+        dict(
+            name='right_ankle',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        12:
+        dict(name='head', id=12, color=[51, 153, 255], type='upper', swap=''),
+        13:
+        dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap=''),
+        14:
+        dict(
+            name='right_neck',
+            id=14,
+            color=[255, 255, 255],
+            type='upper',
+            swap='left_neck'),
+        15:
+        dict(
+            name='left_neck',
+            id=15,
+            color=[255, 255, 255],
+            type='upper',
+            swap='right_neck'),
+        16:
+        dict(
+            name='medial_right_shoulder',
+            id=16,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_shoulder'),
+        17:
+        dict(
+            name='lateral_right_shoulder',
+            id=17,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_shoulder'),
+        18:
+        dict(
+            name='medial_right_bow',
+            id=18,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_bow'),
+        19:
+        dict(
+            name='lateral_right_bow',
+            id=19,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_bow'),
+        20:
+        dict(
+            name='medial_right_wrist',
+            id=20,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_wrist'),
+        21:
+        dict(
+            name='lateral_right_wrist',
+            id=21,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_wrist'),
+        22:
+        dict(
+            name='medial_left_shoulder',
+            id=22,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_shoulder'),
+        23:
+        dict(
+            name='lateral_left_shoulder',
+            id=23,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_shoulder'),
+        24:
+        dict(
+            name='medial_left_bow',
+            id=24,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_bow'),
+        25:
+        dict(
+            name='lateral_left_bow',
+            id=25,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_bow'),
+        26:
+        dict(
+            name='medial_left_wrist',
+            id=26,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_wrist'),
+        27:
+        dict(
+            name='lateral_left_wrist',
+            id=27,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_wrist'),
+        28:
+        dict(
+            name='medial_right_hip',
+            id=28,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_hip'),
+        29:
+        dict(
+            name='lateral_right_hip',
+            id=29,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_hip'),
+        30:
+        dict(
+            name='medial_right_knee',
+            id=30,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_knee'),
+        31:
+        dict(
+            name='lateral_right_knee',
+            id=31,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_knee'),
+        32:
+        dict(
+            name='medial_right_ankle',
+            id=32,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_ankle'),
+        33:
+        dict(
+            name='lateral_right_ankle',
+            id=33,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_ankle'),
+        34:
+        dict(
+            name='medial_left_hip',
+            id=34,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_hip'),
+        35:
+        dict(
+            name='lateral_left_hip',
+            id=35,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_hip'),
+        36:
+        dict(
+            name='medial_left_knee',
+            id=36,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_knee'),
+        37:
+        dict(
+            name='lateral_left_knee',
+            id=37,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_knee'),
+        38:
+        dict(
+            name='medial_left_ankle',
+            id=38,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_ankle'),
+        39:
+        dict(
+            name='lateral_left_ankle',
+            id=39,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_ankle'),
+    },
+    skeleton_info={
+        0:
+        dict(link=('head', 'neck'), id=0, color=[51, 153, 255]),
+        1:
+        dict(link=('neck', 'left_shoulder'), id=1, color=[51, 153, 255]),
+        2:
+        dict(link=('neck', 'right_shoulder'), id=2, color=[51, 153, 255]),
+        3:
+        dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+        4:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('left_shoulder', 'left_hip'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('right_shoulder', 'right_hip'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('left_hip', 'right_hip'), id=9, color=[51, 153, 255]),
+        10:
+        dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_hip', 'right_knee'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_knee', 'left_ankle'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('right_knee', 'right_ankle'), id=13, color=[255, 128, 0]),
+        14:
+        dict(link=('right_neck', 'left_neck'), id=14, color=[255, 255, 255]),
+        15:
+        dict(
+            link=('medial_right_shoulder', 'lateral_right_shoulder'),
+            id=15,
+            color=[255, 255, 255]),
+        16:
+        dict(
+            link=('medial_right_bow', 'lateral_right_bow'),
+            id=16,
+            color=[255, 255, 255]),
+        17:
+        dict(
+            link=('medial_right_wrist', 'lateral_right_wrist'),
+            id=17,
+            color=[255, 255, 255]),
+        18:
+        dict(
+            link=('medial_left_shoulder', 'lateral_left_shoulder'),
+            id=18,
+            color=[255, 255, 255]),
+        19:
+        dict(
+            link=('medial_left_bow', 'lateral_left_bow'),
+            id=19,
+            color=[255, 255, 255]),
+        20:
+        dict(
+            link=('medial_left_wrist', 'lateral_left_wrist'),
+            id=20,
+            color=[255, 255, 255]),
+        21:
+        dict(
+            link=('medial_right_hip', 'lateral_right_hip'),
+            id=21,
+            color=[255, 255, 255]),
+        22:
+        dict(
+            link=('medial_right_knee', 'lateral_right_knee'),
+            id=22,
+            color=[255, 255, 255]),
+        23:
+        dict(
+            link=('medial_right_ankle', 'lateral_right_ankle'),
+            id=23,
+            color=[255, 255, 255]),
+        24:
+        dict(
+            link=('medial_left_hip', 'lateral_left_hip'),
+            id=24,
+            color=[255, 255, 255]),
+        25:
+        dict(
+            link=('medial_left_knee', 'lateral_left_knee'),
+            id=25,
+            color=[255, 255, 255]),
+        26:
+        dict(
+            link=('medial_left_ankle', 'lateral_left_ankle'),
+            id=26,
+            color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 40,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/ochuman.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/ochuman.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ef20838fe583fde133a97e688d30e91ae562746
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/ochuman.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+    dataset_name='ochuman',
+    paper_info=dict(
+        author='Zhang, Song-Hai and Li, Ruilong and Dong, Xin and '
+        'Rosin, Paul and Cai, Zixi and Han, Xi and '
+        'Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min',
+        title='Pose2seg: Detection free human instance segmentation',
+        container='Proceedings of the IEEE conference on computer '
+        'vision and pattern recognition',
+        year='2019',
+        homepage='https://github.com/liruilong940607/OCHumanApi',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/onehand10k.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/onehand10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..016770f14f3075dfa7d59389524a0c11a4feb802
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/onehand10k.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='onehand10k',
+    paper_info=dict(
+        author='Wang, Yangang and Peng, Cong and Liu, Yebin',
+        title='Mask-pose cascaded cnn for 2d hand pose estimation '
+        'from single color image',
+        container='IEEE Transactions on Circuits and Systems '
+        'for Video Technology',
+        year='2018',
+        homepage='https://www.yangangwang.com/papers/WANG-MCC-2018-10.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/panoptic_body3d.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/panoptic_body3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b19ac462415a840ca2e0b9e214bdb35d91b5e4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/panoptic_body3d.py
@@ -0,0 +1,160 @@
+dataset_info = dict(
+    dataset_name='panoptic_pose_3d',
+    paper_info=dict(
+        author='Joo, Hanbyul and Simon, Tomas and  Li, Xulong'
+        'and Liu, Hao and Tan, Lei and Gui, Lin and Banerjee, Sean'
+        'and Godisart, Timothy and Nabbe, Bart and Matthews, Iain'
+        'and Kanade, Takeo and Nobuhara, Shohei and Sheikh, Yaser',
+        title='Panoptic Studio: A Massively Multiview System '
+        'for Interaction Motion Capture',
+        container='IEEE Transactions on Pattern Analysis'
+        ' and Machine Intelligence',
+        year='2017',
+        homepage='http://domedb.perception.cs.cmu.edu',
+    ),
+    keypoint_info={
+        0:
+        dict(name='neck', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(name='nose', id=1, color=[51, 153, 255], type='upper', swap=''),
+        2:
+        dict(name='mid_hip', id=2, color=[0, 255, 0], type='lower', swap=''),
+        3:
+        dict(
+            name='left_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        4:
+        dict(
+            name='left_elbow',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        5:
+        dict(
+            name='left_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='left_knee',
+            id=7,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        8:
+        dict(
+            name='left_ankle',
+            id=8,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        9:
+        dict(
+            name='right_shoulder',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        10:
+        dict(
+            name='right_elbow',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        11:
+        dict(
+            name='right_wrist',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='right_knee',
+            id=13,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        14:
+        dict(
+            name='right_ankle',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        15:
+        dict(
+            name='left_eye',
+            id=15,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        16:
+        dict(
+            name='left_ear',
+            id=16,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        17:
+        dict(
+            name='right_eye',
+            id=17,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        18:
+        dict(
+            name='right_ear',
+            id=18,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear')
+    },
+    skeleton_info={
+        0: dict(link=('nose', 'neck'), id=0, color=[51, 153, 255]),
+        1: dict(link=('neck', 'left_shoulder'), id=1, color=[0, 255, 0]),
+        2: dict(link=('neck', 'right_shoulder'), id=2, color=[255, 128, 0]),
+        3: dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+        4: dict(
+            link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+        7: dict(link=('left_ankle', 'left_knee'), id=7, color=[0, 255, 0]),
+        8: dict(link=('left_knee', 'left_hip'), id=8, color=[0, 255, 0]),
+        9: dict(link=('right_ankle', 'right_knee'), id=9, color=[255, 128, 0]),
+        10: dict(link=('right_knee', 'right_hip'), id=10, color=[255, 128, 0]),
+        11: dict(link=('mid_hip', 'left_hip'), id=11, color=[0, 255, 0]),
+        12: dict(link=('mid_hip', 'right_hip'), id=12, color=[255, 128, 0]),
+        13: dict(link=('mid_hip', 'neck'), id=13, color=[51, 153, 255]),
+    },
+    joint_weights=[
+        1.0, 1.0, 1.0, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2,
+        1.5, 1.0, 1.0, 1.0, 1.0
+    ],
+    sigmas=[
+        0.026, 0.026, 0.107, 0.079, 0.072, 0.062, 0.107, 0.087, 0.089, 0.079,
+        0.072, 0.062, 0.107, 0.087, 0.089, 0.025, 0.035, 0.025, 0.035
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/panoptic_hand2d.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/panoptic_hand2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a65731ba87b155beb1b40591fd9acb232c2afc6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/panoptic_hand2d.py
@@ -0,0 +1,143 @@
+dataset_info = dict(
+    dataset_name='panoptic_hand2d',
+    paper_info=dict(
+        author='Simon, Tomas and Joo, Hanbyul and '
+        'Matthews, Iain and Sheikh, Yaser',
+        title='Hand keypoint detection in single images using '
+        'multiview bootstrapping',
+        container='Proceedings of the IEEE conference on '
+        'Computer Vision and Pattern Recognition',
+        year='2017',
+        homepage='http://domedb.perception.cs.cmu.edu/handdb.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/posetrack18.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/posetrack18.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aefd1c97fe083df35ee88bebab4f99134c27971
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/posetrack18.py
@@ -0,0 +1,176 @@
+dataset_info = dict(
+    dataset_name='posetrack18',
+    paper_info=dict(
+        author='Andriluka, Mykhaylo and Iqbal, Umar and '
+        'Insafutdinov, Eldar and Pishchulin, Leonid and '
+        'Milan, Anton and Gall, Juergen and Schiele, Bernt',
+        title='Posetrack: A benchmark for human pose estimation and tracking',
+        container='Proceedings of the IEEE Conference on '
+        'Computer Vision and Pattern Recognition',
+        year='2018',
+        homepage='https://posetrack.net/users/download.php',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='head_bottom',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        2:
+        dict(
+            name='head_top', id=2, color=[51, 153, 255], type='upper',
+            swap=''),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('nose', 'head_bottom'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'head_top'), id=13, color=[51, 153, 255]),
+        14:
+        dict(
+            link=('head_bottom', 'left_shoulder'), id=14, color=[51, 153,
+                                                                 255]),
+        15:
+        dict(
+            link=('head_bottom', 'right_shoulder'),
+            id=15,
+            color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/rhd2d.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/rhd2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..f48e63702635e140276543d372138de57ae4634e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/rhd2d.py
@@ -0,0 +1,141 @@
+dataset_info = dict(
+    dataset_name='rhd2d',
+    paper_info=dict(
+        author='Christian Zimmermann and Thomas Brox',
+        title='Learning to Estimate 3D Hand Pose from Single RGB Images',
+        container='arXiv',
+        year='2017',
+        homepage='https://lmb.informatik.uni-freiburg.de/resources/'
+        'datasets/RenderedHandposeDataset.en.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/wflw.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/wflw.py
new file mode 100644
index 0000000000000000000000000000000000000000..bed6f56f30f7a2f093e44c5726212e2a0d4659d2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/wflw.py
@@ -0,0 +1,582 @@
+dataset_info = dict(
+    dataset_name='wflw',
+    paper_info=dict(
+        author='Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, '
+        'Quan and Cai, Yici and Zhou, Qiang',
+        title='Look at boundary: A boundary-aware face alignment algorithm',
+        container='Proceedings of the IEEE conference on computer '
+        'vision and pattern recognition',
+        year='2018',
+        homepage='https://wywu.github.io/projects/LAB/WFLW.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-32'),
+        1:
+        dict(
+            name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-31'),
+        2:
+        dict(
+            name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-30'),
+        3:
+        dict(
+            name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-29'),
+        4:
+        dict(
+            name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-28'),
+        5:
+        dict(
+            name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-27'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-26'),
+        7:
+        dict(
+            name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-25'),
+        8:
+        dict(
+            name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-24'),
+        9:
+        dict(
+            name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-23'),
+        10:
+        dict(
+            name='kpt-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        11:
+        dict(
+            name='kpt-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-21'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-20'),
+        13:
+        dict(
+            name='kpt-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        16:
+        dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-13'),
+        20:
+        dict(
+            name='kpt-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        21:
+        dict(
+            name='kpt-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-11'),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-10'),
+        23:
+        dict(
+            name='kpt-23', id=23, color=[255, 255, 255], type='',
+            swap='kpt-9'),
+        24:
+        dict(
+            name='kpt-24', id=24, color=[255, 255, 255], type='',
+            swap='kpt-8'),
+        25:
+        dict(
+            name='kpt-25', id=25, color=[255, 255, 255], type='',
+            swap='kpt-7'),
+        26:
+        dict(
+            name='kpt-26', id=26, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        27:
+        dict(
+            name='kpt-27', id=27, color=[255, 255, 255], type='',
+            swap='kpt-5'),
+        28:
+        dict(
+            name='kpt-28', id=28, color=[255, 255, 255], type='',
+            swap='kpt-4'),
+        29:
+        dict(
+            name='kpt-29', id=29, color=[255, 255, 255], type='',
+            swap='kpt-3'),
+        30:
+        dict(
+            name='kpt-30', id=30, color=[255, 255, 255], type='',
+            swap='kpt-2'),
+        31:
+        dict(
+            name='kpt-31', id=31, color=[255, 255, 255], type='',
+            swap='kpt-1'),
+        32:
+        dict(
+            name='kpt-32', id=32, color=[255, 255, 255], type='',
+            swap='kpt-0'),
+        33:
+        dict(
+            name='kpt-33',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-46'),
+        34:
+        dict(
+            name='kpt-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-45'),
+        35:
+        dict(
+            name='kpt-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-44'),
+        36:
+        dict(
+            name='kpt-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-43'),
+        37:
+        dict(
+            name='kpt-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-42'),
+        38:
+        dict(
+            name='kpt-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-50'),
+        39:
+        dict(
+            name='kpt-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-49'),
+        40:
+        dict(
+            name='kpt-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-48'),
+        41:
+        dict(
+            name='kpt-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-47'),
+        42:
+        dict(
+            name='kpt-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-37'),
+        43:
+        dict(
+            name='kpt-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-36'),
+        44:
+        dict(
+            name='kpt-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-35'),
+        45:
+        dict(
+            name='kpt-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-34'),
+        46:
+        dict(
+            name='kpt-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-33'),
+        47:
+        dict(
+            name='kpt-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-41'),
+        48:
+        dict(
+            name='kpt-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-40'),
+        49:
+        dict(
+            name='kpt-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-39'),
+        50:
+        dict(
+            name='kpt-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-38'),
+        51:
+        dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(name='kpt-52', id=52, color=[255, 255, 255], type='', swap=''),
+        53:
+        dict(name='kpt-53', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(name='kpt-54', id=54, color=[255, 255, 255], type='', swap=''),
+        55:
+        dict(
+            name='kpt-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-59'),
+        56:
+        dict(
+            name='kpt-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-58'),
+        57:
+        dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='kpt-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-56'),
+        59:
+        dict(
+            name='kpt-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-55'),
+        60:
+        dict(
+            name='kpt-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-72'),
+        61:
+        dict(
+            name='kpt-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-71'),
+        62:
+        dict(
+            name='kpt-62',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-70'),
+        63:
+        dict(
+            name='kpt-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-69'),
+        64:
+        dict(
+            name='kpt-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-68'),
+        65:
+        dict(
+            name='kpt-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-75'),
+        66:
+        dict(
+            name='kpt-66',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-74'),
+        67:
+        dict(
+            name='kpt-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-73'),
+        68:
+        dict(
+            name='kpt-68',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-64'),
+        69:
+        dict(
+            name='kpt-69',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-63'),
+        70:
+        dict(
+            name='kpt-70',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-62'),
+        71:
+        dict(
+            name='kpt-71',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-61'),
+        72:
+        dict(
+            name='kpt-72',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-60'),
+        73:
+        dict(
+            name='kpt-73',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-67'),
+        74:
+        dict(
+            name='kpt-74',
+            id=74,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-66'),
+        75:
+        dict(
+            name='kpt-75',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-65'),
+        76:
+        dict(
+            name='kpt-76',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-82'),
+        77:
+        dict(
+            name='kpt-77',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-81'),
+        78:
+        dict(
+            name='kpt-78',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-80'),
+        79:
+        dict(name='kpt-79', id=79, color=[255, 255, 255], type='', swap=''),
+        80:
+        dict(
+            name='kpt-80',
+            id=80,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-78'),
+        81:
+        dict(
+            name='kpt-81',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-77'),
+        82:
+        dict(
+            name='kpt-82',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-76'),
+        83:
+        dict(
+            name='kpt-83',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-87'),
+        84:
+        dict(
+            name='kpt-84',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-86'),
+        85:
+        dict(name='kpt-85', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='kpt-86',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-84'),
+        87:
+        dict(
+            name='kpt-87',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-83'),
+        88:
+        dict(
+            name='kpt-88',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-92'),
+        89:
+        dict(
+            name='kpt-89',
+            id=89,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-91'),
+        90:
+        dict(name='kpt-90', id=90, color=[255, 255, 255], type='', swap=''),
+        91:
+        dict(
+            name='kpt-91',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-89'),
+        92:
+        dict(
+            name='kpt-92',
+            id=92,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-88'),
+        93:
+        dict(
+            name='kpt-93',
+            id=93,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-95'),
+        94:
+        dict(name='kpt-94', id=94, color=[255, 255, 255], type='', swap=''),
+        95:
+        dict(
+            name='kpt-95',
+            id=95,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-93'),
+        96:
+        dict(
+            name='kpt-96',
+            id=96,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-97'),
+        97:
+        dict(
+            name='kpt-97',
+            id=97,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-96')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 98,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/zebra.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/zebra.py
new file mode 100644
index 0000000000000000000000000000000000000000..eac71f796a761bbf87b123f8b7b8b4585df0c525
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/datasets/zebra.py
@@ -0,0 +1,64 @@
+dataset_info = dict(
+    dataset_name='zebra',
+    paper_info=dict(
+        author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and '
+        'Li, Liang and Koger, Benjamin and Costelloe, Blair R and '
+        'Couzin, Iain D',
+        title='DeepPoseKit, a software toolkit for fast and robust '
+        'animal pose estimation using deep learning',
+        container='Elife',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='snout', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='head', id=1, color=[255, 255, 255], type='', swap=''),
+        2:
+        dict(name='neck', id=2, color=[255, 255, 255], type='', swap=''),
+        3:
+        dict(
+            name='forelegL1',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        4:
+        dict(
+            name='forelegR1',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        5:
+        dict(
+            name='hindlegL1',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        6:
+        dict(
+            name='hindlegR1',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        7:
+        dict(name='tailbase', id=7, color=[255, 255, 255], type='', swap=''),
+        8:
+        dict(name='tailtip', id=8, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={
+        0: dict(link=('head', 'snout'), id=0, color=[255, 255, 255]),
+        1: dict(link=('neck', 'head'), id=1, color=[255, 255, 255]),
+        2: dict(link=('forelegL1', 'neck'), id=2, color=[255, 255, 255]),
+        3: dict(link=('forelegR1', 'neck'), id=3, color=[255, 255, 255]),
+        4: dict(link=('hindlegL1', 'tailbase'), id=4, color=[255, 255, 255]),
+        5: dict(link=('hindlegR1', 'tailbase'), id=5, color=[255, 255, 255]),
+        6: dict(link=('tailbase', 'neck'), id=6, color=[255, 255, 255]),
+        7: dict(link=('tailtip', 'tailbase'), id=7, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 9,
+    sigmas=[])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/default_runtime.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..d78da5a1a91e8625d1b8b1d72c4c3bb56956dd67
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/default_runtime.py
@@ -0,0 +1,19 @@
+checkpoint_config = dict(interval=10)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+
+# disable opencv multithreading to avoid system being overloaded
+opencv_num_threads = 0
+# set multi-process start method as `fork` to speed up the training
+mp_start_method = 'fork'
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/filters/gausian_filter.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/_base_/filters/gausian_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2b8fd884cb19b4ec91d8bc74291b7773724bb2dd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/README.md
@@ -0,0 +1,18 @@
+# 2D Animal Keypoint Detection
+
+2D animal keypoint detection (animal pose estimation) aims to detect the key-point of different species, including rats,
+dogs, macaques, and cheetah. It provides detailed behavioral analysis for neuroscience, medical and ecology applications.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/2d_animal_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [DEMO](/demo/docs/2d_animal_demo.md) to generate fancy demos.
+
+<img src="https://user-images.githubusercontent.com/11788150/114201893-4446ec00-9989-11eb-808b-5718c47c7b23.gif" height="140px" alt><br>
+
+<img src="https://user-images.githubusercontent.com/11788150/114205282-b5d46980-998c-11eb-9d6b-85ba47f81252.gif" height="140px" alt><br>
+
+<img src="https://user-images.githubusercontent.com/11788150/114023530-944c8280-98a5-11eb-86b0-5f6d3e232af0.gif" height="140px" alt><br>
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c62b4eecc9f8f1442dfd48ba57ef4734950e4225
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
@@ -0,0 +1,7 @@
+# Top-down heatmap-based pose estimation
+
+Top-down methods divide the task into two stages: object detection and pose estimation.
+
+They perform object detection first, followed by single-object pose estimation given object bounding boxes.
+Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the
+likelihood of being a keypoint.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_animalpose.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_animalpose.md
new file mode 100644
index 0000000000000000000000000000000000000000..6241351c401c3732b2c9d06e78b27133cdabdc0f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_animalpose.md
@@ -0,0 +1,40 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ICCV_2019/html/Cao_Cross-Domain_Adaptation_for_Animal_Pose_Estimation_ICCV_2019_paper.html">Animal-Pose (ICCV'2019)</a></summary>
+
+```bibtex
+@InProceedings{Cao_2019_ICCV,
+    author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing},
+    title = {Cross-Domain Adaptation for Animal Pose Estimation},
+    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
+    month = {October},
+    year = {2019}
+}
+```
+
+</details>
+
+Results on AnimalPose validation set (1117 instances)
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_w32_animalpose_256x256.py)  | 256x256 | 0.736 | 0.959 | 0.832 | 0.775 | 0.966 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256_20210426.log.json) |
+| [pose_hrnet_w48](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_w48_animalpose_256x256.py)  | 256x256 | 0.737 | 0.959 | 0.823 | 0.778 | 0.962 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_animalpose_256x256-34644726_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_animalpose_256x256_20210426.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_animalpose.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_animalpose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b1c84e242bd428d39e5d5062ce02ea71c2c318c6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_animalpose.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_w32_animalpose_256x256.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: Animal-Pose
+  Name: topdown_heatmap_hrnet_w32_animalpose_256x256
+  Results:
+  - Dataset: Animal-Pose
+    Metrics:
+      AP: 0.736
+      AP@0.5: 0.959
+      AP@0.75: 0.832
+      AR: 0.775
+      AR@0.5: 0.966
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_w48_animalpose_256x256.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: Animal-Pose
+  Name: topdown_heatmap_hrnet_w48_animalpose_256x256
+  Results:
+  - Dataset: Animal-Pose
+    Metrics:
+      AP: 0.737
+      AP@0.5: 0.959
+      AP@0.75: 0.823
+      AR: 0.778
+      AR@0.5: 0.962
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_animalpose_256x256-34644726_20210426.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_w32_animalpose_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_w32_animalpose_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..c83979f37f12475f0621e787c319ffb182fae5d3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_w32_animalpose_256x256.py
@@ -0,0 +1,172 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/animalpose.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=20,
+    dataset_joints=20,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/animalpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_w48_animalpose_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_w48_animalpose_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..7db4f23561c59aa3675fce79396a109d9099538a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_w48_animalpose_256x256.py
@@ -0,0 +1,172 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/animalpose.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=20,
+    dataset_joints=20,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/animalpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res101_animalpose_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res101_animalpose_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..0df1a2806a760ffdcf901549e3162e5b3a80a100
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res101_animalpose_256x256.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/animalpose.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=20,
+    dataset_joints=20,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/animalpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res152_animalpose_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res152_animalpose_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..e362e53bd92c587febb17d7f4c3b4cd2db4bac5f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res152_animalpose_256x256.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/animalpose.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=20,
+    dataset_joints=20,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/animalpose'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res50_animalpose_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res50_animalpose_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbd663dc59e6dda7f491efb0f8c2c4b3b0f5719f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res50_animalpose_256x256.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/animalpose.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=20,
+    dataset_joints=20,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/animalpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalPoseDataset',
+        ann_file=f'{data_root}/annotations/animalpose_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/resnet_animalpose.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/resnet_animalpose.md
new file mode 100644
index 0000000000000000000000000000000000000000..6fe6f771d273ee4def4729739dd9c3b13dca47f8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/resnet_animalpose.md
@@ -0,0 +1,41 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ICCV_2019/html/Cao_Cross-Domain_Adaptation_for_Animal_Pose_Estimation_ICCV_2019_paper.html">Animal-Pose (ICCV'2019)</a></summary>
+
+```bibtex
+@InProceedings{Cao_2019_ICCV,
+    author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing},
+    title = {Cross-Domain Adaptation for Animal Pose Estimation},
+    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
+    month = {October},
+    year = {2019}
+}
+```
+
+</details>
+
+Results on AnimalPose validation set (1117 instances)
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res50_animalpose_256x256.py)  | 256x256 | 0.688 | 0.945 | 0.772 | 0.733 | 0.952 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_animalpose_256x256-e1f30bff_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_animalpose_256x256_20210426.log.json) |
+| [pose_resnet_101](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res101_animalpose_256x256.py) | 256x256 | 0.696 | 0.948 | 0.785 | 0.737 | 0.954 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_animalpose_256x256-85563f4a_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_animalpose_256x256_20210426.log.json) |
+| [pose_resnet_152](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res152_animalpose_256x256.py) | 256x256 | 0.709 | 0.948 | 0.797 | 0.749 | 0.951 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_animalpose_256x256-a0a7506c_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_animalpose_256x256_20210426.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/resnet_animalpose.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/resnet_animalpose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6900f8a5ccb625926872ea145e1f6919afa93d99
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/resnet_animalpose.yml
@@ -0,0 +1,56 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res50_animalpose_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    Training Data: Animal-Pose
+  Name: topdown_heatmap_res50_animalpose_256x256
+  Results:
+  - Dataset: Animal-Pose
+    Metrics:
+      AP: 0.688
+      AP@0.5: 0.945
+      AP@0.75: 0.772
+      AR: 0.733
+      AR@0.5: 0.952
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_animalpose_256x256-e1f30bff_20210426.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res101_animalpose_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Animal-Pose
+  Name: topdown_heatmap_res101_animalpose_256x256
+  Results:
+  - Dataset: Animal-Pose
+    Metrics:
+      AP: 0.696
+      AP@0.5: 0.948
+      AP@0.75: 0.785
+      AR: 0.737
+      AR@0.5: 0.954
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_animalpose_256x256-85563f4a_20210426.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/res152_animalpose_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Animal-Pose
+  Name: topdown_heatmap_res152_animalpose_256x256
+  Results:
+  - Dataset: Animal-Pose
+    Metrics:
+      AP: 0.709
+      AP@0.5: 0.948
+      AP@0.75: 0.797
+      AR: 0.749
+      AR@0.5: 0.951
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_animalpose_256x256-a0a7506c_20210426.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_base_ap10k_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_base_ap10k_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd5daf5e746ce0a116c3fa7bc98231eaa305ed51
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_base_ap10k_256x192.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/apt36k'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/train_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/val_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/val_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_huge_ap10k_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_huge_ap10k_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d2f8ab0630bb0f997b529303179b0e425c553ac
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_huge_ap10k_256x192.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ap10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-train-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-val-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-test-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_large_ap10k_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_large_ap10k_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e44c27b3088a3a670ba03e7961a3df6dd3706c2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_large_ap10k_256x192.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ap10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-train-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-val-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-test-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_small_ap10k_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_small_ap10k_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c3f2b97905ba47318cde61f4eec35b4624bc554
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/ViTPose_small_ap10k_256x192.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ap10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-train-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-val-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-test-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_ap10k.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_ap10k.md
new file mode 100644
index 0000000000000000000000000000000000000000..b9db08981c729c2fc63aafc4cf92b1bb86271f63
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_ap10k.md
@@ -0,0 +1,41 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2108.12617">AP-10K (NeurIPS'2021)</a></summary>
+
+```bibtex
+@misc{yu2021ap10k,
+      title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
+      author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
+      year={2021},
+      eprint={2108.12617},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+</details>
+
+Results on AP-10K validation set
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AP<sup>M</sup> | AP<sup>L</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_w32_ap10k_256x256.py)  | 256x256 | 0.738 | 0.958 | 0.808 | 0.592 | 0.743 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_ap10k_256x256-18aac840_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_ap10k_256x256-18aac840_20211029.log.json) |
+| [pose_hrnet_w48](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_w48_ap10k_256x256.py)  | 256x256 | 0.744 | 0.959 | 0.807 | 0.589 | 0.748 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_ap10k_256x256-d95ab412_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_ap10k_256x256-d95ab412_20211029.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_ap10k.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_ap10k.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8cf0ced8b3401de47703881b7c4dd8137852d931
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_ap10k.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_w32_ap10k_256x256.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: AP-10K
+  Name: topdown_heatmap_hrnet_w32_ap10k_256x256
+  Results:
+  - Dataset: AP-10K
+    Metrics:
+      AP: 0.738
+      AP@0.5: 0.958
+      AP@0.75: 0.808
+      APL: 0.743
+      APM: 0.592
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_ap10k_256x256-18aac840_20211029.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_w48_ap10k_256x256.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: AP-10K
+  Name: topdown_heatmap_hrnet_w48_ap10k_256x256
+  Results:
+  - Dataset: AP-10K
+    Metrics:
+      AP: 0.744
+      AP@0.5: 0.959
+      AP@0.75: 0.807
+      APL: 0.748
+      APM: 0.589
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_ap10k_256x256-d95ab412_20211029.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_w32_ap10k_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_w32_ap10k_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3900c03b1ddc8c2706383c3de97127363533d3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_w32_ap10k_256x256.py
@@ -0,0 +1,172 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ap10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-train-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-val-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-test-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_w48_ap10k_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_w48_ap10k_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2012ec8ee0ab65ce761368083e21ae082b2ead2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_w48_ap10k_256x256.py
@@ -0,0 +1,172 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ap10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-train-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-val-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-test-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/res101_ap10k_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/res101_ap10k_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..8496a3cc6960f9b8f7c29266912b4b20427669fb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/res101_ap10k_256x256.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ap10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-train-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-val-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-test-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/res50_ap10k_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/res50_ap10k_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c5699cdb9da9884301d0c402437c936d9c2f608
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/res50_ap10k_256x256.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ap10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-train-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-val-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-test-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/resnet_ap10k.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/resnet_ap10k.md
new file mode 100644
index 0000000000000000000000000000000000000000..3e1be927e51fe495c1f18026533017020fa03072
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/resnet_ap10k.md
@@ -0,0 +1,41 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2108.12617">AP-10K (NeurIPS'2021)</a></summary>
+
+```bibtex
+@misc{yu2021ap10k,
+      title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
+      author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
+      year={2021},
+      eprint={2108.12617},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+</details>
+
+Results on AP-10K validation set
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AP<sup>M</sup> | AP<sup>L</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/res50_ap10k_256x256.py)  | 256x256 | 0.699 | 0.940 | 0.760 | 0.570 | 0.703 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_ap10k_256x256-35760eb8_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_ap10k_256x256-35760eb8_20211029.log.json) |
+| [pose_resnet_101](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/res101_ap10k_256x256.py) | 256x256 | 0.698 | 0.943 | 0.754 | 0.543 | 0.702 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_ap10k_256x256-9edfafb9_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_ap10k_256x256-9edfafb9_20211029.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/resnet_ap10k.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/resnet_ap10k.yml
new file mode 100644
index 0000000000000000000000000000000000000000..48b039fce89bb6fb6b1cd3d7b6c6e32fd7f5d2d5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/resnet_ap10k.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/res50_ap10k_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    Training Data: AP-10K
+  Name: topdown_heatmap_res50_ap10k_256x256
+  Results:
+  - Dataset: AP-10K
+    Metrics:
+      AP: 0.699
+      AP@0.5: 0.94
+      AP@0.75: 0.76
+      APL: 0.703
+      APM: 0.57
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_ap10k_256x256-35760eb8_20211029.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/res101_ap10k_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: AP-10K
+  Name: topdown_heatmap_res101_ap10k_256x256
+  Results:
+  - Dataset: AP-10K
+    Metrics:
+      AP: 0.698
+      AP@0.5: 0.943
+      AP@0.75: 0.754
+      APL: 0.702
+      APM: 0.543
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_ap10k_256x256-9edfafb9_20211029.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_base_apt36k_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_base_apt36k_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3aa5d40ecf8fea1212e8b641fe7e14321fff618
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_base_apt36k_256x192.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ap10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-train-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-val-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/ap10k-test-split1.json',
+        img_prefix=f'{data_root}/data/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_huge_apt36k_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_huge_apt36k_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..0562e79a286b58f19db3b911aa8c6864f8209458
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_huge_apt36k_256x192.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/apt36k'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/train_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/val_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/val_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_large_apt36k_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_large_apt36k_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4ae268d4c68f35ac2d757c15406706f90483d4e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_large_apt36k_256x192.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/apt36k'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/train_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/val_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/val_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_small_apt36k_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_small_apt36k_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..691d373b5ce391a41c997a300aaea7ccb0d63d7e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_small_apt36k_256x192.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ap10k.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/apt36k'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/train_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/val_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalAP10KDataset',
+        ann_file=f'{data_root}/annotations/val_annotations_1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_atrw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_atrw.md
new file mode 100644
index 0000000000000000000000000000000000000000..097c2f6554d19af4b87ffd32a2c26b68d0031184
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_atrw.md
@@ -0,0 +1,40 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1906.05586">ATRW (ACM MM'2020)</a></summary>
+
+```bibtex
+@inproceedings{li2020atrw,
+  title={ATRW: A Benchmark for Amur Tiger Re-identification in the Wild},
+  author={Li, Shuyuan and Li, Jianguo and Tang, Hanlin and Qian, Rui and Lin, Weiyao},
+  booktitle={Proceedings of the 28th ACM International Conference on Multimedia},
+  pages={2590--2598},
+  year={2020}
+}
+```
+
+</details>
+
+Results on ATRW validation set
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_w32_atrw_256x256.py)  | 256x256 | 0.912 | 0.973 | 0.959 | 0.938 | 0.985 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_atrw_256x256-f027f09a_20210414.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_atrw_256x256_20210414.log.json) |
+| [pose_hrnet_w48](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_w48_atrw_256x256.py)  | 256x256 | 0.911 | 0.972 | 0.946 | 0.937 | 0.985 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_atrw_256x256-ac088892_20210414.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_atrw_256x256_20210414.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_atrw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_atrw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c33437024ca9231d2acfb0d001d33c2540b0f793
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_atrw.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_w32_atrw_256x256.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: ATRW
+  Name: topdown_heatmap_hrnet_w32_atrw_256x256
+  Results:
+  - Dataset: ATRW
+    Metrics:
+      AP: 0.912
+      AP@0.5: 0.973
+      AP@0.75: 0.959
+      AR: 0.938
+      AR@0.5: 0.985
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_atrw_256x256-f027f09a_20210414.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_w48_atrw_256x256.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: ATRW
+  Name: topdown_heatmap_hrnet_w48_atrw_256x256
+  Results:
+  - Dataset: ATRW
+    Metrics:
+      AP: 0.911
+      AP@0.5: 0.972
+      AP@0.75: 0.946
+      AR: 0.937
+      AR@0.5: 0.985
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_atrw_256x256-ac088892_20210414.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_w32_atrw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_w32_atrw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef080ea929c2c612ea2182fafe544b7018423a92
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_w32_atrw_256x256.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/atrw.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/atrw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_train.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_val.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_val.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_w48_atrw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_w48_atrw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..86e647784e6c2236ed80ac30fb359622d1b17064
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_w48_atrw_256x256.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/atrw.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/atrw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_train.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_val.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_val.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res101_atrw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res101_atrw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..342e02711c119e1915433076508d10735ff088fa
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res101_atrw_256x256.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/atrw.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/atrw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_train.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_val.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_val.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res152_atrw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res152_atrw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ed68cc0622bb3b5cc8f43718e340fe7312ca8dc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res152_atrw_256x256.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/atrw.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/atrw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_train.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_val.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_val.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res50_atrw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res50_atrw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..28998435a06824d322f4035f33e82e3fd8351c1e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res50_atrw_256x256.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/atrw.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/atrw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_train.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_val.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalATRWDataset',
+        ann_file=f'{data_root}/annotations/keypoint_val.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/resnet_atrw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/resnet_atrw.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e75463e57ee26d9e7da6abde9c815ecfb24c323
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/resnet_atrw.md
@@ -0,0 +1,41 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1906.05586">ATRW (ACM MM'2020)</a></summary>
+
+```bibtex
+@inproceedings{li2020atrw,
+  title={ATRW: A Benchmark for Amur Tiger Re-identification in the Wild},
+  author={Li, Shuyuan and Li, Jianguo and Tang, Hanlin and Qian, Rui and Lin, Weiyao},
+  booktitle={Proceedings of the 28th ACM International Conference on Multimedia},
+  pages={2590--2598},
+  year={2020}
+}
+```
+
+</details>
+
+Results on ATRW validation set
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res50_atrw_256x256.py)  | 256x256 | 0.900 | 0.973 | 0.932 | 0.929 | 0.985 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_atrw_256x256-546c4594_20210414.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_atrw_256x256_20210414.log.json) |
+| [pose_resnet_101](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res101_atrw_256x256.py) | 256x256 | 0.898 | 0.973 | 0.936 | 0.927 | 0.985 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_atrw_256x256-da93f371_20210414.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_atrw_256x256_20210414.log.json) |
+| [pose_resnet_152](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res152_atrw_256x256.py) | 256x256 | 0.896 | 0.973 | 0.931 | 0.927 | 0.985 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_atrw_256x256-2bb8e162_20210414.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_atrw_256x256_20210414.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/resnet_atrw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/resnet_atrw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d448cfcbf6f1fcaa30a579d5a7bd9c6959c437a3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/resnet_atrw.yml
@@ -0,0 +1,56 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res50_atrw_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    Training Data: ATRW
+  Name: topdown_heatmap_res50_atrw_256x256
+  Results:
+  - Dataset: ATRW
+    Metrics:
+      AP: 0.9
+      AP@0.5: 0.973
+      AP@0.75: 0.932
+      AR: 0.929
+      AR@0.5: 0.985
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_atrw_256x256-546c4594_20210414.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res101_atrw_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: ATRW
+  Name: topdown_heatmap_res101_atrw_256x256
+  Results:
+  - Dataset: ATRW
+    Metrics:
+      AP: 0.898
+      AP@0.5: 0.973
+      AP@0.75: 0.936
+      AR: 0.927
+      AR@0.5: 0.985
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_atrw_256x256-da93f371_20210414.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/res152_atrw_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: ATRW
+  Name: topdown_heatmap_res152_atrw_256x256
+  Results:
+  - Dataset: ATRW
+    Metrics:
+      AP: 0.896
+      AP@0.5: 0.973
+      AP@0.75: 0.931
+      AR: 0.927
+      AR@0.5: 0.985
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_atrw_256x256-2bb8e162_20210414.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res101_fly_192x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res101_fly_192x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..334300d9a6827d4eb6faeb42e08ba0ec0740ab16
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res101_fly_192x192.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/fly.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=32,
+    dataset_joints=32,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 192],
+    heatmap_size=[48, 48],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fly'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalFlyDataset',
+        ann_file=f'{data_root}/annotations/fly_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalFlyDataset',
+        ann_file=f'{data_root}/annotations/fly_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalFlyDataset',
+        ann_file=f'{data_root}/annotations/fly_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res152_fly_192x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res152_fly_192x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..90737b88886face476b0b3755c7690c64ebf485f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res152_fly_192x192.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/fly.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=32,
+    dataset_joints=32,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 192],
+    heatmap_size=[48, 48],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fly'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalFlyDataset',
+        ann_file=f'{data_root}/annotations/fly_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalFlyDataset',
+        ann_file=f'{data_root}/annotations/fly_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalFlyDataset',
+        ann_file=f'{data_root}/annotations/fly_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res50_fly_192x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res50_fly_192x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..20b29b5eb78a1b96702ef3c1d516019261659854
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res50_fly_192x192.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/fly.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=32,
+    dataset_joints=32,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 192],
+    heatmap_size=[48, 48],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fly'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalFlyDataset',
+        ann_file=f'{data_root}/annotations/fly_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalFlyDataset',
+        ann_file=f'{data_root}/annotations/fly_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalFlyDataset',
+        ann_file=f'{data_root}/annotations/fly_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/resnet_fly.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/resnet_fly.md
new file mode 100644
index 0000000000000000000000000000000000000000..24060e422b28e1ac4284b699bf6fe3e8c6378a08
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/resnet_fly.md
@@ -0,0 +1,44 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://www.nature.com/articles/s41592-018-0234-5">Vinegar Fly (Nature Methods'2019)</a></summary>
+
+```bibtex
+@article{pereira2019fast,
+  title={Fast animal pose estimation using deep neural networks},
+  author={Pereira, Talmo D and Aldarondo, Diego E and Willmore, Lindsay and Kislin, Mikhail and Wang, Samuel S-H and Murthy, Mala and Shaevitz, Joshua W},
+  journal={Nature methods},
+  volume={16},
+  number={1},
+  pages={117--125},
+  year={2019},
+  publisher={Nature Publishing Group}
+}
+```
+
+</details>
+
+Results on Vinegar Fly test set
+
+|  Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :-------- | :--------: | :------: | :------: | :------: |:------: |:------: |
+|[pose_resnet_50](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res50_fly_192x192.py) | 192x192 | 0.996 | 0.910 | 2.00 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_fly_192x192-5d0ee2d9_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_fly_192x192_20210407.log.json) |
+|[pose_resnet_101](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res101_fly_192x192.py) | 192x192 | 0.996 | 0.912 | 1.95 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_fly_192x192-41a7a6cc_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_fly_192x192_20210407.log.json) |
+|[pose_resnet_152](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res152_fly_192x192.py) | 192x192 | 0.997 | 0.917 | 1.78 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_fly_192x192-fcafbd5a_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_fly_192x192_20210407.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/resnet_fly.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/resnet_fly.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c6475883418a1dbfdfbd4634477a14aa35459bef
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/resnet_fly.yml
@@ -0,0 +1,50 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res50_fly_192x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    Training Data: Vinegar Fly
+  Name: topdown_heatmap_res50_fly_192x192
+  Results:
+  - Dataset: Vinegar Fly
+    Metrics:
+      AUC: 0.91
+      EPE: 2.0
+      PCK@0.2: 0.996
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_fly_192x192-5d0ee2d9_20210407.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res101_fly_192x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Vinegar Fly
+  Name: topdown_heatmap_res101_fly_192x192
+  Results:
+  - Dataset: Vinegar Fly
+    Metrics:
+      AUC: 0.912
+      EPE: 1.95
+      PCK@0.2: 0.996
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_fly_192x192-41a7a6cc_20210407.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/res152_fly_192x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Vinegar Fly
+  Name: topdown_heatmap_res152_fly_192x192
+  Results:
+  - Dataset: Vinegar Fly
+    Metrics:
+      AUC: 0.917
+      EPE: 1.78
+      PCK@0.2: 0.997
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_fly_192x192-fcafbd5a_20210407.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_horse10.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_horse10.md
new file mode 100644
index 0000000000000000000000000000000000000000..9fad3944eba7d330a4a395c5171c8fd7efce38de
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_horse10.md
@@ -0,0 +1,44 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://openaccess.thecvf.com/content/WACV2021/html/Mathis_Pretraining_Boosts_Out-of-Domain_Robustness_for_Pose_Estimation_WACV_2021_paper.html">Horse-10 (WACV'2021)</a></summary>
+
+```bibtex
+@inproceedings{mathis2021pretraining,
+  title={Pretraining boosts out-of-domain robustness for pose estimation},
+  author={Mathis, Alexander and Biasi, Thomas and Schneider, Steffen and Yuksekgonul, Mert and Rogers, Byron and Bethge, Matthias and Mathis, Mackenzie W},
+  booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision},
+  pages={1859--1868},
+  year={2021}
+}
+```
+
+</details>
+
+Results on Horse-10 test set
+
+|Set   | Arch  | Input Size | PCK@0.3 |  NME  | ckpt    | log     |
+| :--- | :---: | :--------: | :------: | :------: |:------: |:------: |
+|split1| [pose_hrnet_w32](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split1.py) | 256x256 | 0.951 | 0.122 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_horse10_256x256_split1-401d901a_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_horse10_256x256_split1_20210405.log.json) |
+|split2| [pose_hrnet_w32](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split2.py) | 256x256 | 0.949 | 0.116 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_horse10_256x256_split2-04840523_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_horse10_256x256_split2_20210405.log.json) |
+|split3| [pose_hrnet_w32](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split3.py) | 256x256 | 0.939 | 0.153 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_horse10_256x256_split3-4db47400_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_horse10_256x256_split3_20210405.log.json) |
+|split1| [pose_hrnet_w48](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split1.py) | 256x256 | 0.973 | 0.095 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_horse10_256x256_split1-3c950d3b_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_horse10_256x256_split1_20210405.log.json) |
+|split2| [pose_hrnet_w48](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split2.py) | 256x256 | 0.969 | 0.101 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_horse10_256x256_split2-8ef72b5d_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_horse10_256x256_split2_20210405.log.json) |
+|split3| [pose_hrnet_w48](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split3.py) | 256x256 | 0.961 | 0.128 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_horse10_256x256_split3-0232ec47_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_horse10_256x256_split3_20210405.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_horse10.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_horse10.yml
new file mode 100644
index 0000000000000000000000000000000000000000..16504855b154d17608dbf3c65442b920b21f425e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_horse10.yml
@@ -0,0 +1,86 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split1.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: Horse-10
+  Name: topdown_heatmap_hrnet_w32_horse10_256x256-split1
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.122
+      PCK@0.3: 0.951
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_horse10_256x256_split1-401d901a_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split2.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_hrnet_w32_horse10_256x256-split2
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.116
+      PCK@0.3: 0.949
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_horse10_256x256_split2-04840523_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split3.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_hrnet_w32_horse10_256x256-split3
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.153
+      PCK@0.3: 0.939
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_horse10_256x256_split3-4db47400_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split1.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_hrnet_w48_horse10_256x256-split1
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.095
+      PCK@0.3: 0.973
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_horse10_256x256_split1-3c950d3b_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split2.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_hrnet_w48_horse10_256x256-split2
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.101
+      PCK@0.3: 0.969
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_horse10_256x256_split2-8ef72b5d_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split3.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_hrnet_w48_horse10_256x256-split3
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.128
+      PCK@0.3: 0.961
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_horse10_256x256_split3-0232ec47_20210405.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split1.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split1.py
new file mode 100644
index 0000000000000000000000000000000000000000..76d2f1c812f1b3f71c7d7dca3f2133baabf29753
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split1.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split2.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4f2bb278c4110b1a8b9826c54cd07606664179c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split2.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split3.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split3.py
new file mode 100644
index 0000000000000000000000000000000000000000..38c2f82f9e97883264472fec7e9fa6128fcec1d1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w32_horse10_256x256-split3.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split1.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split1.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fea30d63a2c52ed8b1d2ccc9b525355a7ca56ad
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split1.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split2.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split2.py
new file mode 100644
index 0000000000000000000000000000000000000000..49f0920e5759ddc2f14e4a9cee94fa9354b0cd86
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split2.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split3.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split3.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e0a4991f18cd89b0eb24cc0e2a8c881ef566bef
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_w48_horse10_256x256-split3.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split1.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split1.py
new file mode 100644
index 0000000000000000000000000000000000000000..f67903582115f40086ebccccfeb272d0bb072189
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split1.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split2.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5203d2c92f11920d6417073617e5b6f0434c66e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split2.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split3.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split3.py
new file mode 100644
index 0000000000000000000000000000000000000000..c371bf0ae7c9493c0a28653bce758d7f5748be1e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split3.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split1.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split1.py
new file mode 100644
index 0000000000000000000000000000000000000000..b119c4808fc845b49e2c2452c45bd2756162bf6f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split1.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split2.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split2.py
new file mode 100644
index 0000000000000000000000000000000000000000..68fefa69b65cde7302d29f1b44ce7deda4c2a9d1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split2.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split3.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split3.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a5673f77f996ef3e94de7e4d673c9a063935102
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split3.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split1.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split1.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a14e16b9920476fec9a290cc12a60fdfa2b25b1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split1.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split1.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split2.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9463010e5133b327ad94fe90e581280f0e11856
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split2.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split2.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split3.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split3.py
new file mode 100644
index 0000000000000000000000000000000000000000..7612dd829a20ba4d754822a5da5bb59b564200af
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split3.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/horse10.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=22,
+    dataset_joints=22,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 21
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        21
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/horse10'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-train-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalHorse10Dataset',
+        ann_file=f'{data_root}/annotations/horse10-test-split3.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/resnet_horse10.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/resnet_horse10.md
new file mode 100644
index 0000000000000000000000000000000000000000..0b7797e103f0e952dde801be09087e0ab2351b98
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/resnet_horse10.md
@@ -0,0 +1,47 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://openaccess.thecvf.com/content/WACV2021/html/Mathis_Pretraining_Boosts_Out-of-Domain_Robustness_for_Pose_Estimation_WACV_2021_paper.html">Horse-10 (WACV'2021)</a></summary>
+
+```bibtex
+@inproceedings{mathis2021pretraining,
+  title={Pretraining boosts out-of-domain robustness for pose estimation},
+  author={Mathis, Alexander and Biasi, Thomas and Schneider, Steffen and Yuksekgonul, Mert and Rogers, Byron and Bethge, Matthias and Mathis, Mackenzie W},
+  booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision},
+  pages={1859--1868},
+  year={2021}
+}
+```
+
+</details>
+
+Results on Horse-10 test set
+
+|Set   | Arch  | Input Size | PCK@0.3 |  NME  | ckpt    | log     |
+| :--- | :---: | :--------: | :------: | :------: |:------: |:------: |
+|split1| [pose_resnet_50](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split1.py) | 256x256 | 0.956 | 0.113 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_horse10_256x256_split1-3a3dc37e_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_horse10_256x256_split1_20210405.log.json) |
+|split2| [pose_resnet_50](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split2.py) | 256x256 | 0.954 | 0.111 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_horse10_256x256_split2-65e2a508_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_horse10_256x256_split2_20210405.log.json) |
+|split3| [pose_resnet_50](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split3.py) | 256x256 | 0.946 | 0.129 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_horse10_256x256_split3-9637d4eb_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_horse10_256x256_split3_20210405.log.json) |
+|split1| [pose_resnet_101](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split1.py) | 256x256 | 0.958 | 0.115 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_horse10_256x256_split1-1b7c259c_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_horse10_256x256_split1_20210405.log.json) |
+|split2| [pose_resnet_101](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split2.py) | 256x256 | 0.955 | 0.115 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_horse10_256x256_split2-30e2fa87_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_horse10_256x256_split2_20210405.log.json) |
+|split3| [pose_resnet_101](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split3.py) | 256x256 | 0.946 | 0.126 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_horse10_256x256_split3-2eea5bb1_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_horse10_256x256_split3_20210405.log.json) |
+|split1| [pose_resnet_152](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split1.py) | 256x256 | 0.969 | 0.105 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_horse10_256x256_split1-7e81fe2d_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_horse10_256x256_split1_20210405.log.json) |
+|split2| [pose_resnet_152](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split2.py) | 256x256 | 0.970 | 0.103 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_horse10_256x256_split2-3b3404a3_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_horse10_256x256_split2_20210405.log.json) |
+|split3| [pose_resnet_152](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split3.py) | 256x256 | 0.957 | 0.131 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_horse10_256x256_split3-c957dac5_20210405.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_horse10_256x256_split3_20210405.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/resnet_horse10.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/resnet_horse10.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d1b39195422f059946f0eef1e6924b1599f91ee8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/resnet_horse10.yml
@@ -0,0 +1,125 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split1.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    Training Data: Horse-10
+  Name: topdown_heatmap_res50_horse10_256x256-split1
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.113
+      PCK@0.3: 0.956
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_horse10_256x256_split1-3a3dc37e_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split2.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_res50_horse10_256x256-split2
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.111
+      PCK@0.3: 0.954
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_horse10_256x256_split2-65e2a508_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res50_horse10_256x256-split3.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_res50_horse10_256x256-split3
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.129
+      PCK@0.3: 0.946
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_horse10_256x256_split3-9637d4eb_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split1.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_res101_horse10_256x256-split1
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.115
+      PCK@0.3: 0.958
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_horse10_256x256_split1-1b7c259c_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split2.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_res101_horse10_256x256-split2
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.115
+      PCK@0.3: 0.955
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_horse10_256x256_split2-30e2fa87_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res101_horse10_256x256-split3.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_res101_horse10_256x256-split3
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.126
+      PCK@0.3: 0.946
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_horse10_256x256_split3-2eea5bb1_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split1.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_res152_horse10_256x256-split1
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.105
+      PCK@0.3: 0.969
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_horse10_256x256_split1-7e81fe2d_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split2.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_res152_horse10_256x256-split2
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.103
+      PCK@0.3: 0.97
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_horse10_256x256_split2-3b3404a3_20210405.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/res152_horse10_256x256-split3.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Horse-10
+  Name: topdown_heatmap_res152_horse10_256x256-split3
+  Results:
+  - Dataset: Horse-10
+    Metrics:
+      NME: 0.131
+      PCK@0.3: 0.957
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_horse10_256x256_split3-c957dac5_20210405.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res101_locust_160x160.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res101_locust_160x160.py
new file mode 100644
index 0000000000000000000000000000000000000000..18ba8ace4ed0b867112e275c9499a308bfa09d4c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res101_locust_160x160.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/locust.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=35,
+    dataset_joints=35,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[160, 160],
+    heatmap_size=[40, 40],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/locust'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalLocustDataset',
+        ann_file=f'{data_root}/annotations/locust_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalLocustDataset',
+        ann_file=f'{data_root}/annotations/locust_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalLocustDataset',
+        ann_file=f'{data_root}/annotations/locust_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res152_locust_160x160.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res152_locust_160x160.py
new file mode 100644
index 0000000000000000000000000000000000000000..3966ef2e5c26da9661bda9fdbc0e0d88b77928d7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res152_locust_160x160.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/locust.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=35,
+    dataset_joints=35,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[160, 160],
+    heatmap_size=[40, 40],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/locust'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalLocustDataset',
+        ann_file=f'{data_root}/annotations/locust_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalLocustDataset',
+        ann_file=f'{data_root}/annotations/locust_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalLocustDataset',
+        ann_file=f'{data_root}/annotations/locust_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res50_locust_160x160.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res50_locust_160x160.py
new file mode 100644
index 0000000000000000000000000000000000000000..0850fc27818a1378c16b7f4c922f5a51e5de15f6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res50_locust_160x160.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/locust.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=35,
+    dataset_joints=35,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[160, 160],
+    heatmap_size=[40, 40],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/locust'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalLocustDataset',
+        ann_file=f'{data_root}/annotations/locust_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalLocustDataset',
+        ann_file=f'{data_root}/annotations/locust_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalLocustDataset',
+        ann_file=f'{data_root}/annotations/locust_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/resnet_locust.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/resnet_locust.md
new file mode 100644
index 0000000000000000000000000000000000000000..20958ffb9c165e1041b1ef102237132005e87036
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/resnet_locust.md
@@ -0,0 +1,43 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://elifesciences.org/articles/47994">Desert Locust (Elife'2019)</a></summary>
+
+```bibtex
+@article{graving2019deepposekit,
+  title={DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning},
+  author={Graving, Jacob M and Chae, Daniel and Naik, Hemal and Li, Liang and Koger, Benjamin and Costelloe, Blair R and Couzin, Iain D},
+  journal={Elife},
+  volume={8},
+  pages={e47994},
+  year={2019},
+  publisher={eLife Sciences Publications Limited}
+}
+```
+
+</details>
+
+Results on Desert Locust test set
+
+|  Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :-------- | :--------: | :------: | :------: | :------: |:------: |:------: |
+|[pose_resnet_50](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res50_locust_160x160.py) | 160x160 | 0.999 | 0.899 | 2.27 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_locust_160x160-9efca22b_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_locust_160x160_20210407.log.json) |
+|[pose_resnet_101](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res101_locust_160x160.py) | 160x160 | 0.999 | 0.907 | 2.03 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_locust_160x160-d77986b3_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_locust_160x160_20210407.log.json) |
+|[pose_resnet_152](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res152_locust_160x160.py) | 160x160 | 1.000 | 0.926 | 1.48 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_locust_160x160-4ea9b372_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_locust_160x160_20210407.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/resnet_locust.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/resnet_locust.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c01a219745866c79cb6656ffcb0aabffc81a47ac
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/resnet_locust.yml
@@ -0,0 +1,50 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res50_locust_160x160.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    Training Data: Desert Locust
+  Name: topdown_heatmap_res50_locust_160x160
+  Results:
+  - Dataset: Desert Locust
+    Metrics:
+      AUC: 0.899
+      EPE: 2.27
+      PCK@0.2: 0.999
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_locust_160x160-9efca22b_20210407.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res101_locust_160x160.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Desert Locust
+  Name: topdown_heatmap_res101_locust_160x160
+  Results:
+  - Dataset: Desert Locust
+    Metrics:
+      AUC: 0.907
+      EPE: 2.03
+      PCK@0.2: 0.999
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_locust_160x160-d77986b3_20210407.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/res152_locust_160x160.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: Desert Locust
+  Name: topdown_heatmap_res152_locust_160x160
+  Results:
+  - Dataset: Desert Locust
+    Metrics:
+      AUC: 0.926
+      EPE: 1.48
+      PCK@0.2: 1.0
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_locust_160x160-4ea9b372_20210407.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_macaque.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_macaque.md
new file mode 100644
index 0000000000000000000000000000000000000000..abcffa04a1395a3978a1be5effc19317d56b975a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_macaque.md
@@ -0,0 +1,40 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://www.ncbi.nlm.nih.gov/pmc/articles/pmc7874091/">MacaquePose (bioRxiv'2020)</a></summary>
+
+```bibtex
+@article{labuguen2020macaquepose,
+  title={MacaquePose: A novel ‘in the wild’macaque monkey pose dataset for markerless motion capture},
+  author={Labuguen, Rollyn and Matsumoto, Jumpei and Negrete, Salvador and Nishimaru, Hiroshi and Nishijo, Hisao and Takada, Masahiko and Go, Yasuhiro and Inoue, Ken-ichi and Shibata, Tomohiro},
+  journal={bioRxiv},
+  year={2020},
+  publisher={Cold Spring Harbor Laboratory}
+}
+```
+
+</details>
+
+Results on MacaquePose with ground-truth detection bounding boxes
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_w32_macaque_256x192.py)  | 256x192 | 0.814 | 0.953 | 0.918 | 0.851 | 0.969 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_macaque_256x192-f7e9e04f_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_macaque_256x192_20210407.log.json) |
+| [pose_hrnet_w48](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_w48_macaque_256x192.py)  | 256x192 | 0.818 | 0.963 | 0.917 | 0.855 | 0.971 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_macaque_256x192-9b34b02a_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_macaque_256x192_20210407.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_macaque.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_macaque.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d02d1f8c42d3ad581021cf16757da9fdbee7dd53
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_macaque.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_w32_macaque_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: MacaquePose
+  Name: topdown_heatmap_hrnet_w32_macaque_256x192
+  Results:
+  - Dataset: MacaquePose
+    Metrics:
+      AP: 0.814
+      AP@0.5: 0.953
+      AP@0.75: 0.918
+      AR: 0.851
+      AR@0.5: 0.969
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_macaque_256x192-f7e9e04f_20210407.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_w48_macaque_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: MacaquePose
+  Name: topdown_heatmap_hrnet_w48_macaque_256x192
+  Results:
+  - Dataset: MacaquePose
+    Metrics:
+      AP: 0.818
+      AP@0.5: 0.963
+      AP@0.75: 0.917
+      AR: 0.855
+      AR@0.5: 0.971
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_macaque_256x192-9b34b02a_20210407.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_w32_macaque_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_w32_macaque_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5085dccdc9c12b030b57f132737f28fc13d6283
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_w32_macaque_256x192.py
@@ -0,0 +1,172 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/macaque.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/macaque'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_w48_macaque_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_w48_macaque_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..bae72c8c71f1b9b66e35bb26e3c22eb850b44554
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_w48_macaque_256x192.py
@@ -0,0 +1,172 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/macaque.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/macaque'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res101_macaque_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res101_macaque_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..3656eb68544bf335e8768e3c67dd95b53ec723e2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res101_macaque_256x192.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/macaque.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/macaque'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res152_macaque_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res152_macaque_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..2267b27a0314e5dc86fa62f179cfefa898ff6494
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res152_macaque_256x192.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/macaque.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/macaque'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res50_macaque_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res50_macaque_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c51c96518d9e61346035a7dbc663ac9462ce7a1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res50_macaque_256x192.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/macaque.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/macaque'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalMacaqueDataset',
+        ann_file=f'{data_root}/annotations/macaque_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/resnet_macaque.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/resnet_macaque.md
new file mode 100644
index 0000000000000000000000000000000000000000..f6c7f6bd53d191df630e114123e08461c580799b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/resnet_macaque.md
@@ -0,0 +1,41 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://www.ncbi.nlm.nih.gov/pmc/articles/pmc7874091/">MacaquePose (bioRxiv'2020)</a></summary>
+
+```bibtex
+@article{labuguen2020macaquepose,
+  title={MacaquePose: A novel ‘in the wild’macaque monkey pose dataset for markerless motion capture},
+  author={Labuguen, Rollyn and Matsumoto, Jumpei and Negrete, Salvador and Nishimaru, Hiroshi and Nishijo, Hisao and Takada, Masahiko and Go, Yasuhiro and Inoue, Ken-ichi and Shibata, Tomohiro},
+  journal={bioRxiv},
+  year={2020},
+  publisher={Cold Spring Harbor Laboratory}
+}
+```
+
+</details>
+
+Results on MacaquePose with ground-truth detection bounding boxes
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res50_macaque_256x192.py)  | 256x192 | 0.799 | 0.952 | 0.919 | 0.837 | 0.964 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_macaque_256x192-98f1dd3a_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_macaque_256x192_20210407.log.json) |
+| [pose_resnet_101](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res101_macaque_256x192.py) | 256x192 | 0.790 | 0.953 | 0.908 | 0.828 | 0.967 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_macaque_256x192-e3b9c6bb_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_macaque_256x192_20210407.log.json) |
+| [pose_resnet_152](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res152_macaque_256x192.py) | 256x192 | 0.794 | 0.951 | 0.915 | 0.834 | 0.968 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_macaque_256x192-c42abc02_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_macaque_256x192_20210407.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/resnet_macaque.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/resnet_macaque.yml
new file mode 100644
index 0000000000000000000000000000000000000000..31aa7566008d55d4b7b03f8d091e465032411d86
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/resnet_macaque.yml
@@ -0,0 +1,56 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res50_macaque_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    Training Data: MacaquePose
+  Name: topdown_heatmap_res50_macaque_256x192
+  Results:
+  - Dataset: MacaquePose
+    Metrics:
+      AP: 0.799
+      AP@0.5: 0.952
+      AP@0.75: 0.919
+      AR: 0.837
+      AR@0.5: 0.964
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_macaque_256x192-98f1dd3a_20210407.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res101_macaque_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: MacaquePose
+  Name: topdown_heatmap_res101_macaque_256x192
+  Results:
+  - Dataset: MacaquePose
+    Metrics:
+      AP: 0.79
+      AP@0.5: 0.953
+      AP@0.75: 0.908
+      AR: 0.828
+      AR@0.5: 0.967
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_macaque_256x192-e3b9c6bb_20210407.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/res152_macaque_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: MacaquePose
+  Name: topdown_heatmap_res152_macaque_256x192
+  Results:
+  - Dataset: MacaquePose
+    Metrics:
+      AP: 0.794
+      AP@0.5: 0.951
+      AP@0.75: 0.915
+      AR: 0.834
+      AR@0.5: 0.968
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_macaque_256x192-c42abc02_20210407.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res101_zebra_160x160.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res101_zebra_160x160.py
new file mode 100644
index 0000000000000000000000000000000000000000..693867c5263f84a182a1d7742ffc996eacb42fd7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res101_zebra_160x160.py
@@ -0,0 +1,124 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/zebra.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=9,
+    dataset_joints=9,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[160, 160],
+    heatmap_size=[40, 40],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/zebra'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalZebraDataset',
+        ann_file=f'{data_root}/annotations/zebra_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalZebraDataset',
+        ann_file=f'{data_root}/annotations/zebra_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalZebraDataset',
+        ann_file=f'{data_root}/annotations/zebra_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res152_zebra_160x160.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res152_zebra_160x160.py
new file mode 100644
index 0000000000000000000000000000000000000000..edc07d3f9721d165aee3c3bf82f030aee9833653
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res152_zebra_160x160.py
@@ -0,0 +1,124 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/zebra.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=9,
+    dataset_joints=9,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[160, 160],
+    heatmap_size=[40, 40],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/zebra'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalZebraDataset',
+        ann_file=f'{data_root}/annotations/zebra_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalZebraDataset',
+        ann_file=f'{data_root}/annotations/zebra_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalZebraDataset',
+        ann_file=f'{data_root}/annotations/zebra_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res50_zebra_160x160.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res50_zebra_160x160.py
new file mode 100644
index 0000000000000000000000000000000000000000..3120b473f8abd6073b4a06a99c89b23e98137145
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res50_zebra_160x160.py
@@ -0,0 +1,124 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/zebra.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=9,
+    dataset_joints=9,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[160, 160],
+    heatmap_size=[40, 40],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/zebra'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='AnimalZebraDataset',
+        ann_file=f'{data_root}/annotations/zebra_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='AnimalZebraDataset',
+        ann_file=f'{data_root}/annotations/zebra_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='AnimalZebraDataset',
+        ann_file=f'{data_root}/annotations/zebra_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/resnet_zebra.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/resnet_zebra.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d34d598ac1f2a19cea7d7d92304c6fd79daed51
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/resnet_zebra.md
@@ -0,0 +1,43 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://elifesciences.org/articles/47994">Grévy’s Zebra (Elife'2019)</a></summary>
+
+```bibtex
+@article{graving2019deepposekit,
+  title={DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning},
+  author={Graving, Jacob M and Chae, Daniel and Naik, Hemal and Li, Liang and Koger, Benjamin and Costelloe, Blair R and Couzin, Iain D},
+  journal={Elife},
+  volume={8},
+  pages={e47994},
+  year={2019},
+  publisher={eLife Sciences Publications Limited}
+}
+```
+
+</details>
+
+Results on Grévy’s Zebra test set
+
+|  Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :-------- | :--------: | :------: | :------: | :------: |:------: |:------: |
+|[pose_resnet_50](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res50_zebra_160x160.py) | 160x160 | 1.000 | 0.914 | 1.86 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_zebra_160x160-5a104833_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_zebra_160x160_20210407.log.json) |
+|[pose_resnet_101](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res101_zebra_160x160.py) | 160x160 | 1.000 | 0.916 | 1.82 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_zebra_160x160-e8cb2010_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_zebra_160x160_20210407.log.json) |
+|[pose_resnet_152](/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res152_zebra_160x160.py) | 160x160 | 1.000 | 0.921 | 1.66 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_zebra_160x160-05de71dd_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_zebra_160x160_20210407.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/resnet_zebra.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/resnet_zebra.yml
new file mode 100644
index 0000000000000000000000000000000000000000..54912ba569e3b545e04587bbd1ffa2191d6f16da
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/resnet_zebra.yml
@@ -0,0 +1,50 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res50_zebra_160x160.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    Training Data: "Gr\xE9vy\u2019s Zebra"
+  Name: topdown_heatmap_res50_zebra_160x160
+  Results:
+  - Dataset: "Gr\xE9vy\u2019s Zebra"
+    Metrics:
+      AUC: 0.914
+      EPE: 1.86
+      PCK@0.2: 1.0
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_zebra_160x160-5a104833_20210407.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res101_zebra_160x160.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: "Gr\xE9vy\u2019s Zebra"
+  Name: topdown_heatmap_res101_zebra_160x160
+  Results:
+  - Dataset: "Gr\xE9vy\u2019s Zebra"
+    Metrics:
+      AUC: 0.916
+      EPE: 1.82
+      PCK@0.2: 1.0
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_zebra_160x160-e8cb2010_20210407.pth
+- Config: configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/res152_zebra_160x160.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: "Gr\xE9vy\u2019s Zebra"
+  Name: topdown_heatmap_res152_zebra_160x160
+  Results:
+  - Dataset: "Gr\xE9vy\u2019s Zebra"
+    Metrics:
+      AUC: 0.921
+      EPE: 1.66
+      PCK@0.2: 1.0
+    Task: Animal 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_zebra_160x160-05de71dd_20210407.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..02682f406b67ad8e5884e0c5d1a25e7bd1a67f3c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/README.md
@@ -0,0 +1,19 @@
+# Image-based Human Body 2D Pose Estimation
+
+Multi-person human pose estimation is defined as the task of detecting the poses (or keypoints) of all people from an input image.
+
+Existing approaches can be categorized into top-down and bottom-up approaches.
+
+Top-down methods (e.g. deeppose) divide the task into two stages: human detection and pose estimation. They perform human detection first, followed by single-person pose estimation given human bounding boxes.
+
+Bottom-up approaches (e.g. AE) first detect all the keypoints and then group/associate them into person instances.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/2d_body_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/2d_human_pose_demo.md#2d-human-pose-demo) to run demos.
+
+<img src="demo/resources/demo_coco.gif" width="600px" alt>
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2048f2182b77605924ec48913c3203e3bc0a61be
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/README.md
@@ -0,0 +1,25 @@
+# Associative embedding: End-to-end learning for joint detection and grouping (AE)
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+AE is one of the most popular 2D bottom-up pose estimation approaches, that first detect all the keypoints and
+then group/associate them into person instances.
+
+In order to group all the predicted keypoints to individuals, a tag is also predicted for each detected keypoint.
+Tags of the same person are similar, while tags of different people are different. Thus the keypoints can be grouped
+according to the tags.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_aic.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_aic.md
new file mode 100644
index 0000000000000000000000000000000000000000..e4737739ccafdce31982effd05e0a1b44a20d789
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_aic.md
@@ -0,0 +1,61 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Cheng_HigherHRNet_Scale-Aware_Representation_Learning_for_Bottom-Up_Human_Pose_Estimation_CVPR_2020_paper.html">HigherHRNet (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{cheng2020higherhrnet,
+  title={HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose Estimation},
+  author={Cheng, Bowen and Xiao, Bin and Wang, Jingdong and Shi, Honghui and Huang, Thomas S and Zhang, Lei},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={5386--5395},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1711.06475">AI Challenger (ArXiv'2017)</a></summary>
+
+```bibtex
+@article{wu2017ai,
+  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
+  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
+  journal={arXiv preprint arXiv:1711.06475},
+  year={2017}
+}
+```
+
+</details>
+
+Results on AIC validation set without multi-scale test
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HigherHRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py)  | 512x512 | 0.315 | 0.710 | 0.243 | 0.379 | 0.757 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_aic_512x512-9a674c33_20210130.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_aic_512x512_20210130.log.json) |
+
+Results on AIC validation set with multi-scale test. 3 default scales (\[2, 1, 0.5\]) are used
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HigherHRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py)  | 512x512 | 0.323 | 0.718 | 0.254 | 0.379 | 0.758 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_aic_512x512-9a674c33_20210130.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_aic_512x512_20210130.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_aic.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_aic.yml
new file mode 100644
index 0000000000000000000000000000000000000000..37d24a423192e918733801aa44970fb3f30b838d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_aic.yml
@@ -0,0 +1,42 @@
+Collections:
+- Name: HigherHRNet
+  Paper:
+    Title: 'HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose
+      Estimation'
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Cheng_HigherHRNet_Scale-Aware_Representation_Learning_for_Bottom-Up_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/higherhrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HigherHRNet
+    Training Data: AI Challenger
+  Name: associative_embedding_higherhrnet_w32_aic_512x512
+  Results:
+  - Dataset: AI Challenger
+    Metrics:
+      AP: 0.315
+      AP@0.5: 0.71
+      AP@0.75: 0.243
+      AR: 0.379
+      AR@0.5: 0.757
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_aic_512x512-9a674c33_20210130.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: AI Challenger
+  Name: associative_embedding_higherhrnet_w32_aic_512x512
+  Results:
+  - Dataset: AI Challenger
+    Metrics:
+      AP: 0.323
+      AP@0.5: 0.718
+      AP@0.75: 0.254
+      AR: 0.379
+      AR@0.5: 0.758
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_aic_512x512-9a674c33_20210130.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..67602935cc952381b8081b993f220ad3a86c90d8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py
@@ -0,0 +1,195 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=14,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.01, 0.01],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf5fef221acb115d43fbf567ce3603d724921a33
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512_udp.py
@@ -0,0 +1,198 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=14,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.01, 0.01],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225])
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_aic.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_aic.md
new file mode 100644
index 0000000000000000000000000000000000000000..89b6b18ef6229c2a1c78d0d6248f6489f3cb3e14
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_aic.md
@@ -0,0 +1,61 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1711.06475">AI Challenger (ArXiv'2017)</a></summary>
+
+```bibtex
+@article{wu2017ai,
+  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
+  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
+  journal={arXiv preprint arXiv:1711.06475},
+  year={2017}
+}
+```
+
+</details>
+
+Results on AIC validation set without multi-scale test
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py)  | 512x512 | 0.303 | 0.697 | 0.225 | 0.373 | 0.755 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_aic_512x512-77e2a98a_20210131.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_aic_512x512_20210131.log.json) |
+
+Results on AIC validation set with multi-scale test. 3 default scales (\[2, 1, 0.5\]) are used
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py)  | 512x512 | 0.318 | 0.717 | 0.246 | 0.379 | 0.764 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_aic_512x512-77e2a98a_20210131.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_aic_512x512_20210131.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_aic.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_aic.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3be9548fb8529e1deda50ef2b0b9ed5968d9848d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_aic.yml
@@ -0,0 +1,41 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HRNet
+    Training Data: AI Challenger
+  Name: associative_embedding_hrnet_w32_aic_512x512
+  Results:
+  - Dataset: AI Challenger
+    Metrics:
+      AP: 0.303
+      AP@0.5: 0.697
+      AP@0.75: 0.225
+      AR: 0.373
+      AR@0.5: 0.755
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_aic_512x512-77e2a98a_20210131.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: AI Challenger
+  Name: associative_embedding_hrnet_w32_aic_512x512
+  Results:
+  - Dataset: AI Challenger
+    Metrics:
+      AP: 0.318
+      AP@0.5: 0.717
+      AP@0.75: 0.246
+      AR: 0.379
+      AR@0.5: 0.764
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_aic_512x512-77e2a98a_20210131.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e4b8363336397e703985c71fd62092d83176018
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py
@@ -0,0 +1,191 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=32,
+        num_joints=14,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.01],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..676e1708bf55edafd005c1f89f3319609a74ee8c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_coco.md
@@ -0,0 +1,67 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Cheng_HigherHRNet_Scale-Aware_Representation_Learning_for_Bottom-Up_Human_Pose_Estimation_CVPR_2020_paper.html">HigherHRNet (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{cheng2020higherhrnet,
+  title={HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose Estimation},
+  author={Cheng, Bowen and Xiao, Bin and Wang, Jingdong and Shi, Honghui and Huang, Thomas S and Zhang, Lei},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={5386--5395},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HigherHRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py)  | 512x512 | 0.677 | 0.870 | 0.738 | 0.723 | 0.890 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_512x512-8ae85183_20200713.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_512x512_20200713.log.json) |
+| [HigherHRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py)  | 640x640 | 0.686 | 0.871 | 0.747 | 0.733 | 0.898 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_640x640-a22fe938_20200712.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_640x640_20200712.log.json) |
+| [HigherHRNet-w48](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py)  | 512x512 | 0.686 | 0.873 | 0.741 | 0.731 | 0.892 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_512x512-60fedcbc_20200712.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_512x512_20200712.log.json) |
+
+Results on COCO val2017 with multi-scale test. 3 default scales (\[2, 1, 0.5\]) are used
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HigherHRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py)  | 512x512 | 0.706 | 0.881 | 0.771 | 0.747 | 0.901 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_512x512-8ae85183_20200713.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_512x512_20200713.log.json) |
+| [HigherHRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py)  | 640x640 | 0.706 | 0.880 | 0.770 | 0.749 | 0.902 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_640x640-a22fe938_20200712.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_640x640_20200712.log.json) |
+| [HigherHRNet-w48](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py)  | 512x512 | 0.716 | 0.884 | 0.775 | 0.755 | 0.901 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_512x512-60fedcbc_20200712.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_512x512_20200712.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5302efe00f9e31682b6498d526963dc2b50db89b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_coco.yml
@@ -0,0 +1,106 @@
+Collections:
+- Name: HigherHRNet
+  Paper:
+    Title: 'HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose
+      Estimation'
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Cheng_HigherHRNet_Scale-Aware_Representation_Learning_for_Bottom-Up_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/higherhrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HigherHRNet
+    Training Data: COCO
+  Name: associative_embedding_higherhrnet_w32_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.677
+      AP@0.5: 0.87
+      AP@0.75: 0.738
+      AR: 0.723
+      AR@0.5: 0.89
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_512x512-8ae85183_20200713.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_higherhrnet_w32_coco_640x640
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.686
+      AP@0.5: 0.871
+      AP@0.75: 0.747
+      AR: 0.733
+      AR@0.5: 0.898
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_640x640-a22fe938_20200712.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_higherhrnet_w48_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.686
+      AP@0.5: 0.873
+      AP@0.75: 0.741
+      AR: 0.731
+      AR@0.5: 0.892
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_512x512-60fedcbc_20200712.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_higherhrnet_w32_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.706
+      AP@0.5: 0.881
+      AP@0.75: 0.771
+      AR: 0.747
+      AR@0.5: 0.901
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_512x512-8ae85183_20200713.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_higherhrnet_w32_coco_640x640
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.706
+      AP@0.5: 0.88
+      AP@0.75: 0.77
+      AR: 0.749
+      AR@0.5: 0.902
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_640x640-a22fe938_20200712.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_higherhrnet_w48_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.716
+      AP@0.5: 0.884
+      AP@0.75: 0.775
+      AR: 0.755
+      AR@0.5: 0.901
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_512x512-60fedcbc_20200712.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_udp_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_udp_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..36ba0c8550af2c802a236cde54791494b2c34733
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_udp_coco.md
@@ -0,0 +1,75 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Cheng_HigherHRNet_Scale-Aware_Representation_Learning_for_Bottom-Up_Human_Pose_Estimation_CVPR_2020_paper.html">HigherHRNet (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{cheng2020higherhrnet,
+  title={HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose Estimation},
+  author={Cheng, Bowen and Xiao, Bin and Wang, Jingdong and Shi, Honghui and Huang, Thomas S and Zhang, Lei},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={5386--5395},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html">UDP (CVPR'2020)</a></summary>
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  month = {June},
+  year = {2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HigherHRNet-w32_udp](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512_udp.py)  | 512x512 | 0.678 | 0.862 | 0.736 | 0.724 | 0.890 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_512x512_udp-8cc64794_20210222.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_512x512_udp_20210222.log.json) |
+| [HigherHRNet-w48_udp](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512_udp.py)  | 512x512 | 0.690 | 0.872 | 0.750 | 0.734 | 0.891 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_512x512_udp-7cad61ef_20210222.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_512x512_udp_20210222.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_udp_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_udp_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1a04988d251b7f7c42639fccb160291614432c35
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_udp_coco.yml
@@ -0,0 +1,43 @@
+Collections:
+- Name: HigherHRNet
+  Paper:
+    Title: 'HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose
+      Estimation'
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Cheng_HigherHRNet_Scale-Aware_Representation_Learning_for_Bottom-Up_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/higherhrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512_udp.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HigherHRNet
+    - UDP
+    Training Data: COCO
+  Name: associative_embedding_higherhrnet_w32_coco_512x512_udp
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.678
+      AP@0.5: 0.862
+      AP@0.75: 0.736
+      AR: 0.724
+      AR@0.5: 0.89
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_512x512_udp-8cc64794_20210222.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512_udp.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_higherhrnet_w48_coco_512x512_udp
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.69
+      AP@0.5: 0.872
+      AP@0.75: 0.75
+      AR: 0.734
+      AR@0.5: 0.891
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_512x512_udp-7cad61ef_20210222.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6f549bad31b8cc18e47fd4c47cd3246540840e3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py
@@ -0,0 +1,193 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=17,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..6109c2e61c916cf0e6075d3929150c466d2f482c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512_udp.py
@@ -0,0 +1,197 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=17,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=False,
+        align_corners=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225])
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..2daf4840bdbe946179fcc380844fe2226654fb05
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py
@@ -0,0 +1,193 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160, 320],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=17,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b92efc4ffc8e7cde69abe5c5b68d743e06cef72
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640_udp.py
@@ -0,0 +1,197 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160, 320],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=17,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=False,
+        align_corners=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225])
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..031e6fc286923f2c2215ebf8233cbb6217600741
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py
@@ -0,0 +1,193 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=48,
+        num_joints=17,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[48],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff298aece7fb69b56c4b37c19d17ac412864efc4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512_udp.py
@@ -0,0 +1,197 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=48,
+        num_joints=17,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[48],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=False,
+        align_corners=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225])
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..b72e57023bf48443b5b0a2f65b9dcca1ef0c541a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco.md
@@ -0,0 +1,63 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">HourglassAENet (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hourglass_ae](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco_512x512.py)  | 512x512 | 0.613 | 0.833 | 0.667 | 0.659 | 0.850 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hourglass_ae/hourglass_ae_coco_512x512-90af499f_20210920.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hourglass_ae/hourglass_ae_coco_512x512_20210920.log.json) |
+
+Results on COCO val2017 with multi-scale test. 3 default scales (\[2, 1, 0.5\]) are used
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hourglass_ae](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco_512x512.py)  | 512x512 | 0.667 | 0.855 | 0.723 | 0.707 | 0.877 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hourglass_ae/hourglass_ae_coco_512x512-90af499f_20210920.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hourglass_ae/hourglass_ae_coco_512x512_20210920.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5b7d5e88f952e6f8fa0ea425496e736c47155e19
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco.yml
@@ -0,0 +1,41 @@
+Collections:
+- Name: Associative Embedding
+  Paper:
+    Title: 'Associative embedding: End-to-end learning for joint detection and grouping'
+    URL: https://arxiv.org/abs/1611.05424
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/associative_embedding.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco_512x512.py
+  In Collection: Associative Embedding
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HourglassAENet
+    Training Data: COCO
+  Name: associative_embedding_hourglass_ae_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.613
+      AP@0.5: 0.833
+      AP@0.75: 0.667
+      AR: 0.659
+      AR@0.5: 0.85
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hourglass_ae/hourglass_ae_coco_512x512-90af499f_20210920.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco_512x512.py
+  In Collection: Associative Embedding
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_hourglass_ae_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.667
+      AP@0.5: 0.855
+      AP@0.75: 0.723
+      AR: 0.707
+      AR@0.5: 0.877
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hourglass_ae/hourglass_ae_coco_512x512-90af499f_20210920.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..351308a2dfdb28a694b91fa1100fd71690331b90
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco_512x512.py
@@ -0,0 +1,167 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained=None,
+    backbone=dict(
+        type='HourglassAENet',
+        num_stacks=4,
+        out_channels=34,
+    ),
+    keypoint_head=dict(
+        type='AEMultiStageHead',
+        in_channels=34,
+        out_channels=34,
+        num_stages=4,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=0),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=4,
+            ae_loss_type='exp',
+            with_heatmaps_loss=[True, True, True, True],
+            with_ae_loss=[True, True, True, True],
+            push_loss_factor=[0.001, 0.001, 0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001, 0.001, 0.001],
+            heatmaps_loss_factor=[1.0, 1.0, 1.0, 1.0])),
+    train_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        img_size=data_cfg['image_size']),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True, True, True],
+        with_ae=[True, True, True, True],
+        select_output_index=[3],
+        project2image=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='MultitaskGatherTarget',
+        pipeline_list=[
+            [dict(type='BottomUpGenerateTarget', sigma=2, max_num_people=30)],
+        ],
+        pipeline_indices=[0] * 4,
+        keys=['targets', 'masks', 'joints']),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=6),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..39f3e3b8e80ee070d0881e16058b93e6dcdb5576
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_coco.md
@@ -0,0 +1,65 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py)  | 512x512 | 0.654 | 0.863 | 0.720 | 0.710 | 0.892 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512_20200816.log.json) |
+| [HRNet-w48](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py)  | 512x512 | 0.665 | 0.860 | 0.727 | 0.716 | 0.889 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_512x512-cf72fcdf_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_512x512_20200816.log.json) |
+
+Results on COCO val2017 with multi-scale test. 3 default scales (\[2, 1, 0.5\]) are used
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py)  | 512x512 | 0.698 | 0.877 | 0.760 | 0.748 | 0.907 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512_20200816.log.json) |
+| [HRNet-w48](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py)  | 512x512 | 0.712 | 0.880 | 0.771 | 0.757 | 0.909 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_512x512-cf72fcdf_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_512x512_20200816.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2838b4a70bc3556ea971aa2f37bcf54ef1310009
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_coco.yml
@@ -0,0 +1,73 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HRNet
+    Training Data: COCO
+  Name: associative_embedding_hrnet_w32_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.654
+      AP@0.5: 0.863
+      AP@0.75: 0.72
+      AR: 0.71
+      AR@0.5: 0.892
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_hrnet_w48_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.665
+      AP@0.5: 0.86
+      AP@0.75: 0.727
+      AR: 0.716
+      AR@0.5: 0.889
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_512x512-cf72fcdf_20200816.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_hrnet_w32_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.698
+      AP@0.5: 0.877
+      AP@0.75: 0.76
+      AR: 0.748
+      AR@0.5: 0.907
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_hrnet_w48_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.712
+      AP@0.5: 0.88
+      AP@0.75: 0.771
+      AR: 0.757
+      AR@0.5: 0.909
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_512x512-cf72fcdf_20200816.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_udp_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_udp_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..2388e5670e5577715799b85e98d02513518d6611
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_udp_coco.md
@@ -0,0 +1,75 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html">UDP (CVPR'2020)</a></summary>
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  month = {June},
+  year = {2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HRNet-w32_udp](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512_udp.py)  | 512x512 | 0.671 | 0.863 | 0.729 | 0.717 | 0.889 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512_udp-91663bf9_20210220.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512_udp_20210220.log.json) |
+| [HRNet-w48_udp](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512_udp.py)  | 512x512 | 0.681 | 0.872 | 0.741 | 0.725 | 0.892 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_512x512_udp-de08fd8c_20210222.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_512x512_udp_20210222.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_udp_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_udp_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..adc8d8dbc5f3ce13709935fe5412f611bf908f0c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_udp_coco.yml
@@ -0,0 +1,43 @@
+Collections:
+- Name: UDP
+  Paper:
+    Title: 'The Devil Is in the Details: Delving Into Unbiased Data Processing for
+      Human Pose Estimation'
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/udp.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512_udp.py
+  In Collection: UDP
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HRNet
+    - UDP
+    Training Data: COCO
+  Name: associative_embedding_hrnet_w32_coco_512x512_udp
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.671
+      AP@0.5: 0.863
+      AP@0.75: 0.729
+      AR: 0.717
+      AR@0.5: 0.889
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512_udp-91663bf9_20210220.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512_udp.py
+  In Collection: UDP
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_hrnet_w48_coco_512x512_udp
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.681
+      AP@0.5: 0.872
+      AP@0.75: 0.741
+      AR: 0.725
+      AR@0.5: 0.892
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_512x512_udp-de08fd8c_20210222.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c63d587178fbbbf8b6825c54c55cdb9f884ff6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py
@@ -0,0 +1,189 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=32,
+        num_joints=17,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb0ef809615b236645139e09cb28cffac35d2360
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512_udp.py
@@ -0,0 +1,193 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=32,
+        num_joints=17,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=False,
+        align_corners=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225])
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..67629a1fd2014724e76a2802f04fee0c9cfc09a2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640.py
@@ -0,0 +1,189 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=32,
+        num_joints=17,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..44c2cecddcbb5d295009d64b2e3e5f17fc4d8cd3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640_udp.py
@@ -0,0 +1,193 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=32,
+        num_joints=17,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=False,
+        align_corners=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225])
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c385bb4f066c8ee5f0795bcb04db5c6722bcb10d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py
@@ -0,0 +1,189 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=48,
+        num_joints=17,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b86aba82760e4174397c8af5997aa4a9062e7190
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512_udp.py
@@ -0,0 +1,193 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=48,
+        num_joints=17,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=False,
+        align_corners=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225])
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..711506240b798b549eb005a4debd333e2b61f43d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640.py
@@ -0,0 +1,189 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=48,
+        num_joints=17,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=8),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8ca32df54c8454e3640518d580dea87586ae663
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640_udp.py
@@ -0,0 +1,193 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=48,
+        num_joints=17,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=False,
+        align_corners=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225])
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=8),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..a9b222551d153f3734074a1b5c4d34d570381e9a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco.md
@@ -0,0 +1,63 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html">MobilenetV2 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4510--4520},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_mobilenetv2](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py)  | 512x512 | 0.380 | 0.671 | 0.368 | 0.473 | 0.741 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/mobilenetv2_coco_512x512-4d96e309_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/mobilenetv2_coco_512x512_20200816.log.json) |
+
+Results on COCO val2017 with multi-scale test. 3 default scales (\[2, 1, 0.5\]) are used
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_mobilenetv2](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py)  | 512x512 | 0.442 | 0.696 | 0.422 | 0.517 | 0.766 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/mobilenetv2_coco_512x512-4d96e309_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/mobilenetv2_coco_512x512_20200816.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..95538eba854d71b46feb38e0db2d6069719f2947
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco.yml
@@ -0,0 +1,41 @@
+Collections:
+- Name: MobilenetV2
+  Paper:
+    Title: 'Mobilenetv2: Inverted residuals and linear bottlenecks'
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/mobilenetv2.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py
+  In Collection: MobilenetV2
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - MobilenetV2
+    Training Data: COCO
+  Name: associative_embedding_mobilenetv2_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.38
+      AP@0.5: 0.671
+      AP@0.75: 0.368
+      AR: 0.473
+      AR@0.5: 0.741
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/mobilenetv2_coco_512x512-4d96e309_20200816.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py
+  In Collection: MobilenetV2
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_mobilenetv2_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.442
+      AP@0.5: 0.696
+      AP@0.75: 0.422
+      AR: 0.517
+      AR@0.5: 0.766
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/mobilenetv2_coco_512x512-4d96e309_20200816.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b0d818707fa875cdc028e45233ad1b0684c0fdf
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py
@@ -0,0 +1,158 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=1280,
+        num_joints=17,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=1,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68700d118145cb881ecafce156a790ec45f6b0c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py
@@ -0,0 +1,158 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=2048,
+        num_joints=17,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff87ac8a51ddab62b7afd4fb0599da5db7ea1d70
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_640x640.py
@@ -0,0 +1,158 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=2048,
+        num_joints=17,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9ed79cc2eb52303b2a1a6e0d440ee519dcc9ebe
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py
@@ -0,0 +1,158 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=2048,
+        num_joints=17,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..e473a83298e05719be75679320cf8299fd3d48cd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_640x640.py
@@ -0,0 +1,158 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=2048,
+        num_joints=17,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5022546c74185b8b60e075b32b289c1870f3e111
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py
@@ -0,0 +1,159 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=2048,
+        num_joints=17,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0],
+        )),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=1,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..8643525dd322aeed0b75dce3b17a706a9c9ff90b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py
@@ -0,0 +1,158 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=2048,
+        num_joints=17,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=17,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=1,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/resnet_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/resnet_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..04b8505ddf2a7833ff8851f26ce660d752bd752c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/resnet_coco.md
@@ -0,0 +1,69 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py)  | 512x512 | 0.466 | 0.742 | 0.479 | 0.552 | 0.797 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/res50_coco_512x512-5521bead_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/res50_coco_512x512_20200816.log.json) |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py)  | 640x640 | 0.479 | 0.757 | 0.487 | 0.566 | 0.810 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/res50_coco_640x640-2046f9cb_20200822.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/res50_coco_640x640_20200822.log.json) |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py)  | 512x512 | 0.554 | 0.807 | 0.599 | 0.622 | 0.841 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/res101_coco_512x512-e0c95157_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/res101_coco_512x512_20200816.log.json) |
+| [pose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py)  | 512x512 | 0.595 | 0.829 | 0.648 | 0.651 | 0.856 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/res152_coco_512x512-364eb38d_20200822.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/res152_coco_512x512_20200822.log.json) |
+
+Results on COCO val2017 with multi-scale test. 3 default scales (\[2, 1, 0.5\]) are used
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py)  | 512x512 | 0.503 | 0.765 | 0.521 | 0.591 | 0.821 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/res50_coco_512x512-5521bead_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/res50_coco_512x512_20200816.log.json) |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py)  | 640x640 | 0.525 | 0.784 | 0.542 | 0.610 | 0.832 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/res50_coco_640x640-2046f9cb_20200822.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/res50_coco_640x640_20200822.log.json) |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py)  | 512x512 | 0.603 | 0.831 | 0.641 | 0.668 | 0.870 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/res101_coco_512x512-e0c95157_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/res101_coco_512x512_20200816.log.json) |
+| [pose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py)  | 512x512 | 0.660 | 0.860 | 0.713 | 0.709 | 0.889 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/res152_coco_512x512-364eb38d_20200822.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/res152_coco_512x512_20200822.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/resnet_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/resnet_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..45c49b8ecb72e9f1172091b8e2a7ddd2720498c5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/resnet_coco.yml
@@ -0,0 +1,137 @@
+Collections:
+- Name: Associative Embedding
+  Paper:
+    Title: 'Associative embedding: End-to-end learning for joint detection and grouping'
+    URL: https://arxiv.org/abs/1611.05424
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/associative_embedding.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py
+  In Collection: Associative Embedding
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - ResNet
+    Training Data: COCO
+  Name: associative_embedding_res50_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.466
+      AP@0.5: 0.742
+      AP@0.75: 0.479
+      AR: 0.552
+      AR@0.5: 0.797
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/res50_coco_512x512-5521bead_20200816.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py
+  In Collection: Associative Embedding
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_res50_coco_640x640
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.479
+      AP@0.5: 0.757
+      AP@0.75: 0.487
+      AR: 0.566
+      AR@0.5: 0.81
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/res50_coco_640x640-2046f9cb_20200822.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py
+  In Collection: Associative Embedding
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_res101_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.554
+      AP@0.5: 0.807
+      AP@0.75: 0.599
+      AR: 0.622
+      AR@0.5: 0.841
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/res101_coco_512x512-e0c95157_20200816.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py
+  In Collection: Associative Embedding
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_res152_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.595
+      AP@0.5: 0.829
+      AP@0.75: 0.648
+      AR: 0.651
+      AR@0.5: 0.856
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/res152_coco_512x512-364eb38d_20200822.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py
+  In Collection: Associative Embedding
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_res50_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.503
+      AP@0.5: 0.765
+      AP@0.75: 0.521
+      AR: 0.591
+      AR@0.5: 0.821
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/res50_coco_512x512-5521bead_20200816.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py
+  In Collection: Associative Embedding
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_res50_coco_640x640
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.525
+      AP@0.5: 0.784
+      AP@0.75: 0.542
+      AR: 0.61
+      AR@0.5: 0.832
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/res50_coco_640x640-2046f9cb_20200822.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py
+  In Collection: Associative Embedding
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_res101_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.603
+      AP@0.5: 0.831
+      AP@0.75: 0.641
+      AR: 0.668
+      AR@0.5: 0.87
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/res101_coco_512x512-e0c95157_20200816.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py
+  In Collection: Associative Embedding
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: associative_embedding_res152_coco_512x512
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.66
+      AP@0.5: 0.86
+      AP@0.75: 0.713
+      AR: 0.709
+      AR@0.5: 0.889
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/res152_coco_512x512-364eb38d_20200822.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_crowdpose.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_crowdpose.md
new file mode 100644
index 0000000000000000000000000000000000000000..44451f645a291469141a97aacf41a3fac6926964
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_crowdpose.md
@@ -0,0 +1,61 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Cheng_HigherHRNet_Scale-Aware_Representation_Learning_for_Bottom-Up_Human_Pose_Estimation_CVPR_2020_paper.html">HigherHRNet (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{cheng2020higherhrnet,
+  title={HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose Estimation},
+  author={Cheng, Bowen and Xiao, Bin and Wang, Jingdong and Shi, Honghui and Huang, Thomas S and Zhang, Lei},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={5386--5395},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Li_CrowdPose_Efficient_Crowded_Scenes_Pose_Estimation_and_a_New_Benchmark_CVPR_2019_paper.html">CrowdPose (CVPR'2019)</a></summary>
+
+```bibtex
+@article{li2018crowdpose,
+  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
+  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
+  journal={arXiv preprint arXiv:1812.00324},
+  year={2018}
+}
+```
+
+</details>
+
+Results on CrowdPose test without multi-scale test
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AP (E) | AP (M) | AP (H) | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | :------: |
+| [HigherHRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py)  | 512x512 | 0.655 | 0.859 | 0.705 | 0.728 | 0.660 | 0.577 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_crowdpose_512x512-1aa4a132_20201017.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_crowdpose_512x512_20201017.log.json) |
+
+Results on CrowdPose test with multi-scale test. 2 scales (\[2, 1\]) are used
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AP (E) | AP (M) | AP (H) | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | :------: |
+| [HigherHRNet-w32](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py)  | 512x512 | 0.661 | 0.864 | 0.710 | 0.742 | 0.670 | 0.566 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_crowdpose_512x512-1aa4a132_20201017.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_crowdpose_512x512_20201017.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_crowdpose.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_crowdpose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b8a2980665d032846c32796196cc22a8be26f29e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_crowdpose.yml
@@ -0,0 +1,44 @@
+Collections:
+- Name: HigherHRNet
+  Paper:
+    Title: 'HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose
+      Estimation'
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Cheng_HigherHRNet_Scale-Aware_Representation_Learning_for_Bottom-Up_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/higherhrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HigherHRNet
+    Training Data: CrowdPose
+  Name: associative_embedding_higherhrnet_w32_crowdpose_512x512
+  Results:
+  - Dataset: CrowdPose
+    Metrics:
+      AP: 0.655
+      AP (E): 0.728
+      AP (H): 0.577
+      AP (M): 0.66
+      AP@0.5: 0.859
+      AP@0.75: 0.705
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_crowdpose_512x512-1aa4a132_20201017.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: CrowdPose
+  Name: associative_embedding_higherhrnet_w32_crowdpose_512x512
+  Results:
+  - Dataset: CrowdPose
+    Metrics:
+      AP: 0.661
+      AP (E): 0.742
+      AP (H): 0.566
+      AP (M): 0.67
+      AP@0.5: 0.864
+      AP@0.75: 0.71
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_crowdpose_512x512-1aa4a132_20201017.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..18739b8b79109cd9db6f69c23423c6884892e93e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py
@@ -0,0 +1,192 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=14,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..a853c3f57feaa0454226eb6c0ad5c05a381e2f73
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512_udp.py
@@ -0,0 +1,196 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=14,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=False,
+        align_corners=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225])
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ce567b9b27ca8144ea6a31fb95e55c278db59d3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640.py
@@ -0,0 +1,192 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160, 320],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=14,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9bf0e33420bf7479e94ea30af847c1d84cefd02
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640_udp.py
@@ -0,0 +1,196 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160, 320],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=14,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=False,
+        align_corners=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..f82792de8cf49e926e4360a4641f3346f886c4e5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512.py
@@ -0,0 +1,192 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=48,
+        num_joints=14,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[48],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7f2c89c8abe50aefb9f09ce1b84c84e60526c98
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512_udp.py
@@ -0,0 +1,196 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=48,
+        num_joints=14,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[48],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=False,
+        align_corners=True,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True,
+        use_udp=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40,
+        use_udp=True),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+        use_udp=True,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1], use_udp=True),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225])
+        ],
+        use_udp=True),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/mobilenetv2_crowdpose_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/mobilenetv2_crowdpose_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1cb8b735fcc88f7305ba32b42c0e1fa55dfb29
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/mobilenetv2_crowdpose_512x512.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=1280,
+        num_joints=14,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res101_crowdpose_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res101_crowdpose_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e3ca353bf26ca718a9af0085706e88b14c3ee87
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res101_crowdpose_512x512.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=2048,
+        num_joints=14,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res152_crowdpose_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res152_crowdpose_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c31129e69e5b1cbc17016bd8dd0524ae8c15e2d1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res152_crowdpose_512x512.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=2048,
+        num_joints=14,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res50_crowdpose_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res50_crowdpose_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..350f7fda2664f6b468fc6ea5857ade39ce97fd2f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res50_crowdpose_512x512.py
@@ -0,0 +1,158 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=2048,
+        num_joints=14,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=14,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0],
+        )),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_mhp.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_mhp.md
new file mode 100644
index 0000000000000000000000000000000000000000..dc15eb19bddc839c8f780c0d867f6d5611dea796
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_mhp.md
@@ -0,0 +1,62 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://dl.acm.org/doi/abs/10.1145/3240508.3240509">MHP (ACM MM'2018)</a></summary>
+
+```bibtex
+@inproceedings{zhao2018understanding,
+  title={Understanding humans in crowded scenes: Deep nested adversarial learning and a new benchmark for multi-human parsing},
+  author={Zhao, Jian and Li, Jianshu and Cheng, Yu and Sim, Terence and Yan, Shuicheng and Feng, Jiashi},
+  booktitle={Proceedings of the 26th ACM international conference on Multimedia},
+  pages={792--800},
+  year={2018}
+}
+```
+
+</details>
+
+Results on MHP v2.0 validation set without multi-scale test
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HRNet-w48](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_mhp_512x512.py)  | 512x512 | 0.583 | 0.895 | 0.666 | 0.656 | 0.931 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_mhp_512x512-85a6ab6f_20201229.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_mhp_512x512_20201229.log.json) |
+
+Results on MHP v2.0 validation set with multi-scale test. 3 default scales (\[2, 1, 0.5\]) are used
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HRNet-w48](/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_mhp_512x512.py)  | 512x512 | 0.592 | 0.898 | 0.673 | 0.664 | 0.932 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_mhp_512x512-85a6ab6f_20201229.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_mhp_512x512_20201229.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_mhp.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_mhp.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8eda9252d16dc61e309f5d9e97c950468f51effd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_mhp.yml
@@ -0,0 +1,41 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_mhp_512x512.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HRNet
+    Training Data: MHP
+  Name: associative_embedding_hrnet_w48_mhp_512x512
+  Results:
+  - Dataset: MHP
+    Metrics:
+      AP: 0.583
+      AP@0.5: 0.895
+      AP@0.75: 0.666
+      AR: 0.656
+      AR@0.5: 0.931
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_mhp_512x512-85a6ab6f_20201229.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_mhp_512x512.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: MHP
+  Name: associative_embedding_hrnet_w48_mhp_512x512
+  Results:
+  - Dataset: MHP
+    Metrics:
+      AP: 0.592
+      AP@0.5: 0.898
+      AP@0.75: 0.673
+      AR: 0.664
+      AR@0.5: 0.932
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_mhp_512x512-85a6ab6f_20201229.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_w48_mhp_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_w48_mhp_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c5b4dfc9fd28783ef2c7cd1abd4035939e73721
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_w48_mhp_512x512.py
@@ -0,0 +1,187 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mhp.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.005,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[400, 550])
+total_epochs = 600
+channel_cfg = dict(
+    dataset_joints=16,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=48,
+        num_joints=16,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=16,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.01],
+            pull_loss_factor=[0.01],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0])),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mhp'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpMhpDataset',
+        ann_file=f'{data_root}/annotations/mhp_train.json',
+        img_prefix=f'{data_root}/train/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpMhpDataset',
+        ann_file=f'{data_root}/annotations/mhp_val.json',
+        img_prefix=f'{data_root}/val/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpMhpDataset',
+        ann_file=f'{data_root}/annotations/mhp_val.json',
+        img_prefix=f'{data_root}/val/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..47346a72e44ee340239c18a7ba7c7dd9aba91bb2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/README.md
@@ -0,0 +1,24 @@
+# DeepPose: Human pose estimation via deep neural networks
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+DeepPose first proposes using deep neural networks (DNNs) to tackle the problem of human pose estimation.
+It follows the top-down paradigm, that first detects human bounding boxes and then estimates poses.
+It learns to directly regress the human body keypoint coordinates.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res101_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res101_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46b8f50144d4805f224efad9c4b90510ff567ee
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res101_coco_256x192.py
@@ -0,0 +1,132 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res152_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res152_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..580b9b0ae67894c9dede5513f6137a69f4ecb513
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res152_coco_256x192.py
@@ -0,0 +1,132 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..c978eeb3b15b24c62ee9c81d6142c4e6fd69d9be
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res50_coco_256x192.py
@@ -0,0 +1,132 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..5aaea7d1e132884c118ce907f93e7199ab7200b1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_coco.md
@@ -0,0 +1,59 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [deeppose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res50_coco_256x192.py)  | 256x192 | 0.526 | 0.816 | 0.586 | 0.638 | 0.887 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_coco_256x192-f6de6c0e_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_coco_256x192_20210205.log.json) |
+| [deeppose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res101_coco_256x192.py) | 256x192 | 0.560 | 0.832 | 0.628 | 0.668 | 0.900 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192-2f247111_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192_20210205.log.json) |
+| [deeppose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res152_coco_256x192.py) | 256x192 | 0.583 | 0.843 | 0.659 | 0.686 | 0.907 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192-7df89a88_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192_20210205.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..21cc7ee3b52efb92207838eda66d8a9b78714bbb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_coco.yml
@@ -0,0 +1,57 @@
+Collections:
+- Name: ResNet
+  Paper:
+    Title: Deep residual learning for image recognition
+    URL: http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res50_coco_256x192.py
+  In Collection: ResNet
+  Metadata:
+    Architecture: &id001
+    - DeepPose
+    - ResNet
+    Training Data: COCO
+  Name: deeppose_res50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.526
+      AP@0.5: 0.816
+      AP@0.75: 0.586
+      AR: 0.638
+      AR@0.5: 0.887
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_coco_256x192-f6de6c0e_20210205.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res101_coco_256x192.py
+  In Collection: ResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: deeppose_res101_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.56
+      AP@0.5: 0.832
+      AP@0.75: 0.628
+      AR: 0.668
+      AR@0.5: 0.9
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192-2f247111_20210205.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/deeppose/coco/res152_coco_256x192.py
+  In Collection: ResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: deeppose_res152_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.583
+      AP@0.5: 0.843
+      AP@0.75: 0.659
+      AR: 0.686
+      AR@0.5: 0.907
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192-7df89a88_20210205.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res101_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res101_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..948975600e9d4d2c824295d85f3f7d0a07d3461e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res101_mpii_256x256.py
@@ -0,0 +1,120 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res152_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res152_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e8ce0ea91172e4b8f9a059b6eb6451af2bca852
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res152_mpii_256x256.py
@@ -0,0 +1,120 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res50_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res50_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..314a21aea21092ac43695448506636840ce45f66
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res50_mpii_256x256.py
@@ -0,0 +1,120 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6eb8e5859d0f783579f7feb9f45af4da89192b1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_mpii.md
@@ -0,0 +1,58 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [deeppose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res50_mpii_256x256.py) | 256x256 | 0.825 | 0.174 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256-c63cd0b6_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256_20210203.log.json) |
+| [deeppose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res101_mpii_256x256.py) | 256x256 | 0.841 | 0.193 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_mpii_256x256-87516a90_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_mpii_256x256_20210205.log.json) |
+| [deeppose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res152_mpii_256x256.py) | 256x256 | 0.850 | 0.198 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_mpii_256x256-15f5e6f9_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_mpii_256x256_20210205.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1685083653287bbee7fbf04474334a4acfb8d0c3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_mpii.yml
@@ -0,0 +1,48 @@
+Collections:
+- Name: ResNet
+  Paper:
+    Title: Deep residual learning for image recognition
+    URL: http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res50_mpii_256x256.py
+  In Collection: ResNet
+  Metadata:
+    Architecture: &id001
+    - DeepPose
+    - ResNet
+    Training Data: MPII
+  Name: deeppose_res50_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.825
+      Mean@0.1: 0.174
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256-c63cd0b6_20210203.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res101_mpii_256x256.py
+  In Collection: ResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: deeppose_res101_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.841
+      Mean@0.1: 0.193
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_mpii_256x256-87516a90_20210205.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/res152_mpii_256x256.py
+  In Collection: ResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: deeppose_res152_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.85
+      Mean@0.1: 0.198
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_mpii_256x256-15f5e6f9_20210205.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c6fef1486076e21762b23ea55f5d856fc36ce68b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
@@ -0,0 +1,10 @@
+# Top-down heatmap-based pose estimation
+
+Top-down methods divide the task into two stages: human detection and pose estimation.
+
+They perform human detection first, followed by single-person pose estimation given human bounding boxes.
+Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the
+likelihood of being a keypoint.
+
+Various neural network models have been proposed for better performance.
+The popular ones include stacked hourglass networks, and HRNet.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_base_aic_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_base_aic_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..58f4567b60438e407baa26cb71502a32360b23d2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_base_aic_256x192.py
@@ -0,0 +1,151 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_huge_aic_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_huge_aic_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..277123bf26fd137af306114989127622ab2870e2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_huge_aic_256x192.py
@@ -0,0 +1,151 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_large_aic_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_large_aic_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c64241adf07acab214545f8ccb5ad59772dd60b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_large_aic_256x192.py
@@ -0,0 +1,151 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_small_aic_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_small_aic_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..af66009deac70a9f01c702516853da9a7fd27546
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/ViTPose_small_aic_256x192.py
@@ -0,0 +1,151 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_aic.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_aic.md
new file mode 100644
index 0000000000000000000000000000000000000000..5331aba3379f908914ac487c48619d2f8767038e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_aic.md
@@ -0,0 +1,39 @@
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1711.06475">AI Challenger (ArXiv'2017)</a></summary>
+
+```bibtex
+@article{wu2017ai,
+  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
+  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
+  journal={arXiv preprint arXiv:1711.06475},
+  year={2017}
+}
+```
+
+</details>
+
+Results on AIC val set with ground-truth bounding boxes
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w32_aic_256x192.py) | 256x192 | 0.323 | 0.762 | 0.219 | 0.366 | 0.789 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_aic_256x192-30a4e465_20200826.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_aic_256x192_20200826.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_aic.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_aic.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d80203665815204aaa190f7789871422f060d031
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_aic.yml
@@ -0,0 +1,24 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w32_aic_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture:
+    - HRNet
+    Training Data: AI Challenger
+  Name: topdown_heatmap_hrnet_w32_aic_256x192
+  Results:
+  - Dataset: AI Challenger
+    Metrics:
+      AP: 0.323
+      AP@0.5: 0.762
+      AP@0.75: 0.219
+      AR: 0.366
+      AR@0.5: 0.789
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_aic_256x192-30a4e465_20200826.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w32_aic_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w32_aic_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..407782cc1fe99a1b4710300764ea8804fad81ebd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w32_aic_256x192.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w32_aic_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w32_aic_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..772e6a23d19fb0ad833a3f8a8670fadd3bbac45b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w32_aic_384x288.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w48_aic_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w48_aic_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..62c98ba67ea818b34d0ae7de47bab548aee939dc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w48_aic_256x192.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w48_aic_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w48_aic_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef063eb2e817151773546ab39bb24127579fd6e3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_w48_aic_384x288.py
@@ -0,0 +1,167 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup=None,
+    # warmup='linear',
+    # warmup_iters=500,
+    # warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res101_aic_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res101_aic_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dd2143d66b8940e09430a10a683190c8674a901
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res101_aic_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res101_aic_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res101_aic_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c1b750ab22130bdd42a2486b971b07a1c65cdb1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res101_aic_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res152_aic_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res152_aic_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4b64ddd742d926c8c8f9cbdbf1c3db00e9744c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res152_aic_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res152_aic_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res152_aic_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4d2276205c67a48b01eec1be2790b9dc7c8ea35
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res152_aic_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res50_aic_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res50_aic_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..a937af4e9053c5bd2911a3d560181e9bce151c26
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res50_aic_256x192.py
@@ -0,0 +1,134 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res50_aic_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res50_aic_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..556cda077a103d7a826a70457d91958c1ddfd80e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res50_aic_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aic.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_train.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
+        'keypoint_train_images_20170902/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownAicDataset',
+        ann_file=f'{data_root}/annotations/aic_val.json',
+        img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
+        'keypoint_validation_images_20170911/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/resnet_aic.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/resnet_aic.md
new file mode 100644
index 0000000000000000000000000000000000000000..e733aba36d3905f626febfff9027658d433c50c7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/resnet_aic.md
@@ -0,0 +1,55 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1711.06475">AI Challenger (ArXiv'2017)</a></summary>
+
+```bibtex
+@article{wu2017ai,
+  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
+  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
+  journal={arXiv preprint arXiv:1711.06475},
+  year={2017}
+}
+```
+
+</details>
+
+Results on AIC val set with ground-truth bounding boxes
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res101_aic_256x192.py) | 256x192 | 0.294 | 0.736 | 0.174 | 0.337 | 0.763 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_aic_256x192-79b35445_20200826.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_aic_256x192_20200826.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/resnet_aic.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/resnet_aic.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7fb30979bfcacfbd46f3886aa223510a6eaf7492
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/resnet_aic.yml
@@ -0,0 +1,25 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/res101_aic_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture:
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: AI Challenger
+  Name: topdown_heatmap_res101_aic_256x192
+  Results:
+  - Dataset: AI Challenger
+    Metrics:
+      AP: 0.294
+      AP@0.5: 0.736
+      AP@0.75: 0.174
+      AR: 0.337
+      AR@0.5: 0.763
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_aic_256x192-79b35445_20200826.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/2xmspn50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/2xmspn50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e11fe346b32b552eb95a19b41a3225af7e260ba
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/2xmspn50_coco_256x192.py
@@ -0,0 +1,165 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-3,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='MSPN',
+        unit_channels=256,
+        num_stages=2,
+        num_units=4,
+        num_blocks=[3, 4, 6, 3],
+        norm_cfg=dict(type='BN')),
+    keypoint_head=dict(
+        type='TopdownHeatmapMSMUHead',
+        out_shape=(64, 48),
+        unit_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=2,
+        num_units=4,
+        use_prm=False,
+        norm_cfg=dict(type='BN'),
+        loss_keypoint=([
+            dict(
+                type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
+        ] * 3 + [
+            dict(
+                type='JointsOHKMMSELoss',
+                use_target_weight=True,
+                loss_weight=1.)
+        ]) * 2),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='megvii',
+        shift_heatmap=False,
+        modulate_kernel=5))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        kernel=[(15, 15), (11, 11), (9, 9), (7, 7)] + [(11, 11), (9, 9),
+                                                       (7, 7), (5, 5)],
+        encoding='Megvii'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/2xrsn50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/2xrsn50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..280450f338d05d46c30569cebf2ffdcb6acfc5d7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/2xrsn50_coco_256x192.py
@@ -0,0 +1,165 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='RSN',
+        unit_channels=256,
+        num_stages=2,
+        num_units=4,
+        num_blocks=[3, 4, 6, 3],
+        num_steps=4,
+        norm_cfg=dict(type='BN')),
+    keypoint_head=dict(
+        type='TopdownHeatmapMSMUHead',
+        out_shape=(64, 48),
+        unit_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=2,
+        num_units=4,
+        use_prm=False,
+        norm_cfg=dict(type='BN'),
+        loss_keypoint=([
+            dict(
+                type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
+        ] * 3 + [
+            dict(
+                type='JointsOHKMMSELoss',
+                use_target_weight=True,
+                loss_weight=1.)
+        ]) * 2),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='megvii',
+        shift_heatmap=False,
+        modulate_kernel=5))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        kernel=[(15, 15), (11, 11), (9, 9), (7, 7)] + [(11, 11), (9, 9),
+                                                       (7, 7), (5, 5)],
+        encoding='Megvii'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/3xmspn50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/3xmspn50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..564a73fb5c16bae1ac7f7f8b61ae4cb4ec286c68
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/3xmspn50_coco_256x192.py
@@ -0,0 +1,165 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-3,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='MSPN',
+        unit_channels=256,
+        num_stages=3,
+        num_units=4,
+        num_blocks=[3, 4, 6, 3],
+        norm_cfg=dict(type='BN')),
+    keypoint_head=dict(
+        type='TopdownHeatmapMSMUHead',
+        out_shape=(64, 48),
+        unit_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=3,
+        num_units=4,
+        use_prm=False,
+        norm_cfg=dict(type='BN'),
+        loss_keypoint=([
+            dict(
+                type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
+        ] * 3 + [
+            dict(
+                type='JointsOHKMMSELoss',
+                use_target_weight=True,
+                loss_weight=1.)
+        ]) * 3),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='megvii',
+        shift_heatmap=False,
+        modulate_kernel=5))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        kernel=[(15, 15), (11, 11), (9, 9), (7, 7)] * 2 + [(11, 11), (9, 9),
+                                                           (7, 7), (5, 5)],
+        encoding='Megvii'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/3xrsn50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/3xrsn50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..86c1a742a43eea2dbe613b68770e747988d92f96
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/3xrsn50_coco_256x192.py
@@ -0,0 +1,165 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='RSN',
+        unit_channels=256,
+        num_stages=3,
+        num_units=4,
+        num_blocks=[3, 4, 6, 3],
+        num_steps=4,
+        norm_cfg=dict(type='BN')),
+    keypoint_head=dict(
+        type='TopdownHeatmapMSMUHead',
+        out_shape=(64, 48),
+        unit_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=3,
+        num_units=4,
+        use_prm=False,
+        norm_cfg=dict(type='BN'),
+        loss_keypoint=([
+            dict(
+                type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
+        ] * 3 + [
+            dict(
+                type='JointsOHKMMSELoss',
+                use_target_weight=True,
+                loss_weight=1.)
+        ]) * 3),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='megvii',
+        shift_heatmap=False,
+        modulate_kernel=5))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        kernel=[(15, 15), (11, 11), (9, 9), (7, 7)] * 2 + [(11, 11), (9, 9),
+                                                           (7, 7), (5, 5)],
+        encoding='Megvii'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/4xmspn50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/4xmspn50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..0144234cbdf364efe28f65f1218249f156e82d91
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/4xmspn50_coco_256x192.py
@@ -0,0 +1,165 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-3,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='MSPN',
+        unit_channels=256,
+        num_stages=4,
+        num_units=4,
+        num_blocks=[3, 4, 6, 3],
+        norm_cfg=dict(type='BN')),
+    keypoint_head=dict(
+        type='TopdownHeatmapMSMUHead',
+        out_shape=(64, 48),
+        unit_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=4,
+        num_units=4,
+        use_prm=False,
+        norm_cfg=dict(type='BN'),
+        loss_keypoint=([
+            dict(
+                type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
+        ] * 3 + [
+            dict(
+                type='JointsOHKMMSELoss',
+                use_target_weight=True,
+                loss_weight=1.)
+        ]) * 4),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='megvii',
+        shift_heatmap=False,
+        modulate_kernel=5))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        kernel=[(15, 15), (11, 11), (9, 9), (7, 7)] * 3 + [(11, 11), (9, 9),
+                                                           (7, 7), (5, 5)],
+        encoding='Megvii'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..f639173081be86d3e54ae586c5a7a569779cb8d1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.75,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..d410a1534f35d0bcd1f9d01f408748081576a2b5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
@@ -0,0 +1,171 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.75,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=0,
+        num_deconv_filters=[],
+        num_deconv_kernels=[],
+        upsample=4,
+        extra=dict(final_conv_kernel=3, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..298b2b59ef8310c73d481e95eb9fa39a8d0a7fef
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=32, 
+                                    layer_decay_rate=0.85,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9a86f0290d6be4046ac5e5dc3fc22288d373775
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
@@ -0,0 +1,171 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=32, 
+                                    layer_decay_rate=0.85,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=0,
+        num_deconv_filters=[],
+        num_deconv_kernels=[],
+        upsample=4,
+        extra=dict(final_conv_kernel=3, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f92e06cb4d76712d55ca23614ceebbf7b38a4b8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=24, 
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.5,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..63c794940805fbc67836f3b0a3ff3d029a7991ac
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
@@ -0,0 +1,171 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=24, 
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.5,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=0,
+        num_deconv_filters=[],
+        num_deconv_kernels=[],
+        upsample=4,
+        extra=dict(final_conv_kernel=3, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_small_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_small_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..42ac25cf1f8556a5ee0e29b9fa3834fa9a1fff37
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_small_coco_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_small_simple_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_small_simple_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..42ac25cf1f8556a5ee0e29b9fa3834fa9a1fff37
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_small_simple_coco_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..118c7ddc3c89aa2fe72c7066bd8b37d214659d73
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco.md
@@ -0,0 +1,40 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="https://proceedings.neurips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf">AlexNet (NeurIPS'2012)</a></summary>
+
+```bibtex
+@inproceedings{krizhevsky2012imagenet,
+  title={Imagenet classification with deep convolutional neural networks},
+  author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
+  booktitle={Advances in neural information processing systems},
+  pages={1097--1105},
+  year={2012}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_alexnet](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco_256x192.py)  | 256x192 | 0.397 | 0.758 | 0.381 | 0.478 | 0.822 | [ckpt](https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192-a7b1fd15_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192_20200727.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1de75d5b3b9f750db40ae3fbec8e8b9e7116e993
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco.yml
@@ -0,0 +1,24 @@
+Collections:
+- Name: AlexNet
+  Paper:
+    Title: Imagenet classification with deep convolutional neural networks
+    URL: https://proceedings.neurips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/alexnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco_256x192.py
+  In Collection: AlexNet
+  Metadata:
+    Architecture:
+    - AlexNet
+    Training Data: COCO
+  Name: topdown_heatmap_alexnet_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.397
+      AP@0.5: 0.758
+      AP@0.75: 0.381
+      AR: 0.478
+      AR@0.5: 0.822
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192-a7b1fd15_20200727.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..5704614306e57c17c5dc1f4df2cc8383f186cacc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='AlexNet', num_classes=-1),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[40, 56],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..f159517386f9a70e5ca6800e842f166a734cf608
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco.md
@@ -0,0 +1,41 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/Wei_Convolutional_Pose_Machines_CVPR_2016_paper.html">CPM (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{wei2016convolutional,
+  title={Convolutional pose machines},
+  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={4724--4732},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_256x192.py)  | 256x192 | 0.623 | 0.859 | 0.704 | 0.686 | 0.903 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_256x192-aa4ba095_20200817.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_256x192_20200817.log.json) |
+| [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_384x288.py)  | 384x288 | 0.650 | 0.864 | 0.725 | 0.708 | 0.905 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_384x288-80feb4bc_20200821.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_384x288_20200821.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f3b3c4d15622680518ba0762c168cc8361b676c3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: CPM
+  Paper:
+    Title: Convolutional pose machines
+    URL: http://openaccess.thecvf.com/content_cvpr_2016/html/Wei_Convolutional_Pose_Machines_CVPR_2016_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/cpm.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_256x192.py
+  In Collection: CPM
+  Metadata:
+    Architecture: &id001
+    - CPM
+    Training Data: COCO
+  Name: topdown_heatmap_cpm_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.623
+      AP@0.5: 0.859
+      AP@0.75: 0.704
+      AR: 0.686
+      AR@0.5: 0.903
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_256x192-aa4ba095_20200817.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_384x288.py
+  In Collection: CPM
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_cpm_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.65
+      AP@0.5: 0.864
+      AP@0.75: 0.725
+      AR: 0.708
+      AR@0.5: 0.905
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_384x288-80feb4bc_20200821.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9d118b62842ceb4d37be55f2072917fc377a835
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_256x192.py
@@ -0,0 +1,143 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='CPM',
+        in_channels=3,
+        out_channels=channel_cfg['num_output_channels'],
+        feat_channels=128,
+        num_stages=6),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=channel_cfg['num_output_channels'],
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=6,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=0, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[24, 32],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e3ae32c397e3730325fbe65c6ef8b2880473654
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_384x288.py
@@ -0,0 +1,143 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='CPM',
+        in_channels=3,
+        out_channels=channel_cfg['num_output_channels'],
+        feat_channels=128,
+        num_stages=6),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=channel_cfg['num_output_channels'],
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=6,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=0, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[36, 48],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ab6b159827c948494e87fbd74191cc5e95a80dc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_256x256.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HourglassNet',
+        num_stacks=1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=1,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_384x384.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_384x384.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e3a60bb6cf85b61290ea47ae79e3d6b8e969e29
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_384x384.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HourglassNet',
+        num_stacks=1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=1,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[384, 384],
+    heatmap_size=[96, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..a99fe7b0b8ddbbcc6993b2e76a0c1fbe49b4614e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass_coco.md
@@ -0,0 +1,42 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29">Hourglass (ECCV'2016)</a></summary>
+
+```bibtex
+@inproceedings{newell2016stacked,
+  title={Stacked hourglass networks for human pose estimation},
+  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
+  booktitle={European conference on computer vision},
+  pages={483--499},
+  year={2016},
+  organization={Springer}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hourglass_52](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_256x256.py) | 256x256 | 0.726 | 0.896 | 0.799 | 0.780 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256-4ec713ba_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256_20200709.log.json) |
+| [pose_hourglass_52](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_384x384.py) | 384x384 | 0.746 | 0.900 | 0.813 | 0.797 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384-be91ba2b_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384_20200812.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..28f09df2afdcfbfdbbcfb0a27f52291038691c5f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass_coco.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: Hourglass
+  Paper:
+    Title: Stacked hourglass networks for human pose estimation
+    URL: https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hourglass.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_256x256.py
+  In Collection: Hourglass
+  Metadata:
+    Architecture: &id001
+    - Hourglass
+    Training Data: COCO
+  Name: topdown_heatmap_hourglass52_coco_256x256
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.726
+      AP@0.5: 0.896
+      AP@0.75: 0.799
+      AR: 0.78
+      AR@0.5: 0.934
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256-4ec713ba_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_384x384.py
+  In Collection: Hourglass
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hourglass52_coco_384x384
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.746
+      AP@0.5: 0.9
+      AP@0.75: 0.813
+      AR: 0.797
+      AR@0.5: 0.939
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384-be91ba2b_20200812.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c9bd3a9d170d10bed0d91df556158aa36ea9ac6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_256x192.py
@@ -0,0 +1,191 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=5, create_symlink=False)
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='AdamW',
+    lr=5e-4,
+    betas=(0.9, 0.999),
+    weight_decay=0.01,
+    paramwise_cfg=dict(
+        custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrformer_base-32815020_20220226.pth',
+    backbone=dict(
+        type='HRFormer',
+        in_channels=3,
+        norm_cfg=norm_cfg,
+        extra=dict(
+            drop_path_rate=0.2,
+            with_rpe=False,
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(2, ),
+                num_channels=(64, ),
+                num_heads=[2],
+                mlp_ratios=[4]),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2),
+                num_channels=(78, 156),
+                num_heads=[2, 4],
+                mlp_ratios=[4, 4],
+                window_sizes=[7, 7]),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2, 2),
+                num_channels=(78, 156, 312),
+                num_heads=[2, 4, 8],
+                mlp_ratios=[4, 4, 4],
+                window_sizes=[7, 7, 7]),
+            stage4=dict(
+                num_modules=2,
+                num_branches=4,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2, 2, 2),
+                num_channels=(78, 156, 312, 624),
+                num_heads=[2, 4, 8, 16],
+                mlp_ratios=[4, 4, 4, 4],
+                window_sizes=[7, 7, 7, 7]))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=78,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_root = 'data/coco'
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file=f'{data_root}/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+)
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc221985b37f205e0130b366f3f635bd326ace82
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_384x288.py
@@ -0,0 +1,192 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10, create_symlink=False)
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='AdamW',
+    lr=5e-4,
+    betas=(0.9, 0.999),
+    weight_decay=0.01,
+    paramwise_cfg=dict(
+        custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrformer_base-32815020_20220226.pth',
+    backbone=dict(
+        type='HRFormer',
+        in_channels=3,
+        norm_cfg=norm_cfg,
+        extra=dict(
+            drop_path_rate=0.3,
+            with_rpe=False,
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(2, ),
+                num_channels=(64, ),
+                num_heads=[2],
+                mlp_ratios=[4]),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2),
+                num_channels=(78, 156),
+                num_heads=[2, 4],
+                mlp_ratios=[4, 4],
+                window_sizes=[7, 7]),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2, 2),
+                num_channels=(78, 156, 312),
+                num_heads=[2, 4, 8],
+                mlp_ratios=[4, 4, 4],
+                window_sizes=[7, 7, 7]),
+            stage4=dict(
+                num_modules=2,
+                num_branches=4,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2, 2, 2),
+                num_channels=(78, 156, 312, 624),
+                num_heads=[2, 4, 8, 16],
+                mlp_ratios=[4, 4, 4, 4],
+                window_sizes=[7, 7, 7, 7]))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=78,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_root = 'data/coco'
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file=f'{data_root}/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+)
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..10c0ca5c0e1526515e491adbafc10d80ad8ddbf1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_coco.md
@@ -0,0 +1,42 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://proceedings.neurips.cc/paper/2021/hash/3bbfdde8842a5c44a0323518eec97cbe-Abstract.html">HRFormer (NIPS'2021)</a></summary>
+
+```bibtex
+@article{yuan2021hrformer,
+  title={HRFormer: High-Resolution Vision Transformer for Dense Predict},
+  author={Yuan, Yuhui and Fu, Rao and Huang, Lang and Lin, Weihong and Zhang, Chao and Chen, Xilin and Wang, Jingdong},
+  journal={Advances in Neural Information Processing Systems},
+  volume={34},
+  year={2021}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrformer_small](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_256x192.py)  | 256x192 | 0.737 | 0.899 | 0.810 | 0.792 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192-b657896f_20220226.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192_20220226.log.json) |
+| [pose_hrformer_small](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_384x288.py)  | 384x288 | 0.755 | 0.906 | 0.822 | 0.805 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288-4b52b078_20220226.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288_20220226.log.json) |
+| [pose_hrformer_base](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_256x192.py)  | 256x192 | 0.753 | 0.907 | 0.821 | 0.806 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192-66cee214_20220226.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192_20220226.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3e54c33dee18ffbe394bee2b97bc6ac3100d3944
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_coco.yml
@@ -0,0 +1,56 @@
+Collections:
+- Name: HRFormer
+  Paper:
+    Title: 'HRFormer: High-Resolution Vision Transformer for Dense Predict'
+    URL: https://proceedings.neurips.cc/paper/2021/hash/3bbfdde8842a5c44a0323518eec97cbe-Abstract.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrformer.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_256x192.py
+  In Collection: HRFormer
+  Metadata:
+    Architecture: &id001
+    - HRFormer
+    Training Data: COCO
+  Name: topdown_heatmap_hrformer_small_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.737
+      AP@0.5: 0.899
+      AP@0.75: 0.81
+      AR: 0.792
+      AR@0.5: 0.938
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192-b657896f_20220226.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_384x288.py
+  In Collection: HRFormer
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrformer_small_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.755
+      AP@0.5: 0.906
+      AP@0.75: 0.822
+      AR: 0.805
+      AR@0.5: 0.941
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288-4b52b078_20220226.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_256x192.py
+  In Collection: HRFormer
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrformer_base_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.753
+      AP@0.5: 0.907
+      AP@0.75: 0.821
+      AR: 0.806
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192-66cee214_20220226.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..edb658b28445a615fe61a06c2f4de609dc3a8400
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_256x192.py
@@ -0,0 +1,192 @@
+_base_ = ['../../../../_base_/datasets/coco.py']
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=5, create_symlink=False)
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='AdamW',
+    lr=5e-4,
+    betas=(0.9, 0.999),
+    weight_decay=0.01,
+    paramwise_cfg=dict(
+        custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrformer_small-09516375_20220226.pth',
+    backbone=dict(
+        type='HRFormer',
+        in_channels=3,
+        norm_cfg=norm_cfg,
+        extra=dict(
+            drop_path_rate=0.1,
+            with_rpe=False,
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(2, ),
+                num_channels=(64, ),
+                num_heads=[2],
+                num_mlp_ratios=[4]),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2),
+                num_channels=(32, 64),
+                num_heads=[1, 2],
+                mlp_ratios=[4, 4],
+                window_sizes=[7, 7]),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2, 2),
+                num_channels=(32, 64, 128),
+                num_heads=[1, 2, 4],
+                mlp_ratios=[4, 4, 4],
+                window_sizes=[7, 7, 7]),
+            stage4=dict(
+                num_modules=2,
+                num_branches=4,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2, 2, 2),
+                num_channels=(32, 64, 128, 256),
+                num_heads=[1, 2, 4, 8],
+                mlp_ratios=[4, 4, 4, 4],
+                window_sizes=[7, 7, 7, 7]))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_root = 'data/coco'
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file=f'{data_root}/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=64),
+    test_dataloader=dict(samples_per_gpu=64),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+)
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc9b62e2aecf50f4ccb694d2882a41a76ad5d53c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_384x288.py
@@ -0,0 +1,192 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=5, create_symlink=False)
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='AdamW',
+    lr=5e-4,
+    betas=(0.9, 0.999),
+    weight_decay=0.01,
+    paramwise_cfg=dict(
+        custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrformer_small-09516375_20220226.pth',
+    backbone=dict(
+        type='HRFormer',
+        in_channels=3,
+        norm_cfg=norm_cfg,
+        extra=dict(
+            drop_path_rate=0.1,
+            with_rpe=False,
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(2, ),
+                num_channels=(64, ),
+                num_heads=[2],
+                num_mlp_ratios=[4]),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2),
+                num_channels=(32, 64),
+                num_heads=[1, 2],
+                mlp_ratios=[4, 4],
+                window_sizes=[7, 7]),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2, 2),
+                num_channels=(32, 64, 128),
+                num_heads=[1, 2, 4],
+                mlp_ratios=[4, 4, 4],
+                window_sizes=[7, 7, 7]),
+            stage4=dict(
+                num_modules=2,
+                num_branches=4,
+                block='HRFORMERBLOCK',
+                num_blocks=(2, 2, 2, 2),
+                num_channels=(32, 64, 128, 256),
+                num_heads=[1, 2, 4, 8],
+                mlp_ratios=[4, 4, 4, 4],
+                window_sizes=[7, 7, 7, 7]))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_root = 'data/coco'
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file=f'{data_root}/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=256),
+    test_dataloader=dict(samples_per_gpu=256),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+)
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_augmentation_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_augmentation_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..533a974cd46303d8cc1249b8be2c494f95f62278
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_augmentation_coco.md
@@ -0,0 +1,62 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [OTHERS] -->
+
+<details>
+<summary align="right"><a href="https://www.mdpi.com/649002">Albumentations (Information'2020)</a></summary>
+
+```bibtex
+@article{buslaev2020albumentations,
+  title={Albumentations: fast and flexible image augmentations},
+  author={Buslaev, Alexander and Iglovikov, Vladimir I and Khvedchenya, Eugene and Parinov, Alex and Druzhinin, Mikhail and Kalinin, Alexandr A},
+  journal={Information},
+  volume={11},
+  number={2},
+  pages={125},
+  year={2020},
+  publisher={Multidisciplinary Digital Publishing Institute}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [coarsedropout](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_coarsedropout.py)  | 256x192 | 0.753 | 0.908 | 0.822 | 0.806 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_coarsedropout-0f16a0ce_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_coarsedropout_20210320.log.json) |
+| [gridmask](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_gridmask.py)  | 256x192 | 0.752 | 0.906 | 0.825 | 0.804 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_gridmask-868180df_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_gridmask_20210320.log.json) |
+| [photometric](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_photometric.py)  | 256x192 | 0.753 | 0.909 | 0.825 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_photometric-308cf591_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_photometric_20210320.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_augmentation_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_augmentation_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..58b7304e2944fb111d61e41ee5a18573ca7d8490
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_augmentation_coco.yml
@@ -0,0 +1,56 @@
+Collections:
+- Name: Albumentations
+  Paper:
+    Title: 'Albumentations: fast and flexible image augmentations'
+    URL: https://www.mdpi.com/649002
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/albumentations.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_coarsedropout.py
+  In Collection: Albumentations
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_256x192_coarsedropout
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.753
+      AP@0.5: 0.908
+      AP@0.75: 0.822
+      AR: 0.806
+      AR@0.5: 0.946
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_coarsedropout-0f16a0ce_20210320.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_gridmask.py
+  In Collection: Albumentations
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_256x192_gridmask
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.752
+      AP@0.5: 0.906
+      AP@0.75: 0.825
+      AR: 0.804
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_gridmask-868180df_20210320.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_photometric.py
+  In Collection: Albumentations
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_256x192_photometric
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.753
+      AP@0.5: 0.909
+      AP@0.75: 0.825
+      AR: 0.805
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_photometric-308cf591_20210320.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..e27eedff2acbcdf6f21cf9f399c9ccd089ac814b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_coco.md
@@ -0,0 +1,43 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py)  | 256x192 | 0.746 | 0.904 | 0.819 | 0.799 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192_20200708.log.json) |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288.py)  | 384x288 | 0.760 | 0.906 | 0.829 | 0.810 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_384x288-d9f0d786_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_384x288_20200708.log.json) |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192.py)  | 256x192 | 0.756 | 0.907 | 0.825 | 0.806 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192_20200708.log.json) |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288.py)  | 384x288 | 0.767 | 0.910 | 0.831 | 0.816 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288-314c8528_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288_20200708.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..af07fbe0f3c62bd228cdbffb7e39bd7caab45cfa
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.746
+      AP@0.5: 0.904
+      AP@0.75: 0.819
+      AR: 0.799
+      AR@0.5: 0.942
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.76
+      AP@0.5: 0.906
+      AP@0.75: 0.829
+      AR: 0.81
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_384x288-d9f0d786_20200708.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w48_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.756
+      AP@0.5: 0.907
+      AP@0.75: 0.825
+      AR: 0.806
+      AR@0.5: 0.942
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w48_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.767
+      AP@0.5: 0.91
+      AP@0.75: 0.831
+      AR: 0.816
+      AR@0.5: 0.946
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288-314c8528_20200708.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_dark_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_dark_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..794a08419aab4609bba8d9a05db6510800ff1851
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_dark_coco.md
@@ -0,0 +1,60 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_dark.py) | 256x192 | 0.757 | 0.907 | 0.823 | 0.808 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192_dark-07f147eb_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192_dark_20200812.log.json) |
+| [pose_hrnet_w32_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288_dark.py) | 384x288 | 0.766 | 0.907 | 0.831 | 0.815 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_384x288_dark-307dafc2_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_384x288_dark_20210203.log.json) |
+| [pose_hrnet_w48_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192_dark.py) | 256x192 | 0.764 | 0.907 | 0.830 | 0.814 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192_dark-8cba3197_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192_dark_20200812.log.json) |
+| [pose_hrnet_w48_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288_dark.py) | 384x288 | 0.772 | 0.910 | 0.836 | 0.820 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288_dark-e881a4b6_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288_dark_20210203.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_dark_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_dark_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..49c2e863bb85b76d4f853948f9f1c77ebdbe13a6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_dark_coco.yml
@@ -0,0 +1,73 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    - DarkPose
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_256x192_dark
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.757
+      AP@0.5: 0.907
+      AP@0.75: 0.823
+      AR: 0.808
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192_dark-07f147eb_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_384x288_dark
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.766
+      AP@0.5: 0.907
+      AP@0.75: 0.831
+      AR: 0.815
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_384x288_dark-307dafc2_20210203.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w48_coco_256x192_dark
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.764
+      AP@0.5: 0.907
+      AP@0.75: 0.83
+      AR: 0.814
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192_dark-8cba3197_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w48_coco_384x288_dark
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.772
+      AP@0.5: 0.91
+      AP@0.75: 0.836
+      AR: 0.82
+      AR@0.5: 0.946
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288_dark-e881a4b6_20210203.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_fp16_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_fp16_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..c2e4b70494428786d83b747a4c494f5a9876268b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_fp16_coco.md
@@ -0,0 +1,56 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [OTHERS] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1710.03740">FP16 (ArXiv'2017)</a></summary>
+
+```bibtex
+@article{micikevicius2017mixed,
+  title={Mixed precision training},
+  author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others},
+  journal={arXiv preprint arXiv:1710.03740},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32_fp16](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_fp16_dynamic.py)  | 256x192 | 0.746 | 0.905 | 0.88 | 0.800 | 0.943 | [ckpt](hrnet_w32_coco_256x192_fp16_dynamic-290efc2e_20210430.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192_fp16_dynamic_20210430.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_fp16_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_fp16_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..47f39f4eb9e592b233f22a66aa8d8908a46b7201
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_fp16_coco.yml
@@ -0,0 +1,24 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_fp16_dynamic.py
+  In Collection: HRNet
+  Metadata:
+    Architecture:
+    - HRNet
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_256x192_fp16_dynamic
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.746
+      AP@0.5: 0.905
+      AP@0.75: 0.88
+      AR: 0.8
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: hrnet_w32_coco_256x192_fp16_dynamic-290efc2e_20210430.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_udp_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_udp_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..acc7207a7b5710832e3f8a53a734ac8d2c7e08b9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_udp_coco.md
@@ -0,0 +1,63 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html">UDP (CVPR'2020)</a></summary>
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  month = {June},
+  year = {2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32_udp](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_udp.py)  | 256x192 | 0.760 | 0.907 | 0.827 | 0.811 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w32_coco_256x192_udp-aba0be42_20210220.pth) | [log](https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w32_coco_256x192_udp_20210220.log.json) |
+| [pose_hrnet_w32_udp](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288_udp.py)  | 384x288 | 0.769 | 0.908 | 0.833 | 0.817 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w32_coco_384x288_udp-e97c1a0f_20210223.pth) | [log](https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w32_coco_384x288_udp_20210223.log.json) |
+| [pose_hrnet_w48_udp](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192_udp.py)  | 256x192 | 0.767 | 0.906 | 0.834 | 0.817 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w48_coco_256x192_udp-2554c524_20210223.pth) | [log](https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w48_coco_256x192_udp_20210223.log.json) |
+| [pose_hrnet_w48_udp](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288_udp.py)  | 384x288 | 0.772 | 0.910 | 0.835 | 0.820 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w48_coco_384x288_udp-0f89c63e_20210223.pth) | [log](https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w48_coco_384x288_udp_20210223.log.json) |
+| [pose_hrnet_w32_udp_regress](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_udp_regress.py)  | 256x192 | 0.758 | 0.908 | 0.823 | 0.812 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w32_coco_256x192_udp_regress-be2dbba4_20210222.pth) | [log](https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w32_coco_256x192_udp_regress_20210222.log.json) |
+
+Note that, UDP also adopts the unbiased encoding/decoding algorithm of [DARK](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#div-align-center-darkpose-cvpr-2020-div).
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_udp_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_udp_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f8d6128ce777fb2766b10e2f715d6f4b0a9dcf52
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_udp_coco.yml
@@ -0,0 +1,90 @@
+Collections:
+- Name: UDP
+  Paper:
+    Title: 'The Devil Is in the Details: Delving Into Unbiased Data Processing for
+      Human Pose Estimation'
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/udp.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_udp.py
+  In Collection: UDP
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    - UDP
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_256x192_udp
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.76
+      AP@0.5: 0.907
+      AP@0.75: 0.827
+      AR: 0.811
+      AR@0.5: 0.945
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w32_coco_256x192_udp-aba0be42_20210220.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288_udp.py
+  In Collection: UDP
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_384x288_udp
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.769
+      AP@0.5: 0.908
+      AP@0.75: 0.833
+      AR: 0.817
+      AR@0.5: 0.944
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w32_coco_384x288_udp-e97c1a0f_20210223.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192_udp.py
+  In Collection: UDP
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w48_coco_256x192_udp
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.767
+      AP@0.5: 0.906
+      AP@0.75: 0.834
+      AR: 0.817
+      AR@0.5: 0.945
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w48_coco_256x192_udp-2554c524_20210223.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288_udp.py
+  In Collection: UDP
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w48_coco_384x288_udp
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.772
+      AP@0.5: 0.91
+      AP@0.75: 0.835
+      AR: 0.82
+      AR@0.5: 0.945
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w48_coco_384x288_udp-0f89c63e_20210223.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_udp_regress.py
+  In Collection: UDP
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_hrnet_w32_coco_256x192_udp_regress
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.758
+      AP@0.5: 0.908
+      AP@0.75: 0.823
+      AR: 0.812
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/udp/hrnet_w32_coco_256x192_udp_regress-be2dbba4_20210222.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f3f45e3a9cdb8051e803e7ab4ffc4b09bc55409
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_coarsedropout.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_coarsedropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..9306e5cc701bf40157ca82aa168ec6935cfed8da
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_coarsedropout.py
@@ -0,0 +1,179 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/top_down/hrnet/'
+    'hrnet_w32_coco_256x192-c78dce93_20200708.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(
+                type='CoarseDropout',
+                max_holes=8,
+                max_height=40,
+                max_width=40,
+                min_holes=1,
+                min_height=10,
+                min_width=10,
+                p=0.5),
+        ]),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a04bd43156a1936fc71890e93929f659ade64e7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_dark.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_fp16_dynamic.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_fp16_dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..234d58a2626fa1d17a204884772870dbd66f46e3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_fp16_dynamic.py
@@ -0,0 +1,4 @@
+_base_ = ['./hrnet_w32_coco_256x192.py']
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_gridmask.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_gridmask.py
new file mode 100644
index 0000000000000000000000000000000000000000..50a5086a07351c9576f68827a44c75888a56c6a9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_gridmask.py
@@ -0,0 +1,176 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/top_down/hrnet/'
+    'hrnet_w32_coco_256x192-c78dce93_20200708.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(
+                type='GridDropout',
+                unit_size_min=10,
+                unit_size_max=40,
+                random_offset=True,
+                p=0.5),
+        ]),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_photometric.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_photometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..f742a886ed3869e06ef2decc3cbd535560b94e94
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_photometric.py
@@ -0,0 +1,167 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/top_down/hrnet/'
+    'hrnet_w32_coco_256x192-c78dce93_20200708.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='PhotometricDistortion'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..5512c3c5b96f100eee0be4934388aba0443ce6fc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_udp.py
@@ -0,0 +1,173 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_udp_regress.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_udp_regress.py
new file mode 100644
index 0000000000000000000000000000000000000000..940ad911d2afb6abc507ebcdb802ce842fc1e3fd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192_udp_regress.py
@@ -0,0 +1,171 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'CombinedTarget'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=3 * channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='CombinedTargetMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget', encoding='UDP', target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1b8eb20f74c942c72aa373e7c5bd7a08ba89082
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdc35771fee4fde2e27b8b6dae732c6688e6dfeb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288_dark.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8e7b5282f7e914080840afdf9e7c99d0204e408
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_384x288_udp.py
@@ -0,0 +1,173 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=17,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=3,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..305d680f227d29e39df621c9a6b81b5fae9bc8d7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..eec0942c16ae7d123627426b60f57f522d40ec0d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192_dark.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e18bf3cfad05dcbde288af9f871665e1055aae99
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_256x192_udp.py
@@ -0,0 +1,173 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..1776926bf139097d857c20a3d5350301e61a5d17
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..82a8009d02f103956bbb5b8bdd1b108805dc0441
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288_dark.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fa81909af3732e3b25b89b4f897598f8407c425
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w48_coco_384x288_udp.py
@@ -0,0 +1,173 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=17,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=3,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_18_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_18_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..593bf2208534c306d2d59b1a93f46b7b60091fe3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_18_coco_256x192.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='LiteHRNet',
+        in_channels=3,
+        extra=dict(
+            stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+            num_stages=3,
+            stages_spec=dict(
+                num_modules=(2, 4, 2),
+                num_branches=(2, 3, 4),
+                num_blocks=(2, 2, 2),
+                module_type=('LITE', 'LITE', 'LITE'),
+                with_fuse=(True, True, True),
+                reduce_ratios=(8, 8, 8),
+                num_channels=(
+                    (40, 80),
+                    (40, 80, 160),
+                    (40, 80, 160, 320),
+                )),
+            with_head=True,
+        )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=40,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_18_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_18_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdf41d5fbf3c53d913591d704d3ab122ed4017a9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_18_coco_384x288.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='LiteHRNet',
+        in_channels=3,
+        extra=dict(
+            stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+            num_stages=3,
+            stages_spec=dict(
+                num_modules=(2, 4, 2),
+                num_branches=(2, 3, 4),
+                num_blocks=(2, 2, 2),
+                module_type=('LITE', 'LITE', 'LITE'),
+                with_fuse=(True, True, True),
+                reduce_ratios=(8, 8, 8),
+                num_channels=(
+                    (40, 80),
+                    (40, 80, 160),
+                    (40, 80, 160, 320),
+                )),
+            with_head=True,
+        )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=40,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_30_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_30_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..6238276a097d281d76358a62eb56e00a36563c89
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_30_coco_256x192.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='LiteHRNet',
+        in_channels=3,
+        extra=dict(
+            stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+            num_stages=3,
+            stages_spec=dict(
+                num_modules=(3, 8, 3),
+                num_branches=(2, 3, 4),
+                num_blocks=(2, 2, 2),
+                module_type=('LITE', 'LITE', 'LITE'),
+                with_fuse=(True, True, True),
+                reduce_ratios=(8, 8, 8),
+                num_channels=(
+                    (40, 80),
+                    (40, 80, 160),
+                    (40, 80, 160, 320),
+                )),
+            with_head=True,
+        )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=40,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_30_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_30_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..25bd8cc28e27dbcfdfc1b6e8b77aff6f8934d90f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_30_coco_384x288.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='LiteHRNet',
+        in_channels=3,
+        extra=dict(
+            stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+            num_stages=3,
+            stages_spec=dict(
+                num_modules=(3, 8, 3),
+                num_branches=(2, 3, 4),
+                num_blocks=(2, 2, 2),
+                module_type=('LITE', 'LITE', 'LITE'),
+                with_fuse=(True, True, True),
+                reduce_ratios=(8, 8, 8),
+                num_channels=(
+                    (40, 80),
+                    (40, 80, 160),
+                    (40, 80, 160, 320),
+                )),
+            with_head=True,
+        )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=40,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..7ce55162b9b7f9c706e95eace342326b978f4013
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_coco.md
@@ -0,0 +1,42 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2104.06403">LiteHRNet (CVPR'2021)</a></summary>
+
+```bibtex
+@inproceedings{Yulitehrnet21,
+  title={Lite-HRNet: A Lightweight High-Resolution Network},
+  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
+  booktitle={CVPR},
+  year={2021}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [LiteHRNet-18](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_18_coco_256x192.py)  | 256x192 | 0.643 | 0.868 | 0.720 | 0.706 | 0.912 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_256x192-6bace359_20211230.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_256x192_20211230.log.json) |
+| [LiteHRNet-18](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_18_coco_384x288.py)  | 384x288 | 0.677 | 0.878 | 0.746 | 0.735 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_384x288-8d4dac48_20211230.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_384x288_20211230.log.json) |
+| [LiteHRNet-30](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_30_coco_256x192.py)  | 256x192 | 0.675 | 0.881 | 0.754 | 0.736 | 0.924 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_256x192-4176555b_20210626.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_256x192_20210626.log.json) |
+| [LiteHRNet-30](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_30_coco_384x288.py)  | 384x288 | 0.700 | 0.884 | 0.776 | 0.758 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_384x288-a3aef5c4_20210626.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_384x288_20210626.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1ba22c59364a6960cf8619fc69b98f10d4f5b1ff
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: LiteHRNet
+  Paper:
+    Title: 'Lite-HRNet: A Lightweight High-Resolution Network'
+    URL: https://arxiv.org/abs/2104.06403
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/litehrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_18_coco_256x192.py
+  In Collection: LiteHRNet
+  Metadata:
+    Architecture: &id001
+    - LiteHRNet
+    Training Data: COCO
+  Name: topdown_heatmap_litehrnet_18_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.643
+      AP@0.5: 0.868
+      AP@0.75: 0.72
+      AR: 0.706
+      AR@0.5: 0.912
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_256x192-6bace359_20211230.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_18_coco_384x288.py
+  In Collection: LiteHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_litehrnet_18_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.677
+      AP@0.5: 0.878
+      AP@0.75: 0.746
+      AR: 0.735
+      AR@0.5: 0.92
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_384x288-8d4dac48_20211230.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_30_coco_256x192.py
+  In Collection: LiteHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_litehrnet_30_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.675
+      AP@0.5: 0.881
+      AP@0.75: 0.754
+      AR: 0.736
+      AR@0.5: 0.924
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_256x192-4176555b_20210626.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_30_coco_384x288.py
+  In Collection: LiteHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_litehrnet_30_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.7
+      AP@0.5: 0.884
+      AP@0.75: 0.776
+      AR: 0.758
+      AR@0.5: 0.928
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_384x288-a3aef5c4_20210626.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f7401a6880527374fb9b62d331e686b176a7b62
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco.md
@@ -0,0 +1,41 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html">MobilenetV2 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4510--4520},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_mobilenetv2](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco_256x192.py)  | 256x192 | 0.646 | 0.874 | 0.723 | 0.707 | 0.917 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_coco_256x192-d1e58e7b_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_coco_256x192_20200727.log.json) |
+| [pose_mobilenetv2](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco_384x288.py)  | 384x288 | 0.673 | 0.879 | 0.743 | 0.729 | 0.916 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_coco_384x288-26be4816_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_coco_384x288_20200727.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cf19575fae9d0949bf50c577d92ab253fc21318b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: MobilenetV2
+  Paper:
+    Title: 'Mobilenetv2: Inverted residuals and linear bottlenecks'
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/mobilenetv2.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco_256x192.py
+  In Collection: MobilenetV2
+  Metadata:
+    Architecture: &id001
+    - MobilenetV2
+    Training Data: COCO
+  Name: topdown_heatmap_mobilenetv2_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.646
+      AP@0.5: 0.874
+      AP@0.75: 0.723
+      AR: 0.707
+      AR@0.5: 0.917
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_coco_256x192-d1e58e7b_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco_384x288.py
+  In Collection: MobilenetV2
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_mobilenetv2_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.673
+      AP@0.5: 0.879
+      AP@0.75: 0.743
+      AR: 0.729
+      AR@0.5: 0.916
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_coco_384x288-26be4816_20200727.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e613b6e0daa5dc901594333604f76159ff9eb12
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02a9bdd2169bffd8503d3e0b0ff50ab8944596a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e0c0171dec1c2f059483409b6ba2325498c31e2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn50_coco_256x192.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-3,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='MSPN',
+        unit_channels=256,
+        num_stages=1,
+        num_units=4,
+        num_blocks=[3, 4, 6, 3],
+        norm_cfg=dict(type='BN')),
+    keypoint_head=dict(
+        type='TopdownHeatmapMSMUHead',
+        out_shape=(64, 48),
+        unit_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=1,
+        num_units=4,
+        use_prm=False,
+        norm_cfg=dict(type='BN'),
+        loss_keypoint=[
+            dict(
+                type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
+        ] * 3 + [
+            dict(
+                type='JointsOHKMMSELoss',
+                use_target_weight=True,
+                loss_weight=1.)
+        ]),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='megvii',
+        shift_heatmap=False,
+        modulate_kernel=5))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        kernel=[(11, 11), (9, 9), (7, 7), (5, 5)],
+        encoding='Megvii'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..22a3f9b1e16d3bc0018774492ce61f21edf817bf
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn_coco.md
@@ -0,0 +1,42 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1901.00148">MSPN (ArXiv'2019)</a></summary>
+
+```bibtex
+@article{li2019rethinking,
+  title={Rethinking on Multi-Stage Networks for Human Pose Estimation},
+  author={Li, Wenbo and Wang, Zhicheng and Yin, Binyi and Peng, Qixiang and Du, Yuming and Xiao, Tianzi and Yu, Gang and Lu, Hongtao and Wei, Yichen and Sun, Jian},
+  journal={arXiv preprint arXiv:1901.00148},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [mspn_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn50_coco_256x192.py) | 256x192 | 0.723 | 0.895 | 0.794 | 0.788 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/mspn50_coco_256x192-8fbfb5d0_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/mspn50_coco_256x192_20201123.log.json) |
+| [2xmspn_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/2xmspn50_coco_256x192.py) | 256x192 | 0.754 | 0.903 | 0.825 | 0.815 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/2xmspn50_coco_256x192-c8765a5c_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/2xmspn50_coco_256x192_20201123.log.json) |
+| [3xmspn_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/3xmspn50_coco_256x192.py) | 256x192 | 0.758 | 0.904 | 0.830 | 0.821 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/3xmspn50_coco_256x192-e348f18e_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/3xmspn50_coco_256x192_20201123.log.json) |
+| [4xmspn_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/4xmspn50_coco_256x192.py) | 256x192 | 0.764 | 0.906 | 0.835 | 0.826 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/4xmspn50_coco_256x192-7b837afb_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/4xmspn50_coco_256x192_20201123.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e4eb04962f299d4e71e371b942d63357dc8b6a5f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: MSPN
+  Paper:
+    Title: Rethinking on Multi-Stage Networks for Human Pose Estimation
+    URL: https://arxiv.org/abs/1901.00148
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/mspn.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn50_coco_256x192.py
+  In Collection: MSPN
+  Metadata:
+    Architecture: &id001
+    - MSPN
+    Training Data: COCO
+  Name: topdown_heatmap_mspn50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.723
+      AP@0.5: 0.895
+      AP@0.75: 0.794
+      AR: 0.788
+      AR@0.5: 0.933
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/mspn/mspn50_coco_256x192-8fbfb5d0_20201123.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/2xmspn50_coco_256x192.py
+  In Collection: MSPN
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_2xmspn50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.754
+      AP@0.5: 0.903
+      AP@0.75: 0.825
+      AR: 0.815
+      AR@0.5: 0.941
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/mspn/2xmspn50_coco_256x192-c8765a5c_20201123.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/3xmspn50_coco_256x192.py
+  In Collection: MSPN
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_3xmspn50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.758
+      AP@0.5: 0.904
+      AP@0.75: 0.83
+      AR: 0.821
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/mspn/3xmspn50_coco_256x192-e348f18e_20201123.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/4xmspn50_coco_256x192.py
+  In Collection: MSPN
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_4xmspn50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.764
+      AP@0.5: 0.906
+      AP@0.75: 0.835
+      AR: 0.826
+      AR@0.5: 0.944
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/mspn/4xmspn50_coco_256x192-7b837afb_20201123.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0963b44abfbe4f4f369b38040315171faf00b5c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..465c00f22815f7119ebaaaeb522c14a82e0d6897
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192_dark.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..037811ad84ffb047b337677a1fcbcbe61d6682ce
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a413c9c3f834fd6aae069557c580bfca814b494
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288_dark.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..24537ccecec040f40efad011d5c0529d6f4cb74d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f3a2236c1e5cce8c15a2ed83b82a0facfc4bd26
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192_dark.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..7664cec78fa25fc5c0305357e0ec1062c74270e4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f192f7c77630d83ea443f5d2d547e0515a33f9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288_dark.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..f64aad0be882d74efb591688e3a357a36453d9a5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_awing.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_awing.py
new file mode 100644
index 0000000000000000000000000000000000000000..6413cf6162f7062978ea0daabc81e3ea557f888d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_awing.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='AdaptiveWingLoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..5121bb08b196fc255ba3d9ab408de791ddd4e7d4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_dark.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_fp16_dynamic.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_fp16_dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..42db33d7495e7847716f581ee6024546b7489be2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_fp16_dynamic.py
@@ -0,0 +1,4 @@
+_base_ = ['./res50_coco_256x192.py']
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bd86690d2c4237b812aff4076458d5bacd8b98d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c52018246ea4f38fcc9df6a01fca37e2deafb30
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288_dark.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest101_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest101_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..e737b6ae44126b811cff84954712143fcb2b2281
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest101_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnest101',
+    backbone=dict(type='ResNeSt', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest101_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest101_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fb13b1954e019805a231ce427deb41b0e0db7bf
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest101_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnest101',
+    backbone=dict(type='ResNeSt', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest200_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest200_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..399a4d3c983c5a763446ae70b274f265559a5039
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest200_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnest200',
+    backbone=dict(type='ResNeSt', depth=200),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest200_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest200_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a16cd378117d04cd5cb481f6593a1e88ccdba44
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest200_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnest200',
+    backbone=dict(type='ResNeSt', depth=200),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=16),
+    test_dataloader=dict(samples_per_gpu=16),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest269_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest269_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee1fc55fb1cdf28e3c43a286a62d6e9b8c1b6b61
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest269_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnest269',
+    backbone=dict(type='ResNeSt', depth=269),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest269_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest269_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..684a35a92ab0fef4b13fb3bc460afb428c255ba4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest269_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnest269',
+    backbone=dict(type='ResNeSt', depth=269),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=16),
+    test_dataloader=dict(samples_per_gpu=16),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..fef8cf2f463c465d49041fa0ffc740fcd10ed68b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest50_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnest50',
+    backbone=dict(type='ResNeSt', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest50_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest50_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..56fff8af9785e1f9665c536c3d8417187ff7e889
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest50_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnest50',
+    backbone=dict(type='ResNeSt', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..4bb1ab04b32ac81aa9e3424d391de658659d257c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest_coco.md
@@ -0,0 +1,46 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2004.08955">ResNeSt (ArXiv'2020)</a></summary>
+
+```bibtex
+@article{zhang2020resnest,
+  title={ResNeSt: Split-Attention Networks},
+  author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander},
+  journal={arXiv preprint arXiv:2004.08955},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnest_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest50_coco_256x192.py)  | 256x192 | 0.721 | 0.899 | 0.802 | 0.776 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_256x192-6e65eece_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_256x192_20210320.log.json) |
+| [pose_resnest_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest50_coco_384x288.py)  | 384x288 | 0.737 | 0.900 | 0.811 | 0.789 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_384x288-dcd20436_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_384x288_20210320.log.json) |
+| [pose_resnest_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest101_coco_256x192.py) | 256x192 | 0.725 | 0.899 | 0.807 | 0.781 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_256x192-2ffcdc9d_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_256x192_20210320.log.json) |
+| [pose_resnest_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest101_coco_384x288.py) | 384x288 | 0.746 | 0.906 | 0.820 | 0.798 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_384x288-80660658_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_384x288_20210320.log.json) |
+| [pose_resnest_200](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest200_coco_256x192.py)  | 256x192 | 0.732 | 0.905 | 0.812 | 0.787 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_256x192-db007a48_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_256x192_20210517.log.json) |
+| [pose_resnest_200](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest200_coco_384x288.py)  | 384x288 | 0.754 | 0.908 | 0.827 | 0.807 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_384x288-b5bb76cb_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_384x288_20210517.log.json) |
+| [pose_resnest_269](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest269_coco_256x192.py) | 256x192 | 0.738 | 0.907 | 0.819 | 0.793 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_256x192-2a7882ac_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_256x192_20210517.log.json) |
+| [pose_resnest_269](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest269_coco_384x288.py) | 384x288 | 0.755 | 0.908 | 0.828 | 0.806 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_384x288-b142b9fb_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_384x288_20210517.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e630a3d86f4cb96e6058617c570585e01efa42da
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest_coco.yml
@@ -0,0 +1,136 @@
+Collections:
+- Name: ResNeSt
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnest.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest50_coco_256x192.py
+  In Collection: ResNeSt
+  Metadata:
+    Architecture: &id001
+    - ResNeSt
+    Training Data: COCO
+  Name: topdown_heatmap_resnest50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.721
+      AP@0.5: 0.899
+      AP@0.75: 0.802
+      AR: 0.776
+      AR@0.5: 0.938
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_256x192-6e65eece_20210320.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest50_coco_384x288.py
+  In Collection: ResNeSt
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnest50_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.737
+      AP@0.5: 0.9
+      AP@0.75: 0.811
+      AR: 0.789
+      AR@0.5: 0.938
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_384x288-dcd20436_20210320.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest101_coco_256x192.py
+  In Collection: ResNeSt
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnest101_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.725
+      AP@0.5: 0.899
+      AP@0.75: 0.807
+      AR: 0.781
+      AR@0.5: 0.939
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_256x192-2ffcdc9d_20210320.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest101_coco_384x288.py
+  In Collection: ResNeSt
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnest101_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.746
+      AP@0.5: 0.906
+      AP@0.75: 0.82
+      AR: 0.798
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_384x288-80660658_20210320.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest200_coco_256x192.py
+  In Collection: ResNeSt
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnest200_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.732
+      AP@0.5: 0.905
+      AP@0.75: 0.812
+      AR: 0.787
+      AR@0.5: 0.942
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_256x192-db007a48_20210517.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest200_coco_384x288.py
+  In Collection: ResNeSt
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnest200_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.754
+      AP@0.5: 0.908
+      AP@0.75: 0.827
+      AR: 0.807
+      AR@0.5: 0.945
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_384x288-b5bb76cb_20210517.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest269_coco_256x192.py
+  In Collection: ResNeSt
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnest269_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.738
+      AP@0.5: 0.907
+      AP@0.75: 0.819
+      AR: 0.793
+      AR@0.5: 0.945
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_256x192-2a7882ac_20210517.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest269_coco_384x288.py
+  In Collection: ResNeSt
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnest269_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.755
+      AP@0.5: 0.908
+      AP@0.75: 0.828
+      AR: 0.806
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_384x288-b142b9fb_20210517.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..b66b95420d2edd5ca82fdc7a1ac4ec4c658ce6f8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_coco.md
@@ -0,0 +1,62 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192.py)  | 256x192 | 0.718 | 0.898 | 0.795 | 0.773 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_20200709.log.json) |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288.py)  | 384x288 | 0.731 | 0.900 | 0.799 | 0.783 | 0.931 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288-e6f795e9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288_20200709.log.json) |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192.py) | 256x192 | 0.726 | 0.899 | 0.806 | 0.781 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192-6e6babf0_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192_20200708.log.json) |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288.py) | 384x288 | 0.748 | 0.905 | 0.817 | 0.798 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288-8c71bdc9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288_20200709.log.json) |
+| [pose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192.py) | 256x192 | 0.735 | 0.905 | 0.812 | 0.790 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192-f6e307c2_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192_20200709.log.json) |
+| [pose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288.py) | 384x288 | 0.750 | 0.908 | 0.821 | 0.800 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288-3860d4c9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288_20200709.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3ba17ab7ed939c255389e47851575e98c375b053
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_coco.yml
@@ -0,0 +1,105 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: COCO
+  Name: topdown_heatmap_res50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.718
+      AP@0.5: 0.898
+      AP@0.75: 0.795
+      AR: 0.773
+      AR@0.5: 0.937
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_res50_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.731
+      AP@0.5: 0.9
+      AP@0.75: 0.799
+      AR: 0.783
+      AR@0.5: 0.931
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288-e6f795e9_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_res101_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.726
+      AP@0.5: 0.899
+      AP@0.75: 0.806
+      AR: 0.781
+      AR@0.5: 0.939
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192-6e6babf0_20200708.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_res101_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.748
+      AP@0.5: 0.905
+      AP@0.75: 0.817
+      AR: 0.798
+      AR@0.5: 0.94
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288-8c71bdc9_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_res152_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.735
+      AP@0.5: 0.905
+      AP@0.75: 0.812
+      AR: 0.79
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192-f6e307c2_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_res152_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.75
+      AP@0.5: 0.908
+      AP@0.75: 0.821
+      AR: 0.8
+      AR@0.5: 0.942
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288-3860d4c9_20200709.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_dark_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_dark_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..1524c1ac7f34e9119fcbeafa2cc5413f3aa29e24
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_dark_coco.md
@@ -0,0 +1,79 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_dark.py) | 256x192 | 0.724 | 0.898 | 0.800 | 0.777 | 0.936 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_dark-43379d20_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_dark_20200709.log.json) |
+| [pose_resnet_50_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288_dark.py) | 384x288 | 0.735 | 0.900 | 0.801 | 0.785 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288_dark-33d3e5e5_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288_dark_20210203.log.json) |
+| [pose_resnet_101_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192_dark.py) | 256x192 | 0.732 | 0.899 | 0.808 | 0.786 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192_dark-64d433e6_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192_dark_20200812.log.json) |
+| [pose_resnet_101_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288_dark.py) | 384x288 | 0.749 | 0.902 | 0.816 | 0.799 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288_dark-cb45c88d_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288_dark_20210203.log.json) |
+| [pose_resnet_152_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192_dark.py) | 256x192 | 0.745 | 0.905 | 0.821 | 0.797 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192_dark-ab4840d5_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192_dark_20200812.log.json) |
+| [pose_resnet_152_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288_dark.py) | 384x288 | 0.757 | 0.909 | 0.826 | 0.806 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288_dark-d3b8ebd7_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288_dark_20210203.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_dark_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_dark_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7a4c79e6d45de4c7c30631b54b826e15804bf6d9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_dark_coco.yml
@@ -0,0 +1,106 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    - DarkPose
+    Training Data: COCO
+  Name: topdown_heatmap_res50_coco_256x192_dark
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.724
+      AP@0.5: 0.898
+      AP@0.75: 0.8
+      AR: 0.777
+      AR@0.5: 0.936
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_dark-43379d20_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_res50_coco_384x288_dark
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.735
+      AP@0.5: 0.9
+      AP@0.75: 0.801
+      AR: 0.785
+      AR@0.5: 0.937
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288_dark-33d3e5e5_20210203.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_res101_coco_256x192_dark
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.732
+      AP@0.5: 0.899
+      AP@0.75: 0.808
+      AR: 0.786
+      AR@0.5: 0.938
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192_dark-64d433e6_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_res101_coco_384x288_dark
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.749
+      AP@0.5: 0.902
+      AP@0.75: 0.816
+      AR: 0.799
+      AR@0.5: 0.939
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288_dark-cb45c88d_20210203.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_res152_coco_256x192_dark
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.745
+      AP@0.5: 0.905
+      AP@0.75: 0.821
+      AR: 0.797
+      AR@0.5: 0.942
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192_dark-ab4840d5_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_res152_coco_384x288_dark
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.757
+      AP@0.5: 0.909
+      AP@0.75: 0.826
+      AR: 0.806
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288_dark-d3b8ebd7_20210203.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_fp16_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_fp16_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..5b147298be2648a24178af5c2b78a8d9a2b9003f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_fp16_coco.md
@@ -0,0 +1,73 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [OTHERS] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1710.03740">FP16 (ArXiv'2017)</a></summary>
+
+```bibtex
+@article{micikevicius2017mixed,
+  title={Mixed precision training},
+  author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others},
+  journal={arXiv preprint arXiv:1710.03740},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50_fp16](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_fp16_dynamic.py)  | 256x192 | 0.717 | 0.898 | 0.793 | 0.772 | 0.936 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_fp16_dynamic-6edb79f3_20210430.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_fp16_dynamic_20210430.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_fp16_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_fp16_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8c7da122c2d29456c72c0f6e24d0eac5e4dee5b4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_fp16_coco.yml
@@ -0,0 +1,25 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192_fp16_dynamic.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture:
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: COCO
+  Name: topdown_heatmap_res50_coco_256x192_fp16_dynamic
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.717
+      AP@0.5: 0.898
+      AP@0.75: 0.793
+      AR: 0.772
+      AR@0.5: 0.936
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_fp16_dynamic-6edb79f3_20210430.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d101_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d101_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc5a5765426d7ebd76570da476fb9b59000cc765
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d101_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnet101_v1d',
+    backbone=dict(type='ResNetV1d', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d101_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d101_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c3bcaa1ed4ed8f0c05de0909a7c2a44912b904e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d101_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnet101_v1d',
+    backbone=dict(type='ResNetV1d', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d152_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d152_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..8346b883adfb513d9ea70bd96896fa09889e9554
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d152_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnet152_v1d',
+    backbone=dict(type='ResNetV1d', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d152_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d152_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9397f6291c7db63ac39a56ae76bc164fdce27ba
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d152_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnet152_v1d',
+    backbone=dict(type='ResNetV1d', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=48,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..d54416419c6d87ab22523dea41fe4fc6398cbf74
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d50_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnet50_v1d',
+    backbone=dict(type='ResNetV1d', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d50_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d50_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..8435abd01b5b48c5bb85abd9567849cc720cc871
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d50_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnet50_v1d',
+    backbone=dict(type='ResNetV1d', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..a879858488bdd1afccb7f31b489d79b3c77cf858
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d_coco.md
@@ -0,0 +1,45 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/He_Bag_of_Tricks_for_Image_Classification_with_Convolutional_Neural_Networks_CVPR_2019_paper.html">ResNetV1D (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{he2019bag,
+  title={Bag of tricks for image classification with convolutional neural networks},
+  author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={558--567},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnetv1d_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d50_coco_256x192.py)  | 256x192 | 0.722 | 0.897 | 0.799 | 0.777 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_coco_256x192-a243b840_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_coco_256x192_20200727.log.json) |
+| [pose_resnetv1d_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d50_coco_384x288.py)  | 384x288 | 0.730 | 0.900 | 0.799 | 0.780 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_coco_384x288-01f3fbb9_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_coco_384x288_20200727.log.json) |
+| [pose_resnetv1d_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d101_coco_256x192.py) | 256x192 | 0.731 | 0.899 | 0.809 | 0.786 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_coco_256x192-5bd08cab_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_coco_256x192_20200727.log.json) |
+| [pose_resnetv1d_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d101_coco_384x288.py) | 384x288 | 0.748 | 0.902 | 0.816 | 0.799 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_coco_384x288-5f9e421d_20200730.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_coco_384x288-20200730.log.json) |
+| [pose_resnetv1d_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d152_coco_256x192.py) | 256x192 | 0.737 | 0.902 | 0.812 | 0.791 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_coco_256x192-c4df51dc_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_coco_256x192_20200727.log.json) |
+| [pose_resnetv1d_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d152_coco_384x288.py) | 384x288 | 0.752 | 0.909 | 0.821 | 0.802 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_coco_384x288-626c622d_20200730.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_coco_384x288-20200730.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f7e9a1bd6616dfbc31bff374f0fa7950be6fc47b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d_coco.yml
@@ -0,0 +1,104 @@
+Collections:
+- Name: ResNetV1D
+  Paper:
+    Title: Bag of tricks for image classification with convolutional neural networks
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/He_Bag_of_Tricks_for_Image_Classification_with_Convolutional_Neural_Networks_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnetv1d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d50_coco_256x192.py
+  In Collection: ResNetV1D
+  Metadata:
+    Architecture: &id001
+    - ResNetV1D
+    Training Data: COCO
+  Name: topdown_heatmap_resnetv1d50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.722
+      AP@0.5: 0.897
+      AP@0.75: 0.799
+      AR: 0.777
+      AR@0.5: 0.933
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_coco_256x192-a243b840_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d50_coco_384x288.py
+  In Collection: ResNetV1D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnetv1d50_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.73
+      AP@0.5: 0.9
+      AP@0.75: 0.799
+      AR: 0.78
+      AR@0.5: 0.934
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_coco_384x288-01f3fbb9_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d101_coco_256x192.py
+  In Collection: ResNetV1D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnetv1d101_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.731
+      AP@0.5: 0.899
+      AP@0.75: 0.809
+      AR: 0.786
+      AR@0.5: 0.938
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_coco_256x192-5bd08cab_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d101_coco_384x288.py
+  In Collection: ResNetV1D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnetv1d101_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.748
+      AP@0.5: 0.902
+      AP@0.75: 0.816
+      AR: 0.799
+      AR@0.5: 0.939
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_coco_384x288-5f9e421d_20200730.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d152_coco_256x192.py
+  In Collection: ResNetV1D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnetv1d152_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.737
+      AP@0.5: 0.902
+      AP@0.75: 0.812
+      AR: 0.791
+      AR@0.5: 0.94
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_coco_256x192-c4df51dc_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d152_coco_384x288.py
+  In Collection: ResNetV1D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnetv1d152_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.752
+      AP@0.5: 0.909
+      AP@0.75: 0.821
+      AR: 0.802
+      AR@0.5: 0.944
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_coco_384x288-626c622d_20200730.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext101_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext101_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..082ccdda8b11db85016ddc3d4fdcf4abae665dc8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext101_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnext101_32x4d',
+    backbone=dict(type='ResNeXt', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext101_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext101_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc548a682c8dbd8414c700c59025b024224e9226
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext101_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnext101_32x4d',
+    backbone=dict(type='ResNeXt', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext152_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext152_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..b75644b6e2a4248dcfbdd4a1e32849be7539ebc7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext152_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnext152_32x4d',
+    backbone=dict(type='ResNeXt', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext152_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext152_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fe79c71028e2c086205a9bf3b961fb3b7423e3c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext152_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnext152_32x4d',
+    backbone=dict(type='ResNeXt', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=48,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb92f98e77d0c5f51735baa19d886648d4b8246e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext50_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnext50_32x4d',
+    backbone=dict(type='ResNeXt', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext50_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext50_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..61645dec70964fa8db13d0b58e0871973c568239
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext50_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnext50_32x4d',
+    backbone=dict(type='ResNeXt', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..8f241f03a418e2c1a0802d8bfaa506b9578acccb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext_coco.md
@@ -0,0 +1,45 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2017/html/Xie_Aggregated_Residual_Transformations_CVPR_2017_paper.html">ResNext (CVPR'2017)</a></summary>
+
+```bibtex
+@inproceedings{xie2017aggregated,
+  title={Aggregated residual transformations for deep neural networks},
+  author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1492--1500},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnext_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext50_coco_256x192.py)  | 256x192 | 0.714 | 0.898 | 0.789 | 0.771 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_256x192-dcff15f6_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_256x192_20200727.log.json) |
+| [pose_resnext_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext50_coco_384x288.py)  | 384x288 | 0.724 | 0.899 | 0.794 | 0.777 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_384x288-412c848f_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_384x288_20200727.log.json) |
+| [pose_resnext_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext101_coco_256x192.py) | 256x192 | 0.726 | 0.900 | 0.801 | 0.782 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_256x192-c7eba365_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_256x192_20200727.log.json) |
+| [pose_resnext_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext101_coco_384x288.py) | 384x288 | 0.743 | 0.903 | 0.815 | 0.795 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_384x288-f5eabcd6_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_384x288_20200727.log.json) |
+| [pose_resnext_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext152_coco_256x192.py) | 256x192 | 0.730 | 0.904 | 0.808 | 0.786 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_256x192-102449aa_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_256x192_20200727.log.json) |
+| [pose_resnext_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext152_coco_384x288.py) | 384x288 | 0.742 | 0.902 | 0.810 | 0.794 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_384x288-806176df_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_384x288_20200727.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e90010416a97252931518c6610e2b6a4d8fa427e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext_coco.yml
@@ -0,0 +1,104 @@
+Collections:
+- Name: ResNext
+  Paper:
+    Title: Aggregated residual transformations for deep neural networks
+    URL: http://openaccess.thecvf.com/content_cvpr_2017/html/Xie_Aggregated_Residual_Transformations_CVPR_2017_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnext.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext50_coco_256x192.py
+  In Collection: ResNext
+  Metadata:
+    Architecture: &id001
+    - ResNext
+    Training Data: COCO
+  Name: topdown_heatmap_resnext50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.714
+      AP@0.5: 0.898
+      AP@0.75: 0.789
+      AR: 0.771
+      AR@0.5: 0.937
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_256x192-dcff15f6_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext50_coco_384x288.py
+  In Collection: ResNext
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnext50_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.724
+      AP@0.5: 0.899
+      AP@0.75: 0.794
+      AR: 0.777
+      AR@0.5: 0.935
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_384x288-412c848f_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext101_coco_256x192.py
+  In Collection: ResNext
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnext101_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.726
+      AP@0.5: 0.9
+      AP@0.75: 0.801
+      AR: 0.782
+      AR@0.5: 0.94
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_256x192-c7eba365_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext101_coco_384x288.py
+  In Collection: ResNext
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnext101_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.743
+      AP@0.5: 0.903
+      AP@0.75: 0.815
+      AR: 0.795
+      AR@0.5: 0.939
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_384x288-f5eabcd6_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext152_coco_256x192.py
+  In Collection: ResNext
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnext152_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.73
+      AP@0.5: 0.904
+      AP@0.75: 0.808
+      AR: 0.786
+      AR@0.5: 0.94
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_256x192-102449aa_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext152_coco_384x288.py
+  In Collection: ResNext
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_resnext152_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.742
+      AP@0.5: 0.902
+      AP@0.75: 0.81
+      AR: 0.794
+      AR@0.5: 0.939
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_384x288-806176df_20200727.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn18_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn18_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..3176d00b502132aa3409a421bd39b663c7cd100e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn18_coco_256x192.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-2,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 190, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='RSN',
+        unit_channels=256,
+        num_stages=1,
+        num_units=4,
+        num_blocks=[2, 2, 2, 2],
+        num_steps=4,
+        norm_cfg=dict(type='BN')),
+    keypoint_head=dict(
+        type='TopdownHeatmapMSMUHead',
+        out_shape=(64, 48),
+        unit_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=1,
+        num_units=4,
+        use_prm=False,
+        norm_cfg=dict(type='BN'),
+        loss_keypoint=[
+            dict(
+                type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
+        ] * 3 + [
+            dict(
+                type='JointsOHKMMSELoss',
+                use_target_weight=True,
+                loss_weight=1.)
+        ]),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='megvii',
+        shift_heatmap=False,
+        modulate_kernel=5))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        kernel=[(11, 11), (9, 9), (7, 7), (5, 5)],
+        encoding='Megvii'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..65bf136ebb9760fe4395906cce44385904e40dd7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn50_coco_256x192.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='RSN',
+        unit_channels=256,
+        num_stages=1,
+        num_units=4,
+        num_blocks=[3, 4, 6, 3],
+        num_steps=4,
+        norm_cfg=dict(type='BN')),
+    keypoint_head=dict(
+        type='TopdownHeatmapMSMUHead',
+        out_shape=(64, 48),
+        unit_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=1,
+        num_units=4,
+        use_prm=False,
+        norm_cfg=dict(type='BN'),
+        loss_keypoint=[
+            dict(
+                type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
+        ] * 3 + [
+            dict(
+                type='JointsOHKMMSELoss',
+                use_target_weight=True,
+                loss_weight=1.)
+        ]),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='megvii',
+        shift_heatmap=False,
+        modulate_kernel=5))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        kernel=[(11, 11), (9, 9), (7, 7), (5, 5)],
+        encoding='Megvii'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=4,
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..7cbb691e7ac8f1f73842e371dce2da6c943ce85d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn_coco.md
@@ -0,0 +1,44 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58580-8_27">RSN (ECCV'2020)</a></summary>
+
+```bibtex
+@misc{cai2020learning,
+    title={Learning Delicate Local Representations for Multi-Person Pose Estimation},
+    author={Yuanhao Cai and Zhicheng Wang and Zhengxiong Luo and Binyi Yin and Angang Du and Haoqian Wang and Xinyu Zhou and Erjin Zhou and Xiangyu Zhang and Jian Sun},
+    year={2020},
+    eprint={2003.04030},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [rsn_18](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn18_coco_256x192.py) | 256x192 | 0.704 | 0.887 | 0.779 | 0.771 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/top_down/rsn/rsn18_coco_256x192-72f4b4a7_20201127.pth) | [log](https://download.openmmlab.com/mmpose/top_down/rsn/rsn18_coco_256x192_20201127.log.json) |
+| [rsn_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn50_coco_256x192.py) | 256x192 | 0.723 | 0.896 | 0.800 | 0.788 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/rsn/rsn50_coco_256x192-72ffe709_20201127.pth) | [log](https://download.openmmlab.com/mmpose/top_down/rsn/rsn50_coco_256x192_20201127.log.json) |
+| [2xrsn_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/2xrsn50_coco_256x192.py) | 256x192 | 0.745 | 0.899 | 0.818 | 0.809 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/rsn/2xrsn50_coco_256x192-50648f0e_20201127.pth) | [log](https://download.openmmlab.com/mmpose/top_down/rsn/2xrsn50_coco_256x192_20201127.log.json) |
+| [3xrsn_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/3xrsn50_coco_256x192.py) | 256x192 | 0.750 | 0.900 | 0.823 | 0.813 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/rsn/3xrsn50_coco_256x192-58f57a68_20201127.pth) | [log](https://download.openmmlab.com/mmpose/top_down/rsn/3xrsn50_coco_256x192_20201127.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7ba36ee1353ee47a8686eee325c07b45d1cd22fd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: RSN
+  Paper:
+    Title: Learning Delicate Local Representations for Multi-Person Pose Estimation
+    URL: https://link.springer.com/chapter/10.1007/978-3-030-58580-8_27
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/rsn.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn18_coco_256x192.py
+  In Collection: RSN
+  Metadata:
+    Architecture: &id001
+    - RSN
+    Training Data: COCO
+  Name: topdown_heatmap_rsn18_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.704
+      AP@0.5: 0.887
+      AP@0.75: 0.779
+      AR: 0.771
+      AR@0.5: 0.926
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/rsn/rsn18_coco_256x192-72f4b4a7_20201127.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn50_coco_256x192.py
+  In Collection: RSN
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_rsn50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.723
+      AP@0.5: 0.896
+      AP@0.75: 0.8
+      AR: 0.788
+      AR@0.5: 0.934
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/rsn/rsn50_coco_256x192-72ffe709_20201127.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/2xrsn50_coco_256x192.py
+  In Collection: RSN
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_2xrsn50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.745
+      AP@0.5: 0.899
+      AP@0.75: 0.818
+      AR: 0.809
+      AR@0.5: 0.939
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/rsn/2xrsn50_coco_256x192-50648f0e_20201127.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/3xrsn50_coco_256x192.py
+  In Collection: RSN
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_3xrsn50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.75
+      AP@0.5: 0.9
+      AP@0.75: 0.823
+      AR: 0.813
+      AR@0.5: 0.94
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/rsn/3xrsn50_coco_256x192-58f57a68_20201127.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet101_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet101_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b4c33b6168b3da4ac7bfcce3d736c85ef2a6b10
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet101_coco_256x192.py
@@ -0,0 +1,134 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/scnet101-94250a77.pth',
+    backbone=dict(type='SCNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=1,
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet101_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet101_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..99ef3b4638f0562ffa2c9fe4819776bc7aacf5fc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet101_coco_384x288.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/scnet101-94250a77.pth',
+    backbone=dict(type='SCNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=48,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe5cac857d5da238c567899a5b2753511927c6f5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet50_coco_256x192.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/scnet50-7ef0a199.pth',
+    backbone=dict(type='SCNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet50_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet50_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..2909f7872788cdb89d8e9d1ef24363fc3357ae01
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet50_coco_384x288.py
@@ -0,0 +1,134 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/scnet50-7ef0a199.pth',
+    backbone=dict(type='SCNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=1,
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..38754c0c2c26aca8553bee16f9cc6ff0f77c35db
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet_coco.md
@@ -0,0 +1,43 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Liu_Improving_Convolutional_Networks_With_Self-Calibrated_Convolutions_CVPR_2020_paper.html">SCNet (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{liu2020improving,
+  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
+  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10096--10105},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_scnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet50_coco_256x192.py)   | 256x192 | 0.728 | 0.899 | 0.807 | 0.784 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_256x192-6920f829_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_256x192_20200709.log.json) |
+| [pose_scnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet50_coco_384x288.py)   | 384x288 | 0.751 | 0.906 | 0.818 | 0.802 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_384x288-9cacd0ea_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_384x288_20200709.log.json) |
+| [pose_scnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet101_coco_256x192.py)  | 256x192 | 0.733 | 0.903 | 0.813 | 0.790 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_256x192-6d348ef9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_256x192_20200709.log.json) |
+| [pose_scnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet101_coco_384x288.py)  | 384x288 | 0.752 | 0.906 | 0.823 | 0.804 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_384x288-0b6e631b_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_384x288_20200709.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6524f9c7466261f112ffd12c331913a631bb0434
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: SCNet
+  Paper:
+    Title: Improving Convolutional Networks with Self-Calibrated Convolutions
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Liu_Improving_Convolutional_Networks_With_Self-Calibrated_Convolutions_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/scnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet50_coco_256x192.py
+  In Collection: SCNet
+  Metadata:
+    Architecture: &id001
+    - SCNet
+    Training Data: COCO
+  Name: topdown_heatmap_scnet50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.728
+      AP@0.5: 0.899
+      AP@0.75: 0.807
+      AR: 0.784
+      AR@0.5: 0.938
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_256x192-6920f829_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet50_coco_384x288.py
+  In Collection: SCNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_scnet50_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.751
+      AP@0.5: 0.906
+      AP@0.75: 0.818
+      AR: 0.802
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_384x288-9cacd0ea_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet101_coco_256x192.py
+  In Collection: SCNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_scnet101_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.733
+      AP@0.5: 0.903
+      AP@0.75: 0.813
+      AR: 0.79
+      AR@0.5: 0.941
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_256x192-6d348ef9_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet101_coco_384x288.py
+  In Collection: SCNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_scnet101_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.752
+      AP@0.5: 0.906
+      AP@0.75: 0.823
+      AR: 0.804
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_384x288-0b6e631b_20200709.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet101_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet101_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..1942597ead9d51a576bfe8da48f9cbb2e80bd61b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet101_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://se-resnet101',
+    backbone=dict(type='SEResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet101_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet101_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..412f79dcd2b4c8280330d5a9aa92a6370457f3ea
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet101_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://se-resnet101',
+    backbone=dict(type='SEResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet152_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet152_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa41d2785c0fa6f70e83729de1ec4f775c271591
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet152_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='SEResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet152_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet152_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..83734d70a0f1a452303fdb99bbadedcac0e22f2c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet152_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='SEResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=48,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..f499c61904007f2a7edbfe31f199d3f0465989a3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet50_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://se-resnet50',
+    backbone=dict(type='SEResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet50_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet50_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..87cddbfc3aae50b14f2a05e1499bc4781b2d1cbc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet50_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://se-resnet50',
+    backbone=dict(type='SEResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..68530925993829bb08d53a485cdc27fe87e03751
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet_coco.md
@@ -0,0 +1,47 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Hu_Squeeze-and-Excitation_Networks_CVPR_2018_paper">SEResNet (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{hu2018squeeze,
+  title={Squeeze-and-excitation networks},
+  author={Hu, Jie and Shen, Li and Sun, Gang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={7132--7141},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_seresnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet50_coco_256x192.py)  | 256x192 | 0.728 | 0.900 | 0.809 | 0.784 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_256x192-25058b66_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_256x192_20200727.log.json) |
+| [pose_seresnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet50_coco_384x288.py)  | 384x288 | 0.748 | 0.905 | 0.819 | 0.799 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_384x288-bc0b7680_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_384x288_20200727.log.json) |
+| [pose_seresnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet101_coco_256x192.py) | 256x192 | 0.734 | 0.904 | 0.815 | 0.790 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_256x192-83f29c4d_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_256x192_20200727.log.json) |
+| [pose_seresnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet101_coco_384x288.py) | 384x288 | 0.753 | 0.907 | 0.823 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_384x288-48de1709_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_384x288_20200727.log.json) |
+| [pose_seresnet_152\*](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet152_coco_256x192.py) | 256x192 | 0.730 | 0.899 | 0.810 | 0.786 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_256x192-1c628d79_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_256x192_20200727.log.json) |
+| [pose_seresnet_152\*](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet152_coco_384x288.py) | 384x288 | 0.753 | 0.906 | 0.823 | 0.806 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_384x288-58b23ee8_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_384x288_20200727.log.json) |
+
+Note that \* means without imagenet pre-training.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..75d1b9ceaa7f68496afda063c5fc1e3e25d65590
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet_coco.yml
@@ -0,0 +1,104 @@
+Collections:
+- Name: SEResNet
+  Paper:
+    Title: Squeeze-and-excitation networks
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Hu_Squeeze-and-Excitation_Networks_CVPR_2018_paper
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/seresnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet50_coco_256x192.py
+  In Collection: SEResNet
+  Metadata:
+    Architecture: &id001
+    - SEResNet
+    Training Data: COCO
+  Name: topdown_heatmap_seresnet50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.728
+      AP@0.5: 0.9
+      AP@0.75: 0.809
+      AR: 0.784
+      AR@0.5: 0.94
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_256x192-25058b66_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet50_coco_384x288.py
+  In Collection: SEResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_seresnet50_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.748
+      AP@0.5: 0.905
+      AP@0.75: 0.819
+      AR: 0.799
+      AR@0.5: 0.941
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_384x288-bc0b7680_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet101_coco_256x192.py
+  In Collection: SEResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_seresnet101_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.734
+      AP@0.5: 0.904
+      AP@0.75: 0.815
+      AR: 0.79
+      AR@0.5: 0.942
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_256x192-83f29c4d_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet101_coco_384x288.py
+  In Collection: SEResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_seresnet101_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.753
+      AP@0.5: 0.907
+      AP@0.75: 0.823
+      AR: 0.805
+      AR@0.5: 0.943
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_384x288-48de1709_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet152_coco_256x192.py
+  In Collection: SEResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_seresnet152_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.73
+      AP@0.5: 0.899
+      AP@0.75: 0.81
+      AR: 0.786
+      AR@0.5: 0.94
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_256x192-1c628d79_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet152_coco_384x288.py
+  In Collection: SEResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_seresnet152_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.753
+      AP@0.5: 0.906
+      AP@0.75: 0.823
+      AR: 0.806
+      AR@0.5: 0.945
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_384x288-58b23ee8_20200727.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..59592e13147ab66bb5048e2f547468c409552440
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco.md
@@ -0,0 +1,41 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Zhang_ShuffleNet_An_Extremely_CVPR_2018_paper.html">ShufflenetV1 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{zhang2018shufflenet,
+  title={Shufflenet: An extremely efficient convolutional neural network for mobile devices},
+  author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={6848--6856},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_shufflenetv1](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco_256x192.py)  | 256x192 | 0.585 | 0.845 | 0.650 | 0.651 | 0.894 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_coco_256x192-353bc02c_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_coco_256x192_20200727.log.json) |
+| [pose_shufflenetv1](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco_384x288.py)  | 384x288 | 0.622 | 0.859 | 0.685 | 0.684 | 0.901 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_coco_384x288-b2930b24_20200804.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_coco_384x288_20200804.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..29947512c319b99576c526ed60c83e74ee3acc6a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco.yml
@@ -0,0 +1,41 @@
+Collections:
+- Name: ShufflenetV1
+  Paper:
+    Title: 'Shufflenet: An extremely efficient convolutional neural network for mobile
+      devices'
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Zhang_ShuffleNet_An_Extremely_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/shufflenetv1.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco_256x192.py
+  In Collection: ShufflenetV1
+  Metadata:
+    Architecture: &id001
+    - ShufflenetV1
+    Training Data: COCO
+  Name: topdown_heatmap_shufflenetv1_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.585
+      AP@0.5: 0.845
+      AP@0.75: 0.65
+      AR: 0.651
+      AR@0.5: 0.894
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_coco_256x192-353bc02c_20200727.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco_384x288.py
+  In Collection: ShufflenetV1
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_shufflenetv1_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.622
+      AP@0.5: 0.859
+      AP@0.75: 0.685
+      AR: 0.684
+      AR@0.5: 0.901
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_coco_384x288-b2930b24_20200804.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6a58306f1e92e8bb49b145936e0d55c89f68c03
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://shufflenet_v1',
+    backbone=dict(type='ShuffleNetV1', groups=3),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=960,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..f142c002b30a2cd437ef40066017ad3e0dfdee32
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://shufflenet_v1',
+    backbone=dict(type='ShuffleNetV1', groups=3),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=960,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..7c88ba017408204c1605a13d51a9935db5c01484
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco.md
@@ -0,0 +1,41 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Ningning_Light-weight_CNN_Architecture_ECCV_2018_paper.html">ShufflenetV2 (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{ma2018shufflenet,
+  title={Shufflenet v2: Practical guidelines for efficient cnn architecture design},
+  author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={116--131},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_shufflenetv2](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco_256x192.py)  | 256x192 | 0.599 | 0.854 | 0.663 | 0.664 | 0.899 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_coco_256x192-0aba71c7_20200921.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_coco_256x192_20200921.log.json) |
+| [pose_shufflenetv2](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco_384x288.py)  | 384x288 | 0.636 | 0.865 | 0.705 | 0.697 | 0.909 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_coco_384x288-fb38ac3a_20200921.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_coco_384x288_20200921.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c8d34a1e365c05ea962f1828be32d874e0e22e1e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: ShufflenetV2
+  Paper:
+    Title: 'Shufflenet v2: Practical guidelines for efficient cnn architecture design'
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Ningning_Light-weight_CNN_Architecture_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/shufflenetv2.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco_256x192.py
+  In Collection: ShufflenetV2
+  Metadata:
+    Architecture: &id001
+    - ShufflenetV2
+    Training Data: COCO
+  Name: topdown_heatmap_shufflenetv2_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.599
+      AP@0.5: 0.854
+      AP@0.75: 0.663
+      AR: 0.664
+      AR@0.5: 0.899
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_coco_256x192-0aba71c7_20200921.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco_384x288.py
+  In Collection: ShufflenetV2
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_shufflenetv2_coco_384x288
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.636
+      AP@0.5: 0.865
+      AP@0.75: 0.705
+      AR: 0.697
+      AR@0.5: 0.909
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_coco_384x288-fb38ac3a_20200921.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..44745a67781c2b1e9d79f9ee1841c32bde53d16a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://shufflenet_v2',
+    backbone=dict(type='ShuffleNetV2', widen_factor=1.0),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebff9346c548cb2bc657202d0dfa457aa24b18f8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco_384x288.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://shufflenet_v2',
+    backbone=dict(type='ShuffleNetV2', widen_factor=1.0),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg16_bn_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg16_bn_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..006f7f314da991dd78f8b40d359b63b01adb2824
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg16_bn_coco_256x192.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://vgg16_bn',
+    backbone=dict(type='VGG', depth=16, norm_cfg=dict(type='BN')),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=512,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..4cc6f6f5dd7c41c212c81865b6dbbe26ac0b2a3b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg_coco.md
@@ -0,0 +1,39 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1409.1556">VGG (ICLR'2015)</a></summary>
+
+```bibtex
+@article{simonyan2014very,
+  title={Very deep convolutional networks for large-scale image recognition},
+  author={Simonyan, Karen and Zisserman, Andrew},
+  journal={arXiv preprint arXiv:1409.1556},
+  year={2014}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [vgg](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg16_bn_coco_256x192.py)  | 256x192 | 0.698 | 0.890 | 0.768 | 0.754 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vgg/vgg16_bn_coco_256x192-7e7c58d6_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vgg/vgg16_bn_coco_256x192_20210517.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..62ecdfb8e365b8ec3ec1dcbc1396032ce9e14c1e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg_coco.yml
@@ -0,0 +1,24 @@
+Collections:
+- Name: VGG
+  Paper:
+    Title: Very deep convolutional networks for large-scale image recognition
+    URL: https://arxiv.org/abs/1409.1556
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/vgg.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg16_bn_coco_256x192.py
+  In Collection: VGG
+  Metadata:
+    Architecture:
+    - VGG
+    Training Data: COCO
+  Name: topdown_heatmap_vgg16_bn_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.698
+      AP@0.5: 0.89
+      AP@0.75: 0.768
+      AR: 0.754
+      AR@0.5: 0.929
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/vgg/vgg16_bn_coco_256x192-7e7c58d6_20210517.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..c86943c5224543e632a100bf18d83f44f3691d4b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.md
@@ -0,0 +1,40 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2105.10154">ViPNAS (CVPR'2021)</a></summary>
+
+```bibtex
+@article{xu2021vipnas,
+  title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
+  author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  year={2021}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [S-ViPNAS-MobileNetV3](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_mbv3_coco_256x192.py)  | 256x192 | 0.700 | 0.887 | 0.778 | 0.757 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_256x192-7018731a_20211122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_256x192_20211122.log.json) |
+| [S-ViPNAS-Res50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py)  | 256x192 | 0.711 | 0.893 | 0.789 | 0.769 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192-cc43b466_20210624.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192_20210624.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e476d28d12d5ae3679865034213443c389767d02
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: ViPNAS
+  Paper:
+    Title: 'ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search'
+    URL: https://arxiv.org/abs/2105.10154
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/vipnas.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_mbv3_coco_256x192.py
+  In Collection: ViPNAS
+  Metadata:
+    Architecture: &id001
+    - ViPNAS
+    Training Data: COCO
+  Name: topdown_heatmap_vipnas_mbv3_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.7
+      AP@0.5: 0.887
+      AP@0.75: 0.778
+      AR: 0.757
+      AR@0.5: 0.929
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_256x192-7018731a_20211122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py
+  In Collection: ViPNAS
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: topdown_heatmap_vipnas_res50_coco_256x192
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.711
+      AP@0.5: 0.893
+      AP@0.75: 0.789
+      AR: 0.769
+      AR@0.5: 0.934
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192-cc43b466_20210624.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_mbv3_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_mbv3_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..96420528833e9fcc8849444db3d4a03307e295cc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_mbv3_coco_256x192.py
@@ -0,0 +1,138 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ViPNAS_MobileNetV3'),
+    keypoint_head=dict(
+        type='ViPNASHeatmapSimpleHead',
+        in_channels=160,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_filters=(160, 160, 160),
+        num_deconv_groups=(160, 160, 160),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..3409caee7837407748e928de81612072161f6801
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ViPNAS_ResNet', depth=50),
+    keypoint_head=dict(
+        type='ViPNASHeatmapSimpleHead',
+        in_channels=608,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..391ab15578ed26c65a6a1866535f7b0128e453e1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
@@ -0,0 +1,491 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py',
+    '../../../../_base_/datasets/aic_info.py',
+    '../../../../_base_/datasets/mpii_info.py',
+    '../../../../_base_/datasets/ap10k_info.py',
+    '../../../../_base_/datasets/coco_wholebody_info.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.75,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+aic_channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+mpii_channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+crowdpose_channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+ap10k_channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+cocowholebody_channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+
+# model settings
+model = dict(
+    type='TopDownMoE',
+    pretrained=None,
+    backbone=dict(
+        type='ViTMoE',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+        num_expert=6,
+        part_features=192
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    associate_keypoint_head=[
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=768,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=aic_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=768,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=mpii_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=768,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=ap10k_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=768,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=ap10k_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=768,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=cocowholebody_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        ],
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    max_num_joints=133,
+    dataset_idx=0,
+)
+
+aic_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=aic_channel_cfg['num_output_channels'],
+    num_joints=aic_channel_cfg['dataset_joints'],
+    dataset_channel=aic_channel_cfg['dataset_channel'],
+    inference_channel=aic_channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    max_num_joints=133,
+    dataset_idx=1,
+)
+
+mpii_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=mpii_channel_cfg['num_output_channels'],
+    num_joints=mpii_channel_cfg['dataset_joints'],
+    dataset_channel=mpii_channel_cfg['dataset_channel'],
+    inference_channel=mpii_channel_cfg['inference_channel'],
+    max_num_joints=133,
+    dataset_idx=2,
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+ap10k_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+    max_num_joints=133,
+    dataset_idx=3,
+)
+
+ap36k_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+    max_num_joints=133,
+    dataset_idx=4,
+)
+
+cocowholebody_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=cocowholebody_channel_cfg['num_output_channels'],
+    num_joints=cocowholebody_channel_cfg['dataset_joints'],
+    dataset_channel=cocowholebody_channel_cfg['dataset_channel'],
+    inference_channel=cocowholebody_channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    dataset_idx=5,
+    max_num_joints=133,
+)
+
+cocowholebody_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+ap10k_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+aic_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+mpii_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+aic_data_root = 'data/aic'
+mpii_data_root = 'data/mpii'
+ap10k_data_root = 'data/ap10k'
+ap36k_data_root = 'data/ap36k'
+
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=64),
+    test_dataloader=dict(samples_per_gpu=64),
+    train=[
+        dict(
+            type='TopDownCocoDataset',
+            ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+            img_prefix=f'{data_root}/train2017/',
+            data_cfg=data_cfg,
+            pipeline=train_pipeline,
+            dataset_info={{_base_.dataset_info}}),
+        dict(
+            type='TopDownAicDataset',
+            ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json',
+            img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/'
+            'keypoint_train_images_20170902/',
+            data_cfg=aic_data_cfg,
+            pipeline=aic_train_pipeline,
+            dataset_info={{_base_.aic_info}}),
+        dict(
+            type='TopDownMpiiDataset',
+            ann_file=f'{mpii_data_root}/annotations/mpii_train.json',
+            img_prefix=f'{mpii_data_root}/images/',
+            data_cfg=mpii_data_cfg,
+            pipeline=mpii_train_pipeline,
+            dataset_info={{_base_.mpii_info}}),
+        dict(
+            type='AnimalAP10KDataset',
+            ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json',
+            img_prefix=f'{ap10k_data_root}/data/',
+            data_cfg=ap10k_data_cfg,
+            pipeline=ap10k_train_pipeline,
+            dataset_info={{_base_.ap10k_info}}),
+        dict(
+            type='AnimalAP10KDataset',
+            ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json',
+            img_prefix=f'{ap36k_data_root}/',
+            data_cfg=ap36k_data_cfg,
+            pipeline=ap10k_train_pipeline,
+            dataset_info={{_base_.ap10k_info}}),
+        dict(
+            type='TopDownCocoWholeBodyDataset',
+            ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+            img_prefix=f'{data_root}/train2017/',
+            data_cfg=cocowholebody_data_cfg,
+            pipeline=cocowholebody_train_pipeline,
+            dataset_info={{_base_.cocowholebody_info}}),
+        ],
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..612aaf0b32688fdf874c30eefe6bbb3ab0fb9767
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
@@ -0,0 +1,491 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py',
+    '../../../../_base_/datasets/aic_info.py',
+    '../../../../_base_/datasets/mpii_info.py',
+    '../../../../_base_/datasets/ap10k_info.py',
+    '../../../../_base_/datasets/coco_wholebody_info.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=32, 
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+aic_channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+mpii_channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+crowdpose_channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+ap10k_channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+cocowholebody_channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+
+# model settings
+model = dict(
+    type='TopDownMoE',
+    pretrained=None,
+    backbone=dict(
+        type='ViTMoE',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+        num_expert=6,
+        part_features=320
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    associate_keypoint_head=[
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=1280,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=aic_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=1280,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=mpii_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=1280,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=ap10k_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=1280,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=ap10k_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=1280,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=cocowholebody_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        ],
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    max_num_joints=133,
+    dataset_idx=0,
+)
+
+aic_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=aic_channel_cfg['num_output_channels'],
+    num_joints=aic_channel_cfg['dataset_joints'],
+    dataset_channel=aic_channel_cfg['dataset_channel'],
+    inference_channel=aic_channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    max_num_joints=133,
+    dataset_idx=1,
+)
+
+mpii_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=mpii_channel_cfg['num_output_channels'],
+    num_joints=mpii_channel_cfg['dataset_joints'],
+    dataset_channel=mpii_channel_cfg['dataset_channel'],
+    inference_channel=mpii_channel_cfg['inference_channel'],
+    max_num_joints=133,
+    dataset_idx=2,
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+ap10k_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+    max_num_joints=133,
+    dataset_idx=3,
+)
+
+ap36k_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+    max_num_joints=133,
+    dataset_idx=4,
+)
+
+cocowholebody_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=cocowholebody_channel_cfg['num_output_channels'],
+    num_joints=cocowholebody_channel_cfg['dataset_joints'],
+    dataset_channel=cocowholebody_channel_cfg['dataset_channel'],
+    inference_channel=cocowholebody_channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    dataset_idx=5,
+    max_num_joints=133,
+)
+
+cocowholebody_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+ap10k_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+aic_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+mpii_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+aic_data_root = 'data/aic'
+mpii_data_root = 'data/mpii'
+ap10k_data_root = 'data/ap10k'
+ap36k_data_root = 'data/ap36k'
+
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=64),
+    test_dataloader=dict(samples_per_gpu=64),
+    train=[
+        dict(
+            type='TopDownCocoDataset',
+            ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+            img_prefix=f'{data_root}/train2017/',
+            data_cfg=data_cfg,
+            pipeline=train_pipeline,
+            dataset_info={{_base_.dataset_info}}),
+        dict(
+            type='TopDownAicDataset',
+            ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json',
+            img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/'
+            'keypoint_train_images_20170902/',
+            data_cfg=aic_data_cfg,
+            pipeline=aic_train_pipeline,
+            dataset_info={{_base_.aic_info}}),
+        dict(
+            type='TopDownMpiiDataset',
+            ann_file=f'{mpii_data_root}/annotations/mpii_train.json',
+            img_prefix=f'{mpii_data_root}/images/',
+            data_cfg=mpii_data_cfg,
+            pipeline=mpii_train_pipeline,
+            dataset_info={{_base_.mpii_info}}),
+        dict(
+            type='AnimalAP10KDataset',
+            ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json',
+            img_prefix=f'{ap10k_data_root}/data/',
+            data_cfg=ap10k_data_cfg,
+            pipeline=ap10k_train_pipeline,
+            dataset_info={{_base_.ap10k_info}}),
+        dict(
+            type='AnimalAP10KDataset',
+            ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json',
+            img_prefix=f'{ap36k_data_root}/',
+            data_cfg=ap36k_data_cfg,
+            pipeline=ap10k_train_pipeline,
+            dataset_info={{_base_.ap10k_info}}),
+        dict(
+            type='TopDownCocoWholeBodyDataset',
+            ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+            img_prefix=f'{data_root}/train2017/',
+            data_cfg=cocowholebody_data_cfg,
+            pipeline=cocowholebody_train_pipeline,
+            dataset_info={{_base_.cocowholebody_info}}),
+        ],
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..0936de449b9a2bb74510b51e1d4e81f2c11eb8ac
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
@@ -0,0 +1,491 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py',
+    '../../../../_base_/datasets/aic_info.py',
+    '../../../../_base_/datasets/mpii_info.py',
+    '../../../../_base_/datasets/ap10k_info.py',
+    '../../../../_base_/datasets/coco_wholebody_info.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=24, 
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+aic_channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+mpii_channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+crowdpose_channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+ap10k_channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+cocowholebody_channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+
+# model settings
+model = dict(
+    type='TopDownMoE',
+    pretrained=None,
+    backbone=dict(
+        type='ViTMoE',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.5,
+        num_expert=6,
+        part_features=256
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    associate_keypoint_head=[
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=1024,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=aic_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=1024,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=mpii_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=1024,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=ap10k_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=1024,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=ap10k_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=1024,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=cocowholebody_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        ],
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    max_num_joints=133,
+    dataset_idx=0,
+)
+
+aic_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=aic_channel_cfg['num_output_channels'],
+    num_joints=aic_channel_cfg['dataset_joints'],
+    dataset_channel=aic_channel_cfg['dataset_channel'],
+    inference_channel=aic_channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    max_num_joints=133,
+    dataset_idx=1,
+)
+
+mpii_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=mpii_channel_cfg['num_output_channels'],
+    num_joints=mpii_channel_cfg['dataset_joints'],
+    dataset_channel=mpii_channel_cfg['dataset_channel'],
+    inference_channel=mpii_channel_cfg['inference_channel'],
+    max_num_joints=133,
+    dataset_idx=2,
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+ap10k_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+    max_num_joints=133,
+    dataset_idx=3,
+)
+
+ap36k_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+    max_num_joints=133,
+    dataset_idx=4,
+)
+
+cocowholebody_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=cocowholebody_channel_cfg['num_output_channels'],
+    num_joints=cocowholebody_channel_cfg['dataset_joints'],
+    dataset_channel=cocowholebody_channel_cfg['dataset_channel'],
+    inference_channel=cocowholebody_channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    dataset_idx=5,
+    max_num_joints=133,
+)
+
+cocowholebody_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+ap10k_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+aic_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+mpii_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+aic_data_root = 'data/aic'
+mpii_data_root = 'data/mpii'
+ap10k_data_root = 'data/ap10k'
+ap36k_data_root = 'data/ap36k'
+
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=64),
+    test_dataloader=dict(samples_per_gpu=64),
+    train=[
+        dict(
+            type='TopDownCocoDataset',
+            ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+            img_prefix=f'{data_root}/train2017/',
+            data_cfg=data_cfg,
+            pipeline=train_pipeline,
+            dataset_info={{_base_.dataset_info}}),
+        dict(
+            type='TopDownAicDataset',
+            ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json',
+            img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/'
+            'keypoint_train_images_20170902/',
+            data_cfg=aic_data_cfg,
+            pipeline=aic_train_pipeline,
+            dataset_info={{_base_.aic_info}}),
+        dict(
+            type='TopDownMpiiDataset',
+            ann_file=f'{mpii_data_root}/annotations/mpii_train.json',
+            img_prefix=f'{mpii_data_root}/images/',
+            data_cfg=mpii_data_cfg,
+            pipeline=mpii_train_pipeline,
+            dataset_info={{_base_.mpii_info}}),
+        dict(
+            type='AnimalAP10KDataset',
+            ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json',
+            img_prefix=f'{ap10k_data_root}/data/',
+            data_cfg=ap10k_data_cfg,
+            pipeline=ap10k_train_pipeline,
+            dataset_info={{_base_.ap10k_info}}),
+        dict(
+            type='AnimalAP10KDataset',
+            ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json',
+            img_prefix=f'{ap36k_data_root}/',
+            data_cfg=ap36k_data_cfg,
+            pipeline=ap10k_train_pipeline,
+            dataset_info={{_base_.ap10k_info}}),
+        dict(
+            type='TopDownCocoWholeBodyDataset',
+            ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+            img_prefix=f'{data_root}/train2017/',
+            data_cfg=cocowholebody_data_cfg,
+            pipeline=cocowholebody_train_pipeline,
+            dataset_info={{_base_.cocowholebody_info}}),
+        ],
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_small_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_small_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..0617aaa53399d85ddc76afe869a3ea1e192aa778
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_small_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
@@ -0,0 +1,491 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py',
+    '../../../../_base_/datasets/aic_info.py',
+    '../../../../_base_/datasets/mpii_info.py',
+    '../../../../_base_/datasets/ap10k_info.py',
+    '../../../../_base_/datasets/coco_wholebody_info.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+aic_channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+mpii_channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+crowdpose_channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+ap10k_channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+cocowholebody_channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+
+# model settings
+model = dict(
+    type='TopDownMoE',
+    pretrained=None,
+    backbone=dict(
+        type='ViTMoE',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+        num_expert=6,
+        part_features=192
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    associate_keypoint_head=[
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=384,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=aic_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=384,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=mpii_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=384,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=ap10k_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=384,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=ap10k_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        dict(
+            type='TopdownHeatmapSimpleHead',
+            in_channels=384,
+            num_deconv_layers=2,
+            num_deconv_filters=(256, 256),
+            num_deconv_kernels=(4, 4),
+            extra=dict(final_conv_kernel=1, ),
+            out_channels=cocowholebody_channel_cfg['num_output_channels'],
+            loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+        ],
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    max_num_joints=133,
+    dataset_idx=0,
+)
+
+aic_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=aic_channel_cfg['num_output_channels'],
+    num_joints=aic_channel_cfg['dataset_joints'],
+    dataset_channel=aic_channel_cfg['dataset_channel'],
+    inference_channel=aic_channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    max_num_joints=133,
+    dataset_idx=1,
+)
+
+mpii_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=mpii_channel_cfg['num_output_channels'],
+    num_joints=mpii_channel_cfg['dataset_joints'],
+    dataset_channel=mpii_channel_cfg['dataset_channel'],
+    inference_channel=mpii_channel_cfg['inference_channel'],
+    max_num_joints=133,
+    dataset_idx=2,
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+ap10k_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+    max_num_joints=133,
+    dataset_idx=3,
+)
+
+ap36k_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+    max_num_joints=133,
+    dataset_idx=4,
+)
+
+cocowholebody_data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=cocowholebody_channel_cfg['num_output_channels'],
+    num_joints=cocowholebody_channel_cfg['dataset_joints'],
+    dataset_channel=cocowholebody_channel_cfg['dataset_channel'],
+    inference_channel=cocowholebody_channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+    dataset_idx=5,
+    max_num_joints=133,
+)
+
+cocowholebody_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+ap10k_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+aic_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+mpii_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'dataset_idx'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+aic_data_root = 'data/aic'
+mpii_data_root = 'data/mpii'
+ap10k_data_root = 'data/ap10k'
+ap36k_data_root = 'data/ap36k'
+
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=64),
+    test_dataloader=dict(samples_per_gpu=64),
+    train=[
+        dict(
+            type='TopDownCocoDataset',
+            ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+            img_prefix=f'{data_root}/train2017/',
+            data_cfg=data_cfg,
+            pipeline=train_pipeline,
+            dataset_info={{_base_.dataset_info}}),
+        dict(
+            type='TopDownAicDataset',
+            ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json',
+            img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/'
+            'keypoint_train_images_20170902/',
+            data_cfg=aic_data_cfg,
+            pipeline=aic_train_pipeline,
+            dataset_info={{_base_.aic_info}}),
+        dict(
+            type='TopDownMpiiDataset',
+            ann_file=f'{mpii_data_root}/annotations/mpii_train.json',
+            img_prefix=f'{mpii_data_root}/images/',
+            data_cfg=mpii_data_cfg,
+            pipeline=mpii_train_pipeline,
+            dataset_info={{_base_.mpii_info}}),
+        dict(
+            type='AnimalAP10KDataset',
+            ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json',
+            img_prefix=f'{ap10k_data_root}/data/',
+            data_cfg=ap10k_data_cfg,
+            pipeline=ap10k_train_pipeline,
+            dataset_info={{_base_.ap10k_info}}),
+        dict(
+            type='AnimalAP10KDataset',
+            ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json',
+            img_prefix=f'{ap36k_data_root}/',
+            data_cfg=ap36k_data_cfg,
+            pipeline=ap10k_train_pipeline,
+            dataset_info={{_base_.ap10k_info}}),
+        dict(
+            type='TopDownCocoWholeBodyDataset',
+            ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+            img_prefix=f'{data_root}/train2017/',
+            data_cfg=cocowholebody_data_cfg,
+            pipeline=cocowholebody_train_pipeline,
+            dataset_info={{_base_.cocowholebody_info}}),
+        ],
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_base_crowdpose_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_base_crowdpose_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad98bc24d78b89e19db7f142aefce74d892ecd81
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_base_crowdpose_256x192.py
@@ -0,0 +1,149 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_huge_crowdpose_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_huge_crowdpose_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ddd2885a1457afa74344e8ced59299053af40a5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_huge_crowdpose_256x192.py
@@ -0,0 +1,149 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_large_crowdpose_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_large_crowdpose_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d6fd54f8211d2b3d451dc9b5c3331ba85583b0d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/ViTPose_large_crowdpose_256x192.py
@@ -0,0 +1,149 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_crowdpose.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_crowdpose.md
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e2473c30fecf4c7f49b262b4ea2a8cefac992
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_crowdpose.md
@@ -0,0 +1,39 @@
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Li_CrowdPose_Efficient_Crowded_Scenes_Pose_Estimation_and_a_New_Benchmark_CVPR_2019_paper.html">CrowdPose (CVPR'2019)</a></summary>
+
+```bibtex
+@article{li2018crowdpose,
+  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
+  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
+  journal={arXiv preprint arXiv:1812.00324},
+  year={2018}
+}
+```
+
+</details>
+
+Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AP (E) | AP (M) | AP (H) | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | :------: |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w32_crowdpose_256x192.py)  | 256x192 | 0.675 | 0.825 | 0.729 | 0.770 | 0.687 | 0.553 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192-960be101_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192_20201227.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_crowdpose.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_crowdpose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cf1f8b7a2d6aadb6d52f1a7f35e5a70db276ce7d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_crowdpose.yml
@@ -0,0 +1,25 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w32_crowdpose_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture:
+    - HRNet
+    Training Data: CrowdPose
+  Name: topdown_heatmap_hrnet_w32_crowdpose_256x192
+  Results:
+  - Dataset: CrowdPose
+    Metrics:
+      AP: 0.675
+      AP (E): 0.77
+      AP (H): 0.553
+      AP (M): 0.687
+      AP@0.5: 0.825
+      AP@0.75: 0.729
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192-960be101_20201227.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w32_crowdpose_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w32_crowdpose_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8fc5f47d5dbc16ae36b83f0df53955670509bb1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w32_crowdpose_256x192.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w32_crowdpose_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w32_crowdpose_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..f94fda4a1980fec4b5859f1139b479a764e1f8e6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w32_crowdpose_384x288.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w48_crowdpose_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w48_crowdpose_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..fccc213e67adf0086a544be68f0dd1cadc8e7746
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w48_crowdpose_256x192.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w48_crowdpose_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w48_crowdpose_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8373648aeb83f1e176222de63c185f2b1a36dfc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_w48_crowdpose_384x288.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..b425b0c886b4365209ae4d879e91b6dd1458d87a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_256x192.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_320x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_320x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a0fecb24259bfa796c45c69104678903f502552
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_320x256.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 320],
+    heatmap_size=[64, 80],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..0be685a06b510cb94274d02b592e890d5831ec3c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_384x288.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res152_crowdpose_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res152_crowdpose_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab4b2512b4759642cbf4758f77c4f15df71d2164
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res152_crowdpose_256x192.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res152_crowdpose_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res152_crowdpose_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..f54e428c87e3da15ac5eefd8a61d4ab33f275a94
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res152_crowdpose_384x288.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res50_crowdpose_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res50_crowdpose_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..22f765f1fc497217fe958a0b0a7ed34a628a6243
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res50_crowdpose_256x192.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res50_crowdpose_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res50_crowdpose_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea49a82987522d5978536efa2b5dacffe8be4185
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res50_crowdpose_384x288.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/crowdpose.py'
+]
+evaluation = dict(interval=10, metric='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    crowd_matching=False,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/crowdpose/annotations/'
+    'det_for_crowd_test_0.1_0.5.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=6,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/crowdpose'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCrowdPoseDataset',
+        ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/resnet_crowdpose.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/resnet_crowdpose.md
new file mode 100644
index 0000000000000000000000000000000000000000..81f9ee0522ee69cb12cc5c0139900fa350423158
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/resnet_crowdpose.md
@@ -0,0 +1,58 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Li_CrowdPose_Efficient_Crowded_Scenes_Pose_Estimation_and_a_New_Benchmark_CVPR_2019_paper.html">CrowdPose (CVPR'2019)</a></summary>
+
+```bibtex
+@article{li2018crowdpose,
+  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
+  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
+  journal={arXiv preprint arXiv:1812.00324},
+  year={2018}
+}
+```
+
+</details>
+
+Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AP (E) | AP (M) | AP (H) | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | :------: |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res50_crowdpose_256x192.py)  | 256x192 | 0.637 | 0.808 | 0.692 | 0.739 | 0.650 | 0.506 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192-c6a526b6_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192_20201227.log.json) |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_256x192.py)  | 256x192 | 0.647 | 0.810 | 0.703 | 0.744 | 0.658 | 0.522 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192-8f5870f4_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192_20201227.log.json) |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_320x256.py)  | 320x256 | 0.661 | 0.821 | 0.714 | 0.759 | 0.671 | 0.536 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256-c88c512a_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256_20201227.log.json) |
+| [pose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res152_crowdpose_256x192.py)  | 256x192 | 0.656 | 0.818 | 0.712 | 0.754 | 0.666 | 0.532 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192-dbd49aba_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192_20201227.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/resnet_crowdpose.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/resnet_crowdpose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..44b9c8e1d27e2812e1c05182bfe7219cc8ddc30e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/resnet_crowdpose.yml
@@ -0,0 +1,77 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res50_crowdpose_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: CrowdPose
+  Name: topdown_heatmap_res50_crowdpose_256x192
+  Results:
+  - Dataset: CrowdPose
+    Metrics:
+      AP: 0.637
+      AP (E): 0.739
+      AP (H): 0.506
+      AP (M): 0.65
+      AP@0.5: 0.808
+      AP@0.75: 0.692
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192-c6a526b6_20201227.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: CrowdPose
+  Name: topdown_heatmap_res101_crowdpose_256x192
+  Results:
+  - Dataset: CrowdPose
+    Metrics:
+      AP: 0.647
+      AP (E): 0.744
+      AP (H): 0.522
+      AP (M): 0.658
+      AP@0.5: 0.81
+      AP@0.75: 0.703
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192-8f5870f4_20201227.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res101_crowdpose_320x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: CrowdPose
+  Name: topdown_heatmap_res101_crowdpose_320x256
+  Results:
+  - Dataset: CrowdPose
+    Metrics:
+      AP: 0.661
+      AP (E): 0.759
+      AP (H): 0.536
+      AP (M): 0.671
+      AP@0.5: 0.821
+      AP@0.75: 0.714
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256-c88c512a_20201227.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/res152_crowdpose_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: CrowdPose
+  Name: topdown_heatmap_res152_crowdpose_256x192
+  Results:
+  - Dataset: CrowdPose
+    Metrics:
+      AP: 0.656
+      AP (E): 0.754
+      AP (H): 0.532
+      AP (M): 0.666
+      AP@0.5: 0.818
+      AP@0.75: 0.712
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192-dbd49aba_20201227.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_h36m.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_h36m.md
new file mode 100644
index 0000000000000000000000000000000000000000..c658cba54d9f5baaa26f85bf7c49bbe9bb52d03a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_h36m.md
@@ -0,0 +1,44 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/6682899/">Human3.6M (TPAMI'2014)</a></summary>
+
+```bibtex
+@article{h36m_pami,
+  author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu,  Cristian},
+  title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  publisher = {IEEE Computer Society},
+  volume = {36},
+  number = {7},
+  pages = {1325-1339},
+  month = {jul},
+  year = {2014}
+}
+```
+
+</details>
+
+Results on Human3.6M test set with ground truth 2D detections
+
+| Arch  | Input Size | EPE | PCK | ckpt | log |
+| :--- | :-----------: | :---: | :---: | :----: | :---: |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_w32_h36m_256x256.py) | 256x256 | 9.43 | 0.911 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_h36m_256x256-d3206675_20210621.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_h36m_256x256_20210621.log.json) |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_w48_h36m_256x256.py) | 256x256 | 7.36 | 0.932 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_h36m_256x256-78e88d08_20210621.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_h36m_256x256_20210621.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_h36m.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_h36m.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ac738b22d879f6d4084a975d40d6688a07376cdb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_h36m.yml
@@ -0,0 +1,34 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_w32_h36m_256x256.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: Human3.6M
+  Name: topdown_heatmap_hrnet_w32_h36m_256x256
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      EPE: 9.43
+      PCK: 0.911
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_h36m_256x256-d3206675_20210621.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_w48_h36m_256x256.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: Human3.6M
+  Name: topdown_heatmap_hrnet_w48_h36m_256x256
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      EPE: 7.36
+      PCK: 0.932
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_h36m_256x256-78e88d08_20210621.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_w32_h36m_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_w32_h36m_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..94a59be92cfcf692a22a7ad35e6d205ad1871b62
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_w32_h36m_256x256.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/h36m.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'EPE'], key_indicator='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/h36m'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownH36MDataset',
+        ann_file=f'{data_root}/annotation_body2d/h36m_coco_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownH36MDataset',
+        ann_file=f'{data_root}/annotation_body2d/h36m_coco_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownH36MDataset',
+        ann_file=f'{data_root}/annotation_body2d/h36m_coco_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_w48_h36m_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_w48_h36m_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..03e1e50849ddf6f3528cac5c3fe526176bb16989
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_w48_h36m_256x256.py
@@ -0,0 +1,157 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/h36m.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'EPE'], key_indicator='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/h36m'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownH36MDataset',
+        ann_file=f'{data_root}/annotation_body2d/h36m_coco_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownH36MDataset',
+        ann_file=f'{data_root}/annotation_body2d/h36m_coco_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownH36MDataset',
+        ann_file=f'{data_root}/annotation_body2d/h36m_coco_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb.md
new file mode 100644
index 0000000000000000000000000000000000000000..a122e8aa24c7b834d8a6b4cb35e372309d30f50f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb.md
@@ -0,0 +1,56 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/Wei_Convolutional_Pose_Machines_CVPR_2016_paper.html">CPM (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{wei2016convolutional,
+  title={Convolutional pose machines},
+  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={4724--4732},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://www.cv-foundation.org/openaccess/content_iccv_2013/html/Jhuang_Towards_Understanding_Action_2013_ICCV_paper.html">JHMDB (ICCV'2013)</a></summary>
+
+```bibtex
+@inproceedings{Jhuang:ICCV:2013,
+  title = {Towards understanding action recognition},
+  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
+  booktitle = {International Conf. on Computer Vision (ICCV)},
+  month = Dec,
+  pages = {3192-3199},
+  year = {2013}
+}
+```
+
+</details>
+
+Results on Sub-JHMDB dataset
+
+The models are pre-trained on MPII dataset only. NO test-time augmentation (multi-scale /rotation testing) is used.
+
+- Normalized by Person Size
+
+| Split| Arch        | Input Size | Head | Sho  | Elb | Wri | Hip | Knee | Ank | Mean | ckpt    | log     |
+| :--- | :--------:  | :--------: | :---: | :---: |:---: |:---: |:---: |:---:  |:---: | :---: | :-----: |:------: |
+| Sub1 |  [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub1_368x368.py) | 368x368 | 96.1 | 91.9 | 81.0 |  78.9 | 96.6 | 90.8| 87.3 | 89.5 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth)  | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368_20201122.log.json) |
+| Sub2 |  [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub2_368x368.py) | 368x368 | 98.1 | 93.6 | 77.1 |  70.9 | 94.0 | 89.1| 84.7 | 87.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth)  | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368_20201122.log.json) |
+| Sub3 |  [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub3_368x368.py) | 368x368 | 97.9 | 94.9 | 87.3 |  84.0 | 98.6 | 94.4| 86.2 | 92.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth)  | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368_20201122.log.json) |
+| Average |  cpm                                                       | 368x368 | 97.4 | 93.5 | 81.5 |  77.9 | 96.4 | 91.4| 86.1 | 89.8 | -        | -       |
+
+- Normalized by Torso Size
+
+| Split| Arch        | Input Size | Head | Sho  | Elb | Wri | Hip | Knee | Ank | Mean | ckpt    | log     |
+| :--- | :--------:  | :--------: | :---: | :---: |:---: |:---: |:---: |:---:  |:---: | :---: | :-----: |:------: |
+| Sub1 |  [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub1_368x368.py) | 368x368 | 89.0 | 63.0 | 54.0 |  54.9 | 68.2 | 63.1 | 61.2 | 66.0 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth)  | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368_20201122.log.json) |
+| Sub2 |  [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub2_368x368.py) | 368x368 | 90.3 | 57.9 | 46.8 |  44.3 | 60.8 | 58.2 | 62.4 | 61.1 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth)  | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368_20201122.log.json) |
+| Sub3 |  [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub3_368x368.py) | 368x368 | 91.0 | 72.6 | 59.9 |  54.0 | 73.2 | 68.5 | 65.8 | 70.3 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth)  | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368_20201122.log.json) |
+| Average |  cpm                                                       | 368x368 | 90.1 | 64.5 | 53.6 |  51.1 | 67.4 | 63.3 | 63.1 | 65.7 | -        | -       |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb.yml
new file mode 100644
index 0000000000000000000000000000000000000000..eda79a04c24cef7837deb17ee3da44bd3e415310
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb.yml
@@ -0,0 +1,122 @@
+Collections:
+- Name: CPM
+  Paper:
+    Title: Convolutional pose machines
+    URL: http://openaccess.thecvf.com/content_cvpr_2016/html/Wei_Convolutional_Pose_Machines_CVPR_2016_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/cpm.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub1_368x368.py
+  In Collection: CPM
+  Metadata:
+    Architecture: &id001
+    - CPM
+    Training Data: JHMDB
+  Name: topdown_heatmap_cpm_jhmdb_sub1_368x368
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 87.3
+      Elb: 81.0
+      Head: 96.1
+      Hip: 96.6
+      Knee: 90.8
+      Mean: 89.5
+      Sho: 91.9
+      Wri: 78.9
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub2_368x368.py
+  In Collection: CPM
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_cpm_jhmdb_sub2_368x368
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 84.7
+      Elb: 77.1
+      Head: 98.1
+      Hip: 94.0
+      Knee: 89.1
+      Mean: 87.4
+      Sho: 93.6
+      Wri: 70.9
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub3_368x368.py
+  In Collection: CPM
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_cpm_jhmdb_sub3_368x368
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 86.2
+      Elb: 87.3
+      Head: 97.9
+      Hip: 98.6
+      Knee: 94.4
+      Mean: 92.4
+      Sho: 94.9
+      Wri: 84.0
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub1_368x368.py
+  In Collection: CPM
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_cpm_jhmdb_sub1_368x368
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 61.2
+      Elb: 54.0
+      Head: 89.0
+      Hip: 68.2
+      Knee: 63.1
+      Mean: 66.0
+      Sho: 63.0
+      Wri: 54.9
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub2_368x368.py
+  In Collection: CPM
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_cpm_jhmdb_sub2_368x368
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 62.4
+      Elb: 46.8
+      Head: 90.3
+      Hip: 60.8
+      Knee: 58.2
+      Mean: 61.1
+      Sho: 57.9
+      Wri: 44.3
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub3_368x368.py
+  In Collection: CPM
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_cpm_jhmdb_sub3_368x368
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 65.8
+      Elb: 59.9
+      Head: 91.0
+      Hip: 73.2
+      Knee: 68.5
+      Mean: 70.3
+      Sho: 72.6
+      Wri: 54.0
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub1_368x368.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub1_368x368.py
new file mode 100644
index 0000000000000000000000000000000000000000..15ae4a0f2059d59d766520635687a481b4f64366
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub1_368x368.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/jhmdb.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368-116e62b8_20200822.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[20, 30])
+total_epochs = 40
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='CPM',
+        in_channels=3,
+        out_channels=channel_cfg['num_output_channels'],
+        feat_channels=128,
+        num_stages=6),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=channel_cfg['num_output_channels'],
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=6,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=0, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[368, 368],
+    heatmap_size=[46, 46],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/jhmdb'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub1_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub1_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub1_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub2_368x368.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub2_368x368.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f885f541701295eeab24c6dbebccc4911035b54
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub2_368x368.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/jhmdb.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368-116e62b8_20200822.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[20, 30])
+total_epochs = 40
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='CPM',
+        in_channels=3,
+        out_channels=channel_cfg['num_output_channels'],
+        feat_channels=128,
+        num_stages=6),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=channel_cfg['num_output_channels'],
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=6,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=0, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[368, 368],
+    heatmap_size=[46, 46],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/jhmdb'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub2_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub2_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub2_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub3_368x368.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub3_368x368.py
new file mode 100644
index 0000000000000000000000000000000000000000..69706a76c207b38a122c0b3fe0f7711a41598cb7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub3_368x368.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/jhmdb.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368-116e62b8_20200822.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[20, 30])
+total_epochs = 40
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='CPM',
+        in_channels=3,
+        out_channels=channel_cfg['num_output_channels'],
+        feat_channels=128,
+        num_stages=6),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=channel_cfg['num_output_channels'],
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=6,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=0, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[368, 368],
+    heatmap_size=[46, 46],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/jhmdb'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub3_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub3_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub3_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub1_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub1_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..0870a6cbd306e8fab3e1b342d97ea0f23b3bb7e9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub1_256x256.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/jhmdb.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[20, 30])
+total_epochs = 40
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[32, 32],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/jhmdb'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub1_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub1_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub1_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub2_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub2_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..51f27b7e60236991bdc68efaaa3357f298927c0a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub2_256x256.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/jhmdb.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[20, 30])
+total_epochs = 40
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[32, 32],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/jhmdb'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub2_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub2_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub2_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub3_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub3_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..db0026693a29acf55e61d6353618364c3626edc6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub3_256x256.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/jhmdb.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[20, 30])
+total_epochs = 40
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[32, 32],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/jhmdb'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub3_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub3_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub3_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub1_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub1_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..857854161247694ab57f1efdb019c3a4da427374
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub1_256x256.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/jhmdb.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 15])
+total_epochs = 20
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/jhmdb'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub1_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub1_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub1_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub2_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub2_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..d52be3d11e265d515c51263727892f3787f5809d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub2_256x256.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/jhmdb.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 15])
+total_epochs = 20
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/jhmdb'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub2_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub2_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub2_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub3_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub3_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9ab7fb8755e0e8c729a317c13f852f7404c453
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub3_256x256.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/jhmdb.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 15])
+total_epochs = 20
+channel_cfg = dict(
+    num_output_channels=15,
+    dataset_joints=15,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/jhmdb'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub3_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub3_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownJhmdbDataset',
+        ann_file=f'{data_root}/annotations/Sub3_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/resnet_jhmdb.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/resnet_jhmdb.md
new file mode 100644
index 0000000000000000000000000000000000000000..fa2b969180f24aeac67741f1cb31d377a3afc8db
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/resnet_jhmdb.md
@@ -0,0 +1,81 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://www.cv-foundation.org/openaccess/content_iccv_2013/html/Jhuang_Towards_Understanding_Action_2013_ICCV_paper.html">JHMDB (ICCV'2013)</a></summary>
+
+```bibtex
+@inproceedings{Jhuang:ICCV:2013,
+  title = {Towards understanding action recognition},
+  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
+  booktitle = {International Conf. on Computer Vision (ICCV)},
+  month = Dec,
+  pages = {3192-3199},
+  year = {2013}
+}
+```
+
+</details>
+
+Results on Sub-JHMDB dataset
+
+The models are pre-trained on MPII dataset only. *NO* test-time augmentation (multi-scale /rotation testing) is used.
+
+- Normalized by Person Size
+
+| Split| Arch        | Input Size | Head | Sho  | Elb | Wri | Hip | Knee | Ank | Mean | ckpt    | log     |
+| :--- | :--------:  | :--------: | :---: | :---: |:---: |:---: |:---: |:---:  |:---: | :---: | :-----: |:------: |
+| Sub1 |  [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub1_256x256.py) | 256x256 | 99.1 | 98.0 | 93.8 |  91.3 | 99.4 | 96.5| 92.8 | 96.1 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256_20201122.log.json) |
+| Sub2 |  [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub2_256x256.py) | 256x256 | 99.3 | 97.1 | 90.6 |  87.0 | 98.9 | 96.3| 94.1 | 95.0 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256_20201122.log.json) |
+| Sub3 |  [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub3_256x256.py) | 256x256 | 99.0 | 97.9 | 94.0 |  91.6 | 99.7 | 98.0| 94.7 | 96.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256_20201122.log.json) |
+| Average |  pose_resnet_50                                                            | 256x256 | 99.2 | 97.7 | 92.8 |  90.0 | 99.3 | 96.9| 93.9 | 96.0 | -        | -       |
+| Sub1 |  [pose_resnet_50 (2 Deconv.)](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub1_256x256.py) | 256x256 | 99.1 | 98.5 | 94.6 |  92.0 | 99.4 | 94.6| 92.5 | 96.1 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256_20201122.log.json) |
+| Sub2 |  [pose_resnet_50 (2 Deconv.)](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub2_256x256.py) | 256x256 | 99.3 | 97.8 | 91.0 |  87.0 | 99.1 | 96.5| 93.8 | 95.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256_20201122.log.json) |
+| Sub3 |  [pose_resnet_50 (2 Deconv.)](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub3_256x256.py) | 256x256 | 98.8 | 98.4 | 94.3 |  92.1 | 99.8 | 97.5| 93.8 | 96.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256_20201122.log.json) |
+| Average |  pose_resnet_50 (2 Deconv.)                                                                    | 256x256 | 99.1 | 98.2 | 93.3 |  90.4 | 99.4 | 96.2| 93.4 | 96.0 | -        | -       |
+
+- Normalized by Torso Size
+
+| Split| Arch        | Input Size | Head | Sho  | Elb | Wri | Hip | Knee | Ank | Mean | ckpt    | log     |
+| :--- | :--------:  | :--------: | :---: | :---: |:---: |:---: |:---: |:---:  |:---: | :---: | :-----: |:------: |
+| Sub1 |  [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub1_256x256.py) | 256x256 | 93.3 | 83.2 | 74.4 |  72.7 | 85.0 | 81.2 | 78.9 | 81.9 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256_20201122.log.json) |
+| Sub2 |  [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub2_256x256.py) | 256x256 | 94.1 | 74.9 | 64.5 |  62.5 | 77.9 | 71.9 | 78.6 | 75.5 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256_20201122.log.json) |
+| Sub3 |  [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub3_256x256.py) | 256x256 | 97.0 | 82.2 | 74.9 |  70.7 | 84.7 | 83.7 | 84.2 | 82.9 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256_20201122.log.json) |
+| Average |  pose_resnet_50                                                            | 256x256 | 94.8 | 80.1 | 71.3 |  68.6 | 82.5 | 78.9 | 80.6 | 80.1 | -        | -       |
+| Sub1 |  [pose_resnet_50 (2 Deconv.)](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub1_256x256.py) | 256x256 | 92.4 | 80.6 | 73.2 |  70.5 | 82.3 | 75.4| 75.0 | 79.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256_20201122.log.json) |
+| Sub2 |  [pose_resnet_50 (2 Deconv.)](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub2_256x256.py) | 256x256 | 93.4 | 73.6 | 63.8 |  60.5 | 75.1 | 68.4| 75.5 | 73.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256_20201122.log.json) |
+| Sub3 |  [pose_resnet_50 (2 Deconv.)](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub3_256x256.py) | 256x256 | 96.1 | 81.2 | 72.6 |  67.9 | 83.6 | 80.9| 81.5 | 81.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256_20201122.log.json) |
+| Average |  pose_resnet_50 (2 Deconv.)                                                                    | 256x256 | 94.0 | 78.5 | 69.9 |  66.3 | 80.3 | 74.9| 77.3 | 78.0 | -        | -       |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/resnet_jhmdb.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/resnet_jhmdb.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0116ecac101574b050030a4157e0b66abd7e5a46
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/resnet_jhmdb.yml
@@ -0,0 +1,237 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub1_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_jhmdb_sub1_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 92.8
+      Elb: 93.8
+      Head: 99.1
+      Hip: 99.4
+      Knee: 96.5
+      Mean: 96.1
+      Sho: 98.0
+      Wri: 91.3
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub2_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_jhmdb_sub2_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 94.1
+      Elb: 90.6
+      Head: 99.3
+      Hip: 98.9
+      Knee: 96.3
+      Mean: 95.0
+      Sho: 97.1
+      Wri: 87.0
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub3_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_jhmdb_sub3_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 94.7
+      Elb: 94.0
+      Head: 99.0
+      Hip: 99.7
+      Knee: 98.0
+      Mean: 96.7
+      Sho: 97.9
+      Wri: 91.6
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub1_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_2deconv_jhmdb_sub1_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 92.5
+      Elb: 94.6
+      Head: 99.1
+      Hip: 99.4
+      Knee: 94.6
+      Mean: 96.1
+      Sho: 98.5
+      Wri: 92.0
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub2_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_2deconv_jhmdb_sub2_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 93.8
+      Elb: 91.0
+      Head: 99.3
+      Hip: 99.1
+      Knee: 96.5
+      Mean: 95.2
+      Sho: 97.8
+      Wri: 87.0
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub3_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_2deconv_jhmdb_sub3_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 93.8
+      Elb: 94.3
+      Head: 98.8
+      Hip: 99.8
+      Knee: 97.5
+      Mean: 96.7
+      Sho: 98.4
+      Wri: 92.1
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub1_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_jhmdb_sub1_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 78.9
+      Elb: 74.4
+      Head: 93.3
+      Hip: 85.0
+      Knee: 81.2
+      Mean: 81.9
+      Sho: 83.2
+      Wri: 72.7
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub2_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_jhmdb_sub2_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 78.6
+      Elb: 64.5
+      Head: 94.1
+      Hip: 77.9
+      Knee: 71.9
+      Mean: 75.5
+      Sho: 74.9
+      Wri: 62.5
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_jhmdb_sub3_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_jhmdb_sub3_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 84.2
+      Elb: 74.9
+      Head: 97.0
+      Hip: 84.7
+      Knee: 83.7
+      Mean: 82.9
+      Sho: 82.2
+      Wri: 70.7
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub1_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_2deconv_jhmdb_sub1_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 75.0
+      Elb: 73.2
+      Head: 92.4
+      Hip: 82.3
+      Knee: 75.4
+      Mean: 79.2
+      Sho: 80.6
+      Wri: 70.5
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub2_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_2deconv_jhmdb_sub2_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 75.5
+      Elb: 63.8
+      Head: 93.4
+      Hip: 75.1
+      Knee: 68.4
+      Mean: 73.7
+      Sho: 73.6
+      Wri: 60.5
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/res50_2deconv_jhmdb_sub3_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: JHMDB
+  Name: topdown_heatmap_res50_2deconv_jhmdb_sub3_256x256
+  Results:
+  - Dataset: JHMDB
+    Metrics:
+      Ank: 81.5
+      Elb: 72.6
+      Head: 96.1
+      Hip: 83.6
+      Knee: 80.9
+      Mean: 81.2
+      Sho: 81.2
+      Wri: 67.9
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/res50_mhp_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/res50_mhp_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b0a322040a5bafd5de7505b34f72ffe91117ed9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/res50_mhp_256x192.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mhp.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    bbox_thr=1.0,
+    use_gt_bbox=True,
+    image_thr=0.0,
+    bbox_file='',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mhp'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMhpDataset',
+        ann_file=f'{data_root}/annotations/mhp_train.json',
+        img_prefix=f'{data_root}/train/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMhpDataset',
+        ann_file=f'{data_root}/annotations/mhp_val.json',
+        img_prefix=f'{data_root}/val/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMhpDataset',
+        ann_file=f'{data_root}/annotations/mhp_val.json',
+        img_prefix=f'{data_root}/val/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/resnet_mhp.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/resnet_mhp.md
new file mode 100644
index 0000000000000000000000000000000000000000..befa17ea9548975429e917385bdf45a2a9b7c723
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/resnet_mhp.md
@@ -0,0 +1,59 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://dl.acm.org/doi/abs/10.1145/3240508.3240509">MHP (ACM MM'2018)</a></summary>
+
+```bibtex
+@inproceedings{zhao2018understanding,
+  title={Understanding humans in crowded scenes: Deep nested adversarial learning and a new benchmark for multi-human parsing},
+  author={Zhao, Jian and Li, Jianshu and Cheng, Yu and Sim, Terence and Yan, Shuicheng and Feng, Jiashi},
+  booktitle={Proceedings of the 26th ACM international conference on Multimedia},
+  pages={792--800},
+  year={2018}
+}
+```
+
+</details>
+
+Results on MHP v2.0 val set
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/res50_mhp_256x192.py) | 256x192 | 0.583 | 0.897 | 0.669 | 0.636 | 0.918 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_mhp_256x192-28c5b818_20201229.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_mhp_256x192_20201229.log.json) |
+
+Note that, the evaluation metric used here is mAP (adapted from COCO), which may be different from the official evaluation [codes](https://github.com/ZhaoJ9014/Multi-Human-Parsing/tree/master/Evaluation/Multi-Human-Pose).
+Please be cautious if you use the results in papers.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/resnet_mhp.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/resnet_mhp.yml
new file mode 100644
index 0000000000000000000000000000000000000000..777b1dbb5f5d2fd03bbe56214785a8ce675f0a1c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/resnet_mhp.yml
@@ -0,0 +1,25 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/res50_mhp_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture:
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: MHP
+  Name: topdown_heatmap_res50_mhp_256x192
+  Results:
+  - Dataset: MHP
+    Metrics:
+      AP: 0.583
+      AP@0.5: 0.897
+      AP@0.75: 0.669
+      AR: 0.636
+      AR@0.5: 0.918
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_mhp_256x192-28c5b818_20201229.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_base_mpii_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_base_mpii_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbd0eef61be608bc5e151b48f55786691546d922
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_base_mpii_256x192.py
@@ -0,0 +1,146 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_huge_mpii_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_huge_mpii_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc680aee55c86336f29824e3f8986a282f2056c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_huge_mpii_256x192.py
@@ -0,0 +1,146 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_large_mpii_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_large_mpii_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..7105e38a0561723017fef2c0d8479b609239c641
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_large_mpii_256x192.py
@@ -0,0 +1,146 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_small_mpii_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_small_mpii_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..f80f5228ab683eef03921d855e9c8b8f93620549
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/ViTPose_small_mpii_256x192.py
@@ -0,0 +1,146 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e9012f672f17a455a3637fd49da69f533d01bb0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii.md
@@ -0,0 +1,39 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/Wei_Convolutional_Pose_Machines_CVPR_2016_paper.html">CPM (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{wei2016convolutional,
+  title={Convolutional pose machines},
+  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={4724--4732},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii_368x368.py) | 368x368 | 0.876 | 0.285 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368-116e62b8_20200822.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368_20200822.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c62a93f069002b55bf2e3d3a716e0826fbae56d7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii.yml
@@ -0,0 +1,21 @@
+Collections:
+- Name: CPM
+  Paper:
+    Title: Convolutional pose machines
+    URL: http://openaccess.thecvf.com/content_cvpr_2016/html/Wei_Convolutional_Pose_Machines_CVPR_2016_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/cpm.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii_368x368.py
+  In Collection: CPM
+  Metadata:
+    Architecture:
+    - CPM
+    Training Data: MPII
+  Name: topdown_heatmap_cpm_mpii_368x368
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.876
+      Mean@0.1: 0.285
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368-116e62b8_20200822.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii_368x368.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii_368x368.py
new file mode 100644
index 0000000000000000000000000000000000000000..62b81a5c79299c6633de519ae0cf99d02031b4cf
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii_368x368.py
@@ -0,0 +1,132 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='CPM',
+        in_channels=3,
+        out_channels=channel_cfg['num_output_channels'],
+        feat_channels=128,
+        num_stages=6),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=channel_cfg['num_output_channels'],
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=6,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=0, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[368, 368],
+    heatmap_size=[46, 46],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass52_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass52_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b96027fe54821bbac819d374c42e5bfa30cabb2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass52_mpii_256x256.py
@@ -0,0 +1,129 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HourglassNet',
+        num_stacks=1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=1,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass52_mpii_384x384.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass52_mpii_384x384.py
new file mode 100644
index 0000000000000000000000000000000000000000..30f2ec04ee60e38e4c9ee16327252d45f3748e9b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass52_mpii_384x384.py
@@ -0,0 +1,129 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HourglassNet',
+        num_stacks=1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=1,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[384, 384],
+    heatmap_size=[96, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..d429415acfae5d43924653e30fbf76eb09de52ba
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass_mpii.md
@@ -0,0 +1,41 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29">Hourglass (ECCV'2016)</a></summary>
+
+```bibtex
+@inproceedings{newell2016stacked,
+  title={Stacked hourglass networks for human pose estimation},
+  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
+  booktitle={European conference on computer vision},
+  pages={483--499},
+  year={2016},
+  organization={Springer}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_hourglass_52](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass52_mpii_256x256.py) | 256x256 | 0.889 | 0.317 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_256x256-ae358435_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_256x256_20200812.log.json) |
+| [pose_hourglass_52](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass52_mpii_384x384.py) | 384x384 | 0.894 | 0.366 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_384x384-04090bc3_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_384x384_20200812.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ecd47008a220dc6a296c49b35cc12456599b490b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass_mpii.yml
@@ -0,0 +1,34 @@
+Collections:
+- Name: Hourglass
+  Paper:
+    Title: Stacked hourglass networks for human pose estimation
+    URL: https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hourglass.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass52_mpii_256x256.py
+  In Collection: Hourglass
+  Metadata:
+    Architecture: &id001
+    - Hourglass
+    Training Data: MPII
+  Name: topdown_heatmap_hourglass52_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.889
+      Mean@0.1: 0.317
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_256x256-ae358435_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass52_mpii_384x384.py
+  In Collection: Hourglass
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_hourglass52_mpii_384x384
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.894
+      Mean@0.1: 0.366
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_384x384-04090bc3_20200812.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_dark_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_dark_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..b7100183eae55d59ba2d1afe459c85a94df0acf0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_dark_mpii.md
@@ -0,0 +1,57 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256_dark.py) | 256x256 | 0.904 | 0.354 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_dark-f1601c5b_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_dark_20200927.log.json) |
+| [pose_hrnet_w48_dark](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256_dark.py) | 256x256 | 0.905 | 0.360 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_dark-0decd39f_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_dark_20200927.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_dark_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_dark_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..795e135a923be338965e750a28160033bedd2f5d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_dark_mpii.yml
@@ -0,0 +1,35 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    - DarkPose
+    Training Data: MPII
+  Name: topdown_heatmap_hrnet_w32_mpii_256x256_dark
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.904
+      Mean@0.1: 0.354
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_dark-f1601c5b_20200927.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_hrnet_w48_mpii_256x256_dark
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.905
+      Mean@0.1: 0.36
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_dark-0decd39f_20200927.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..d4c205ca64c8537cf6189e4d206711f31b24edfe
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_mpii.md
@@ -0,0 +1,41 @@
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256.py) | 256x256 | 0.900 | 0.334 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256-6c4f923f_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_20200812.log.json) |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256.py) | 256x256 | 0.901 | 0.337 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256-92cab7bd_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_20200812.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..94607111ef62935b44fb072f87efbdf42796ed5a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_mpii.yml
@@ -0,0 +1,34 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: MPII
+  Name: topdown_heatmap_hrnet_w32_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.9
+      Mean@0.1: 0.334
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256-6c4f923f_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_hrnet_w48_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.901
+      Mean@0.1: 0.337
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256-92cab7bd_20200812.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ef7e84d708f8426ab5aaa0502c15c82de4e81a6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256.py
@@ -0,0 +1,154 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..503920eb1c50271b7f6615081464bf13265f97b9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256_dark.py
@@ -0,0 +1,154 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..d31a172fbf2a022a815ac65554afeb829e70ab8e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w32_mpii_256x256_udp.py
@@ -0,0 +1,161 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a4ef131ea479c18cd754256bc73d221e7ff348
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256.py
@@ -0,0 +1,154 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4531f0f99617711548bd2374f9095b049726580b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256_dark.py
@@ -0,0 +1,154 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..d373d830d9e1242248f2fef534a903916b851cfe
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_w48_mpii_256x256_udp.py
@@ -0,0 +1,161 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_18_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_18_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2a31e2c266d999af8b9532aefb97ae22779896f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_18_mpii_256x256.py
@@ -0,0 +1,145 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', key_indicator='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='LiteHRNet',
+        in_channels=3,
+        extra=dict(
+            stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+            num_stages=3,
+            stages_spec=dict(
+                num_modules=(2, 4, 2),
+                num_branches=(2, 3, 4),
+                num_blocks=(2, 2, 2),
+                module_type=('LITE', 'LITE', 'LITE'),
+                with_fuse=(True, True, True),
+                reduce_ratios=(8, 8, 8),
+                num_channels=(
+                    (40, 80),
+                    (40, 80, 160),
+                    (40, 80, 160, 320),
+                )),
+            with_head=True,
+        )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=40,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_30_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_30_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b56ac9325df90ba3d13e1190d9db63bdc93f678
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_30_mpii_256x256.py
@@ -0,0 +1,145 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', key_indicator='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='LiteHRNet',
+        in_channels=3,
+        extra=dict(
+            stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+            num_stages=3,
+            stages_spec=dict(
+                num_modules=(3, 8, 3),
+                num_branches=(2, 3, 4),
+                num_blocks=(2, 2, 2),
+                module_type=('LITE', 'LITE', 'LITE'),
+                with_fuse=(True, True, True),
+                reduce_ratios=(8, 8, 8),
+                num_channels=(
+                    (40, 80),
+                    (40, 80, 160),
+                    (40, 80, 160, 320),
+                )),
+            with_head=True,
+        )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=40,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..d77a3bae6155f25180c12e541111529ab80d9594
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_mpii.md
@@ -0,0 +1,39 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2104.06403">LiteHRNet (CVPR'2021)</a></summary>
+
+```bibtex
+@inproceedings{Yulitehrnet21,
+  title={Lite-HRNet: A Lightweight High-Resolution Network},
+  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
+  booktitle={CVPR},
+  year={2021}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [LiteHRNet-18](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_18_mpii_256x256.py) | 256x256 | 0.859 | 0.260 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_mpii_256x256-cabd7984_20210623.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_mpii_256x256_20210623.log.json) |
+| [LiteHRNet-30](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_30_mpii_256x256.py) | 256x256 | 0.869 | 0.271 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_mpii_256x256-faae8bd8_20210622.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_mpii_256x256_20210622.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ae20a7352692714813ee839c62100a9b0f8c6250
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_mpii.yml
@@ -0,0 +1,34 @@
+Collections:
+- Name: LiteHRNet
+  Paper:
+    Title: 'Lite-HRNet: A Lightweight High-Resolution Network'
+    URL: https://arxiv.org/abs/2104.06403
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/litehrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_18_mpii_256x256.py
+  In Collection: LiteHRNet
+  Metadata:
+    Architecture: &id001
+    - LiteHRNet
+    Training Data: MPII
+  Name: topdown_heatmap_litehrnet_18_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.859
+      Mean@0.1: 0.26
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_mpii_256x256-cabd7984_20210623.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_30_mpii_256x256.py
+  In Collection: LiteHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_litehrnet_30_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.869
+      Mean@0.1: 0.271
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_mpii_256x256-faae8bd8_20210622.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..f811d33041b8af9cfe226c9391228721b3a4ba98
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii.md
@@ -0,0 +1,39 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html">MobilenetV2 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4510--4520},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_mobilenetv2](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mobilenet_v2/mpii/mobilenet_v2_mpii_256x256.py) | 256x256 | 0.854 | 0.235 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_mpii_256x256-e068afa7_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_mpii_256x256_20200812.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..87a4912b4ae4842480bf0642bac2d214fa65a4c5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii.yml
@@ -0,0 +1,21 @@
+Collections:
+- Name: MobilenetV2
+  Paper:
+    Title: 'Mobilenetv2: Inverted residuals and linear bottlenecks'
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/mobilenetv2.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mobilenet_v2/mpii/mobilenet_v2_mpii_256x256.py
+  In Collection: MobilenetV2
+  Metadata:
+    Architecture:
+    - MobilenetV2
+    Training Data: MPII
+  Name: topdown_heatmap_mpii
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.854
+      Mean@0.1: 0.235
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_mpii_256x256-e068afa7_20200812.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..b13feaf1fc77695d59fcc334e687909b72147aa2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res101_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res101_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e09b84e98e02e044b4b7c7d967041582fd28502
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res101_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res152_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res152_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c5456e0041c8aeff08f8ce975206c3cdf2156f0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res152_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res50_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res50_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9898e43e2ed7ce0b2306d0b4f14b312d82bff
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res50_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnet_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnet_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..64a5337b5005144483a6c500237019d71bae9cad
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnet_mpii.md
@@ -0,0 +1,58 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res50_mpii_256x256.py) | 256x256 | 0.882 | 0.286 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256_20200812.log.json) |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res101_mpii_256x256.py) | 256x256 | 0.888 | 0.290 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_256x256-416f5d71_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_256x256_20200812.log.json) |
+| [pose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res152_mpii_256x256.py) | 256x256 | 0.889 | 0.303 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_256x256-3ecba29d_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_256x256_20200812.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnet_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnet_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..227eb34c59cd05c9ff0d654a5fb27552af12aab7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnet_mpii.yml
@@ -0,0 +1,48 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res50_mpii_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: MPII
+  Name: topdown_heatmap_res50_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.882
+      Mean@0.1: 0.286
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res101_mpii_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_res101_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.888
+      Mean@0.1: 0.29
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_256x256-416f5d71_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/res152_mpii_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_res152_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.889
+      Mean@0.1: 0.303
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_256x256-3ecba29d_20200812.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d101_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d101_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..d35b83a44ec1d555b6896c1ce8699802901faf29
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d101_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnet101_v1d',
+    backbone=dict(type='ResNetV1d', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d152_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d152_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6e26ca93989ec7549dd7179a0f99e984b5505f4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d152_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnet152_v1d',
+    backbone=dict(type='ResNetV1d', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d50_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d50_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..e10ad9ed76a7626451e764835ba26805de65a086
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d50_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnet50_v1d',
+    backbone=dict(type='ResNetV1d', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..27a655eedd1be7a8a7b11728e78ab6b88b16808a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d_mpii.md
@@ -0,0 +1,41 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/He_Bag_of_Tricks_for_Image_Classification_with_Convolutional_Neural_Networks_CVPR_2019_paper.html">ResNetV1D (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{he2019bag,
+  title={Bag of tricks for image classification with convolutional neural networks},
+  author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={558--567},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_resnetv1d_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d50_mpii_256x256.py) | 256x256 | 0.881 | 0.290 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_mpii_256x256-2337a92e_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_mpii_256x256_20200812.log.json) |
+| [pose_resnetv1d_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d101_mpii_256x256.py) | 256x256 | 0.883 | 0.295 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_mpii_256x256-2851d710_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_mpii_256x256_20200812.log.json) |
+| [pose_resnetv1d_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d152_mpii_256x256.py) | 256x256 | 0.888 | 0.300 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_mpii_256x256-8b10a87c_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_mpii_256x256_20200812.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b02c3d44f17436c9ee248a3271651a85fef98555
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d_mpii.yml
@@ -0,0 +1,47 @@
+Collections:
+- Name: ResNetV1D
+  Paper:
+    Title: Bag of tricks for image classification with convolutional neural networks
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/He_Bag_of_Tricks_for_Image_Classification_with_Convolutional_Neural_Networks_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnetv1d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d50_mpii_256x256.py
+  In Collection: ResNetV1D
+  Metadata:
+    Architecture: &id001
+    - ResNetV1D
+    Training Data: MPII
+  Name: topdown_heatmap_resnetv1d50_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.881
+      Mean@0.1: 0.29
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_mpii_256x256-2337a92e_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d101_mpii_256x256.py
+  In Collection: ResNetV1D
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_resnetv1d101_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.883
+      Mean@0.1: 0.295
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_mpii_256x256-2851d710_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d152_mpii_256x256.py
+  In Collection: ResNetV1D
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_resnetv1d152_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.888
+      Mean@0.1: 0.3
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_mpii_256x256-8b10a87c_20200812.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext101_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext101_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..d01af2be2e1dd85b90245443dddb1a706938b159
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext101_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnext101_32x4d',
+    backbone=dict(type='ResNeXt', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext152_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext152_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d730b49a1196d90767f04fb595891fe01b4c76f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext152_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnext152_32x4d',
+    backbone=dict(type='ResNeXt', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext50_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext50_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..22d97420bba76867dca4325da7c928b6d157d78f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext50_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://resnext50_32x4d',
+    backbone=dict(type='ResNeXt', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..b118ca4fd0999e83daa64c6f2ee1f4b764dc2c12
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext_mpii.md
@@ -0,0 +1,39 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2017/html/Xie_Aggregated_Residual_Transformations_CVPR_2017_paper.html">ResNext (CVPR'2017)</a></summary>
+
+```bibtex
+@inproceedings{xie2017aggregated,
+  title={Aggregated residual transformations for deep neural networks},
+  author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1492--1500},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_resnext_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext152_mpii_256x256.py) | 256x256 | 0.887 | 0.294 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_mpii_256x256-df302719_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_mpii_256x256_20200927.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c3ce9cd12126bd92da34ff99f889e6c96faaf77d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext_mpii.yml
@@ -0,0 +1,21 @@
+Collections:
+- Name: ResNext
+  Paper:
+    Title: Aggregated residual transformations for deep neural networks
+    URL: http://openaccess.thecvf.com/content_cvpr_2017/html/Xie_Aggregated_Residual_Transformations_CVPR_2017_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnext.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext152_mpii_256x256.py
+  In Collection: ResNext
+  Metadata:
+    Architecture:
+    - ResNext
+    Training Data: MPII
+  Name: topdown_heatmap_resnext152_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.887
+      Mean@0.1: 0.294
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_mpii_256x256-df302719_20200927.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet101_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet101_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4f746671f00fb12b9a511cd643b04b10530e268
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet101_mpii_256x256.py
@@ -0,0 +1,124 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/scnet101-94250a77.pth',
+    backbone=dict(type='SCNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet50_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet50_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a4011f3419ac1272f03be3f13d89d1021cac94b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet50_mpii_256x256.py
@@ -0,0 +1,124 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/scnet50-7ef0a199.pth',
+    backbone=dict(type='SCNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..0a282b77f9a1d09842e738f67cb5d4c13bb342e8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet_mpii.md
@@ -0,0 +1,40 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Liu_Improving_Convolutional_Networks_With_Self-Calibrated_Convolutions_CVPR_2020_paper.html">SCNet (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{liu2020improving,
+  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
+  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10096--10105},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_scnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet50_mpii_256x256.py) | 256x256 | 0.888 | 0.290 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_mpii_256x256-a54b6af5_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_mpii_256x256_20200812.log.json) |
+| [pose_scnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet101_mpii_256x256.py) | 256x256 | 0.886 | 0.293 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_mpii_256x256-b4c2d184_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_mpii_256x256_20200812.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..681c59b39967bfd5ada38cdda4cf3dd8cf2969ae
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet_mpii.yml
@@ -0,0 +1,34 @@
+Collections:
+- Name: SCNet
+  Paper:
+    Title: Improving Convolutional Networks with Self-Calibrated Convolutions
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Liu_Improving_Convolutional_Networks_With_Self-Calibrated_Convolutions_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/scnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet50_mpii_256x256.py
+  In Collection: SCNet
+  Metadata:
+    Architecture: &id001
+    - SCNet
+    Training Data: MPII
+  Name: topdown_heatmap_scnet50_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.888
+      Mean@0.1: 0.29
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_mpii_256x256-a54b6af5_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet101_mpii_256x256.py
+  In Collection: SCNet
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_scnet101_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.886
+      Mean@0.1: 0.293
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_mpii_256x256-b4c2d184_20200812.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet101_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet101_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffe3cfe2c536ff48e5ed9d1edf59cb94af38c1be
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet101_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://se-resnet101',
+    backbone=dict(type='SEResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet152_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet152_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa12a8d03ed75d8f656662f85ccac2a0e6d4130a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet152_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='SEResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet50_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet50_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3382e19cc9f3d018eadca99f512bc4cf21c221c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet50_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://se-resnet50',
+    backbone=dict(type='SEResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe25c1cab35cafdf3a487580dd840dcca174bb06
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet_mpii.md
@@ -0,0 +1,43 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Hu_Squeeze-and-Excitation_Networks_CVPR_2018_paper">SEResNet (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{hu2018squeeze,
+  title={Squeeze-and-excitation networks},
+  author={Hu, Jie and Shen, Li and Sun, Gang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={7132--7141},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_seresnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet50_mpii_256x256.py) | 256x256 | 0.884 | 0.292 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_mpii_256x256-1bb21f79_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_mpii_256x256_20200927.log.json) |
+| [pose_seresnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet101_mpii_256x256.py) | 256x256 | 0.884 | 0.295 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_mpii_256x256-0ba14ff5_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_mpii_256x256_20200927.log.json) |
+| [pose_seresnet_152\*](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet152_mpii_256x256.py) | 256x256 | 0.884 | 0.287 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_mpii_256x256-6ea1e774_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_mpii_256x256_20200927.log.json) |
+
+Note that \* means without imagenet pre-training.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..86e79d30db3a21b09628c1d542aa835969fb880b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet_mpii.yml
@@ -0,0 +1,47 @@
+Collections:
+- Name: SEResNet
+  Paper:
+    Title: Squeeze-and-excitation networks
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Hu_Squeeze-and-Excitation_Networks_CVPR_2018_paper
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/seresnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet50_mpii_256x256.py
+  In Collection: SEResNet
+  Metadata:
+    Architecture: &id001
+    - SEResNet
+    Training Data: MPII
+  Name: topdown_heatmap_seresnet50_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.884
+      Mean@0.1: 0.292
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_mpii_256x256-1bb21f79_20200927.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet101_mpii_256x256.py
+  In Collection: SEResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_seresnet101_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.884
+      Mean@0.1: 0.295
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_mpii_256x256-0ba14ff5_20200927.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet152_mpii_256x256.py
+  In Collection: SEResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII
+  Name: topdown_heatmap_seresnet152_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.884
+      Mean@0.1: 0.287
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_mpii_256x256-6ea1e774_20200927.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..fb165265725276c48cc893655ca025faaf7be3b0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii.md
@@ -0,0 +1,39 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Zhang_ShuffleNet_An_Extremely_CVPR_2018_paper.html">ShufflenetV1 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{zhang2018shufflenet,
+  title={Shufflenet: An extremely efficient convolutional neural network for mobile devices},
+  author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={6848--6856},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_shufflenetv1](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii_256x256.py) | 256x256 | 0.823 | 0.195 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_mpii_256x256-dcc1c896_20200925.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_mpii_256x256_20200925.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f707dcfbb4c2be55a7cde70958a1ddac407fe508
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: ShufflenetV1
+  Paper:
+    Title: 'Shufflenet: An extremely efficient convolutional neural network for mobile
+      devices'
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Zhang_ShuffleNet_An_Extremely_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/shufflenetv1.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii_256x256.py
+  In Collection: ShufflenetV1
+  Metadata:
+    Architecture:
+    - ShufflenetV1
+    Training Data: MPII
+  Name: topdown_heatmap_shufflenetv1_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.823
+      Mean@0.1: 0.195
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_mpii_256x256-dcc1c896_20200925.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a665ba0727d444b2bf1762e83e65fdc881792cd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://shufflenet_v1',
+    backbone=dict(type='ShuffleNetV1', groups=3),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=960,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii.md
new file mode 100644
index 0000000000000000000000000000000000000000..9990df0c9daf23ca5d3389c7ef2b0862fac50d4a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii.md
@@ -0,0 +1,39 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Ningning_Light-weight_CNN_Architecture_ECCV_2018_paper.html">ShufflenetV2 (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{ma2018shufflenet,
+  title={Shufflenet v2: Practical guidelines for efficient cnn architecture design},
+  author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={116--131},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2014},
+  month = {June}
+}
+```
+
+</details>
+
+Results on MPII val set
+
+| Arch  | Input Size | Mean | Mean@0.1   | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |
+| [pose_shufflenetv2](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii_256x256.py) | 256x256 | 0.828 | 0.205 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_mpii_256x256-4fb9df2d_20200925.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_mpii_256x256_20200925.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii.yml
new file mode 100644
index 0000000000000000000000000000000000000000..58a4724215f6004f7ffb8bced17ce9e228a44998
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii.yml
@@ -0,0 +1,21 @@
+Collections:
+- Name: ShufflenetV2
+  Paper:
+    Title: 'Shufflenet v2: Practical guidelines for efficient cnn architecture design'
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Ningning_Light-weight_CNN_Architecture_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/shufflenetv2.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii_256x256.py
+  In Collection: ShufflenetV2
+  Metadata:
+    Architecture:
+    - ShufflenetV2
+    Training Data: MPII
+  Name: topdown_heatmap_shufflenetv2_mpii_256x256
+  Results:
+  - Dataset: MPII
+    Metrics:
+      Mean: 0.828
+      Mean@0.1: 0.205
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_mpii_256x256-4fb9df2d_20200925.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..25937d116bd16523ed5624a0dff76e3abdf9fc42
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii_256x256.py
@@ -0,0 +1,123 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://shufflenet_v2',
+    backbone=dict(type='ShuffleNetV2', widen_factor=1.0),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiDataset',
+        ann_file=f'{data_root}/annotations/mpii_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res101_mpii_trb_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res101_mpii_trb_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e841a09a3bd02709f2b857ea5a10efc3a657ff
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res101_mpii_trb_256x256.py
@@ -0,0 +1,122 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii_trb.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=40,
+    dataset_joints=40,
+    dataset_channel=list(range(40)),
+    inference_channel=list(range(40)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiTrbDataset',
+        ann_file=f'{data_root}/annotations/mpii_trb_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiTrbDataset',
+        ann_file=f'{data_root}/annotations/mpii_trb_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiTrbDataset',
+        ann_file=f'{data_root}/annotations/mpii_trb_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res152_mpii_trb_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res152_mpii_trb_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9862fc8f0160cd4c1b6d9c89a7b3cd1b88346aa
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res152_mpii_trb_256x256.py
@@ -0,0 +1,122 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii_trb.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=40,
+    dataset_joints=40,
+    dataset_channel=list(range(40)),
+    inference_channel=list(range(40)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiTrbDataset',
+        ann_file=f'{data_root}/annotations/mpii_trb_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiTrbDataset',
+        ann_file=f'{data_root}/annotations/mpii_trb_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiTrbDataset',
+        ann_file=f'{data_root}/annotations/mpii_trb_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res50_mpii_trb_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res50_mpii_trb_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc24472abab2f77919352ebabdd3e2e138e8a09
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res50_mpii_trb_256x256.py
@@ -0,0 +1,122 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpii_trb.py'
+]
+evaluation = dict(interval=10, metric='PCKh', save_best='PCKh')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=40,
+    dataset_joints=40,
+    dataset_channel=list(range(40)),
+    inference_channel=list(range(40)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_gt_bbox=True,
+    bbox_file=None,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/mpii'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownMpiiTrbDataset',
+        ann_file=f'{data_root}/annotations/mpii_trb_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownMpiiTrbDataset',
+        ann_file=f'{data_root}/annotations/mpii_trb_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownMpiiTrbDataset',
+        ann_file=f'{data_root}/annotations/mpii_trb_val.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/resnet_mpii_trb.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/resnet_mpii_trb.md
new file mode 100644
index 0000000000000000000000000000000000000000..10e2b9f8c1c488981ad7c34a7599215b3d55cf8a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/resnet_mpii_trb.md
@@ -0,0 +1,58 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ICCV_2019/html/Duan_TRB_A_Novel_Triplet_Representation_for_Understanding_2D_Human_Body_ICCV_2019_paper.html">MPII-TRB (ICCV'2019)</a></summary>
+
+```bibtex
+@inproceedings{duan2019trb,
+  title={TRB: A Novel Triplet Representation for Understanding 2D Human Body},
+  author={Duan, Haodong and Lin, Kwan-Yee and Jin, Sheng and Liu, Wentao and Qian, Chen and Ouyang, Wanli},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={9479--9488},
+  year={2019}
+}
+```
+
+</details>
+
+Results on MPII-TRB val set
+
+| Arch  | Input Size | Skeleton Acc   | Contour Acc   | Mean Acc | ckpt    | log     |
+| :--- | :--------: | :------: | :------: |:------: |:------: |:------: |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res50_mpii_trb_256x256.py)  | 256x256 | 0.887 | 0.858 | 0.868 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_trb_256x256-896036b8_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_trb_256x256_20200812.log.json) |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res101_mpii_trb_256x256.py)  | 256x256 | 0.890 | 0.863 | 0.873 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_trb_256x256-cfad2f05_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_trb_256x256_20200812.log.json) |
+| [pose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res152_mpii_trb_256x256.py)  | 256x256 | 0.897 | 0.868 | 0.879 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_trb_256x256-dd369ce6_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_trb_256x256_20200812.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/resnet_mpii_trb.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/resnet_mpii_trb.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0f7f7458137ee0fa5eed1853aad25e3a30318eee
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/resnet_mpii_trb.yml
@@ -0,0 +1,51 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res50_mpii_trb_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: MPII-TRB
+  Name: topdown_heatmap_res50_mpii_trb_256x256
+  Results:
+  - Dataset: MPII-TRB
+    Metrics:
+      Contour Acc: 0.858
+      Mean Acc: 0.868
+      Skeleton Acc: 0.887
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_trb_256x256-896036b8_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res101_mpii_trb_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII-TRB
+  Name: topdown_heatmap_res101_mpii_trb_256x256
+  Results:
+  - Dataset: MPII-TRB
+    Metrics:
+      Contour Acc: 0.863
+      Mean Acc: 0.873
+      Skeleton Acc: 0.89
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_trb_256x256-cfad2f05_20200812.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/res152_mpii_trb_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: MPII-TRB
+  Name: topdown_heatmap_res152_mpii_trb_256x256
+  Results:
+  - Dataset: MPII-TRB
+    Metrics:
+      Contour Acc: 0.868
+      Mean Acc: 0.879
+      Skeleton Acc: 0.897
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_trb_256x256-dd369ce6_20200812.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_base_ochuman_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_base_ochuman_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..84dbfacbb8abb61ac1e7bb5e2eea528d06bb4d13
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_base_ochuman_256x192.py
@@ -0,0 +1,153 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_huge_ochuman_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_huge_ochuman_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..130fca6264d2e1b6f949787cac23b8a857e22870
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_huge_ochuman_256x192.py
@@ -0,0 +1,153 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_large_ochuman_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_large_ochuman_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..af7f5d1e3de14e2ecef1dc8b61aee2d7e50e8f45
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_large_ochuman_256x192.py
@@ -0,0 +1,153 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_small_ochuman_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_small_ochuman_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..58bd1caba5bd07a4bef73e7131a995ee678043a4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/ViTPose_small_ochuman_256x192.py
@@ -0,0 +1,153 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_ochuman.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_ochuman.md
new file mode 100644
index 0000000000000000000000000000000000000000..e844b067adb2d8cf59fcd8fe63a6b1e8d5f9825b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_ochuman.md
@@ -0,0 +1,44 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Zhang_Pose2Seg_Detection_Free_Human_Instance_Segmentation_CVPR_2019_paper.html">OCHuman (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{zhang2019pose2seg,
+  title={Pose2seg: Detection free human instance segmentation},
+  author={Zhang, Song-Hai and Li, Ruilong and Dong, Xin and Rosin, Paul and Cai, Zixi and Han, Xi and Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={889--898},
+  year={2019}
+}
+```
+
+</details>
+
+Results on OCHuman test dataset with ground-truth bounding boxes
+
+Following the common setting, the models are trained on COCO train dataset, and evaluate on OCHuman dataset.
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w32_ochuman_256x192.py)  | 256x192 | 0.591 | 0.748 | 0.641 | 0.631 | 0.775 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192_20200708.log.json) |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w32_ochuman_384x288.py)  | 384x288 | 0.606 | 0.748 | 0.650 | 0.647 | 0.776 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_384x288-d9f0d786_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_384x288_20200708.log.json) |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w48_ochuman_256x192.py) | 256x192 | 0.611 | 0.752 | 0.663 | 0.648 | 0.778 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192_20200708.log.json) |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w48_ochuman_384x288.py) | 384x288 | 0.616 | 0.749 | 0.663 | 0.653 | 0.773 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288-314c8528_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288_20200708.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_ochuman.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_ochuman.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0b3b625af0baa50746f5f82a88d92a7d171e1392
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_ochuman.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w32_ochuman_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: OCHuman
+  Name: topdown_heatmap_hrnet_w32_ochuman_256x192
+  Results:
+  - Dataset: OCHuman
+    Metrics:
+      AP: 0.591
+      AP@0.5: 0.748
+      AP@0.75: 0.641
+      AR: 0.631
+      AR@0.5: 0.775
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w32_ochuman_384x288.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: OCHuman
+  Name: topdown_heatmap_hrnet_w32_ochuman_384x288
+  Results:
+  - Dataset: OCHuman
+    Metrics:
+      AP: 0.606
+      AP@0.5: 0.748
+      AP@0.75: 0.65
+      AR: 0.647
+      AR@0.5: 0.776
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_384x288-d9f0d786_20200708.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w48_ochuman_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: OCHuman
+  Name: topdown_heatmap_hrnet_w48_ochuman_256x192
+  Results:
+  - Dataset: OCHuman
+    Metrics:
+      AP: 0.611
+      AP@0.5: 0.752
+      AP@0.75: 0.663
+      AR: 0.648
+      AR@0.5: 0.778
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w48_ochuman_384x288.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: OCHuman
+  Name: topdown_heatmap_hrnet_w48_ochuman_384x288
+  Results:
+  - Dataset: OCHuman
+    Metrics:
+      AP: 0.616
+      AP@0.5: 0.749
+      AP@0.75: 0.663
+      AR: 0.653
+      AR@0.5: 0.773
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288-314c8528_20200708.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w32_ochuman_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w32_ochuman_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ea620501b3522c1e5f91350cf33ce4443624643
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w32_ochuman_256x192.py
@@ -0,0 +1,168 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w32_ochuman_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w32_ochuman_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..3612849918fdfe18d9f9a0fd031b49e6928e6d6c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w32_ochuman_384x288.py
@@ -0,0 +1,168 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w48_ochuman_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w48_ochuman_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..d26bd814ca4182542c9f78076672f32cb51acc7f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w48_ochuman_256x192.py
@@ -0,0 +1,168 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w48_ochuman_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w48_ochuman_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..246adaf687bf3f69edcd1ab82e4c027e30511d37
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_w48_ochuman_384x288.py
@@ -0,0 +1,168 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res101_ochuman_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res101_ochuman_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..c50002c895d3878b6986a36906f65e62b523515e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res101_ochuman_256x192.py
@@ -0,0 +1,137 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res101_ochuman_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res101_ochuman_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..84e3842b7ff04055fa5e6f4f7f88e5190af85edc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res101_ochuman_384x288.py
@@ -0,0 +1,137 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res152_ochuman_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res152_ochuman_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..b71fb679b851e9280810df589d46269420834989
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res152_ochuman_256x192.py
@@ -0,0 +1,137 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res152_ochuman_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res152_ochuman_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d95e1fcd780d9305b77595ce670122f56eee53
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res152_ochuman_384x288.py
@@ -0,0 +1,137 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=48,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res50_ochuman_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res50_ochuman_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..0649558c4a16eadfe7c6241657ace7c5e57872b1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res50_ochuman_256x192.py
@@ -0,0 +1,137 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res50_ochuman_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res50_ochuman_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b7f957c91b7acc9718c4c5d4bb215f5d50537bb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/res50_ochuman_384x288.py
@@ -0,0 +1,137 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/ochuman.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/ochuman'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file='data/coco/annotations/person_keypoints_train2017.json',
+        img_prefix='data/coco//train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_val_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownOCHumanDataset',
+        ann_file=f'{data_root}/annotations/'
+        'ochuman_coco_format_test_range_0.00_1.00.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/resnet_ochuman.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/resnet_ochuman.md
new file mode 100644
index 0000000000000000000000000000000000000000..5b948f811821edcfc007d5ec85663319ebeacd87
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/resnet_ochuman.md
@@ -0,0 +1,63 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Zhang_Pose2Seg_Detection_Free_Human_Instance_Segmentation_CVPR_2019_paper.html">OCHuman (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{zhang2019pose2seg,
+  title={Pose2seg: Detection free human instance segmentation},
+  author={Zhang, Song-Hai and Li, Ruilong and Dong, Xin and Rosin, Paul and Cai, Zixi and Han, Xi and Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={889--898},
+  year={2019}
+}
+```
+
+</details>
+
+Results on OCHuman test dataset with ground-truth bounding boxes
+
+Following the common setting, the models are trained on COCO train dataset, and evaluate on OCHuman dataset.
+
+| Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192.py)  | 256x192 | 0.546 | 0.726 | 0.593 | 0.592 | 0.755 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_20200709.log.json) |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288.py)  | 384x288 | 0.539 | 0.723 | 0.574 | 0.588 | 0.756 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288-e6f795e9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288_20200709.log.json) |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192.py) | 256x192 | 0.559 | 0.724 | 0.606 | 0.605 | 0.751 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192-6e6babf0_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192_20200708.log.json) |
+| [pose_resnet_101](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288.py) | 384x288 | 0.571 | 0.715 | 0.615 | 0.615 | 0.748 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288-8c71bdc9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288_20200709.log.json) |
+| [pose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192.py) | 256x192 | 0.570 | 0.725 | 0.617 | 0.616 | 0.754 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192-f6e307c2_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192_20200709.log.json) |
+| [pose_resnet_152](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288.py) | 384x288 | 0.582 | 0.723 | 0.627 | 0.627 | 0.752 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288-3860d4c9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288_20200709.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/resnet_ochuman.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/resnet_ochuman.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7757701c2597f853ccf45a8ad593f297958e75b7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/resnet_ochuman.yml
@@ -0,0 +1,105 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: OCHuman
+  Name: topdown_heatmap_res50_coco_256x192
+  Results:
+  - Dataset: OCHuman
+    Metrics:
+      AP: 0.546
+      AP@0.5: 0.726
+      AP@0.75: 0.593
+      AR: 0.592
+      AR@0.5: 0.755
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res50_coco_384x288.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: OCHuman
+  Name: topdown_heatmap_res50_coco_384x288
+  Results:
+  - Dataset: OCHuman
+    Metrics:
+      AP: 0.539
+      AP@0.5: 0.723
+      AP@0.75: 0.574
+      AR: 0.588
+      AR@0.5: 0.756
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288-e6f795e9_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: OCHuman
+  Name: topdown_heatmap_res101_coco_256x192
+  Results:
+  - Dataset: OCHuman
+    Metrics:
+      AP: 0.559
+      AP@0.5: 0.724
+      AP@0.75: 0.606
+      AR: 0.605
+      AR@0.5: 0.751
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192-6e6babf0_20200708.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res101_coco_384x288.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: OCHuman
+  Name: topdown_heatmap_res101_coco_384x288
+  Results:
+  - Dataset: OCHuman
+    Metrics:
+      AP: 0.571
+      AP@0.5: 0.715
+      AP@0.75: 0.615
+      AR: 0.615
+      AR@0.5: 0.748
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288-8c71bdc9_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: OCHuman
+  Name: topdown_heatmap_res152_coco_256x192
+  Results:
+  - Dataset: OCHuman
+    Metrics:
+      AP: 0.57
+      AP@0.5: 0.725
+      AP@0.75: 0.617
+      AR: 0.616
+      AR@0.5: 0.754
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192-f6e307c2_20200709.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/res152_coco_384x288.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: OCHuman
+  Name: topdown_heatmap_res152_coco_384x288
+  Results:
+  - Dataset: OCHuman
+    Metrics:
+      AP: 0.582
+      AP@0.5: 0.723
+      AP@0.75: 0.627
+      AR: 0.627
+      AR@0.5: 0.752
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288-3860d4c9_20200709.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_posetrack18.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_posetrack18.md
new file mode 100644
index 0000000000000000000000000000000000000000..9c8117b48b04ecc15a4daefa38738d34171e3318
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_posetrack18.md
@@ -0,0 +1,56 @@
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Andriluka_PoseTrack_A_Benchmark_CVPR_2018_paper.html">PoseTrack18 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{andriluka2018posetrack,
+  title={Posetrack: A benchmark for human pose estimation and tracking},
+  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={5167--5176},
+  year={2018}
+}
+```
+
+</details>
+
+Results on PoseTrack2018 val with ground-truth bounding boxes
+
+| Arch  | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total  | ckpt    | log     |
+| :--- | :--------: | :------: |:------: |:------: |:------: |:------: |:------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_256x192.py) | 256x192 | 87.4 | 88.6 | 84.3 | 78.5 | 79.7 | 81.8 | 78.8 | 83.0 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192_20201028.log.json) |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_384x288.py) | 384x288 | 87.0 | 88.8 | 85.0 | 80.1 | 80.5 | 82.6 | 79.4 | 83.6 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288_20211130.log.json) |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_256x192.py) | 256x192 | 88.2 | 90.1 | 85.8 | 80.8 | 80.7 | 83.3 | 80.3 | 84.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192_20211130.log.json) |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_384x288.py) | 384x288 | 87.8 | 90.0 | 85.9 | 81.3 | 81.1 | 83.3 | 80.9 | 84.5 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288_20211130.log.json) |
+
+The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.
+
+Results on PoseTrack2018 val with [MMDetection](https://github.com/open-mmlab/mmdetection) pre-trained [Cascade R-CNN](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth) (X-101-64x4d-FPN) human detector
+
+| Arch  | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total  | ckpt    | log     |
+| :--- | :--------: | :------: |:------: |:------: |:------: |:------: |:------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_256x192.py) | 256x192 | 78.0 | 82.9 | 79.5 | 73.8 | 76.9 | 76.6 | 70.2 | 76.9 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192_20201028.log.json) |
+| [pose_hrnet_w32](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_384x288.py) | 384x288 | 79.9 | 83.6 | 80.4 | 74.5 | 74.8 | 76.1 | 70.5 | 77.3 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288_20211130.log.json) |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_256x192.py) | 256x192 | 80.1 | 83.4 | 80.6 | 74.8 | 74.3 | 76.8 | 70.4 | 77.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192_20211130.log.json) |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_384x288.py) | 384x288 | 80.2 | 83.8 | 80.9 | 75.2 | 74.7 | 76.7 | 71.7 | 77.8 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288_20211130.log.json) |
+
+The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_posetrack18.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_posetrack18.yml
new file mode 100644
index 0000000000000000000000000000000000000000..349daa295a1006a3c9ea424b5c709d47b6196a91
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_posetrack18.yml
@@ -0,0 +1,160 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: PoseTrack18
+  Name: topdown_heatmap_hrnet_w32_posetrack18_256x192
+  Results:
+  - Dataset: PoseTrack18
+    Metrics:
+      Ankl: 78.8
+      Elb: 84.3
+      Head: 87.4
+      Hip: 79.7
+      Knee: 81.8
+      Shou: 88.6
+      Total: 83.0
+      Wri: 78.5
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_384x288.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: PoseTrack18
+  Name: topdown_heatmap_hrnet_w32_posetrack18_384x288
+  Results:
+  - Dataset: PoseTrack18
+    Metrics:
+      Ankl: 79.4
+      Elb: 85.0
+      Head: 87.0
+      Hip: 80.5
+      Knee: 82.6
+      Shou: 88.8
+      Total: 83.6
+      Wri: 80.1
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: PoseTrack18
+  Name: topdown_heatmap_hrnet_w48_posetrack18_256x192
+  Results:
+  - Dataset: PoseTrack18
+    Metrics:
+      Ankl: 80.3
+      Elb: 85.8
+      Head: 88.2
+      Hip: 80.7
+      Knee: 83.3
+      Shou: 90.1
+      Total: 84.4
+      Wri: 80.8
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_384x288.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: PoseTrack18
+  Name: topdown_heatmap_hrnet_w48_posetrack18_384x288
+  Results:
+  - Dataset: PoseTrack18
+    Metrics:
+      Ankl: 80.9
+      Elb: 85.9
+      Head: 87.8
+      Hip: 81.1
+      Knee: 83.3
+      Shou: 90.0
+      Total: 84.5
+      Wri: 81.3
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: PoseTrack18
+  Name: topdown_heatmap_hrnet_w32_posetrack18_256x192
+  Results:
+  - Dataset: PoseTrack18
+    Metrics:
+      Ankl: 70.2
+      Elb: 79.5
+      Head: 78.0
+      Hip: 76.9
+      Knee: 76.6
+      Shou: 82.9
+      Total: 76.9
+      Wri: 73.8
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_384x288.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: PoseTrack18
+  Name: topdown_heatmap_hrnet_w32_posetrack18_384x288
+  Results:
+  - Dataset: PoseTrack18
+    Metrics:
+      Ankl: 70.5
+      Elb: 80.4
+      Head: 79.9
+      Hip: 74.8
+      Knee: 76.1
+      Shou: 83.6
+      Total: 77.3
+      Wri: 74.5
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: PoseTrack18
+  Name: topdown_heatmap_hrnet_w48_posetrack18_256x192
+  Results:
+  - Dataset: PoseTrack18
+    Metrics:
+      Ankl: 70.4
+      Elb: 80.6
+      Head: 80.1
+      Hip: 74.3
+      Knee: 76.8
+      Shou: 83.4
+      Total: 77.4
+      Wri: 74.8
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_384x288.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: PoseTrack18
+  Name: topdown_heatmap_hrnet_w48_posetrack18_384x288
+  Results:
+  - Dataset: PoseTrack18
+    Metrics:
+      Ankl: 71.7
+      Elb: 80.9
+      Head: 80.2
+      Hip: 74.7
+      Knee: 76.7
+      Shou: 83.8
+      Total: 77.8
+      Wri: 75.2
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e0bab25d081111c9eb2b6f30a2e733f10ca48fa
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_256x192.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/posetrack18.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric='mAP', save_best='Total AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[10, 15])
+total_epochs = 20
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.4,
+    bbox_file='data/posetrack18/annotations/'
+    'posetrack18_val_human_detections.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/posetrack18'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cb933fbaf4f69bb517f1ccbe157ace5afce2d36
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w32_posetrack18_384x288.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/posetrack18.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_384x288-d9f0d786_20200708.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric='mAP', save_best='Total AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[10, 15])
+total_epochs = 20
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.4,
+    bbox_file='data/posetrack18/annotations/'
+    'posetrack18_val_human_detections.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/posetrack18'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcfb6214c4ace61db2f72f54d3cf40c8f8033296
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_256x192.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/posetrack18.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric='mAP', save_best='Total AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[10, 15])
+total_epochs = 20
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.4,
+    bbox_file='data/posetrack18/annotations/'
+    'posetrack18_val_human_detections.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/posetrack18'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..78edf760140cd4d6041ae3304d5c14b660857840
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_w48_posetrack18_384x288.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/posetrack18.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288-314c8528_20200708.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric='mAP', save_best='Total AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[10, 15])
+total_epochs = 20
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.4,
+    bbox_file='data/posetrack18/annotations/'
+    'posetrack18_val_human_detections.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/posetrack18'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/res50_posetrack18_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/res50_posetrack18_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..341fa1b13c0b35c726e9f863be55856df774bcab
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/res50_posetrack18_256x192.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/posetrack18.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth'  # noqa: E501
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric='mAP', save_best='Total AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[10, 15])
+total_epochs = 20
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.4,
+    bbox_file='data/posetrack18/annotations/'
+    'posetrack18_val_human_detections.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/posetrack18'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/resnet_posetrack18.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/resnet_posetrack18.md
new file mode 100644
index 0000000000000000000000000000000000000000..26aee7ba51a4acc1ee549a1292f96f9dea710b4f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/resnet_posetrack18.md
@@ -0,0 +1,66 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Andriluka_PoseTrack_A_Benchmark_CVPR_2018_paper.html">PoseTrack18 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{andriluka2018posetrack,
+  title={Posetrack: A benchmark for human pose estimation and tracking},
+  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={5167--5176},
+  year={2018}
+}
+```
+
+</details>
+
+Results on PoseTrack2018 val with ground-truth bounding boxes
+
+| Arch  | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total  | ckpt    | log     |
+| :--- | :--------: | :------: |:------: |:------: |:------: |:------: |:------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/res50_posetrack18_256x192.py) | 256x192 | 86.5 | 87.5 | 82.3 | 75.6 | 79.9 | 78.6 | 74.0 | 81.0 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192-a62807c7_20201028.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192_20201028.log.json) |
+
+The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.
+
+Results on PoseTrack2018 val with [MMDetection](https://github.com/open-mmlab/mmdetection) pre-trained [Cascade R-CNN](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth) (X-101-64x4d-FPN) human detector
+
+| Arch  | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total  | ckpt    | log     |
+| :--- | :--------: | :------: |:------: |:------: |:------: |:------: |:------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/res50_posetrack18_256x192.py) | 256x192 | 78.9 | 81.9 | 77.8 | 70.8 | 75.3 | 73.2 | 66.4 | 75.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192-a62807c7_20201028.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192_20201028.log.json) |
+
+The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/resnet_posetrack18.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/resnet_posetrack18.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f85bc4b64834862f166ed6ba118337dcf1d12fe0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/resnet_posetrack18.yml
@@ -0,0 +1,47 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/res50_posetrack18_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: PoseTrack18
+  Name: topdown_heatmap_res50_posetrack18_256x192
+  Results:
+  - Dataset: PoseTrack18
+    Metrics:
+      Ankl: 74.0
+      Elb: 82.3
+      Head: 86.5
+      Hip: 79.9
+      Knee: 78.6
+      Shou: 87.5
+      Total: 81.0
+      Wri: 75.6
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192-a62807c7_20201028.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/res50_posetrack18_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: PoseTrack18
+  Name: topdown_heatmap_res50_posetrack18_256x192
+  Results:
+  - Dataset: PoseTrack18
+    Metrics:
+      Ankl: 66.4
+      Elb: 77.8
+      Head: 78.9
+      Hip: 75.3
+      Knee: 73.2
+      Shou: 81.9
+      Total: 75.2
+      Wri: 70.8
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192-a62807c7_20201028.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c638432b501656801367f035e70c4ac888130d14
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/README.md
@@ -0,0 +1,9 @@
+# Video-based Single-view 2D Human Body Pose Estimation
+
+Multi-person 2D human pose estimation in video is defined as the task of detecting the poses (or keypoints) of all people from an input video.
+
+For this task, we currently support [PoseWarper](/configs/body/2d_kpt_sview_rgb_vid/posewarper).
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/2d_body_keypoint.md) to prepare data.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..425d116704cc5ca1a9257ffc7575550fabf77981
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/README.md
@@ -0,0 +1,25 @@
+# Learning Temporal Pose Estimation from Sparsely-Labeled Videos
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1906.04016">PoseWarper (NeurIPS'2019)</a></summary>
+
+```bibtex
+@inproceedings{NIPS2019_gberta,
+title = {Learning Temporal Pose Estimation from Sparsely Labeled Videos},
+author = {Bertasius, Gedas and Feichtenhofer, Christoph, and Tran, Du and Shi, Jianbo, and Torresani, Lorenzo},
+booktitle = {Advances in Neural Information Processing Systems 33},
+year = {2019},
+}
+```
+
+</details>
+
+PoseWarper proposes a network that leverages training videos with sparse annotations (every k frames) to learn to perform dense temporal pose propagation and estimation. Given a pair of video frames, a labeled Frame A and an unlabeled Frame B, the model is trained to predict human pose in Frame A using the features from Frame B by means of deformable convolutions to implicitly learn the pose warping between A and B.
+
+The training of PoseWarper can be split into two stages.
+
+The first-stage is trained with the pre-trained model and the main backbone is fine-tuned in a single-frame setting.
+
+The second-stage is trained with the model from the first stage, and the warping offsets are learned in a multi-frame setting while the backbone is frozen.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_posetrack18_posewarper.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_posetrack18_posewarper.md
new file mode 100644
index 0000000000000000000000000000000000000000..0fd0a7f5af070590052cbd4cae6338f10402550e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_posetrack18_posewarper.md
@@ -0,0 +1,88 @@
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1906.04016">PoseWarper (NeurIPS'2019)</a></summary>
+
+```bibtex
+@inproceedings{NIPS2019_gberta,
+title = {Learning Temporal Pose Estimation from Sparsely Labeled Videos},
+author = {Bertasius, Gedas and Feichtenhofer, Christoph, and Tran, Du and Shi, Jianbo, and Torresani, Lorenzo},
+booktitle = {Advances in Neural Information Processing Systems 33},
+year = {2019},
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Andriluka_PoseTrack_A_Benchmark_CVPR_2018_paper.html">PoseTrack18 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{andriluka2018posetrack,
+  title={Posetrack: A benchmark for human pose estimation and tracking},
+  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={5167--5176},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Note that the training of PoseWarper can be split into two stages.
+
+The first-stage is trained with the pre-trained [checkpoint](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288-314c8528_20200708.pth) on COCO dataset, and the main backbone is fine-tuned on PoseTrack18 in a single-frame setting.
+
+The second-stage is trained with the last [checkpoint](https://download.openmmlab.com/mmpose/top_down/posewarper/hrnet_w48_posetrack18_384x288_posewarper_stage1-08b632aa_20211130.pth) from the first stage, and the warping offsets are learned in a multi-frame setting while the backbone is frozen.
+
+Results on PoseTrack2018 val with ground-truth bounding boxes
+
+| Arch  | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total  | ckpt    | log     |
+| :--- | :--------: | :------: |:------: |:------: |:------: |:------: |:------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage2.py) | 384x288 | 88.2 | 90.3 | 86.1 | 81.6 | 81.8 | 83.8 | 81.5 | 85.0 | [ckpt](https://download.openmmlab.com/mmpose/top_down/posewarper/hrnet_w48_posetrack18_384x288_posewarper_stage2-4abf88db_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/posewarper/hrnet_w48_posetrack18_384x288_posewarper_stage2_20211130.log.json) |
+
+Results on PoseTrack2018 val with precomputed human bounding boxes from PoseWarper supplementary data files from [this link](https://www.dropbox.com/s/ygfy6r8nitoggfq/PoseWarper_supp_files.zip?dl=0)<sup>1</sup>.
+
+| Arch  | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total  | ckpt    | log     |
+| :--- | :--------: | :------: |:------: |:------: |:------: |:------: |:------: | :------: | :------: |:------: |:------: |
+| [pose_hrnet_w48](/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage2.py) | 384x288 | 81.8 | 85.6 | 82.7 | 77.2 | 76.8 | 79.0 | 74.4 | 79.8 | [ckpt](https://download.openmmlab.com/mmpose/top_down/posewarper/hrnet_w48_posetrack18_384x288_posewarper_stage2-4abf88db_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/posewarper/hrnet_w48_posetrack18_384x288_posewarper_stage2_20211130.log.json) |
+
+<sup>1</sup> Please download the precomputed human bounding boxes on PoseTrack2018 val from `$PoseWarper_supp_files/posetrack18_precomputed_boxes/val_boxes.json` and place it here: `$mmpose/data/posetrack18/posetrack18_precomputed_boxes/val_boxes.json` to be consistent with the [config](/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage2.py). Please refer to [DATA Preparation](/docs/en/tasks/2d_body_keypoint.md) for more detail about data preparation.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_posetrack18_posewarper.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_posetrack18_posewarper.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3d260312f085a95bcc5fbfe5c2d78f76a20ec4e9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_posetrack18_posewarper.yml
@@ -0,0 +1,47 @@
+Collections:
+- Name: PoseWarper
+  Paper:
+    Title: Learning Temporal Pose Estimation from Sparsely Labeled Videos
+    URL: https://arxiv.org/abs/1906.04016
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/posewarper.md
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage2.py
+  In Collection: PoseWarper
+  Metadata:
+    Architecture: &id001
+    - PoseWarper
+    - HRNet
+    Training Data: COCO
+  Name: posewarper_hrnet_w48_posetrack18_384x288_posewarper_stage2
+  Results:
+  - Dataset: COCO
+    Metrics:
+      Ankl: 81.5
+      Elb: 86.1
+      Head: 88.2
+      Hip: 81.8
+      Knee: 83.8
+      Shou: 90.3
+      Total: 85.0
+      Wri: 81.6
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/posewarper/hrnet_w48_posetrack18_384x288_posewarper_stage2-4abf88db_20211130.pth
+- Config: configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage2.py
+  In Collection: PoseWarper
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: posewarper_hrnet_w48_posetrack18_384x288_posewarper_stage2
+  Results:
+  - Dataset: COCO
+    Metrics:
+      Ankl: 74.4
+      Elb: 82.7
+      Head: 81.8
+      Hip: 76.8
+      Knee: 79.0
+      Shou: 85.6
+      Total: 79.8
+      Wri: 77.2
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/posewarper/hrnet_w48_posetrack18_384x288_posewarper_stage2-4abf88db_20211130.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage1.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage1.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6ab2d8f76830eb56ca1bc03bd11e0522cdc256d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage1.py
@@ -0,0 +1,166 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/posetrack18.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288-314c8528_20200708.pth'  # noqa: E501
+cudnn_benchmark = True
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric='mAP', save_best='Total AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0001,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[5, 7])
+total_epochs = 10
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=True,
+    det_bbox_thr=0.2,
+    bbox_file='data/posetrack18/annotations/'
+    'posetrack18_val_human_detections.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=45,
+        scale_factor=0.35),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/posetrack18'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=3,
+    val_dataloader=dict(samples_per_gpu=16),
+    test_dataloader=dict(samples_per_gpu=16),
+    train=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownPoseTrack18Dataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage2.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eb5de9d3541e2dd1b1416ccd7a224ca1079593b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_w48_posetrack18_384x288_posewarper_stage2.py
@@ -0,0 +1,204 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/posetrack18.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/posewarper/hrnet_w48_posetrack18_384x288_posewarper_stage1-08b632aa_20211130.pth'  # noqa: E501
+cudnn_benchmark = True
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric='mAP', save_best='Total AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0001,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[10, 15])
+total_epochs = 20
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseWarper',
+    pretrained=None,
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+        frozen_stages=4,
+    ),
+    concat_tensors=True,
+    neck=dict(
+        type='PoseWarperNeck',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        inner_channels=128,
+        deform_groups=channel_cfg['num_output_channels'],
+        dilations=(3, 6, 12, 18, 24),
+        trans_conv_kernel=1,
+        res_blocks_cfg=dict(block='BASIC', num_blocks=20),
+        offsets_kernel=3,
+        deform_conv_kernel=3,
+        freeze_trans_layer=True,
+        im2col_step=80),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=channel_cfg['num_output_channels'],
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=0, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    use_nms=True,
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.2,
+    bbox_file='data/posetrack18/posetrack18_precomputed_boxes/'
+    'val_boxes.json',
+    # frame_indices_train=[-1, 0],
+    frame_index_rand=True,
+    frame_index_range=[-2, 2],
+    num_adj_frames=1,
+    frame_indices_test=[-2, -1, 0, 1, 2],
+    # the first weight is the current frame,
+    # then on ascending order of frame indices
+    frame_weight_train=(0.0, 1.0),
+    frame_weight_test=(0.3, 0.1, 0.25, 0.25, 0.1),
+)
+
+# take care of orders of the transforms
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=45,
+        scale_factor=0.35),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'frame_weight'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file',
+            'center',
+            'scale',
+            'rotation',
+            'bbox_score',
+            'flip_pairs',
+            'frame_weight',
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/posetrack18'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=4),
+    test_dataloader=dict(samples_per_gpu=4),
+    train=dict(
+        type='TopDownPoseTrack18VideoDataset',
+        ann_file=f'{data_root}/annotations/posetrack18_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownPoseTrack18VideoDataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownPoseTrack18VideoDataset',
+        ann_file=f'{data_root}/annotations/posetrack18_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7ac9137ba963b22de68156cc4512484bdd918f8e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/README.md
@@ -0,0 +1,8 @@
+# Multi-view 3D Human Body Pose Estimation
+
+Multi-view 3D human body pose estimation targets at predicting the X, Y, Z coordinates of human body joints from multi-view RGB images.
+For this task, we currently support [VoxelPose](/configs/body/3d_kpt_mview_rgb_img/voxelpose).
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/3d_body_keypoint.md) to prepare data.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f3160f5b92bf8065cb5823081ccabe3d8d513b09
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/README.md
@@ -0,0 +1,23 @@
+# VoxelPose: Towards Multi-Camera 3D Human Pose Estimation in Wild Environment
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123460188.pdf">VoxelPose (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{tumultipose,
+  title={VoxelPose: Towards Multi-Camera 3D Human Pose Estimation in Wild Environment},
+  author={Tu, Hanyue and Wang, Chunyu and Zeng, Wenjun},
+  booktitle={ECCV},
+  year={2020}
+}
+```
+
+</details>
+
+VoxelPose proposes to break down the task of 3d human pose estimation into 2 stages: (1) Human center detection by Cuboid Proposal Network
+(2) Human pose regression by Pose Regression Network.
+
+The networks in the two stages are all based on 3D convolution. And the input feature volumes are generated by projecting each voxel to
+multi-view images and sampling at the projected location on the 2D heatmaps.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.md
new file mode 100644
index 0000000000000000000000000000000000000000..a71ad8e6a0916d14c55782b4677f30d0c43c432f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.md
@@ -0,0 +1,37 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123460188.pdf">VoxelPose (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{tumultipose,
+  title={VoxelPose: Towards Multi-Camera 3D Human Pose Estimation in Wild Environment},
+  author={Tu, Hanyue and Wang, Chunyu and Zeng, Wenjun},
+  booktitle={ECCV},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://openaccess.thecvf.com/content_iccv_2015/html/Joo_Panoptic_Studio_A_ICCV_2015_paper.html">CMU Panoptic (ICCV'2015)</a></summary>
+
+```bibtex
+@Article = {joo_iccv_2015,
+author = {Hanbyul Joo, Hao Liu, Lei Tan, Lin Gui, Bart Nabbe, Iain Matthews, Takeo Kanade, Shohei Nobuhara, and Yaser Sheikh},
+title = {Panoptic Studio: A Massively Multiview System for Social Motion Capture},
+booktitle = {ICCV},
+year = {2015}
+}
+```
+
+</details>
+
+Results on CMU Panoptic dataset.
+
+| Arch | mAP | mAR | MPJPE | Recall@500mm| ckpt | log |
+| :--- | :---: | :---: | :---: | :---: | :---: | :---: |
+| [prn64_cpn80_res50](/configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.py) | 97.31 | 97.99 | 17.57| 99.85| [ckpt](https://download.openmmlab.com/mmpose/body3d/voxelpose/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5-545c150e_20211103.pth) | [log](https://download.openmmlab.com/mmpose/body3d/voxelpose/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5_20211103.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.py
new file mode 100644
index 0000000000000000000000000000000000000000..90996e1eeff112eec680c710a51722b6ba46ead5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.py
@@ -0,0 +1,226 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/panoptic_body3d.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric='mAP', save_best='mAP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0001,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 9])
+total_epochs = 15
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+space_size = [8000, 8000, 2000]
+space_center = [0, -500, 800]
+cube_size = [80, 80, 20]
+sub_space_size = [2000, 2000, 2000]
+sub_cube_size = [64, 64, 64]
+image_size = [960, 512]
+heatmap_size = [240, 128]
+num_joints = 15
+
+train_data_cfg = dict(
+    image_size=image_size,
+    heatmap_size=[heatmap_size],
+    num_joints=num_joints,
+    seq_list=[
+        '160422_ultimatum1', '160224_haggling1', '160226_haggling1',
+        '161202_haggling1', '160906_ian1', '160906_ian2', '160906_ian3',
+        '160906_band1', '160906_band2'
+    ],
+    cam_list=[(0, 12), (0, 6), (0, 23), (0, 13), (0, 3)],
+    num_cameras=5,
+    seq_frame_interval=3,
+    subset='train',
+    root_id=2,
+    max_num=10,
+    space_size=space_size,
+    space_center=space_center,
+    cube_size=cube_size,
+)
+
+test_data_cfg = train_data_cfg.copy()
+test_data_cfg.update(
+    dict(
+        seq_list=[
+            '160906_pizza1',
+            '160422_haggling1',
+            '160906_ian5',
+            '160906_band4',
+        ],
+        seq_frame_interval=12,
+        subset='validation'))
+
+# model settings
+backbone = dict(
+    type='AssociativeEmbedding',
+    pretrained=None,
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='DeconvHead',
+        in_channels=2048,
+        out_channels=num_joints,
+        num_deconv_layers=3,
+        num_deconv_filters=(256, 256, 256),
+        num_deconv_kernels=(4, 4, 4),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=15,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[False],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0],
+        )),
+    train_cfg=dict(),
+    test_cfg=dict(
+        num_joints=num_joints,
+        nms_kernel=None,
+        nms_padding=None,
+        tag_per_joint=None,
+        max_num_people=None,
+        detection_threshold=None,
+        tag_threshold=None,
+        use_detection_val=None,
+        ignore_too_much=None,
+    ))
+
+model = dict(
+    type='DetectAndRegress',
+    backbone=backbone,
+    pretrained='checkpoints/resnet_50_deconv.pth.tar',
+    human_detector=dict(
+        type='VoxelCenterDetector',
+        image_size=image_size,
+        heatmap_size=heatmap_size,
+        space_size=space_size,
+        cube_size=cube_size,
+        space_center=space_center,
+        center_net=dict(type='V2VNet', input_channels=15, output_channels=1),
+        center_head=dict(
+            type='CuboidCenterHead',
+            space_size=space_size,
+            space_center=space_center,
+            cube_size=cube_size,
+            max_num=10,
+            max_pool_kernel=3),
+        train_cfg=dict(dist_threshold=500.0),
+        test_cfg=dict(center_threshold=0.3),
+    ),
+    pose_regressor=dict(
+        type='VoxelSinglePose',
+        image_size=image_size,
+        heatmap_size=heatmap_size,
+        sub_space_size=sub_space_size,
+        sub_cube_size=sub_cube_size,
+        num_joints=15,
+        pose_net=dict(type='V2VNet', input_channels=15, output_channels=15),
+        pose_head=dict(type='CuboidPoseHead', beta=100.0)))
+
+train_pipeline = [
+    dict(
+        type='MultiItemProcess',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='BottomUpRandomAffine',
+                rot_factor=0,
+                scale_factor=[1.0, 1.0],
+                scale_type='long',
+                trans_factor=0),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='DiscardDuplicatedItems',
+        keys_list=[
+            'joints_3d', 'joints_3d_visible', 'ann_info', 'roots_3d',
+            'num_persons', 'sample_id'
+        ]),
+    dict(type='GenerateVoxel3DHeatmapTarget', sigma=200.0, joint_indices=[2]),
+    dict(
+        type='Collect',
+        keys=['img', 'targets_3d'],
+        meta_keys=[
+            'num_persons', 'joints_3d', 'camera', 'center', 'scale',
+            'joints_3d_visible', 'roots_3d'
+        ]),
+]
+
+val_pipeline = [
+    dict(
+        type='MultiItemProcess',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='BottomUpRandomAffine',
+                rot_factor=0,
+                scale_factor=[1.0, 1.0],
+                scale_type='long',
+                trans_factor=0),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='DiscardDuplicatedItems',
+        keys_list=[
+            'joints_3d', 'joints_3d_visible', 'ann_info', 'roots_3d',
+            'num_persons', 'sample_id'
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['sample_id', 'camera', 'center', 'scale']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/panoptic/'
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=2),
+    test_dataloader=dict(samples_per_gpu=2),
+    train=dict(
+        type='Body3DMviewDirectPanopticDataset',
+        ann_file=None,
+        img_prefix=data_root,
+        data_cfg=train_data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Body3DMviewDirectPanopticDataset',
+        ann_file=None,
+        img_prefix=data_root,
+        data_cfg=test_data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DMviewDirectPanopticDataset',
+        ann_file=None,
+        img_prefix=data_root,
+        data_cfg=test_data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8b5e57897fa76a36ea601598baa991fbe94e934f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: VoxelPose
+  Paper:
+    Title: 'VoxelPose: Towards Multi-Camera 3D Human Pose Estimation in Wild Environment'
+    URL: https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123460188.pdf
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/voxelpose.md
+Models:
+- Config: configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.py
+  In Collection: VoxelPose
+  Metadata:
+    Architecture:
+    - VoxelPose
+    Training Data: CMU Panoptic
+  Name: voxelpose_voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5
+  Results:
+  - Dataset: CMU Panoptic
+    Metrics:
+      MPJPE: 17.57
+      mAP: 97.31
+      mAR: 97.99
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/voxelpose/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5-545c150e_20211103.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..30b2bd310cfbabb7911b46c154a8793aa41ebd60
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/README.md
@@ -0,0 +1,17 @@
+# Single-view 3D Human Body Pose Estimation
+
+3D pose estimation is the detection and analysis of X, Y, Z coordinates of human body joints from an RGB image.
+For single-person 3D pose estimation from a monocular camera, existing works can be classified into three categories:
+(1) from 2D poses to 3D poses (2D-to-3D pose lifting)
+(2) jointly learning 2D and 3D poses, and
+(3) directly regressing 3D poses from images.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/3d_body_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/3d_human_pose_demo.md) to run demos.
+
+<img src="https://user-images.githubusercontent.com/15977946/118820606-02df2000-b8e9-11eb-9984-b9228101e780.gif" width="600px" alt><br>
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..297c88896088a17041bc92f0cfba1550e9dabaa2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/README.md
@@ -0,0 +1,23 @@
+# A simple yet effective baseline for 3d human pose estimation
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_iccv_2017/html/Martinez_A_Simple_yet_ICCV_2017_paper.html">SimpleBaseline3D (ICCV'2017)</a></summary>
+
+```bibtex
+@inproceedings{martinez_2017_3dbaseline,
+  title={A simple yet effective baseline for 3d human pose estimation},
+  author={Martinez, Julieta and Hossain, Rayat and Romero, Javier and Little, James J.},
+  booktitle={ICCV},
+  year={2017}
+}
+```
+
+</details>
+
+Simple 3D baseline proposes to break down the task of 3d human pose estimation into 2 stages: (1) Image → 2D pose
+(2) 2D pose → 3D pose.
+
+The authors find that “lifting” ground truth 2D joint locations to 3D space is a task that can be solved with a low error rate.
+Based on the success of 2d human pose estimation, it directly "lifts" 2d joint locations to 3d space.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.md
new file mode 100644
index 0000000000000000000000000000000000000000..0aac3fdd451ac810bafdf19323dd5f0b7c302542
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.md
@@ -0,0 +1,44 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_iccv_2017/html/Martinez_A_Simple_yet_ICCV_2017_paper.html">SimpleBaseline3D (ICCV'2017)</a></summary>
+
+```bibtex
+@inproceedings{martinez_2017_3dbaseline,
+  title={A simple yet effective baseline for 3d human pose estimation},
+  author={Martinez, Julieta and Hossain, Rayat and Romero, Javier and Little, James J.},
+  booktitle={ICCV},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/6682899/">Human3.6M (TPAMI'2014)</a></summary>
+
+```bibtex
+@article{h36m_pami,
+  author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu,  Cristian},
+  title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  publisher = {IEEE Computer Society},
+  volume = {36},
+  number = {7},
+  pages = {1325-1339},
+  month = {jul},
+  year = {2014}
+}
+```
+
+</details>
+
+Results on Human3.6M dataset with ground truth 2D detections
+
+| Arch | MPJPE | P-MPJPE | ckpt | log |
+| :--- | :---: | :---: | :---: | :---: |
+| [simple_baseline_3d_tcn<sup>1</sup>](/configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.py) | 43.4 | 34.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) |
+
+<sup>1</sup> Differing from the original paper, we didn't apply the `max-norm constraint` because we found this led to a better convergence and performance.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ec29530a51a7db9593fa15c40c8a846ecda06d9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.py
@@ -0,0 +1,180 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/h36m.py'
+]
+evaluation = dict(interval=10, metric=['mpjpe', 'p-mpjpe'], save_best='MPJPE')
+
+# optimizer settings
+optimizer = dict(
+    type='Adam',
+    lr=1e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    by_epoch=False,
+    step=100000,
+    gamma=0.96,
+)
+
+total_epochs = 200
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    pretrained=None,
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=2,
+        kernel_sizes=(1, 1, 1),
+        dropout=0.5),
+    keypoint_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=16,  # do not predict root joint
+        loss_keypoint=dict(type='MSELoss')),
+    train_cfg=dict(),
+    test_cfg=dict(restore_global_position=True))
+
+# data settings
+data_root = 'data/h36m'
+data_cfg = dict(
+    num_joints=17,
+    seq_len=1,
+    seq_frame_interval=1,
+    causal=True,
+    joint_2d_src='gt',
+    need_camera_param=False,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+)
+
+# 3D joint normalization parameters
+# From file: '{data_root}/annotation_body3d/fps50/joint3d_rel_stats.pkl'
+joint_3d_normalize_param = dict(
+    mean=[[-2.55652589e-04, -7.11960570e-03, -9.81433052e-04],
+          [-5.65463051e-03, 3.19636009e-01, 7.19329269e-02],
+          [-1.01705840e-02, 6.91147892e-01, 1.55352986e-01],
+          [2.55651315e-04, 7.11954606e-03, 9.81423866e-04],
+          [-5.09729780e-03, 3.27040413e-01, 7.22258095e-02],
+          [-9.99656606e-03, 7.08277383e-01, 1.58016408e-01],
+          [2.90583676e-03, -2.11363307e-01, -4.74210915e-02],
+          [5.67537804e-03, -4.35088906e-01, -9.76974016e-02],
+          [5.93884964e-03, -4.91891970e-01, -1.10666618e-01],
+          [7.37352083e-03, -5.83948619e-01, -1.31171400e-01],
+          [5.41920653e-03, -3.83931702e-01, -8.68145417e-02],
+          [2.95964662e-03, -1.87567488e-01, -4.34536934e-02],
+          [1.26585822e-03, -1.20170579e-01, -2.82526049e-02],
+          [4.67186639e-03, -3.83644089e-01, -8.55125784e-02],
+          [1.67648571e-03, -1.97007177e-01, -4.31368364e-02],
+          [8.70569015e-04, -1.68664569e-01, -3.73902498e-02]],
+    std=[[0.11072244, 0.02238818, 0.07246294],
+         [0.15856311, 0.18933832, 0.20880479],
+         [0.19179935, 0.24320062, 0.24756193],
+         [0.11072181, 0.02238805, 0.07246253],
+         [0.15880454, 0.19977188, 0.2147063],
+         [0.18001944, 0.25052739, 0.24853247],
+         [0.05210694, 0.05211406, 0.06908241],
+         [0.09515367, 0.10133032, 0.12899733],
+         [0.11742458, 0.12648469, 0.16465091],
+         [0.12360297, 0.13085539, 0.16433336],
+         [0.14602232, 0.09707956, 0.13952731],
+         [0.24347532, 0.12982249, 0.20230181],
+         [0.2446877, 0.21501816, 0.23938235],
+         [0.13876084, 0.1008926, 0.1424411],
+         [0.23687529, 0.14491219, 0.20980829],
+         [0.24400695, 0.23975028, 0.25520584]])
+
+# 2D joint normalization parameters
+# From file: '{data_root}/annotation_body3d/fps50/joint2d_stats.pkl'
+joint_2d_normalize_param = dict(
+    mean=[[532.08351635, 419.74137558], [531.80953144, 418.2607141],
+          [530.68456967, 493.54259285], [529.36968722, 575.96448516],
+          [532.29767646, 421.28483336], [531.93946631, 494.72186795],
+          [529.71984447, 578.96110365], [532.93699382, 370.65225054],
+          [534.1101856, 317.90342311], [534.55416813, 304.24143901],
+          [534.86955004, 282.31030885], [534.11308566, 330.11296796],
+          [533.53637525, 376.2742511], [533.49380107, 391.72324565],
+          [533.52579142, 330.09494668], [532.50804964, 374.190479],
+          [532.72786934, 380.61615716]],
+    std=[[107.73640054, 63.35908715], [119.00836213, 64.1215443],
+         [119.12412107, 50.53806215], [120.61688045, 56.38444891],
+         [101.95735275, 62.89636486], [106.24832897, 48.41178119],
+         [108.46734966, 54.58177071], [109.07369806, 68.70443672],
+         [111.20130351, 74.87287863], [111.63203838, 77.80542514],
+         [113.22330788, 79.90670556], [105.7145833, 73.27049436],
+         [107.05804267, 73.93175781], [107.97449418, 83.30391802],
+         [121.60675105, 74.25691526], [134.34378973, 77.48125087],
+         [131.79990652, 89.86721124]])
+
+train_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=True),
+    dict(
+        type='NormalizeJointCoordinate',
+        item='target',
+        mean=joint_3d_normalize_param['mean'],
+        std=joint_3d_normalize_param['std']),
+    dict(
+        type='NormalizeJointCoordinate',
+        item='input_2d',
+        mean=joint_2d_normalize_param['mean'],
+        std=joint_2d_normalize_param['std']),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=[
+            'target_image_path', 'flip_pairs', 'root_position',
+            'root_position_index', 'target_mean', 'target_std'
+        ])
+]
+
+val_pipeline = train_pipeline
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=64),
+    test_dataloader=dict(samples_per_gpu=64),
+    train=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_train.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b6de86b8f2a860e1a9440c1ee2057490b559308d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.yml
@@ -0,0 +1,21 @@
+Collections:
+- Name: SimpleBaseline3D
+  Paper:
+    Title: A simple yet effective baseline for 3d human pose estimation
+    URL: http://openaccess.thecvf.com/content_iccv_2017/html/Martinez_A_Simple_yet_ICCV_2017_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline3d.md
+Models:
+- Config: configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.py
+  In Collection: SimpleBaseline3D
+  Metadata:
+    Architecture:
+    - SimpleBaseline3D
+    Training Data: Human3.6M
+  Name: pose_lift_simplebaseline3d_h36m
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 43.4
+      P-MPJPE: 34.3
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e91fabccfae7d07184caf2039d15ace051ee3b5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.md
@@ -0,0 +1,42 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_iccv_2017/html/Martinez_A_Simple_yet_ICCV_2017_paper.html">SimpleBaseline3D (ICCV'2017)</a></summary>
+
+```bibtex
+@inproceedings{martinez_2017_3dbaseline,
+  title={A simple yet effective baseline for 3d human pose estimation},
+  author={Martinez, Julieta and Hossain, Rayat and Romero, Javier and Little, James J.},
+  booktitle={ICCV},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/8374605/">MPI-INF-3DHP (3DV'2017)</a></summary>
+
+```bibtex
+@inproceedings{mono-3dhp2017,
+  author = {Mehta, Dushyant and Rhodin, Helge and Casas, Dan and Fua, Pascal and Sotnychenko, Oleksandr and Xu, Weipeng and Theobalt, Christian},
+  title = {Monocular 3D Human Pose Estimation In The Wild Using Improved CNN Supervision},
+  booktitle = {3D Vision (3DV), 2017 Fifth International Conference on},
+  url = {http://gvv.mpi-inf.mpg.de/3dhp_dataset},
+  year = {2017},
+  organization={IEEE},
+  doi={10.1109/3dv.2017.00064},
+}
+```
+
+</details>
+
+Results on MPI-INF-3DHP dataset with ground truth 2D detections
+
+| Arch | MPJPE | P-MPJPE | 3DPCK | 3DAUC | ckpt | log |
+| :--- | :---: | :---: | :---: | :---: | :---: | :---: |
+| [simple_baseline_3d_tcn<sup>1</sup>](configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.py) | 84.3 | 53.2 | 85.0 | 52.0 | [ckpt](https://download.openmmlab.com/mmpose/body3d/simplebaseline3d/simplebaseline3d_mpi-inf-3dhp-b75546f6_20210603.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simplebaseline3d/simplebaseline3d_mpi-inf-3dhp_20210603.log.json) |
+
+<sup>1</sup> Differing from the original paper, we didn't apply the `max-norm constraint` because we found this led to a better convergence and performance.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbe23db0f73fc260af1998fc7461b8b40eeb5144
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.py
@@ -0,0 +1,192 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpi_inf_3dhp.py'
+]
+evaluation = dict(
+    interval=10,
+    metric=['mpjpe', 'p-mpjpe', '3dpck', '3dauc'],
+    key_indicator='MPJPE')
+
+# optimizer settings
+optimizer = dict(
+    type='Adam',
+    lr=1e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    by_epoch=False,
+    step=100000,
+    gamma=0.96,
+)
+
+total_epochs = 200
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    pretrained=None,
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=2,
+        kernel_sizes=(1, 1, 1),
+        dropout=0.5),
+    keypoint_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=16,  # do not predict root joint
+        loss_keypoint=dict(type='MSELoss')),
+    train_cfg=dict(),
+    test_cfg=dict(restore_global_position=True))
+
+# data settings
+data_root = 'data/mpi_inf_3dhp'
+train_data_cfg = dict(
+    num_joints=17,
+    seq_len=1,
+    seq_frame_interval=1,
+    causal=True,
+    joint_2d_src='gt',
+    need_camera_param=False,
+    camera_param_file=f'{data_root}/annotations/cameras_train.pkl',
+)
+test_data_cfg = dict(
+    num_joints=17,
+    seq_len=1,
+    seq_frame_interval=1,
+    causal=True,
+    joint_2d_src='gt',
+    need_camera_param=False,
+    camera_param_file=f'{data_root}/annotations/cameras_test.pkl',
+)
+
+# 3D joint normalization parameters
+# From file: '{data_root}/annotations/joint3d_rel_stats.pkl'
+joint_3d_normalize_param = dict(
+    mean=[[1.29798757e-02, -6.14242101e-01, -8.27376088e-02],
+          [8.76858608e-03, -3.99992424e-01, -5.62749816e-02],
+          [1.96335208e-02, -3.64617227e-01, -4.88267063e-02],
+          [2.75206678e-02, -1.95085890e-01, -2.01508894e-02],
+          [2.22896982e-02, -1.37878727e-01, -5.51315396e-03],
+          [-4.16641282e-03, -3.65152343e-01, -5.43331534e-02],
+          [-1.83806493e-02, -1.88053038e-01, -2.78737492e-02],
+          [-1.81491930e-02, -1.22997985e-01, -1.15657333e-02],
+          [1.02960759e-02, -3.93481284e-03, 2.56594686e-03],
+          [-9.82312721e-04, 3.03909927e-01, 6.40930378e-02],
+          [-7.40153218e-03, 6.03930248e-01, 1.01704308e-01],
+          [-1.02960759e-02, 3.93481284e-03, -2.56594686e-03],
+          [-2.65585735e-02, 3.10685217e-01, 5.90257974e-02],
+          [-2.97909979e-02, 6.09658773e-01, 9.83101419e-02],
+          [5.27935016e-03, -1.95547908e-01, -3.06803451e-02],
+          [9.67095383e-03, -4.67827216e-01, -6.31183199e-02]],
+    std=[[0.22265961, 0.19394593, 0.24823498],
+         [0.14710804, 0.13572695, 0.16518279],
+         [0.16562233, 0.12820609, 0.1770134],
+         [0.25062919, 0.1896429, 0.24869254],
+         [0.29278334, 0.29575863, 0.28972444],
+         [0.16916984, 0.13424898, 0.17943313],
+         [0.24760463, 0.18768265, 0.24697394],
+         [0.28709979, 0.28541425, 0.29065647],
+         [0.08867271, 0.02868353, 0.08192097],
+         [0.21473598, 0.23872363, 0.22448061],
+         [0.26021136, 0.3188117, 0.29020494],
+         [0.08867271, 0.02868353, 0.08192097],
+         [0.20729183, 0.2332424, 0.22969608],
+         [0.26214967, 0.3125435, 0.29601641],
+         [0.07129179, 0.06720073, 0.0811808],
+         [0.17489889, 0.15827879, 0.19465977]])
+
+# 2D joint normalization parameters
+# From file: '{data_root}/annotations/joint2d_stats.pkl'
+joint_2d_normalize_param = dict(
+    mean=[[991.90641651, 862.69810047], [1012.08511619, 957.61720198],
+          [1014.49360896, 974.59889655], [1015.67993223, 1055.61969227],
+          [1012.53566238, 1082.80581721], [1009.22188073, 973.93984209],
+          [1005.0694331, 1058.35166276], [1003.49327495, 1089.75631017],
+          [1010.54615457, 1141.46165082], [1003.63254875, 1283.37687485],
+          [1001.97780897, 1418.03079034], [1006.61419313, 1145.20131053],
+          [999.60794074, 1287.13556333], [998.33830821, 1422.30463081],
+          [1008.58017385, 1143.33148068], [1010.97561846, 1053.38953748],
+          [1012.06704779, 925.75338048]],
+    std=[[23374.39708662, 7213.93351296], [533.82975336, 219.70387631],
+         [539.03326985, 218.9370412], [566.57219249, 233.32613405],
+         [590.4265317, 269.2245025], [539.92993936, 218.53166338],
+         [546.30605944, 228.43631598], [564.88616584, 267.85235566],
+         [515.76216052, 206.72322146], [500.6260933, 223.24233285],
+         [505.35940904, 268.4394148], [512.43406541, 202.93095363],
+         [502.41443672, 218.70111819], [509.76363747, 267.67317375],
+         [511.65693552, 204.13307947], [521.66823785, 205.96774166],
+         [541.47940161, 226.01738951]])
+
+train_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=14,
+        root_name='root_position',
+        remove_root=True),
+    dict(
+        type='NormalizeJointCoordinate',
+        item='target',
+        mean=joint_3d_normalize_param['mean'],
+        std=joint_3d_normalize_param['std']),
+    dict(
+        type='NormalizeJointCoordinate',
+        item='input_2d',
+        mean=joint_2d_normalize_param['mean'],
+        std=joint_2d_normalize_param['std']),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=[
+            'target_image_path', 'flip_pairs', 'root_position',
+            'root_position_index', 'target_mean', 'target_std'
+        ])
+]
+
+val_pipeline = train_pipeline
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=64),
+    test_dataloader=dict(samples_per_gpu=64),
+    train=dict(
+        type='Body3DMpiInf3dhpDataset',
+        ann_file=f'{data_root}/annotations/mpi_inf_3dhp_train.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=train_data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Body3DMpiInf3dhpDataset',
+        ann_file=f'{data_root}/annotations/mpi_inf_3dhp_test_valid.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=test_data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DMpiInf3dhpDataset',
+        ann_file=f'{data_root}/annotations/mpi_inf_3dhp_test_valid.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=test_data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bca7b505281160a3cce7dee6fe9dba95059f3331
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: SimpleBaseline3D
+  Paper:
+    Title: A simple yet effective baseline for 3d human pose estimation
+    URL: http://openaccess.thecvf.com/content_iccv_2017/html/Martinez_A_Simple_yet_ICCV_2017_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline3d.md
+Models:
+- Config: configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.py
+  In Collection: SimpleBaseline3D
+  Metadata:
+    Architecture:
+    - SimpleBaseline3D
+    Training Data: MPI-INF-3DHP
+  Name: pose_lift_simplebaseline3d_mpi-inf-3dhp
+  Results:
+  - Dataset: MPI-INF-3DHP
+    Metrics:
+      3DAUC: 52.0
+      3DPCK: 85.0
+      MPJPE: 84.3
+      P-MPJPE: 53.2
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/simplebaseline3d/simplebaseline3d_mpi-inf-3dhp-b75546f6_20210603.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8473efc0745516c0c2f751fc7f20c76565263166
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/README.md
@@ -0,0 +1,11 @@
+# Video-based Single-view 3D Human Body Pose Estimation
+
+Video-based 3D pose estimation is the detection and analysis of X, Y, Z coordinates of human body joints from a sequence of RGB images.
+For single-person 3D pose estimation from a monocular camera, existing works can be classified into three categories:
+(1) from 2D poses to 3D poses (2D-to-3D pose lifting)
+(2) jointly learning 2D and 3D poses, and
+(3) directly regressing 3D poses from images.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/3d_body_keypoint.md) to prepare data.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c820a2f089cf7ca9810931a153915e4aa5e93fab
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/README.md
@@ -0,0 +1,22 @@
+# 3D human pose estimation in video with temporal convolutions and semi-supervised training
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Pavllo_3D_Human_Pose_Estimation_in_Video_With_Temporal_Convolutions_and_CVPR_2019_paper.html">VideoPose3D (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{pavllo20193d,
+  title={3d human pose estimation in video with temporal convolutions and semi-supervised training},
+  author={Pavllo, Dario and Feichtenhofer, Christoph and Grangier, David and Auli, Michael},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7753--7762},
+  year={2019}
+}
+```
+
+</details>
+
+Based on the success of 2d human pose estimation, it directly "lifts" a sequence of 2d keypoints to 3d keypoints.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m.md
new file mode 100644
index 0000000000000000000000000000000000000000..cad6bd5051eabe9bc5aa77ca849943fd20614ca1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m.md
@@ -0,0 +1,66 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Pavllo_3D_Human_Pose_Estimation_in_Video_With_Temporal_Convolutions_and_CVPR_2019_paper.html">VideoPose3D (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{pavllo20193d,
+  title={3d human pose estimation in video with temporal convolutions and semi-supervised training},
+  author={Pavllo, Dario and Feichtenhofer, Christoph and Grangier, David and Auli, Michael},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7753--7762},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/6682899/">Human3.6M (TPAMI'2014)</a></summary>
+
+```bibtex
+@article{h36m_pami,
+  author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu,  Cristian},
+  title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  publisher = {IEEE Computer Society},
+  volume = {36},
+  number = {7},
+  pages = {1325-1339},
+  month = {jul},
+  year = {2014}
+}
+```
+
+</details>
+
+Results on Human3.6M dataset with ground truth 2D detections, supervised training
+
+| Arch | Receptive Field | MPJPE | P-MPJPE | ckpt | log |
+| :--- | :---: | :---: | :---: | :---: | :---: |
+| [VideoPose3D](/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_supervised.py) | 27 | 40.0 | 30.1 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D](/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_81frames_fullconv_supervised.py) | 81 | 38.9 | 29.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D](/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_243frames_fullconv_supervised.py) | 243 | 37.6 | 28.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) |
+
+Results on Human3.6M dataset with CPN 2D detections<sup>1</sup>, supervised training
+
+| Arch | Receptive Field | MPJPE | P-MPJPE | ckpt | log |
+| :--- | :---: | :---: | :---: | :---: | :---: |
+| [VideoPose3D](/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_1frame_fullconv_supervised_cpn_ft.py) | 1 | 52.9 | 41.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) |
+| [VideoPose3D](/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_243frames_fullconv_supervised_cpn_ft.py) | 243 | 47.9 | 38.0 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) |
+
+Results on Human3.6M dataset with ground truth 2D detections, semi-supervised training
+
+| Training Data | Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log |
+| :--- | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| 10% S1 | [VideoPose3D](/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_semi-supervised.py) | 27 | 58.1 | 42.8 | 54.7 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) |
+
+Results on Human3.6M dataset with CPN 2D detections<sup>1</sup>, semi-supervised training
+
+| Training Data | Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log |
+| :--- | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| 10% S1 | [VideoPose3D](/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_semi-supervised_cpn_ft.py) | 27 | 67.4 | 50.1 | 63.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) |
+
+<sup>1</sup> CPN 2D detections are provided by [official repo](https://github.com/facebookresearch/VideoPose3D/blob/master/DATASETS.md). The reformatted version used in this repository can be downloaded from [train_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_train.npy) and [test_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_test.npy).
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m.yml
new file mode 100644
index 0000000000000000000000000000000000000000..392c494ace4de30d1c7576ac9392ecfc6270751e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m.yml
@@ -0,0 +1,102 @@
+Collections:
+- Name: VideoPose3D
+  Paper:
+    Title: 3d human pose estimation in video with temporal convolutions and semi-supervised
+      training
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Pavllo_3D_Human_Pose_Estimation_in_Video_With_Temporal_Convolutions_and_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/videopose3d.md
+Models:
+- Config: configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_supervised.py
+  In Collection: VideoPose3D
+  Metadata:
+    Architecture: &id001
+    - VideoPose3D
+    Training Data: Human3.6M
+  Name: video_pose_lift_videopose3d_h36m_27frames_fullconv_supervised
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 40.0
+      P-MPJPE: 30.1
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth
+- Config: configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_81frames_fullconv_supervised.py
+  In Collection: VideoPose3D
+  Metadata:
+    Architecture: *id001
+    Training Data: Human3.6M
+  Name: video_pose_lift_videopose3d_h36m_81frames_fullconv_supervised
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 38.9
+      P-MPJPE: 29.2
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth
+- Config: configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_243frames_fullconv_supervised.py
+  In Collection: VideoPose3D
+  Metadata:
+    Architecture: *id001
+    Training Data: Human3.6M
+  Name: video_pose_lift_videopose3d_h36m_243frames_fullconv_supervised
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 37.6
+      P-MPJPE: 28.3
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth
+- Config: configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_1frame_fullconv_supervised_cpn_ft.py
+  In Collection: VideoPose3D
+  Metadata:
+    Architecture: *id001
+    Training Data: Human3.6M
+  Name: video_pose_lift_videopose3d_h36m_1frame_fullconv_supervised_cpn_ft
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 52.9
+      P-MPJPE: 41.3
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth
+- Config: configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_243frames_fullconv_supervised_cpn_ft.py
+  In Collection: VideoPose3D
+  Metadata:
+    Architecture: *id001
+    Training Data: Human3.6M
+  Name: video_pose_lift_videopose3d_h36m_243frames_fullconv_supervised_cpn_ft
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 47.9
+      P-MPJPE: 38.0
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth
+- Config: configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_semi-supervised.py
+  In Collection: VideoPose3D
+  Metadata:
+    Architecture: *id001
+    Training Data: Human3.6M
+  Name: video_pose_lift_videopose3d_h36m_27frames_fullconv_semi-supervised
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 58.1
+      N-MPJPE: 54.7
+      P-MPJPE: 42.8
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth
+- Config: configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_semi-supervised_cpn_ft.py
+  In Collection: VideoPose3D
+  Metadata:
+    Architecture: *id001
+    Training Data: Human3.6M
+  Name: video_pose_lift_videopose3d_h36m_27frames_fullconv_semi-supervised_cpn_ft
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 67.4
+      N-MPJPE: 63.2
+      P-MPJPE: 50.1
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_1frame_fullconv_supervised_cpn_ft.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_1frame_fullconv_supervised_cpn_ft.py
new file mode 100644
index 0000000000000000000000000000000000000000..2de3c3bbcd2ede1dd7031398c865296596d8f4c7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_1frame_fullconv_supervised_cpn_ft.py
@@ -0,0 +1,158 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/h36m.py'
+]
+evaluation = dict(
+    interval=10, metric=['mpjpe', 'p-mpjpe'], key_indicator='MPJPE')
+
+# optimizer settings
+optimizer = dict(
+    type='Adam',
+    lr=1e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='exp',
+    by_epoch=True,
+    gamma=0.98,
+)
+
+total_epochs = 160
+
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    pretrained=None,
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=4,
+        kernel_sizes=(1, 1, 1, 1, 1),
+        dropout=0.25,
+        use_stride_conv=True),
+    keypoint_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=17,
+        loss_keypoint=dict(type='MPJPELoss')),
+    train_cfg=dict(),
+    test_cfg=dict(restore_global_position=True))
+
+# data settings
+data_root = 'data/h36m'
+train_data_cfg = dict(
+    num_joints=17,
+    seq_len=1,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=False,
+    joint_2d_src='detection',
+    joint_2d_det_file=f'{data_root}/joint_2d_det_files/' +
+    'cpn_ft_h36m_dbb_train.npy',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+)
+test_data_cfg = dict(
+    num_joints=17,
+    seq_len=1,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=False,
+    joint_2d_src='detection',
+    joint_2d_det_file=f'{data_root}/joint_2d_det_files/' +
+    'cpn_ft_h36m_dbb_test.npy',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+)
+
+train_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(
+        type='RelativeJointRandomFlip',
+        item=['input_2d', 'target'],
+        flip_cfg=[
+            dict(center_mode='static', center_x=0.),
+            dict(center_mode='root', center_index=0)
+        ],
+        visible_item=['input_2d_visible', 'target_visible'],
+        flip_prob=0.5),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+val_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=128),
+    test_dataloader=dict(samples_per_gpu=128),
+    train=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_train.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=train_data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=test_data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=test_data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_243frames_fullconv_supervised.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_243frames_fullconv_supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..23b23fede0bc7840859b997b44f070b9019367d3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_243frames_fullconv_supervised.py
@@ -0,0 +1,144 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/h36m.py'
+]
+evaluation = dict(
+    interval=10, metric=['mpjpe', 'p-mpjpe'], key_indicator='MPJPE')
+
+# optimizer settings
+optimizer = dict(
+    type='Adam',
+    lr=1e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='exp',
+    by_epoch=True,
+    gamma=0.975,
+)
+
+total_epochs = 160
+
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    pretrained=None,
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=4,
+        kernel_sizes=(3, 3, 3, 3, 3),
+        dropout=0.25,
+        use_stride_conv=True),
+    keypoint_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=17,
+        loss_keypoint=dict(type='MPJPELoss')),
+    train_cfg=dict(),
+    test_cfg=dict(restore_global_position=True))
+
+# data settings
+data_root = 'data/h36m'
+data_cfg = dict(
+    num_joints=17,
+    seq_len=243,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='gt',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+)
+
+train_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(
+        type='RelativeJointRandomFlip',
+        item=['input_2d', 'target'],
+        flip_cfg=[
+            dict(center_mode='static', center_x=0.),
+            dict(center_mode='root', center_index=0)
+        ],
+        visible_item=['input_2d_visible', 'target_visible'],
+        flip_prob=0.5),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+val_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=0,
+    val_dataloader=dict(samples_per_gpu=128),
+    test_dataloader=dict(samples_per_gpu=128),
+    train=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_train.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_243frames_fullconv_supervised_cpn_ft.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_243frames_fullconv_supervised_cpn_ft.py
new file mode 100644
index 0000000000000000000000000000000000000000..65d7b49053800b6ecdc6a153a3f4349a90974bc0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_243frames_fullconv_supervised_cpn_ft.py
@@ -0,0 +1,158 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/h36m.py'
+]
+evaluation = dict(
+    interval=10, metric=['mpjpe', 'p-mpjpe'], key_indicator='MPJPE')
+
+# optimizer settings
+optimizer = dict(
+    type='Adam',
+    lr=1e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='exp',
+    by_epoch=True,
+    gamma=0.98,
+)
+
+total_epochs = 200
+
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    pretrained=None,
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=4,
+        kernel_sizes=(3, 3, 3, 3, 3),
+        dropout=0.25,
+        use_stride_conv=True),
+    keypoint_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=17,
+        loss_keypoint=dict(type='MPJPELoss')),
+    train_cfg=dict(),
+    test_cfg=dict(restore_global_position=True))
+
+# data settings
+data_root = 'data/h36m'
+train_data_cfg = dict(
+    num_joints=17,
+    seq_len=243,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='detection',
+    joint_2d_det_file=f'{data_root}/joint_2d_det_files/' +
+    'cpn_ft_h36m_dbb_train.npy',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+)
+test_data_cfg = dict(
+    num_joints=17,
+    seq_len=243,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='detection',
+    joint_2d_det_file=f'{data_root}/joint_2d_det_files/' +
+    'cpn_ft_h36m_dbb_test.npy',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+)
+
+train_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(
+        type='RelativeJointRandomFlip',
+        item=['input_2d', 'target'],
+        flip_cfg=[
+            dict(center_mode='static', center_x=0.),
+            dict(center_mode='root', center_index=0)
+        ],
+        visible_item=['input_2d_visible', 'target_visible'],
+        flip_prob=0.5),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+val_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=0,
+    val_dataloader=dict(samples_per_gpu=128),
+    test_dataloader=dict(samples_per_gpu=128),
+    train=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_train.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=train_data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=test_data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=test_data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_semi-supervised.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_semi-supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..70404c9fcede383f32e3c6cb2a77f9924d804b78
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_semi-supervised.py
@@ -0,0 +1,222 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/h36m.py'
+]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=10, metric=['mpjpe', 'p-mpjpe', 'n-mpjpe'], key_indicator='MPJPE')
+
+# optimizer settings
+optimizer = dict(
+    type='Adam',
+    lr=1e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='exp',
+    by_epoch=True,
+    gamma=0.98,
+)
+
+total_epochs = 200
+
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    pretrained=None,
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=2,
+        kernel_sizes=(3, 3, 3),
+        dropout=0.25,
+        use_stride_conv=True),
+    keypoint_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=17,
+        loss_keypoint=dict(type='MPJPELoss')),
+    traj_backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=2,
+        kernel_sizes=(3, 3, 3),
+        dropout=0.25,
+        use_stride_conv=True),
+    traj_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=1,
+        loss_keypoint=dict(type='MPJPELoss', use_target_weight=True),
+        is_trajectory=True),
+    loss_semi=dict(
+        type='SemiSupervisionLoss',
+        joint_parents=[0, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15],
+        warmup_iterations=1311376 // 64 // 8 *
+        5),  # dataset_size // samples_per_gpu // gpu_num * warmup_epochs
+    train_cfg=dict(),
+    test_cfg=dict(restore_global_position=True))
+
+# data settings
+data_root = 'data/h36m'
+labeled_data_cfg = dict(
+    num_joints=17,
+    seq_len=27,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='gt',
+    subset=0.1,
+    subjects=['S1'],
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+)
+unlabeled_data_cfg = dict(
+    num_joints=17,
+    seq_len=27,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='gt',
+    subjects=['S5', 'S6', 'S7', 'S8'],
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+    need_2d_label=True)
+val_data_cfg = dict(
+    num_joints=17,
+    seq_len=27,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='gt',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl')
+test_data_cfg = val_data_cfg
+
+train_labeled_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(
+        type='RelativeJointRandomFlip',
+        item=['input_2d', 'target'],
+        flip_cfg=[
+            dict(center_mode='static', center_x=0.),
+            dict(center_mode='root', center_index=0)
+        ],
+        visible_item=['input_2d_visible', 'target_visible'],
+        flip_prob=0.5),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target',
+              ('root_position', 'traj_target')],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+train_unlabeled_pipeline = [
+    dict(
+        type='ImageCoordinateNormalization',
+        item=['input_2d', 'target_2d'],
+        norm_camera=True),
+    dict(
+        type='RelativeJointRandomFlip',
+        item=['input_2d', 'target_2d'],
+        flip_cfg=[
+            dict(center_mode='static', center_x=0.),
+            dict(center_mode='static', center_x=0.)
+        ],
+        visible_item='input_2d_visible',
+        flip_prob=0.5,
+        flip_camera=True),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(type='CollectCameraIntrinsics'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'unlabeled_input'),
+              ('target_2d', 'unlabeled_target_2d'), 'intrinsics'],
+        meta_name='unlabeled_metas',
+        meta_keys=['target_image_path', 'flip_pairs'])
+]
+
+val_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=64),
+    test_dataloader=dict(samples_per_gpu=64),
+    train=dict(
+        type='Body3DSemiSupervisionDataset',
+        labeled_dataset=dict(
+            type='Body3DH36MDataset',
+            ann_file=f'{data_root}/annotation_body3d/fps50/h36m_train.npz',
+            img_prefix=f'{data_root}/images/',
+            data_cfg=labeled_data_cfg,
+            pipeline=train_labeled_pipeline,
+            dataset_info={{_base_.dataset_info}}),
+        unlabeled_dataset=dict(
+            type='Body3DH36MDataset',
+            ann_file=f'{data_root}/annotation_body3d/fps50/h36m_train.npz',
+            img_prefix=f'{data_root}/images/',
+            data_cfg=unlabeled_data_cfg,
+            pipeline=train_unlabeled_pipeline,
+            dataset_info={{_base_.dataset_info}})),
+    val=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=val_data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=test_data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_semi-supervised_cpn_ft.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_semi-supervised_cpn_ft.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b0d9fe5205e44b9062fdc60a7d51f8671e556b4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_semi-supervised_cpn_ft.py
@@ -0,0 +1,228 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/h36m.py'
+]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=10, metric=['mpjpe', 'p-mpjpe', 'n-mpjpe'], key_indicator='MPJPE')
+
+# optimizer settings
+optimizer = dict(
+    type='Adam',
+    lr=1e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='exp',
+    by_epoch=True,
+    gamma=0.98,
+)
+
+total_epochs = 200
+
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    pretrained=None,
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=2,
+        kernel_sizes=(3, 3, 3),
+        dropout=0.25,
+        use_stride_conv=True),
+    keypoint_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=17,
+        loss_keypoint=dict(type='MPJPELoss')),
+    traj_backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=2,
+        kernel_sizes=(3, 3, 3),
+        dropout=0.25,
+        use_stride_conv=True),
+    traj_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=1,
+        loss_keypoint=dict(type='MPJPELoss', use_target_weight=True),
+        is_trajectory=True),
+    loss_semi=dict(
+        type='SemiSupervisionLoss',
+        joint_parents=[0, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15],
+        warmup_iterations=1311376 // 64 // 8 *
+        5),  # dataset_size // samples_per_gpu // gpu_num * warmup_epochs
+    train_cfg=dict(),
+    test_cfg=dict(restore_global_position=True))
+
+# data settings
+data_root = 'data/h36m'
+labeled_data_cfg = dict(
+    num_joints=17,
+    seq_len=27,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='detection',
+    joint_2d_det_file=f'{data_root}/joint_2d_det_files/' +
+    'cpn_ft_h36m_dbb_train.npy',
+    subset=0.1,
+    subjects=['S1'],
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+)
+unlabeled_data_cfg = dict(
+    num_joints=17,
+    seq_len=27,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='detection',
+    joint_2d_det_file=f'{data_root}/joint_2d_det_files/' +
+    'cpn_ft_h36m_dbb_train.npy',
+    subjects=['S5', 'S6', 'S7', 'S8'],
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+    need_2d_label=True)
+val_data_cfg = dict(
+    num_joints=17,
+    seq_len=27,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='detection',
+    joint_2d_det_file=f'{data_root}/joint_2d_det_files/' +
+    'cpn_ft_h36m_dbb_test.npy',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl')
+test_data_cfg = val_data_cfg
+
+train_labeled_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(
+        type='RelativeJointRandomFlip',
+        item=['input_2d', 'target'],
+        flip_cfg=[
+            dict(center_mode='static', center_x=0.),
+            dict(center_mode='root', center_index=0)
+        ],
+        visible_item=['input_2d_visible', 'target_visible'],
+        flip_prob=0.5),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target',
+              ('root_position', 'traj_target')],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+train_unlabeled_pipeline = [
+    dict(
+        type='ImageCoordinateNormalization',
+        item=['input_2d', 'target_2d'],
+        norm_camera=True),
+    dict(
+        type='RelativeJointRandomFlip',
+        item=['input_2d', 'target_2d'],
+        flip_cfg=[
+            dict(center_mode='static', center_x=0.),
+            dict(center_mode='static', center_x=0.)
+        ],
+        visible_item='input_2d_visible',
+        flip_prob=0.5,
+        flip_camera=True),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(type='CollectCameraIntrinsics'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'unlabeled_input'),
+              ('target_2d', 'unlabeled_target_2d'), 'intrinsics'],
+        meta_name='unlabeled_metas',
+        meta_keys=['target_image_path', 'flip_pairs'])
+]
+
+val_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=0,
+    val_dataloader=dict(samples_per_gpu=64),
+    test_dataloader=dict(samples_per_gpu=64),
+    train=dict(
+        type='Body3DSemiSupervisionDataset',
+        labeled_dataset=dict(
+            type='Body3DH36MDataset',
+            ann_file=f'{data_root}/annotation_body3d/fps50/h36m_train.npz',
+            img_prefix=f'{data_root}/images/',
+            data_cfg=labeled_data_cfg,
+            pipeline=train_labeled_pipeline,
+            dataset_info={{_base_.dataset_info}}),
+        unlabeled_dataset=dict(
+            type='Body3DH36MDataset',
+            ann_file=f'{data_root}/annotation_body3d/fps50/h36m_train.npz',
+            img_prefix=f'{data_root}/images/',
+            data_cfg=unlabeled_data_cfg,
+            pipeline=train_unlabeled_pipeline,
+            dataset_info={{_base_.dataset_info}})),
+    val=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=val_data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=test_data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_supervised.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f28a59b4c273d5dabd043d957b95e6c1286ce6a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_27frames_fullconv_supervised.py
@@ -0,0 +1,144 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/h36m.py'
+]
+evaluation = dict(
+    interval=10, metric=['mpjpe', 'p-mpjpe'], key_indicator='MPJPE')
+
+# optimizer settings
+optimizer = dict(
+    type='Adam',
+    lr=1e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='exp',
+    by_epoch=True,
+    gamma=0.975,
+)
+
+total_epochs = 160
+
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    pretrained=None,
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=2,
+        kernel_sizes=(3, 3, 3),
+        dropout=0.25,
+        use_stride_conv=True),
+    keypoint_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=17,
+        loss_keypoint=dict(type='MPJPELoss')),
+    train_cfg=dict(),
+    test_cfg=dict(restore_global_position=True))
+
+# data settings
+data_root = 'data/h36m'
+data_cfg = dict(
+    num_joints=17,
+    seq_len=27,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='gt',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+)
+
+train_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(
+        type='RelativeJointRandomFlip',
+        item=['input_2d', 'target'],
+        flip_cfg=[
+            dict(center_mode='static', center_x=0.),
+            dict(center_mode='root', center_index=0)
+        ],
+        visible_item=['input_2d_visible', 'target_visible'],
+        flip_prob=0.5),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+val_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=128),
+    test_dataloader=dict(samples_per_gpu=128),
+    train=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_train.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_81frames_fullconv_supervised.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_81frames_fullconv_supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..507a9f42c6cd6abdfa949b310a51ce10ad55c0e4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m_81frames_fullconv_supervised.py
@@ -0,0 +1,144 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/h36m.py'
+]
+evaluation = dict(
+    interval=10, metric=['mpjpe', 'p-mpjpe'], key_indicator='MPJPE')
+
+# optimizer settings
+optimizer = dict(
+    type='Adam',
+    lr=1e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='exp',
+    by_epoch=True,
+    gamma=0.975,
+)
+
+total_epochs = 160
+
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    pretrained=None,
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=3,
+        kernel_sizes=(3, 3, 3, 3),
+        dropout=0.25,
+        use_stride_conv=True),
+    keypoint_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=17,
+        loss_keypoint=dict(type='MPJPELoss')),
+    train_cfg=dict(),
+    test_cfg=dict(restore_global_position=True))
+
+# data settings
+data_root = 'data/h36m'
+data_cfg = dict(
+    num_joints=17,
+    seq_len=81,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=True,
+    joint_2d_src='gt',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotation_body3d/cameras.pkl',
+)
+
+train_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(
+        type='RelativeJointRandomFlip',
+        item=['input_2d', 'target'],
+        flip_cfg=[
+            dict(center_mode='static', center_x=0.),
+            dict(center_mode='root', center_index=0)
+        ],
+        visible_item=['input_2d_visible', 'target_visible'],
+        flip_prob=0.5),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+val_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=0,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=128),
+    test_dataloader=dict(samples_per_gpu=128),
+    train=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_train.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DH36MDataset',
+        ann_file=f'{data_root}/annotation_body3d/fps50/h36m_test.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp.md
new file mode 100644
index 0000000000000000000000000000000000000000..d85edc57b44368c86783c35adf3d320674e68819
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp.md
@@ -0,0 +1,41 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Pavllo_3D_Human_Pose_Estimation_in_Video_With_Temporal_Convolutions_and_CVPR_2019_paper.html">VideoPose3D (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{pavllo20193d,
+  title={3d human pose estimation in video with temporal convolutions and semi-supervised training},
+  author={Pavllo, Dario and Feichtenhofer, Christoph and Grangier, David and Auli, Michael},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7753--7762},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/8374605/">MPI-INF-3DHP (3DV'2017)</a></summary>
+
+```bibtex
+@inproceedings{mono-3dhp2017,
+  author = {Mehta, Dushyant and Rhodin, Helge and Casas, Dan and Fua, Pascal and Sotnychenko, Oleksandr and Xu, Weipeng and Theobalt, Christian},
+  title = {Monocular 3D Human Pose Estimation In The Wild Using Improved CNN Supervision},
+  booktitle = {3D Vision (3DV), 2017 Fifth International Conference on},
+  url = {http://gvv.mpi-inf.mpg.de/3dhp_dataset},
+  year = {2017},
+  organization={IEEE},
+  doi={10.1109/3dv.2017.00064},
+}
+```
+
+</details>
+
+Results on MPI-INF-3DHP dataset with ground truth 2D detections, supervised training
+
+| Arch | Receptive Field | MPJPE | P-MPJPE | 3DPCK | 3DAUC | ckpt | log |
+| :--- | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| [VideoPose3D](configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp_1frame_fullconv_supervised_gt.py) | 1 | 58.3 | 40.6 | 94.1 | 63.1 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_mpi-inf-3dhp_1frame_fullconv_supervised_gt-d6ed21ef_20210603.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_mpi-inf-3dhp_1frame_fullconv_supervised_gt_20210603.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp.yml
new file mode 100644
index 0000000000000000000000000000000000000000..70c073a8d9fb69765e32feae242d122b2bd2567a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp.yml
@@ -0,0 +1,24 @@
+Collections:
+- Name: VideoPose3D
+  Paper:
+    Title: 3d human pose estimation in video with temporal convolutions and semi-supervised
+      training
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Pavllo_3D_Human_Pose_Estimation_in_Video_With_Temporal_Convolutions_and_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/videopose3d.md
+Models:
+- Config: configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp_1frame_fullconv_supervised_gt.py
+  In Collection: VideoPose3D
+  Metadata:
+    Architecture:
+    - VideoPose3D
+    Training Data: MPI-INF-3DHP
+  Name: video_pose_lift_videopose3d_mpi-inf-3dhp_1frame_fullconv_supervised_gt
+  Results:
+  - Dataset: MPI-INF-3DHP
+    Metrics:
+      3DAUC: 63.1
+      3DPCK: 94.1
+      MPJPE: 58.3
+      P-MPJPE: 40.6
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_mpi-inf-3dhp_1frame_fullconv_supervised_gt-d6ed21ef_20210603.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp_1frame_fullconv_supervised_gt.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp_1frame_fullconv_supervised_gt.py
new file mode 100644
index 0000000000000000000000000000000000000000..dac308a60a11af88932c6c406ef465dcc9862396
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp_1frame_fullconv_supervised_gt.py
@@ -0,0 +1,156 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/mpi_inf_3dhp.py'
+]
+evaluation = dict(
+    interval=10,
+    metric=['mpjpe', 'p-mpjpe', '3dpck', '3dauc'],
+    key_indicator='MPJPE')
+
+# optimizer settings
+optimizer = dict(
+    type='Adam',
+    lr=1e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='exp',
+    by_epoch=True,
+    gamma=0.98,
+)
+
+total_epochs = 160
+
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    pretrained=None,
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=4,
+        kernel_sizes=(1, 1, 1, 1, 1),
+        dropout=0.25,
+        use_stride_conv=True),
+    keypoint_head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=17,
+        loss_keypoint=dict(type='MPJPELoss')),
+    train_cfg=dict(),
+    test_cfg=dict(restore_global_position=True))
+
+# data settings
+data_root = 'data/mpi_inf_3dhp'
+train_data_cfg = dict(
+    num_joints=17,
+    seq_len=1,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=False,
+    joint_2d_src='gt',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotations/cameras_train.pkl',
+)
+test_data_cfg = dict(
+    num_joints=17,
+    seq_len=1,
+    seq_frame_interval=1,
+    causal=False,
+    temporal_padding=False,
+    joint_2d_src='gt',
+    need_camera_param=True,
+    camera_param_file=f'{data_root}/annotations/cameras_test.pkl',
+)
+
+train_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=14,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(
+        type='RelativeJointRandomFlip',
+        item=['input_2d', 'target'],
+        flip_cfg=[
+            dict(center_mode='static', center_x=0.),
+            dict(center_mode='root', center_index=14)
+        ],
+        visible_item=['input_2d_visible', 'target_visible'],
+        flip_prob=0.5),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+val_pipeline = [
+    dict(
+        type='GetRootCenteredPose',
+        item='target',
+        visible_item='target_visible',
+        root_index=14,
+        root_name='root_position',
+        remove_root=False),
+    dict(type='ImageCoordinateNormalization', item='input_2d'),
+    dict(type='PoseSequenceToTensor', item='input_2d'),
+    dict(
+        type='Collect',
+        keys=[('input_2d', 'input'), 'target'],
+        meta_name='metas',
+        meta_keys=['target_image_path', 'flip_pairs', 'root_position'])
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=128),
+    test_dataloader=dict(samples_per_gpu=128),
+    train=dict(
+        type='Body3DMpiInf3dhpDataset',
+        ann_file=f'{data_root}/annotations/mpi_inf_3dhp_train.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=train_data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Body3DMpiInf3dhpDataset',
+        ann_file=f'{data_root}/annotations/mpi_inf_3dhp_test_valid.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=test_data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Body3DMpiInf3dhpDataset',
+        ann_file=f'{data_root}/annotations/mpi_inf_3dhp_test_valid.npz',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=test_data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a0c7817f40f334ddbc79b3e3c2b5f27e9cfff076
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/README.md
@@ -0,0 +1,120 @@
+# Human Body 3D Mesh Recovery
+
+This task aims at recovering the full 3D mesh representation (parameterized by shape and 3D joint angles) of a
+human body from a single RGB image.
+
+## Data preparation
+
+The preparation for human mesh recovery mainly includes:
+
+- Datasets
+- Annotations
+- SMPL Model
+
+Please follow [DATA Preparation](/docs/en/tasks/3d_body_mesh.md) to prepare them.
+
+## Prepare Pretrained Models
+
+Please download the pretrained HMR model from
+[here](https://download.openmmlab.com/mmpose/mesh/hmr/hmr_mesh_224x224-c21e8229_20201015.pth),
+and make it looks like this:
+
+```text
+mmpose
+`-- models
+    `-- pytorch
+         `-- hmr
+            |-- hmr_mesh_224x224-c21e8229_20201015.pth
+```
+
+## Inference with pretrained models
+
+### Test a Dataset
+
+You can use the following commands to test the pretrained model on Human3.6M test set and
+evaluate the joint error.
+
+```shell
+# single-gpu testing
+python tools/test.py configs/mesh/hmr/hmr_resnet_50.py \
+models/pytorch/hmr/hmr_mesh_224x224-c21e8229_20201015.pth --eval=joint_error
+
+# multiple-gpu testing
+./tools/dist_test.sh configs/mesh/hmr/hmr_resnet_50.py \
+models/pytorch/hmr/hmr_mesh_224x224-c21e8229_20201015.pth 8 --eval=joint_error
+```
+
+## Train the model
+
+In order to train the model, please download the
+[zip file](https://drive.google.com/file/d/1JrwfHYIFdQPO7VeBEG9Kk3xsZMVJmhtv/view?usp=sharing)
+of the sampled train images of Human3.6M dataset.
+Extract the images and make them look like this：
+
+```text
+mmpose
+├── mmpose
+├── docs
+├── tests
+├── tools
+├── configs
+`── data
+    │── h36m_train
+        ├── S1
+        │   ├── S1_Directions_1.54138969
+        │   │   ├── S1_Directions_1.54138969_000001.jpg
+        │   │   ├── S1_Directions_1.54138969_000006.jpg
+        │   │   └── ...
+        │   ├── S1_Directions_1.55011271
+        │   └── ...
+        ├── S11
+        │   ├── S11_Directions_1.54138969
+        │   ├── S11_Directions_1.55011271
+        │   └── ...
+        ├── S5
+        │   ├── S5_Directions_1.54138969
+        │   ├── S5_Directions_1.55011271
+        │   └── S5_WalkTogether.60457274
+        ├── S6
+        │   ├── S6_Directions_1.54138969
+        │   ├── S6_Directions_1.55011271
+        │   └── S6_WalkTogether.60457274
+        ├── S7
+        │   ├── S7_Directions_1.54138969
+        │   ├── S7_Directions_1.55011271
+        │   └── S7_WalkTogether.60457274
+        ├── S8
+        │   ├── S8_Directions_1.54138969
+        │   ├── S8_Directions_1.55011271
+        │   └── S8_WalkTogether_2.60457274
+        └── S9
+            ├── S9_Directions_1.54138969
+            ├── S9_Directions_1.55011271
+            └── S9_WalkTogether.60457274
+
+```
+
+Please also download the preprocessed annotation file for Human3.6M train set from
+[here](https://drive.google.com/file/d/1NveJQGS4IYaASaJbLHT_zOGqm6Lo_gh5/view?usp=sharing)
+under `$MMPOSE/data/mesh_annotation_files`, and make it like this:
+
+```text
+mmpose
+├── mmpose
+├── docs
+├── tests
+├── tools
+├── configs
+`── data
+    │── mesh_annotation_files
+        ├── h36m_train.npz
+        └── ...
+```
+
+### Train with multiple GPUs
+
+Here is the code of using 8 GPUs to train HMR net:
+
+```shell
+./tools/dist_train.sh configs/mesh/hmr/hmr_resnet_50.py 8 --work-dir work_dirs/hmr --no-validate
+```
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b970e4970531b78773681c893c7950831824cd10
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/README.md
@@ -0,0 +1,24 @@
+# End-to-end Recovery of Human Shape and Pose
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Kanazawa_End-to-End_Recovery_of_CVPR_2018_paper.html">HMR (CVPR'2018)</a></summary>
+
+```bibtex
+@inProceedings{kanazawaHMR18,
+  title={End-to-end Recovery of Human Shape and Pose},
+  author = {Angjoo Kanazawa
+  and Michael J. Black
+  and David W. Jacobs
+  and Jitendra Malik},
+  booktitle={Computer Vision and Pattern Recognition (CVPR)},
+  year={2018}
+}
+```
+
+</details>
+
+HMR is an end-to-end framework for reconstructing a full 3D mesh of a human body from a single RGB image.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/mixed/res50_mixed_224x224.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/mixed/res50_mixed_224x224.py
new file mode 100644
index 0000000000000000000000000000000000000000..669cba07d996ddbdb3948861b2c379865429879e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/mixed/res50_mixed_224x224.py
@@ -0,0 +1,149 @@
+_base_ = ['../../../../_base_/default_runtime.py']
+use_adversarial_train = True
+
+optimizer = dict(
+    generator=dict(type='Adam', lr=2.5e-4),
+    discriminator=dict(type='Adam', lr=1e-4))
+
+optimizer_config = None
+
+lr_config = dict(policy='Fixed', by_epoch=False)
+
+total_epochs = 100
+img_res = 224
+
+# model settings
+model = dict(
+    type='ParametricMesh',
+    pretrained=None,
+    backbone=dict(type='ResNet', depth=50),
+    mesh_head=dict(
+        type='HMRMeshHead',
+        in_channels=2048,
+        smpl_mean_params='models/smpl/smpl_mean_params.npz',
+    ),
+    disc=dict(),
+    smpl=dict(
+        type='SMPL',
+        smpl_path='models/smpl',
+        joints_regressor='models/smpl/joints_regressor_cmr.npy'),
+    train_cfg=dict(disc_step=1),
+    test_cfg=dict(),
+    loss_mesh=dict(
+        type='MeshLoss',
+        joints_2d_loss_weight=100,
+        joints_3d_loss_weight=1000,
+        vertex_loss_weight=20,
+        smpl_pose_loss_weight=30,
+        smpl_beta_loss_weight=0.2,
+        focal_length=5000,
+        img_res=img_res),
+    loss_gan=dict(
+        type='GANLoss',
+        gan_type='lsgan',
+        real_label_val=1.0,
+        fake_label_val=0.0,
+        loss_weight=1))
+
+data_cfg = dict(
+    image_size=[img_res, img_res],
+    iuv_size=[img_res // 4, img_res // 4],
+    num_joints=24,
+    use_IUV=False,
+    uv_type='BF')
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='MeshRandomChannelNoise', noise_factor=0.4),
+    dict(type='MeshRandomFlip', flip_prob=0.5),
+    dict(type='MeshGetRandomScaleRotation', rot_factor=30, scale_factor=0.25),
+    dict(type='MeshAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'joints_2d', 'joints_2d_visible', 'joints_3d',
+            'joints_3d_visible', 'pose', 'beta', 'has_smpl'
+        ],
+        meta_keys=['image_file', 'center', 'scale', 'rotation']),
+]
+
+train_adv_pipeline = [dict(type='Collect', keys=['mosh_theta'], meta_keys=[])]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='MeshAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=['image_file', 'center', 'scale', 'rotation']),
+]
+
+test_pipeline = val_pipeline
+
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    train=dict(
+        type='MeshAdversarialDataset',
+        train_dataset=dict(
+            type='MeshMixDataset',
+            configs=[
+                dict(
+                    ann_file='data/mesh_annotation_files/h36m_train.npz',
+                    img_prefix='data/h36m_train',
+                    data_cfg=data_cfg,
+                    pipeline=train_pipeline),
+                dict(
+                    ann_file='data/mesh_annotation_files/'
+                    'mpi_inf_3dhp_train.npz',
+                    img_prefix='data/mpi_inf_3dhp',
+                    data_cfg=data_cfg,
+                    pipeline=train_pipeline),
+                dict(
+                    ann_file='data/mesh_annotation_files/'
+                    'lsp_dataset_original_train.npz',
+                    img_prefix='data/lsp_dataset_original',
+                    data_cfg=data_cfg,
+                    pipeline=train_pipeline),
+                dict(
+                    ann_file='data/mesh_annotation_files/hr-lspet_train.npz',
+                    img_prefix='data/hr-lspet',
+                    data_cfg=data_cfg,
+                    pipeline=train_pipeline),
+                dict(
+                    ann_file='data/mesh_annotation_files/mpii_train.npz',
+                    img_prefix='data/mpii',
+                    data_cfg=data_cfg,
+                    pipeline=train_pipeline),
+                dict(
+                    ann_file='data/mesh_annotation_files/coco_2014_train.npz',
+                    img_prefix='data/coco',
+                    data_cfg=data_cfg,
+                    pipeline=train_pipeline)
+            ],
+            partition=[0.35, 0.15, 0.1, 0.10, 0.10, 0.2]),
+        adversarial_dataset=dict(
+            type='MoshDataset',
+            ann_file='data/mesh_annotation_files/CMU_mosh.npz',
+            pipeline=train_adv_pipeline),
+    ),
+    test=dict(
+        type='MeshH36MDataset',
+        ann_file='data/mesh_annotation_files/h36m_valid_protocol2.npz',
+        img_prefix='data/Human3.6M',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+    ),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/mixed/resnet_mixed.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/mixed/resnet_mixed.md
new file mode 100644
index 0000000000000000000000000000000000000000..e76d54e6013315b4091880eee279537004407df1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/mixed/resnet_mixed.md
@@ -0,0 +1,62 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Kanazawa_End-to-End_Recovery_of_CVPR_2018_paper.html">HMR (CVPR'2018)</a></summary>
+
+```bibtex
+@inProceedings{kanazawaHMR18,
+  title={End-to-end Recovery of Human Shape and Pose},
+  author = {Angjoo Kanazawa
+  and Michael J. Black
+  and David W. Jacobs
+  and Jitendra Malik},
+  booktitle={Computer Vision and Pattern Recognition (CVPR)},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/6682899/">Human3.6M (TPAMI'2014)</a></summary>
+
+```bibtex
+@article{h36m_pami,
+  author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu,  Cristian},
+  title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  publisher = {IEEE Computer Society},
+  volume = {36},
+  number = {7},
+  pages = {1325-1339},
+  month = {jul},
+  year = {2014}
+}
+```
+
+</details>
+
+Results on Human3.6M with ground-truth bounding box having MPJPE-PA of 52.60 mm on Protocol2
+
+| Arch  | Input Size | MPJPE (P1)| MPJPE-PA (P1) | MPJPE (P2) | MPJPE-PA (P2) | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |
+| [hmr_resnet_50](/configs/body/3d_mesh_sview_rgb_img/hmr/mixed/res50_mixed_224x224.py)  | 224x224 | 80.75 | 55.08 | 80.35 | 52.60 | [ckpt](https://download.openmmlab.com/mmpose/mesh/hmr/hmr_mesh_224x224-c21e8229_20201015.pth) | [log](https://download.openmmlab.com/mmpose/mesh/hmr/hmr_mesh_224x224_20201015.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/mixed/resnet_mixed.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/mixed/resnet_mixed.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b5307dd052795c58740d1845f913852fa0d4b164
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/body/3d_mesh_sview_rgb_img/hmr/mixed/resnet_mixed.yml
@@ -0,0 +1,24 @@
+Collections:
+- Name: HMR
+  Paper:
+    Title: End-to-end Recovery of Human Shape and Pose
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Kanazawa_End-to-End_Recovery_of_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/hmr.md
+Models:
+- Config: configs/body/3d_mesh_sview_rgb_img/hmr/mixed/res50_mixed_224x224.py
+  In Collection: HMR
+  Metadata:
+    Architecture:
+    - HMR
+    - ResNet
+    Training Data: Human3.6M
+  Name: hmr_res50_mixed_224x224
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE (P1): 80.75
+      MPJPE (P2): 80.35
+      MPJPE-PA (P1): 55.08
+      MPJPE-PA (P2): 52.6
+    Task: Body 3D Mesh
+  Weights: https://download.openmmlab.com/mmpose/mesh/hmr/hmr_mesh_224x224-c21e8229_20201015.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..65a4c3dec855ddea53d6d89f9ee3d6e76263a5b1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/README.md
@@ -0,0 +1,16 @@
+# 2D Face Landmark Detection
+
+2D face landmark detection (also referred to as face alignment) is defined as the task of detecting the face keypoints from an input image.
+
+Normally, the input images are cropped face images, where the face locates at the center;
+or the rough location (or the bounding box) of the hand is provided.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/2d_face_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/2d_face_demo.md) to run demos.
+
+<img src="https://user-images.githubusercontent.com/11788150/109144943-ccd44900-779c-11eb-9e9d-8682e7629654.gif" width="600px" alt><br>
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..155c92ac183305d8d159a001f215d44d4566b866
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/README.md
@@ -0,0 +1,24 @@
+# DeepPose: Human pose estimation via deep neural networks
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+DeepPose first proposes using deep neural networks (DNNs) to tackle the problem of pose estimation.
+It follows the top-down paradigm, that first detects the bounding boxes and then estimates poses.
+It learns to directly regress the face keypoint coordinates.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c32cf765d386f02e73b1e5276acfd3de1ebd9db
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256.py
@@ -0,0 +1,122 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/wflw.py'
+]
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=98,
+    dataset_joints=98,
+    dataset_channel=[
+        list(range(98)),
+    ],
+    inference_channel=list(range(98)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/wflw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256_softwingloss.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256_softwingloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3ebd31d1c5ceb0706597e739c6e7560832b1791
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256_softwingloss.py
@@ -0,0 +1,122 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/wflw.py'
+]
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=98,
+    dataset_joints=98,
+    dataset_channel=[
+        list(range(98)),
+    ],
+    inference_channel=list(range(98)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SoftWingLoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/wflw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256_wingloss.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256_wingloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5578c81d697713c16eb227c6e5d956ab544c5b79
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256_wingloss.py
@@ -0,0 +1,122 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/wflw.py'
+]
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=98,
+    dataset_joints=98,
+    dataset_channel=[
+        list(range(98)),
+    ],
+    inference_channel=list(range(98)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='WingLoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/wflw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_softwingloss_wflw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_softwingloss_wflw.md
new file mode 100644
index 0000000000000000000000000000000000000000..e7bad5704326465af9b1d16ff94bc33d16f9e070
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_softwingloss_wflw.md
@@ -0,0 +1,75 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/document/9442331/">SoftWingloss (TIP'2021)</a></summary>
+
+```bibtex
+@article{lin2021structure,
+  title={Structure-Coherent Deep Feature Learning for Robust Face Alignment},
+  author={Lin, Chunze and Zhu, Beier and Wang, Quan and Liao, Renjie and Qian, Chen and Lu, Jiwen and Zhou, Jie},
+  journal={IEEE Transactions on Image Processing},
+  year={2021},
+  publisher={IEEE}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Wu_Look_at_Boundary_CVPR_2018_paper.html">WFLW (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{wu2018look,
+  title={Look at boundary: A boundary-aware face alignment algorithm},
+  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={2129--2138},
+  year={2018}
+}
+```
+
+</details>
+
+Results on WFLW dataset
+
+The model is trained on WFLW train.
+
+| Arch  | Input Size | NME<sub>*test*</sub> | NME<sub>*pose*</sub> | NME<sub>*illumination*</sub> | NME<sub>*occlusion*</sub> | NME<sub>*blur*</sub> | NME<sub>*makeup*</sub> | NME<sub>*expression*</sub> | ckpt | log |
+| :-----| :--------: | :------------------: | :------------------: |:---------------------------: |:------------------------: | :------------------: | :--------------: |:-------------------------: |:---: | :---: |
+| [deeppose_res50_softwingloss](/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256_softwingloss.py)  | 256x256 | 4.41 | 7.77 | 4.37 | 5.27 | 5.01 | 4.36 | 4.70 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss-4d34f22a_20211212.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss_20211212.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_softwingloss_wflw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_softwingloss_wflw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ffd81c0534cd9c48548461145dbdf5640a492b17
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_softwingloss_wflw.yml
@@ -0,0 +1,28 @@
+Collections:
+- Name: SoftWingloss
+  Paper:
+    Title: Structure-Coherent Deep Feature Learning for Robust Face Alignment
+    URL: https://ieeexplore.ieee.org/document/9442331/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/softwingloss.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256_softwingloss.py
+  In Collection: SoftWingloss
+  Metadata:
+    Architecture:
+    - DeepPose
+    - ResNet
+    - SoftWingloss
+    Training Data: WFLW
+  Name: deeppose_res50_wflw_256x256_softwingloss
+  Results:
+  - Dataset: WFLW
+    Metrics:
+      NME blur: 5.01
+      NME expression: 4.7
+      NME illumination: 4.37
+      NME makeup: 4.36
+      NME occlusion: 5.27
+      NME pose: 7.77
+      NME test: 4.41
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss-4d34f22a_20211212.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wflw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wflw.md
new file mode 100644
index 0000000000000000000000000000000000000000..f27f74a4548dfc4f8fb033eb1c9c29d04ffd74a1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wflw.md
@@ -0,0 +1,58 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Wu_Look_at_Boundary_CVPR_2018_paper.html">WFLW (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{wu2018look,
+  title={Look at boundary: A boundary-aware face alignment algorithm},
+  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={2129--2138},
+  year={2018}
+}
+```
+
+</details>
+
+Results on WFLW dataset
+
+The model is trained on WFLW train.
+
+| Arch  | Input Size | NME<sub>*test*</sub> | NME<sub>*pose*</sub> | NME<sub>*illumination*</sub> | NME<sub>*occlusion*</sub> | NME<sub>*blur*</sub> | NME<sub>*makeup*</sub> | NME<sub>*expression*</sub> | ckpt | log |
+| :-----| :--------: | :------------------: | :------------------: |:---------------------------: |:------------------------: | :------------------: | :--------------: |:-------------------------: |:---: | :---: |
+| [deeppose_res50](/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256.py)  | 256x256 | 4.85 | 8.50 | 4.81 | 5.69 | 5.45 | 4.82 | 5.20 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256-92d0ba7f_20210303.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_20210303.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wflw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wflw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..03df2a716ef81252348f1c6713ffe7166892f3aa
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wflw.yml
@@ -0,0 +1,27 @@
+Collections:
+- Name: ResNet
+  Paper:
+    Title: Deep residual learning for image recognition
+    URL: http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnet.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256.py
+  In Collection: ResNet
+  Metadata:
+    Architecture:
+    - DeepPose
+    - ResNet
+    Training Data: WFLW
+  Name: deeppose_res50_wflw_256x256
+  Results:
+  - Dataset: WFLW
+    Metrics:
+      NME blur: 5.45
+      NME expression: 5.2
+      NME illumination: 4.81
+      NME makeup: 4.82
+      NME occlusion: 5.69
+      NME pose: 8.5
+      NME test: 4.85
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256-92d0ba7f_20210303.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wingloss_wflw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wingloss_wflw.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb5fd1929e6ecc3fecf205b60d472bb04ada2cb8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wingloss_wflw.md
@@ -0,0 +1,76 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Feng_Wing_Loss_for_CVPR_2018_paper.html">Wingloss (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{feng2018wing,
+  title={Wing Loss for Robust Facial Landmark Localisation with Convolutional Neural Networks},
+  author={Feng, Zhen-Hua and Kittler, Josef and Awais, Muhammad and Huber, Patrik and Wu, Xiao-Jun},
+  booktitle={Computer Vision and Pattern Recognition (CVPR), 2018 IEEE Conference on},
+  year={2018},
+  pages ={2235-2245},
+  organization={IEEE}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Wu_Look_at_Boundary_CVPR_2018_paper.html">WFLW (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{wu2018look,
+  title={Look at boundary: A boundary-aware face alignment algorithm},
+  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={2129--2138},
+  year={2018}
+}
+```
+
+</details>
+
+Results on WFLW dataset
+
+The model is trained on WFLW train.
+
+| Arch  | Input Size | NME<sub>*test*</sub> | NME<sub>*pose*</sub> | NME<sub>*illumination*</sub> | NME<sub>*occlusion*</sub> | NME<sub>*blur*</sub> | NME<sub>*makeup*</sub> | NME<sub>*expression*</sub> | ckpt | log |
+| :-----| :--------: | :------------------: | :------------------: |:---------------------------: |:------------------------: | :------------------: | :--------------: |:-------------------------: |:---: | :---: |
+| [deeppose_res50_wingloss](/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256_wingloss.py)  | 256x256 | 4.64 | 8.25 | 4.59 | 5.56 | 5.26 | 4.59 | 5.07 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss-f82a5e53_20210303.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss_20210303.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wingloss_wflw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wingloss_wflw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..494258b4ec06a8ef81b097d173911f6c58941cb2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wingloss_wflw.yml
@@ -0,0 +1,29 @@
+Collections:
+- Name: Wingloss
+  Paper:
+    Title: Wing Loss for Robust Facial Landmark Localisation with Convolutional Neural
+      Networks
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Feng_Wing_Loss_for_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/wingloss.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/res50_wflw_256x256_wingloss.py
+  In Collection: Wingloss
+  Metadata:
+    Architecture:
+    - DeepPose
+    - ResNet
+    - Wingloss
+    Training Data: WFLW
+  Name: deeppose_res50_wflw_256x256_wingloss
+  Results:
+  - Dataset: WFLW
+    Metrics:
+      NME blur: 5.26
+      NME expression: 5.07
+      NME illumination: 4.59
+      NME makeup: 4.59
+      NME occlusion: 5.56
+      NME pose: 8.25
+      NME test: 4.64
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss-f82a5e53_20210303.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_300w.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_300w.md
new file mode 100644
index 0000000000000000000000000000000000000000..aae3b73ffe9a99b76fb815fac3029153b85594c6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_300w.md
@@ -0,0 +1,44 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://www.sciencedirect.com/science/article/pii/S0262885616000147">300W (IMAVIS'2016)</a></summary>
+
+```bibtex
+@article{sagonas2016300,
+  title={300 faces in-the-wild challenge: Database and results},
+  author={Sagonas, Christos and Antonakos, Epameinondas and Tzimiropoulos, Georgios and Zafeiriou, Stefanos and Pantic, Maja},
+  journal={Image and vision computing},
+  volume={47},
+  pages={3--18},
+  year={2016},
+  publisher={Elsevier}
+}
+```
+
+</details>
+
+Results on 300W dataset
+
+The model is trained on 300W train.
+
+| Arch  | Input Size | NME<sub>*common*</sub> | NME<sub>*challenge*</sub> | NME<sub>*full*</sub> | NME<sub>*test*</sub> | ckpt | log |
+| :-----| :--------: | :------------------: | :------------------: | :--------------: |:-------------------------: |:---: | :---: |
+| [pose_hrnetv2_w18](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_w18_300w_256x256.py)  | 256x256 | 2.86 | 5.45 | 3.37  | 3.97  | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_300w_256x256-eea53406_20211019.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_300w_256x256_20211019.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_300w.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_300w.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3d03f9e716ff41ebf9faada16bf1864809e5ad7f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_300w.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: HRNetv2
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Visual Recognition
+    URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_w18_300w_256x256.py
+  In Collection: HRNetv2
+  Metadata:
+    Architecture:
+    - HRNetv2
+    Training Data: 300W
+  Name: topdown_heatmap_hrnetv2_w18_300w_256x256
+  Results:
+  - Dataset: 300W
+    Metrics:
+      NME challenge: 5.45
+      NME common: 2.86
+      NME full: 3.37
+      NME test: 3.97
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_300w_256x256-eea53406_20211019.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_w18_300w_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_w18_300w_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..88c9bdf91a97676814e01b6902ab0492b2148c49
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_w18_300w_256x256.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/300w.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=68,
+    dataset_joints=68,
+    dataset_channel=[
+        list(range(68)),
+    ],
+    inference_channel=list(range(68)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=1.5),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/300w'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='Face300WDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_300w_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Face300WDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_300w_valid.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Face300WDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_300w_valid.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_w18_300w_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_w18_300w_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6275f6fa41367602f2633fd0e9dd91587c6129ba
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_w18_300w_256x256_dark.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/300w.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=68,
+    dataset_joints=68,
+    dataset_channel=[
+        list(range(68)),
+    ],
+    inference_channel=list(range(68)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/300w'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='Face300WDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_300w_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Face300WDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_300w_valid.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Face300WDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_300w_valid.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/res50_300w_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/res50_300w_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..9194cfb2f8305fbd08dd946406551c8a0a82eac1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/res50_300w_256x256.py
@@ -0,0 +1,126 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/300w.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=68,
+    dataset_joints=68,
+    dataset_channel=[
+        list(range(68)),
+    ],
+    inference_channel=list(range(68)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/300w'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='Face300WDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_300w_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Face300WDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_300w_valid.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Face300WDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_300w_valid.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ed6f5b02c8502ef0f23f699ec81554fc88ff36f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
@@ -0,0 +1,10 @@
+# Top-down heatmap-based face keypoint estimation
+
+Top-down methods divide the task into two stages: face detection and face keypoint estimation.
+
+They perform face detection first, followed by face keypoint estimation given face bounding boxes.
+Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the
+likelihood of being a keypoint.
+
+Various neural network models have been proposed for better performance.
+The popular ones include HRNetv2.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_aflw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_aflw.md
new file mode 100644
index 0000000000000000000000000000000000000000..52907485c31fead106dcc94908bfaee10e3fa1e0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_aflw.md
@@ -0,0 +1,43 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/6130513/">AFLW (ICCVW'2011)</a></summary>
+
+```bibtex
+@inproceedings{koestinger2011annotated,
+  title={Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization},
+  author={Koestinger, Martin and Wohlhart, Paul and Roth, Peter M and Bischof, Horst},
+  booktitle={2011 IEEE international conference on computer vision workshops (ICCV workshops)},
+  pages={2144--2151},
+  year={2011},
+  organization={IEEE}
+}
+```
+
+</details>
+
+Results on AFLW dataset
+
+The model is trained on AFLW train and evaluated on AFLW full and frontal.
+
+| Arch  | Input Size | NME<sub>*full*</sub> | NME<sub>*frontal*</sub>  | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_w18_aflw_256x256.py)  | 256x256 | 1.41 | 1.27 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256_20210125.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_aflw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_aflw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1ee61e35afef3372541a0603f687e7af57b59c2b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_aflw.yml
@@ -0,0 +1,21 @@
+Collections:
+- Name: HRNetv2
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Visual Recognition
+    URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_w18_aflw_256x256.py
+  In Collection: HRNetv2
+  Metadata:
+    Architecture:
+    - HRNetv2
+    Training Data: AFLW
+  Name: topdown_heatmap_hrnetv2_w18_aflw_256x256
+  Results:
+  - Dataset: AFLW
+    Metrics:
+      NME frontal: 1.27
+      NME full: 1.41
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_dark_aflw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_dark_aflw.md
new file mode 100644
index 0000000000000000000000000000000000000000..19161ec6b308ca6af9a166536c209431b749438f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_dark_aflw.md
@@ -0,0 +1,60 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/6130513/">AFLW (ICCVW'2011)</a></summary>
+
+```bibtex
+@inproceedings{koestinger2011annotated,
+  title={Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization},
+  author={Koestinger, Martin and Wohlhart, Paul and Roth, Peter M and Bischof, Horst},
+  booktitle={2011 IEEE international conference on computer vision workshops (ICCV workshops)},
+  pages={2144--2151},
+  year={2011},
+  organization={IEEE}
+}
+```
+
+</details>
+
+Results on AFLW dataset
+
+The model is trained on AFLW train and evaluated on AFLW full and frontal.
+
+| Arch  | Input Size | NME<sub>*full*</sub> | NME<sub>*frontal*</sub>  | ckpt | log |
+| :-------------- | :-----------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18_dark](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_w18_aflw_256x256_dark.py)  | 256x256 | 1.34 | 1.20 | [ckpt](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_aflw_256x256_dark-219606c0_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_aflw_256x256_dark_20210125.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ab60120930746f6ab4e6bbee6203c08dec14b482
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_w18_aflw_256x256_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - DarkPose
+    Training Data: AFLW
+  Name: topdown_heatmap_hrnetv2_w18_aflw_256x256_dark
+  Results:
+  - Dataset: AFLW
+    Metrics:
+      NME frontal: 1.2
+      NME full: 1.34
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_aflw_256x256_dark-219606c0_20210125.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_w18_aflw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_w18_aflw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..b139c2323dbfb0addb3baf3a6c348962e232331f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_w18_aflw_256x256.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aflw.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=19,
+    dataset_joints=19,
+    dataset_channel=[
+        list(range(19)),
+    ],
+    inference_channel=list(range(19)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aflw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceAFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_aflw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceAFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_aflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceAFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_aflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_w18_aflw_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_w18_aflw_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7ab367de704b615b6fa3caf2cce97b60d4e7c91
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_w18_aflw_256x256_dark.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aflw.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=19,
+    dataset_joints=19,
+    dataset_channel=[
+        list(range(19)),
+    ],
+    inference_channel=list(range(19)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aflw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceAFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_aflw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceAFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_aflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceAFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_aflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/res50_aflw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/res50_aflw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e216574600978f3ca55af0cfb9f97b233ffe313
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/res50_aflw_256x256.py
@@ -0,0 +1,126 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/aflw.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=19,
+    dataset_joints=19,
+    dataset_channel=[
+        list(range(19)),
+    ],
+    inference_channel=list(range(19)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/aflw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceAFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_aflw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceAFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_aflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceAFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_aflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass52_coco_wholebody_face_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass52_coco_wholebody_face_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7989b49808187dbd3158070e47fcfa54247853d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass52_coco_wholebody_face_256x256.py
@@ -0,0 +1,132 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_face.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], key_indicator='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=68,
+    dataset_joints=68,
+    dataset_channel=[
+        list(range(68)),
+    ],
+    inference_channel=list(range(68)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HourglassNet',
+        num_stacks=1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=1,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.md
new file mode 100644
index 0000000000000000000000000000000000000000..9cc9af478dc6d89a2d8ea4de23d7f3a6d082b827
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.md
@@ -0,0 +1,39 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29">Hourglass (ECCV'2016)</a></summary>
+
+```bibtex
+@inproceedings{newell2016stacked,
+  title={Stacked hourglass networks for human pose estimation},
+  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
+  booktitle={European conference on computer vision},
+  pages={483--499},
+  year={2016},
+  organization={Springer}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Face (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Face val set
+
+| Arch  | Input Size | NME | ckpt | log |
+| :-------------- | :-----------: | :------: |:------: |:------: |
+| [pose_hourglass_52](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass52_coco_wholebody_face_256x256.py)  | 256x256 | 0.0586 | [ckpt](https://download.openmmlab.com/mmpose/face/hourglass/hourglass52_coco_wholebody_face_256x256-6994cf2e_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/hourglass/hourglass52_coco_wholebody_face_256x256_20210909.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml
new file mode 100644
index 0000000000000000000000000000000000000000..03761d866e573566090f40f7fb0d917126dd0f41
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml
@@ -0,0 +1,20 @@
+Collections:
+- Name: Hourglass
+  Paper:
+    Title: Stacked hourglass networks for human pose estimation
+    URL: https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hourglass.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass52_coco_wholebody_face_256x256.py
+  In Collection: Hourglass
+  Metadata:
+    Architecture:
+    - Hourglass
+    Training Data: COCO-WholeBody-Face
+  Name: topdown_heatmap_hourglass52_coco_wholebody_face_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Face
+    Metrics:
+      NME: 0.0586
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/hourglass/hourglass52_coco_wholebody_face_256x256-6994cf2e_20210909.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.md
new file mode 100644
index 0000000000000000000000000000000000000000..f1d4fb8d329059ffe1821d3c18fa4b9c2ba17947
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.md
@@ -0,0 +1,39 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Face (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Face val set
+
+| Arch  | Input Size | NME | ckpt | log |
+| :-------------- | :-----------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_w18_coco_wholebody_face_256x256.py)  | 256x256 | 0.0569 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_coco_wholebody_face_256x256-c1ca469b_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_coco_wholebody_face_256x256_20210909.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml
new file mode 100644
index 0000000000000000000000000000000000000000..754598e49a7460596b8e393806a69d4bbe9985b8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml
@@ -0,0 +1,20 @@
+Collections:
+- Name: HRNetv2
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Visual Recognition
+    URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_w18_coco_wholebody_face_256x256.py
+  In Collection: HRNetv2
+  Metadata:
+    Architecture:
+    - HRNetv2
+    Training Data: COCO-WholeBody-Face
+  Name: topdown_heatmap_hrnetv2_w18_coco_wholebody_face_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Face
+    Metrics:
+      NME: 0.0569
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_coco_wholebody_face_256x256-c1ca469b_20210909.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md
new file mode 100644
index 0000000000000000000000000000000000000000..4de0db0cd0cb0e3da7bdcf7aebad9c3101519ff5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md
@@ -0,0 +1,56 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Face (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Face val set
+
+| Arch  | Input Size | NME | ckpt | log |
+| :-------------- | :-----------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18_dark](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_w18_coco_wholebody_face_256x256_dark.py)  | 256x256 | 0.0513 | [ckpt](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_coco_wholebody_face_256x256_dark-3d9a334e_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_coco_wholebody_face_256x256_dark_20210909.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e8b9e895744e742aa5d9ebc2ca9d3a7d28617fe2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml
@@ -0,0 +1,21 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_w18_coco_wholebody_face_256x256_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - DarkPose
+    Training Data: COCO-WholeBody-Face
+  Name: topdown_heatmap_hrnetv2_w18_coco_wholebody_face_256x256_dark
+  Results:
+  - Dataset: COCO-WholeBody-Face
+    Metrics:
+      NME: 0.0513
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_coco_wholebody_face_256x256_dark-3d9a334e_20210909.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_w18_coco_wholebody_face_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_w18_coco_wholebody_face_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..88722deaaa1075ea55aa104a3bbe7bd9832c70eb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_w18_coco_wholebody_face_256x256.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_face.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], key_indicator='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=68,
+    dataset_joints=68,
+    dataset_channel=[
+        list(range(68)),
+    ],
+    inference_channel=list(range(68)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_w18_coco_wholebody_face_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_w18_coco_wholebody_face_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3998c3bde899e16fdd629f4450c55244f223765
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_w18_coco_wholebody_face_256x256_dark.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_face.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], key_indicator='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=68,
+    dataset_joints=68,
+    dataset_channel=[
+        list(range(68)),
+    ],
+    inference_channel=list(range(68)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.md
new file mode 100644
index 0000000000000000000000000000000000000000..3db8e5f4e651ebd3b945851eefe6c40e725ef87a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.md
@@ -0,0 +1,38 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html">MobilenetV2 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4510--4520},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Face (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Face val set
+
+| Arch  | Input Size | NME | ckpt | log |
+| :-------------- | :-----------: | :------: |:------: |:------: |
+| [pose_mobilenetv2](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face_256x256.py)  | 256x256 | 0.0612 | [ckpt](https://download.openmmlab.com/mmpose/face/mobilenetv2/mobilenetv2_coco_wholebody_face_256x256-4a3f096e_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/mobilenetv2/mobilenetv2_coco_wholebody_face_256x256_20210909.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f1e23e7deea45c7c6df91f3f77fdf400968d288f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml
@@ -0,0 +1,20 @@
+Collections:
+- Name: MobilenetV2
+  Paper:
+    Title: 'Mobilenetv2: Inverted residuals and linear bottlenecks'
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/mobilenetv2.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face_256x256.py
+  In Collection: MobilenetV2
+  Metadata:
+    Architecture:
+    - MobilenetV2
+    Training Data: COCO-WholeBody-Face
+  Name: topdown_heatmap_mobilenetv2_coco_wholebody_face_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Face
+    Metrics:
+      NME: 0.0612
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/mobilenetv2/mobilenetv2_coco_wholebody_face_256x256-4a3f096e_20210909.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1b54e0ca939fc395c9669b1f438f612ea28c221
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face_256x256.py
@@ -0,0 +1,126 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_face.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], key_indicator='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=68,
+    dataset_joints=68,
+    dataset_channel=[
+        list(range(68)),
+    ],
+    inference_channel=list(range(68)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/res50_coco_wholebody_face_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/res50_coco_wholebody_face_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c636a329e16715f291eac591d85fb528d7fc6c2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/res50_coco_wholebody_face_256x256.py
@@ -0,0 +1,126 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_face.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], key_indicator='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=68,
+    dataset_joints=68,
+    dataset_channel=[
+        list(range(68)),
+    ],
+    inference_channel=list(range(68)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.md
new file mode 100644
index 0000000000000000000000000000000000000000..b63a74e442d5733ea3ce5bbbd906055acf569119
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.md
@@ -0,0 +1,55 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Face (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Face val set
+
+| Arch  | Input Size | NME | ckpt | log |
+| :-------------- | :-----------: | :------: |:------: |:------: |
+| [pose_res50](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/res50_coco_wholebody_face_256x256.py)  | 256x256 | 0.0566 | [ckpt](https://download.openmmlab.com/mmpose/face/resnet/res50_coco_wholebody_face_256x256-5128edf5_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/resnet/res50_coco_wholebody_face_256x256_20210909.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9e25ebc72f5e22c859781486c194c7ff2249f064
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml
@@ -0,0 +1,21 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/res50_coco_wholebody_face_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture:
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: COCO-WholeBody-Face
+  Name: topdown_heatmap_res50_coco_wholebody_face_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Face
+    Metrics:
+      NME: 0.0566
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/resnet/res50_coco_wholebody_face_256x256-5128edf5_20210909.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet50_coco_wholebody_face_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet50_coco_wholebody_face_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02d71149ec54e6673e1f201c5fc5a0aed47c6d8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet50_coco_wholebody_face_256x256.py
@@ -0,0 +1,127 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_face.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], key_indicator='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=68,
+    dataset_joints=68,
+    dataset_channel=[
+        list(range(68)),
+    ],
+    inference_channel=list(range(68)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/scnet50-7ef0a199.pth',
+    backbone=dict(type='SCNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.md
new file mode 100644
index 0000000000000000000000000000000000000000..48029a01caf018a5f190f98e2428b2b329056cad
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.md
@@ -0,0 +1,38 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Liu_Improving_Convolutional_Networks_With_Self-Calibrated_Convolutions_CVPR_2020_paper.html">SCNet (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{liu2020improving,
+  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
+  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10096--10105},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Face (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Face val set
+
+| Arch  | Input Size | NME | ckpt | log |
+| :-------------- | :-----------: | :------: |:------: |:------: |
+| [pose_scnet_50](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet50_coco_wholebody_face_256x256.py)  | 256x256 | 0.0565 | [ckpt](https://download.openmmlab.com/mmpose/face/scnet/scnet50_coco_wholebody_face_256x256-a0183f5f_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/scnet/scnet50_coco_wholebody_face_256x256_20210909.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7be429196f67ee588962ce17746659d22b7789d4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml
@@ -0,0 +1,20 @@
+Collections:
+- Name: SCNet
+  Paper:
+    Title: Improving Convolutional Networks with Self-Calibrated Convolutions
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Liu_Improving_Convolutional_Networks_With_Self-Calibrated_Convolutions_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/scnet.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet50_coco_wholebody_face_256x256.py
+  In Collection: SCNet
+  Metadata:
+    Architecture:
+    - SCNet
+    Training Data: COCO-WholeBody-Face
+  Name: topdown_heatmap_scnet50_coco_wholebody_face_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Face
+    Metrics:
+      NME: 0.0565
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/scnet/scnet50_coco_wholebody_face_256x256-a0183f5f_20210909.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_cofw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_cofw.md
new file mode 100644
index 0000000000000000000000000000000000000000..051fced17c500d5106b48962235b6a65f369bce1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_cofw.md
@@ -0,0 +1,42 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_iccv_2013/html/Burgos-Artizzu_Robust_Face_Landmark_2013_ICCV_paper.html">COFW (ICCV'2013)</a></summary>
+
+```bibtex
+@inproceedings{burgos2013robust,
+  title={Robust face landmark estimation under occlusion},
+  author={Burgos-Artizzu, Xavier P and Perona, Pietro and Doll{\'a}r, Piotr},
+  booktitle={Proceedings of the IEEE international conference on computer vision},
+  pages={1513--1520},
+  year={2013}
+}
+```
+
+</details>
+
+Results on COFW dataset
+
+The model is trained on COFW train.
+
+| Arch  | Input Size | NME | ckpt | log |
+| :-----| :--------: | :----: |:---: | :---: |
+| [pose_hrnetv2_w18](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_w18_cofw_256x256.py)  | 256x256 |  3.40 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_cofw_256x256-49243ab8_20211019.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_cofw_256x256_20211019.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_cofw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_cofw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..abeb759662cd4826599940eea04474e2e59a8375
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_cofw.yml
@@ -0,0 +1,20 @@
+Collections:
+- Name: HRNetv2
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Visual Recognition
+    URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_w18_cofw_256x256.py
+  In Collection: HRNetv2
+  Metadata:
+    Architecture:
+    - HRNetv2
+    Training Data: COFW
+  Name: topdown_heatmap_hrnetv2_w18_cofw_256x256
+  Results:
+  - Dataset: COFW
+    Metrics:
+      NME: 3.4
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_cofw_256x256-49243ab8_20211019.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_w18_cofw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_w18_cofw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf316bcff72edaff2de157458cc14dde019262ac
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_w18_cofw_256x256.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/cofw.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=29,
+    dataset_joints=29,
+    dataset_channel=[
+        list(range(29)),
+    ],
+    inference_channel=list(range(29)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=1.5),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/cofw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceCOFWDataset',
+        ann_file=f'{data_root}/annotations/cofw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceCOFWDataset',
+        ann_file=f'{data_root}/annotations/cofw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceCOFWDataset',
+        ann_file=f'{data_root}/annotations/cofw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_w18_cofw_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_w18_cofw_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8eb6e27d5522cc9c9883be09a5f3a5e8cb612f2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_w18_cofw_256x256_dark.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/cofw.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=29,
+    dataset_joints=29,
+    dataset_channel=[
+        list(range(29)),
+    ],
+    inference_channel=list(range(29)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/cofw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceCOFWDataset',
+        ann_file=f'{data_root}/annotations/cofw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceCOFWDataset',
+        ann_file=f'{data_root}/annotations/cofw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceCOFWDataset',
+        ann_file=f'{data_root}/annotations/cofw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/res50_cofw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/res50_cofw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..13b37c1d4f2b626dd87e194258a1e9297de34158
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/res50_cofw_256x256.py
@@ -0,0 +1,126 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/cofw.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=29,
+    dataset_joints=29,
+    dataset_channel=[
+        list(range(29)),
+    ],
+    inference_channel=list(range(29)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/cofw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceCOFWDataset',
+        ann_file=f'{data_root}/annotations/cofw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceCOFWDataset',
+        ann_file=f'{data_root}/annotations/cofw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceCOFWDataset',
+        ann_file=f'{data_root}/annotations/cofw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_awing_wflw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_awing_wflw.md
new file mode 100644
index 0000000000000000000000000000000000000000..193029918241ace3b208d865513d06583b9f52d3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_awing_wflw.md
@@ -0,0 +1,59 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/pdf/1904.07399.pdf">AdaptiveWingloss (ICCV'2019)</a></summary>
+
+```bibtex
+@inproceedings{wang2019adaptive,
+  title={Adaptive wing loss for robust face alignment via heatmap regression},
+  author={Wang, Xinyao and Bo, Liefeng and Fuxin, Li},
+  booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
+  pages={6971--6981},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Wu_Look_at_Boundary_CVPR_2018_paper.html">WFLW (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{wu2018look,
+  title={Look at boundary: A boundary-aware face alignment algorithm},
+  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={2129--2138},
+  year={2018}
+}
+```
+
+</details>
+
+Results on WFLW dataset
+
+The model is trained on WFLW train.
+
+| Arch  | Input Size | NME<sub>*test*</sub> | NME<sub>*pose*</sub> | NME<sub>*illumination*</sub> | NME<sub>*occlusion*</sub> | NME<sub>*blur*</sub> | NME<sub>*makeup*</sub> | NME<sub>*expression*</sub> | ckpt | log |
+| :-----| :--------: | :------------------: | :------------------: |:---------------------------: |:------------------------: | :------------------: | :--------------: |:-------------------------: |:---: | :---: |
+| [pose_hrnetv2_w18_awing](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256_awing.py)  | 256x256 | 4.02 | 6.94 | 3.96 | 4.78 | 4.59 | 3.85 | 4.28 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_awing-5af5055c_20211212.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_awing_20211212.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..af61d3013a1692df5268468dde5396144a0db2f1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml
@@ -0,0 +1,27 @@
+Collections:
+- Name: HRNetv2
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Visual Recognition
+    URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256_awing.py
+  In Collection: HRNetv2
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - AdaptiveWingloss
+    Training Data: WFLW
+  Name: topdown_heatmap_hrnetv2_w18_wflw_256x256_awing
+  Results:
+  - Dataset: WFLW
+    Metrics:
+      NME blur: 4.59
+      NME expression: 4.28
+      NME illumination: 3.96
+      NME makeup: 3.85
+      NME occlusion: 4.78
+      NME pose: 6.94
+      NME test: 4.02
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_awing-5af5055c_20211212.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_dark_wflw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_dark_wflw.md
new file mode 100644
index 0000000000000000000000000000000000000000..8e22009a71aca41fbf354942eb730b9128f7a0df
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_dark_wflw.md
@@ -0,0 +1,59 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Wu_Look_at_Boundary_CVPR_2018_paper.html">WFLW (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{wu2018look,
+  title={Look at boundary: A boundary-aware face alignment algorithm},
+  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={2129--2138},
+  year={2018}
+}
+```
+
+</details>
+
+Results on WFLW dataset
+
+The model is trained on WFLW train.
+
+| Arch  | Input Size | NME<sub>*test*</sub> | NME<sub>*pose*</sub> | NME<sub>*illumination*</sub> | NME<sub>*occlusion*</sub> | NME<sub>*blur*</sub> | NME<sub>*makeup*</sub> | NME<sub>*expression*</sub> | ckpt | log |
+| :-----| :--------: | :------------------: | :------------------: |:---------------------------: |:------------------------: | :------------------: | :--------------: |:-------------------------: |:---: | :---: |
+| [pose_hrnetv2_w18_dark](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256_dark.py)  | 256x256 | 3.98  | 6.99 | 3.96  | 4.78  | 4.57  | 3.87  | 4.30  | [ckpt](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_wflw_256x256_dark-3f8e0c2c_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_wflw_256x256_dark_20210125.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f5133d9627cf72043725b9669bf75fed60d3934f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml
@@ -0,0 +1,27 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - DarkPose
+    Training Data: WFLW
+  Name: topdown_heatmap_hrnetv2_w18_wflw_256x256_dark
+  Results:
+  - Dataset: WFLW
+    Metrics:
+      NME blur: 4.57
+      NME expression: 4.3
+      NME illumination: 3.96
+      NME makeup: 3.87
+      NME occlusion: 4.78
+      NME pose: 6.99
+      NME test: 3.98
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_wflw_256x256_dark-3f8e0c2c_20210125.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..d89b32a6330384102da83f804a6d5cfa5f030f8a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/wflw.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=98,
+    dataset_joints=98,
+    dataset_channel=[
+        list(range(98)),
+    ],
+    inference_channel=list(range(98)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/wflw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256_awing.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256_awing.py
new file mode 100644
index 0000000000000000000000000000000000000000..db83c19a5eb30679413ae9c472849c3825e278dc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256_awing.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/wflw.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=98,
+    dataset_joints=98,
+    dataset_channel=[
+        list(range(98)),
+    ],
+    inference_channel=list(range(98)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='AdaptiveWingLoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/wflw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c28f56f47256f521cc09c1bcd9623959ae44861
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256_dark.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/wflw.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=98,
+    dataset_joints=98,
+    dataset_channel=[
+        list(range(98)),
+    ],
+    inference_channel=list(range(98)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/wflw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_wflw.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_wflw.md
new file mode 100644
index 0000000000000000000000000000000000000000..70ca3ad5e9a053ec183c01bb31b19f6f02a76ca6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_wflw.md
@@ -0,0 +1,42 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Wu_Look_at_Boundary_CVPR_2018_paper.html">WFLW (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{wu2018look,
+  title={Look at boundary: A boundary-aware face alignment algorithm},
+  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={2129--2138},
+  year={2018}
+}
+```
+
+</details>
+
+Results on WFLW dataset
+
+The model is trained on WFLW train.
+
+| Arch  | Input Size | NME<sub>*test*</sub> | NME<sub>*pose*</sub> | NME<sub>*illumination*</sub> | NME<sub>*occlusion*</sub> | NME<sub>*blur*</sub> | NME<sub>*makeup*</sub> | NME<sub>*expression*</sub> | ckpt | log |
+| :-----| :--------: | :------------------: | :------------------: |:---------------------------: |:------------------------: | :------------------: | :--------------: |:-------------------------: |:---: | :---: |
+| [pose_hrnetv2_w18](/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256.py)  | 256x256 | 4.06 | 6.98 | 3.99 | 4.83 | 4.59 | 3.92 | 4.33 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256-2bf032a6_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_20210125.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_wflw.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_wflw.yml
new file mode 100644
index 0000000000000000000000000000000000000000..517aa89aebfceb51e44e9b23ceb0a6084644f6ad
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_wflw.yml
@@ -0,0 +1,26 @@
+Collections:
+- Name: HRNetv2
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Visual Recognition
+    URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_w18_wflw_256x256.py
+  In Collection: HRNetv2
+  Metadata:
+    Architecture:
+    - HRNetv2
+    Training Data: WFLW
+  Name: topdown_heatmap_hrnetv2_w18_wflw_256x256
+  Results:
+  - Dataset: WFLW
+    Metrics:
+      NME blur: 4.59
+      NME expression: 4.33
+      NME illumination: 3.99
+      NME makeup: 3.92
+      NME occlusion: 4.83
+      NME pose: 6.98
+      NME test: 4.06
+    Task: Face 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256-2bf032a6_20210125.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/res50_wflw_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/res50_wflw_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2f5d3443a20e987c96a454c46a188fc0ff9c1db
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/res50_wflw_256x256.py
@@ -0,0 +1,126 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/wflw.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-3,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 55])
+total_epochs = 60
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=98,
+    dataset_joints=98,
+    dataset_channel=[
+        list(range(98)),
+    ],
+    inference_channel=list(range(98)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/wflw'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_train.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FaceWFLWDataset',
+        ann_file=f'{data_root}/annotations/face_landmarks_wflw_test.json',
+        img_prefix=f'{data_root}/images/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6818d3dc1d7f9a25bea8ecc73f1c9b0b563ba21b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/README.md
@@ -0,0 +1,7 @@
+# 2D Fashion Landmark Detection
+
+2D fashion landmark detection (also referred to as fashion alignment) aims to detect the key-point located at the functional region of clothes, for example the neckline and the cuff.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/2d_fashion_landmark.md) to prepare data.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dacfddfd451a49d3044936fdee995d6dfd29ac4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/README.md
@@ -0,0 +1,24 @@
+# Deeppose: Human pose estimation via deep neural networks
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+DeepPose first proposes using deep neural networks (DNNs) to tackle the problem of keypoint detection.
+It follows the top-down paradigm, that first detects the bounding boxes and then estimates poses.
+It learns to directly regress the fashion keypoint coordinates.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res101_deepfashion_full_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res101_deepfashion_full_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..a59b0a9a7e34ee2d65da5b2a257b8723dda1f5d5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res101_deepfashion_full_256x192.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_full.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=8,
+    dataset_joints=8,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res101_deepfashion_lower_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res101_deepfashion_lower_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c6af600fd01277781feca695caec496a96ea8db
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res101_deepfashion_lower_256x192.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_lower.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=4,
+    dataset_joints=4,
+    dataset_channel=[
+        [0, 1, 2, 3],
+    ],
+    inference_channel=[0, 1, 2, 3])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res101_deepfashion_upper_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res101_deepfashion_upper_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..77826c51249d196e739712ddf4d94f75d8218668
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res101_deepfashion_upper_256x192.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_upper.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=6,
+    dataset_joints=6,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res152_deepfashion_full_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res152_deepfashion_full_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d587c77d356cf75f786d81e197ee42d321f8f4c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res152_deepfashion_full_256x192.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_full.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=8,
+    dataset_joints=8,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res152_deepfashion_lower_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res152_deepfashion_lower_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a08301516aca6e89002000d89cd8c112c7483ec
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res152_deepfashion_lower_256x192.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_lower.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=4,
+    dataset_joints=4,
+    dataset_channel=[
+        [0, 1, 2, 3],
+    ],
+    inference_channel=[0, 1, 2, 3])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res152_deepfashion_upper_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res152_deepfashion_upper_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c89056e2602d72935adef047a873654fbf586fc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res152_deepfashion_upper_256x192.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_upper.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=6,
+    dataset_joints=6,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_full_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_full_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..27bb30f2a1090a4a8d481f63a6c8dc984b7502c3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_full_256x192.py
@@ -0,0 +1,140 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_full.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=8,
+    dataset_joints=8,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_lower_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_lower_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0bb9686100a2ddad266f75e82aaa6b0b42b5017
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_lower_256x192.py
@@ -0,0 +1,140 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_lower.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=4,
+    dataset_joints=4,
+    dataset_channel=[
+        [0, 1, 2, 3],
+    ],
+    inference_channel=[0, 1, 2, 3])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_upper_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_upper_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ca1b245053fb3b8d1c23288155e630dc8d4735
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_upper_256x192.py
@@ -0,0 +1,140 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_upper.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=6,
+    dataset_joints=6,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/resnet_deepfashion.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/resnet_deepfashion.md
new file mode 100644
index 0000000000000000000000000000000000000000..d0f3f2a8d8e6a0139d7ecb5c8d9766c1b709a577
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/resnet_deepfashion.md
@@ -0,0 +1,75 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/Liu_DeepFashion_Powering_Robust_CVPR_2016_paper.html">DeepFashion (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{liuLQWTcvpr16DeepFashion,
+ author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou},
+ title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations},
+ booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-46475-6_15">DeepFashion (ECCV'2016)</a></summary>
+
+```bibtex
+@inproceedings{liuYLWTeccv16FashionLandmark,
+ author = {Liu, Ziwei and Yan, Sijie and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou},
+ title = {Fashion Landmark Detection in the Wild},
+ booktitle = {European Conference on Computer Vision (ECCV)},
+ month = {October},
+ year = {2016}
+ }
+```
+
+</details>
+
+Results on DeepFashion val set
+
+|Set   | Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :---: | :--------: | :------: | :------: | :------: |:------: |:------: |
+|upper | [deeppose_resnet_50](/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_upper_256x192.py) | 256x256 | 0.965 | 0.535 | 17.2 | [ckpt](https://download.openmmlab.com/mmpose/fashion/deeppose/deeppose_res50_deepfashion_upper_256x192-497799fb_20210309.pth) | [log](https://download.openmmlab.com/mmpose/fashion/deeppose/deeppose_res50_deepfashion_upper_256x192_20210309.log.json) |
+|lower | [deeppose_resnet_50](/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_lower_256x192.py) | 256x256 | 0.971 | 0.678 | 11.8 | [ckpt](https://download.openmmlab.com/mmpose/fashion/deeppose/deeppose_res50_deepfashion_lower_256x192-94e0e653_20210309.pth) | [log](https://download.openmmlab.com/mmpose/fashion/deeppose/deeppose_res50_deepfashion_lower_256x192_20210309.log.json) |
+|full  | [deeppose_resnet_50](/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_full_256x192.py)  | 256x256 | 0.983 | 0.602 | 14.0 | [ckpt](https://download.openmmlab.com/mmpose/fashion/deeppose/deeppose_res50_deepfashion_full_256x192-4e0273e2_20210309.pth) | [log](https://download.openmmlab.com/mmpose/fashion/deeppose/deeppose_res50_deepfashion_full_256x192_20210309.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/resnet_deepfashion.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/resnet_deepfashion.yml
new file mode 100644
index 0000000000000000000000000000000000000000..392ac02117ca9849f94e28ad868ea78366fd4404
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/resnet_deepfashion.yml
@@ -0,0 +1,51 @@
+Collections:
+- Name: ResNet
+  Paper:
+    Title: Deep residual learning for image recognition
+    URL: http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnet.md
+Models:
+- Config: configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_upper_256x192.py
+  In Collection: ResNet
+  Metadata:
+    Architecture: &id001
+    - DeepPose
+    - ResNet
+    Training Data: DeepFashion
+  Name: deeppose_res50_deepfashion_upper_256x192
+  Results:
+  - Dataset: DeepFashion
+    Metrics:
+      AUC: 0.535
+      EPE: 17.2
+      PCK@0.2: 0.965
+    Task: Fashion 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/fashion/deeppose/deeppose_res50_deepfashion_upper_256x192-497799fb_20210309.pth
+- Config: configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_lower_256x192.py
+  In Collection: ResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: DeepFashion
+  Name: deeppose_res50_deepfashion_lower_256x192
+  Results:
+  - Dataset: DeepFashion
+    Metrics:
+      AUC: 0.678
+      EPE: 11.8
+      PCK@0.2: 0.971
+    Task: Fashion 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/fashion/deeppose/deeppose_res50_deepfashion_lower_256x192-94e0e653_20210309.pth
+- Config: configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/res50_deepfashion_full_256x192.py
+  In Collection: ResNet
+  Metadata:
+    Architecture: *id001
+    Training Data: DeepFashion
+  Name: deeppose_res50_deepfashion_full_256x192
+  Results:
+  - Dataset: DeepFashion
+    Metrics:
+      AUC: 0.602
+      EPE: 14.0
+      PCK@0.2: 0.983
+    Task: Fashion 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/fashion/deeppose/deeppose_res50_deepfashion_full_256x192-4e0273e2_20210309.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7eaa145f56aa800ccd4449bf2a7d293587c92e2a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
@@ -0,0 +1,9 @@
+# Top-down heatmap-based fashion keypoint estimation
+
+Top-down methods divide the task into two stages: clothes detection and fashion keypoint estimation.
+
+They perform clothes detection first, followed by fashion keypoint estimation given fashion bounding boxes.
+Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the
+likelihood of being a keypoint.
+
+Various neural network models have been proposed for better performance.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_full_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_full_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..d70d51ee061295b8219e1f09a09437fcceb70110
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_full_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_full.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=8,
+    dataset_joints=8,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_full_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_full_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a885d3099e5e52bde40c84d24ad2327981e598c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_full_256x192_udp.py
@@ -0,0 +1,177 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_full.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=8,
+    dataset_joints=8,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_lower_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_lower_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a81cfc1bd4d14e3b33e54ab2ad35b7364ccbc82
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_lower_256x192.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_lower.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=4,
+    dataset_joints=4,
+    dataset_channel=[
+        [0, 1, 2, 3],
+    ],
+    inference_channel=[0, 1, 2, 3])
+
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_lower_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_lower_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..49d7b7d887a05b63c3ee1abd738d8d16d56d7697
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_lower_256x192_udp.py
@@ -0,0 +1,176 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_lower.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=4,
+    dataset_joints=4,
+    dataset_channel=[
+        [0, 1, 2, 3],
+    ],
+    inference_channel=[0, 1, 2, 3])
+
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_upper_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_upper_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8bf5bcae11e6bd2afc46b4e2e7a6bd85ea1e7a2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_upper_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_upper.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=6,
+    dataset_joints=6,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_upper_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_upper_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5b3bbfc39ab9004b9a65e1ddd3767b117ca11af
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w32_deepfashion_upper_256x192_udp.py
@@ -0,0 +1,177 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_upper.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=6,
+    dataset_joints=6,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_full_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_full_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e61e6a3975770c1d7230a9a13096928dd1b3286
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_full_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_full.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=8,
+    dataset_joints=8,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_full_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_full_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..43e039db6c9234cc6eb7e288c641cf72e50392be
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_full_256x192_udp.py
@@ -0,0 +1,177 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_full.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=8,
+    dataset_joints=8,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_lower_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_lower_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..b03d6801265c24a364686bda8a6aa55ea7867e61
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_lower_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_lower.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=4,
+    dataset_joints=4,
+    dataset_channel=[
+        [0, 1, 2, 3],
+    ],
+    inference_channel=[0, 1, 2, 3])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_lower_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_lower_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c42bb4aa15c86d72107fe8cf3616bc74ba370efa
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_lower_256x192_udp.py
@@ -0,0 +1,177 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_lower.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=4,
+    dataset_joints=4,
+    dataset_channel=[
+        [0, 1, 2, 3],
+    ],
+    inference_channel=[0, 1, 2, 3])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_upper_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_upper_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa14b3c2bb524adc15247e2b7632cec3c726b45d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_upper_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_upper.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=6,
+    dataset_joints=6,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_upper_256x192_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_upper_256x192_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f01adb699a1ce9c48281e1f78a6b51f1de7b476
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/hrnet_w48_deepfashion_upper_256x192_udp.py
@@ -0,0 +1,177 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_upper.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=6,
+    dataset_joints=6,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res101_deepfashion_full_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res101_deepfashion_full_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..038111db308e57fcc53f69c0de2b8ed99c4c872e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res101_deepfashion_full_256x192.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_full.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=8,
+    dataset_joints=8,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res101_deepfashion_lower_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res101_deepfashion_lower_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..530161a5813c090968b954df4cb2f8a495656377
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res101_deepfashion_lower_256x192.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_lower.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=4,
+    dataset_joints=4,
+    dataset_channel=[
+        [0, 1, 2, 3],
+    ],
+    inference_channel=[0, 1, 2, 3])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res101_deepfashion_upper_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res101_deepfashion_upper_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf3b7d2e0bfa1e31cd437f28ade2b8244495b1f3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res101_deepfashion_upper_256x192.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_upper.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=6,
+    dataset_joints=6,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res152_deepfashion_full_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res152_deepfashion_full_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..da19ce28ad2f3b0408dc1802e586727272103b02
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res152_deepfashion_full_256x192.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_full.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=8,
+    dataset_joints=8,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res152_deepfashion_lower_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res152_deepfashion_lower_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfe78cf8a29ee60371755995a52dfce4e7eeec26
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res152_deepfashion_lower_256x192.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_lower.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=4,
+    dataset_joints=4,
+    dataset_channel=[
+        [0, 1, 2, 3],
+    ],
+    inference_channel=[0, 1, 2, 3])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res152_deepfashion_upper_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res152_deepfashion_upper_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..93d0ef51654f6473728ba725d1e0acfffd078711
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res152_deepfashion_upper_256x192.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_upper.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=6,
+    dataset_joints=6,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_full_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_full_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..559cb3a2298be62bc06a47b2561c1ceda55247a3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_full_256x192.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_full.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=8,
+    dataset_joints=8,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_full_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='full',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_lower_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_lower_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..6be9538ccf0b62a8a6e3501a429633c0a9dc74ec
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_lower_256x192.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_lower.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=4,
+    dataset_joints=4,
+    dataset_channel=[
+        [0, 1, 2, 3],
+    ],
+    inference_channel=[0, 1, 2, 3])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_lower_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='lower',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_upper_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_upper_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e45afeccb104c505b492d2d94620f903138b75e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_upper_256x192.py
@@ -0,0 +1,139 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/deepfashion_upper.py'
+]
+evaluation = dict(interval=10, metric='PCK', save_best='PCK')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=6,
+    dataset_joints=6,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/fld'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_train.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_val.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='DeepFashionDataset',
+        ann_file=f'{data_root}/annotations/fld_upper_test.json',
+        img_prefix=f'{data_root}/img/',
+        subset='upper',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/resnet_deepfashion.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/resnet_deepfashion.md
new file mode 100644
index 0000000000000000000000000000000000000000..ca23c8d1e0abe08f0482e81f32869c0fb7778161
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/resnet_deepfashion.md
@@ -0,0 +1,75 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/Liu_DeepFashion_Powering_Robust_CVPR_2016_paper.html">DeepFashion (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{liuLQWTcvpr16DeepFashion,
+ author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou},
+ title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations},
+ booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-46475-6_15">DeepFashion (ECCV'2016)</a></summary>
+
+```bibtex
+@inproceedings{liuYLWTeccv16FashionLandmark,
+ author = {Liu, Ziwei and Yan, Sijie and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou},
+ title = {Fashion Landmark Detection in the Wild},
+ booktitle = {European Conference on Computer Vision (ECCV)},
+ month = {October},
+ year = {2016}
+ }
+```
+
+</details>
+
+Results on DeepFashion val set
+
+|Set   | Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :---: | :--------: | :------: | :------: | :------: |:------: |:------: |
+|upper | [pose_resnet_50](/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_upper_256x192.py) | 256x256 | 0.954 | 0.578 | 16.8 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_upper_256x192-41794f03_20210124.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_upper_256x192_20210124.log.json) |
+|lower | [pose_resnet_50](/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_lower_256x192.py) | 256x256 | 0.965 | 0.744 | 10.5 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_lower_256x192-1292a839_20210124.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_lower_256x192_20210124.log.json) |
+|full  | [pose_resnet_50](/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_full_256x192.py)  | 256x256 | 0.977 | 0.664 | 12.7 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_full_256x192-0dbd6e42_20210124.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_full_256x192_20210124.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/resnet_deepfashion.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/resnet_deepfashion.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bd871418d2bc6bb1ca532f51bc7464b215af4dea
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/resnet_deepfashion.yml
@@ -0,0 +1,51 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_upper_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: DeepFashion
+  Name: topdown_heatmap_res50_deepfashion_upper_256x192
+  Results:
+  - Dataset: DeepFashion
+    Metrics:
+      AUC: 0.578
+      EPE: 16.8
+      PCK@0.2: 0.954
+    Task: Fashion 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_upper_256x192-41794f03_20210124.pth
+- Config: configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_lower_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: DeepFashion
+  Name: topdown_heatmap_res50_deepfashion_lower_256x192
+  Results:
+  - Dataset: DeepFashion
+    Metrics:
+      AUC: 0.744
+      EPE: 10.5
+      PCK@0.2: 0.965
+    Task: Fashion 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_lower_256x192-1292a839_20210124.pth
+- Config: configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/res50_deepfashion_full_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: DeepFashion
+  Name: topdown_heatmap_res50_deepfashion_full_256x192
+  Results:
+  - Dataset: DeepFashion
+    Metrics:
+      AUC: 0.664
+      EPE: 12.7
+      PCK@0.2: 0.977
+    Task: Fashion 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_full_256x192-0dbd6e42_20210124.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b8047eafa65f864d8797ab6faf834f3fbf5176a3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/README.md
@@ -0,0 +1,16 @@
+# 2D Hand Pose Estimation
+
+2D hand pose estimation is defined as the task of detecting the poses (or keypoints) of the hand from an input image.
+
+Normally, the input images are cropped hand images, where the hand locates at the center;
+or the rough location (or the bounding box) of the hand is provided.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/2d_hand_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/2d_hand_demo.md) to run demos.
+
+<img src="https://user-images.githubusercontent.com/11788150/109098558-8c54db00-775c-11eb-8966-85df96b23dc5.gif" width="600px" alt><br>
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..846d120515552a9ced401bb0bee64dbe3b76a74e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/README.md
@@ -0,0 +1,24 @@
+# Deeppose: Human pose estimation via deep neural networks
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+DeepPose first proposes using deep neural networks (DNNs) to tackle the problem of keypoint detection.
+It follows the top-down paradigm, that first detects the bounding boxes and then estimates poses.
+It learns to directly regress the hand keypoint coordinates.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/res50_onehand10k_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/res50_onehand10k_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fdde7549739c6a2adfffbbdddf77a8c2def4f6c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/res50_onehand10k_256x256.py
@@ -0,0 +1,131 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/onehand10k.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/onehand10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/resnet_onehand10k.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/resnet_onehand10k.md
new file mode 100644
index 0000000000000000000000000000000000000000..42b2a01652d81cb09801e9cf96f3453d184a6b95
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/resnet_onehand10k.md
@@ -0,0 +1,59 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/8529221/">OneHand10K (TCSVT'2019)</a></summary>
+
+```bibtex
+@article{wang2018mask,
+  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+  journal={IEEE Transactions on Circuits and Systems for Video Technology},
+  volume={29},
+  number={11},
+  pages={3258--3268},
+  year={2018},
+  publisher={IEEE}
+}
+```
+
+</details>
+
+Results on OneHand10K val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [deeppose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/res50_onehand10k_256x256.py) | 256x256 | 0.990 | 0.486 | 34.28 | [ckpt](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_onehand10k_256x256-cbddf43a_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_onehand10k_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/resnet_onehand10k.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/resnet_onehand10k.yml
new file mode 100644
index 0000000000000000000000000000000000000000..994a32a658dbd49b775b943331eef01ac099a798
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/resnet_onehand10k.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: ResNet
+  Paper:
+    Title: Deep residual learning for image recognition
+    URL: http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnet.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/res50_onehand10k_256x256.py
+  In Collection: ResNet
+  Metadata:
+    Architecture:
+    - DeepPose
+    - ResNet
+    Training Data: OneHand10K
+  Name: deeppose_res50_onehand10k_256x256
+  Results:
+  - Dataset: OneHand10K
+    Metrics:
+      AUC: 0.486
+      EPE: 34.28
+      PCK@0.2: 0.99
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_onehand10k_256x256-cbddf43a_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/res50_panoptic2d_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/res50_panoptic2d_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0fd4d3738ecc2d43797da61ec2c4cb6465e7a12
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/res50_panoptic2d_256x256.py
@@ -0,0 +1,131 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/panoptic_hand2d.py'
+]
+evaluation = dict(interval=10, metric=['PCKh', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/panoptic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/resnet_panoptic2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/resnet_panoptic2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..b5082315a9d7be26bdc4aca87324a32f996e76ae
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/resnet_panoptic2d.md
@@ -0,0 +1,56 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2017/html/Simon_Hand_Keypoint_Detection_CVPR_2017_paper.html">CMU Panoptic HandDB (CVPR'2017)</a></summary>
+
+```bibtex
+@inproceedings{simon2017hand,
+  title={Hand keypoint detection in single images using multiview bootstrapping},
+  author={Simon, Tomas and Joo, Hanbyul and Matthews, Iain and Sheikh, Yaser},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={1145--1153},
+  year={2017}
+}
+```
+
+</details>
+
+Results on CMU Panoptic (MPII+NZSL val set)
+
+| Arch  | Input Size | PCKh@0.7 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [deeppose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/res50_panoptic2d_256x256.py) | 256x256 | 0.999 | 0.686 | 9.36 | [ckpt](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_panoptic_256x256-8a745183_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_panoptic_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/resnet_panoptic2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/resnet_panoptic2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1cf7747b501fcf6cbdfc41b90e2978136d94e405
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/resnet_panoptic2d.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: ResNet
+  Paper:
+    Title: Deep residual learning for image recognition
+    URL: http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnet.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/res50_panoptic2d_256x256.py
+  In Collection: ResNet
+  Metadata:
+    Architecture:
+    - DeepPose
+    - ResNet
+    Training Data: CMU Panoptic HandDB
+  Name: deeppose_res50_panoptic2d_256x256
+  Results:
+  - Dataset: CMU Panoptic HandDB
+    Metrics:
+      AUC: 0.686
+      EPE: 9.36
+      PCKh@0.7: 0.999
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_panoptic_256x256-8a745183_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/res50_rhd2d_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/res50_rhd2d_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdcfb45cabe874c9518bead088e685730f1c4afb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/res50_rhd2d_256x256.py
@@ -0,0 +1,131 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/rhd2d.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/rhd'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/resnet_rhd2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/resnet_rhd2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..292552054428493f0b4b8941d8928fa089b77cd9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/resnet_rhd2d.md
@@ -0,0 +1,57 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html">DeepPose (CVPR'2014)</a></summary>
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+  title={Deeppose: Human pose estimation via deep neural networks},
+  author={Toshev, Alexander and Szegedy, Christian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={1653--1660},
+  year={2014}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://lmb.informatik.uni-freiburg.de/projects/hand3d/">RHD (ICCV'2017)</a></summary>
+
+```bibtex
+@TechReport{zb2017hand,
+  author={Christian Zimmermann and Thomas Brox},
+  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+  institution={arXiv:1705.01389},
+  year={2017},
+  note="https://arxiv.org/abs/1705.01389",
+  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+</details>
+
+Results on RHD test set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [deeppose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/res50_rhd2d_256x256.py) | 256x256 | 0.988 | 0.865 | 3.29 | [ckpt](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_rhd2d_256x256-37f1c4d3_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_rhd2d_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/resnet_rhd2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/resnet_rhd2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5ba15ad3c865e0792a9a42f1b3325a49263e7361
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/resnet_rhd2d.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: ResNet
+  Paper:
+    Title: Deep residual learning for image recognition
+    URL: http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/resnet.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/res50_rhd2d_256x256.py
+  In Collection: ResNet
+  Metadata:
+    Architecture:
+    - DeepPose
+    - ResNet
+    Training Data: RHD
+  Name: deeppose_res50_rhd2d_256x256
+  Results:
+  - Dataset: RHD
+    Metrics:
+      AUC: 0.865
+      EPE: 3.29
+      PCK@0.2: 0.988
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_rhd2d_256x256-37f1c4d3_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82d150bd1f9479c8a9794f2d137f0ddcdb862279
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
@@ -0,0 +1,9 @@
+# Top-down heatmap-based hand keypoint estimation
+
+Top-down methods divide the task into two stages: hand detection and hand keypoint estimation.
+
+They perform hand detection first, followed by hand keypoint estimation given hand bounding boxes.
+Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the
+likelihood of being a keypoint.
+
+Various neural network models have been proposed for better performance.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass52_coco_wholebody_hand_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass52_coco_wholebody_hand_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e79ae581970c2c83dec872da365f3b1b8d016b5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass52_coco_wholebody_hand_256x256.py
@@ -0,0 +1,137 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_hand.py'
+]
+evaluation = dict(
+    interval=10, metric=['PCK', 'AUC', 'EPE'], key_indicator='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HourglassNet',
+        num_stacks=1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapMultiStageHead',
+        in_channels=256,
+        out_channels=channel_cfg['num_output_channels'],
+        num_stages=1,
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.md
new file mode 100644
index 0000000000000000000000000000000000000000..72438883fa6eeb95fb413c8963dc4155743a75dd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.md
@@ -0,0 +1,39 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29">Hourglass (ECCV'2016)</a></summary>
+
+```bibtex
+@inproceedings{newell2016stacked,
+  title={Stacked hourglass networks for human pose estimation},
+  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
+  booktitle={European conference on computer vision},
+  pages={483--499},
+  year={2016},
+  organization={Springer}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Hand (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hourglass_52](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass52_coco_wholebody_hand_256x256.py) | 256x256 | 0.804 | 0.835 | 4.54 | [ckpt](https://download.openmmlab.com/mmpose/hand/hourglass/hourglass52_coco_wholebody_hand_256x256-7b05c6db_20210909.pth) | [log](https://download.openmmlab.com/mmpose/hand/hourglass/hourglass52_coco_wholebody_hand_256x256_20210909.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml
new file mode 100644
index 0000000000000000000000000000000000000000..426952c6f4f658b6b332a3b78e369f955952baf9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: Hourglass
+  Paper:
+    Title: Stacked hourglass networks for human pose estimation
+    URL: https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hourglass.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass52_coco_wholebody_hand_256x256.py
+  In Collection: Hourglass
+  Metadata:
+    Architecture:
+    - Hourglass
+    Training Data: COCO-WholeBody-Hand
+  Name: topdown_heatmap_hourglass52_coco_wholebody_hand_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Hand
+    Metrics:
+      AUC: 0.835
+      EPE: 4.54
+      PCK@0.2: 0.804
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/hourglass/hourglass52_coco_wholebody_hand_256x256-7b05c6db_20210909.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md
new file mode 100644
index 0000000000000000000000000000000000000000..15f08e168e484e2775f7f35dd02c832bc6f0393f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md
@@ -0,0 +1,39 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Hand (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_w18_coco_wholebody_hand_256x256.py) | 256x256 | 0.813 | 0.840 | 4.39 | [ckpt](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_coco_wholebody_hand_256x256_20210908.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1a4b4445d9985fbb1d4e18174127f95ded5269ce
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: HRNetv2
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Visual Recognition
+    URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_w18_coco_wholebody_hand_256x256.py
+  In Collection: HRNetv2
+  Metadata:
+    Architecture:
+    - HRNetv2
+    Training Data: COCO-WholeBody-Hand
+  Name: topdown_heatmap_hrnetv2_w18_coco_wholebody_hand_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Hand
+    Metrics:
+      AUC: 0.84
+      EPE: 4.39
+      PCK@0.2: 0.813
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md
new file mode 100644
index 0000000000000000000000000000000000000000..e3af94b65c39dca554958c64fadcb7966a6f8407
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md
@@ -0,0 +1,56 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Hand (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18_dark](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_w18_coco_wholebody_hand_256x256_dark.py) | 256x256 | 0.814 | 0.840 | 4.37 | [ckpt](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_coco_wholebody_hand_256x256_dark-a9228c9c_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_coco_wholebody_hand_256x256_dark_20210908.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml
new file mode 100644
index 0000000000000000000000000000000000000000..31d0a38ab797914e38704c49c85fd0f84ab5f392
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_w18_coco_wholebody_hand_256x256_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - DarkPose
+    Training Data: COCO-WholeBody-Hand
+  Name: topdown_heatmap_hrnetv2_w18_coco_wholebody_hand_256x256_dark
+  Results:
+  - Dataset: COCO-WholeBody-Hand
+    Metrics:
+      AUC: 0.84
+      EPE: 4.37
+      PCK@0.2: 0.814
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_coco_wholebody_hand_256x256_dark-a9228c9c_20210908.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_w18_coco_wholebody_hand_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_w18_coco_wholebody_hand_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..7679379361e187d0e79bace42ecb81ede8f7d593
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_w18_coco_wholebody_hand_256x256.py
@@ -0,0 +1,165 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_hand.py'
+]
+evaluation = dict(
+    interval=10, metric=['PCK', 'AUC', 'EPE'], key_indicator='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_w18_coco_wholebody_hand_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_w18_coco_wholebody_hand_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cc62f77e4e28ada6ec44e4504e8aad6cdddd34f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_w18_coco_wholebody_hand_256x256_dark.py
@@ -0,0 +1,165 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_hand.py'
+]
+evaluation = dict(
+    interval=10, metric=['PCK', 'AUC', 'EPE'], key_indicator='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.md
new file mode 100644
index 0000000000000000000000000000000000000000..51a9d78e0a358ba91cb7ae76d27750ce54b7a9ec
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.md
@@ -0,0 +1,37 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2104.06403">LiteHRNet (CVPR'2021)</a></summary>
+
+```bibtex
+@inproceedings{Yulitehrnet21,
+  title={Lite-HRNet: A Lightweight High-Resolution Network},
+  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
+  booktitle={CVPR},
+  year={2021}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Hand (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [LiteHRNet-18](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_w18_coco_wholebody_hand_256x256.py) | 256x256 | 0.795 | 0.830 | 4.77 | [ckpt](https://download.openmmlab.com/mmpose/hand/litehrnet/litehrnet_w18_coco_wholebody_hand_256x256-d6945e6a_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/litehrnet/litehrnet_w18_coco_wholebody_hand_256x256_20210908.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d7751dcb179cc4a0cfa01e07e2be863059f43e99
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: LiteHRNet
+  Paper:
+    Title: 'Lite-HRNet: A Lightweight High-Resolution Network'
+    URL: https://arxiv.org/abs/2104.06403
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/litehrnet.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_w18_coco_wholebody_hand_256x256.py
+  In Collection: LiteHRNet
+  Metadata:
+    Architecture:
+    - LiteHRNet
+    Training Data: COCO-WholeBody-Hand
+  Name: topdown_heatmap_litehrnet_w18_coco_wholebody_hand_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Hand
+    Metrics:
+      AUC: 0.83
+      EPE: 4.77
+      PCK@0.2: 0.795
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/litehrnet/litehrnet_w18_coco_wholebody_hand_256x256-d6945e6a_20210908.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_w18_coco_wholebody_hand_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_w18_coco_wholebody_hand_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..04c526d860eb42b05a316b700fb99a9cad492edf
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_w18_coco_wholebody_hand_256x256.py
@@ -0,0 +1,152 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_hand.py'
+]
+evaluation = dict(
+    interval=10, metric=['PCK', 'AUC', 'EPE'], key_indicator='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='LiteHRNet',
+        in_channels=3,
+        extra=dict(
+            stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+            num_stages=3,
+            stages_spec=dict(
+                num_modules=(2, 4, 2),
+                num_branches=(2, 3, 4),
+                num_blocks=(2, 2, 2),
+                module_type=('LITE', 'LITE', 'LITE'),
+                with_fuse=(True, True, True),
+                reduce_ratios=(8, 8, 8),
+                num_channels=(
+                    (40, 80),
+                    (40, 80, 160),
+                    (40, 80, 160, 320),
+                )),
+            with_head=True,
+        )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=40,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md
new file mode 100644
index 0000000000000000000000000000000000000000..7fa4afc8b4656d10e10d5a5fc3b11c0379fd896e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md
@@ -0,0 +1,38 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html">MobilenetV2 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4510--4520},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Hand (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--------: | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_mobilenetv2](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand_256x256.py) | 256x256 | 0.795 | 0.829 | 4.77 | [ckpt](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_coco_wholebody_hand_256x256-06b8c877_20210909.pth) | [log](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_coco_wholebody_hand_256x256_20210909.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml
new file mode 100644
index 0000000000000000000000000000000000000000..aa0df1bf7ce36469e4b07496205c3b195e30b66b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: MobilenetV2
+  Paper:
+    Title: 'Mobilenetv2: Inverted residuals and linear bottlenecks'
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/mobilenetv2.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand_256x256.py
+  In Collection: MobilenetV2
+  Metadata:
+    Architecture:
+    - MobilenetV2
+    Training Data: COCO-WholeBody-Hand
+  Name: topdown_heatmap_mobilenetv2_coco_wholebody_hand_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Hand
+    Metrics:
+      AUC: 0.829
+      EPE: 4.77
+      PCK@0.2: 0.795
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_coco_wholebody_hand_256x256-06b8c877_20210909.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bd8af1d2c3989faffd246875d89a07ee1de4298
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand_256x256.py
@@ -0,0 +1,131 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_hand.py'
+]
+evaluation = dict(
+    interval=10, metric=['PCK', 'AUC', 'EPE'], key_indicator='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/res50_coco_wholebody_hand_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/res50_coco_wholebody_hand_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..8693eb219243bcc844fe4e7a41f8d05daa2732a3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/res50_coco_wholebody_hand_256x256.py
@@ -0,0 +1,131 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_hand.py'
+]
+evaluation = dict(
+    interval=10, metric=['PCK', 'AUC', 'EPE'], key_indicator='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.md
new file mode 100644
index 0000000000000000000000000000000000000000..0d2781ba7e79c6e0272acec96bf1d8e29b5ff9fa
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.md
@@ -0,0 +1,55 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Hand (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--------: | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/res50_coco_wholebody_hand_256x256.py) | 256x256 | 0.800 | 0.833 | 4.64 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_coco_wholebody_hand_256x256-8dbc750c_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_coco_wholebody_hand_256x256_20210908.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d1e22ea7ad4946d196f59e76c31f83e1aea3d89b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/res50_coco_wholebody_hand_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture:
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: COCO-WholeBody-Hand
+  Name: topdown_heatmap_res50_coco_wholebody_hand_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Hand
+    Metrics:
+      AUC: 0.833
+      EPE: 4.64
+      PCK@0.2: 0.8
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_coco_wholebody_hand_256x256-8dbc750c_20210908.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet50_coco_wholebody_hand_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet50_coco_wholebody_hand_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa9f9e41c74061e862bb211a9f6a57132dc7aa1f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet50_coco_wholebody_hand_256x256.py
@@ -0,0 +1,132 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody_hand.py'
+]
+evaluation = dict(
+    interval=10, metric=['PCK', 'AUC', 'EPE'], key_indicator='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/scnet50-7ef0a199.pth',
+    backbone=dict(type='SCNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='HandCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.md
new file mode 100644
index 0000000000000000000000000000000000000000..5a7304e4db04f0779f43d53c8c293b6ee1bfc81a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.md
@@ -0,0 +1,38 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Liu_Improving_Convolutional_Networks_With_Self-Calibrated_Convolutions_CVPR_2020_paper.html">SCNet (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{liu2020improving,
+  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
+  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10096--10105},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody-Hand (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--------: | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_scnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet50_coco_wholebody_hand_256x256.py) | 256x256 | 0.803 | 0.834 | 4.55 | [ckpt](https://download.openmmlab.com/mmpose/hand/scnet/scnet50_coco_wholebody_hand_256x256-e73414c7_20210909.pth) | [log](https://download.openmmlab.com/mmpose/hand/scnet/scnet50_coco_wholebody_hand_256x256_20210909.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml
new file mode 100644
index 0000000000000000000000000000000000000000..241ba81139273842bfbc699d96dac64e572bfd4f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: SCNet
+  Paper:
+    Title: Improving Convolutional Networks with Self-Calibrated Convolutions
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Liu_Improving_Convolutional_Networks_With_Self-Calibrated_Convolutions_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/scnet.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet50_coco_wholebody_hand_256x256.py
+  In Collection: SCNet
+  Metadata:
+    Architecture:
+    - SCNet
+    Training Data: COCO-WholeBody-Hand
+  Name: topdown_heatmap_scnet50_coco_wholebody_hand_256x256
+  Results:
+  - Dataset: COCO-WholeBody-Hand
+    Metrics:
+      AUC: 0.834
+      EPE: 4.55
+      PCK@0.2: 0.803
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/scnet/scnet50_coco_wholebody_hand_256x256-e73414c7_20210909.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/hrnetv2_w18_freihand2d_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/hrnetv2_w18_freihand2d_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9fc516480933a0302a36d844071714edd68dc4a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/hrnetv2_w18_freihand2d_256x256.py
@@ -0,0 +1,165 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/freihand2d.py'
+]
+evaluation = dict(
+    interval=10, metric=['PCK', 'AUC', 'EPE'], key_indicator='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/freihand'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FreiHandDataset',
+        ann_file=f'{data_root}/annotations/freihand_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FreiHandDataset',
+        ann_file=f'{data_root}/annotations/freihand_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FreiHandDataset',
+        ann_file=f'{data_root}/annotations/freihand_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/res50_freihand2d_224x224.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/res50_freihand2d_224x224.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7d774bb35554e62a6f3aa9e3a1bef8cc4bf6a49
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/res50_freihand2d_224x224.py
@@ -0,0 +1,131 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/freihand2d.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[50, 70])
+total_epochs = 100
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[56, 56],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/freihand'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='FreiHandDataset',
+        ann_file=f'{data_root}/annotations/freihand_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='FreiHandDataset',
+        ann_file=f'{data_root}/annotations/freihand_val.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='FreiHandDataset',
+        ann_file=f'{data_root}/annotations/freihand_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/resnet_freihand2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/resnet_freihand2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..55629b23ea2e462db7b998ca785a8778b34d88c1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/resnet_freihand2d.md
@@ -0,0 +1,57 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ICCV_2019/html/Zimmermann_FreiHAND_A_Dataset_for_Markerless_Capture_of_Hand_Pose_and_ICCV_2019_paper.html">FreiHand (ICCV'2019)</a></summary>
+
+```bibtex
+@inproceedings{zimmermann2019freihand,
+  title={Freihand: A dataset for markerless capture of hand pose and shape from single rgb images},
+  author={Zimmermann, Christian and Ceylan, Duygu and Yang, Jimei and Russell, Bryan and Argus, Max and Brox, Thomas},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={813--822},
+  year={2019}
+}
+```
+
+</details>
+
+Results on FreiHand val & test set
+
+| Set | Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :--------: | :------: | :------: | :------: |:------: |:------: |
+|val| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/res50_freihand_224x224.py) | 224x224 | 0.993 | 0.868 | 3.25 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224-ff0799bc_20200914.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224_20200914.log.json) |
+|test| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/res50_freihand_224x224.py) | 224x224 | 0.992 | 0.868 | 3.27 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224-ff0799bc_20200914.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224_20200914.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/resnet_freihand2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/resnet_freihand2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f83395f97263db320f26e629cbbd62ba8368842b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/resnet_freihand2d.yml
@@ -0,0 +1,37 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/res50_freihand_224x224.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: FreiHand
+  Name: topdown_heatmap_res50_freihand_224x224
+  Results:
+  - Dataset: FreiHand
+    Metrics:
+      AUC: 0.868
+      EPE: 3.25
+      PCK@0.2: 0.993
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224-ff0799bc_20200914.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/res50_freihand_224x224.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: FreiHand
+  Name: topdown_heatmap_res50_freihand_224x224
+  Results:
+  - Dataset: FreiHand
+    Metrics:
+      AUC: 0.868
+      EPE: 3.27
+      PCK@0.2: 0.992
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224-ff0799bc_20200914.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_base_interhand2d_all_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_base_interhand2d_all_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..275b3a3a0b72d3333077dbcba548ede0ada43de0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_base_interhand2d_all_256x192.py
@@ -0,0 +1,162 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/interhand2d.py'
+]
+checkpoint_config = dict(interval=5)
+evaluation = dict(interval=5, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 50])
+total_epochs = 60
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/interhand2.6m'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_joint_3d.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_joint_3d.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_joint_3d.json',
+        img_prefix=f'{data_root}/images/test/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_huge_interhand2d_all_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_huge_interhand2d_all_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..2af0f77d17f2153f8454b2e25c59e83239890144
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_huge_interhand2d_all_256x192.py
@@ -0,0 +1,162 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/interhand2d.py'
+]
+checkpoint_config = dict(interval=5)
+evaluation = dict(interval=5, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 50])
+total_epochs = 60
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/interhand2.6m'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_joint_3d.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_joint_3d.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_joint_3d.json',
+        img_prefix=f'{data_root}/images/test/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_large_interhand2d_all_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_large_interhand2d_all_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..72c33a72f4d596479ff54c71bebbf66242e0c29d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_large_interhand2d_all_256x192.py
@@ -0,0 +1,162 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/interhand2d.py'
+]
+checkpoint_config = dict(interval=5)
+evaluation = dict(interval=5, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 50])
+total_epochs = 60
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/interhand2.6m'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_joint_3d.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_joint_3d.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_joint_3d.json',
+        img_prefix=f'{data_root}/images/test/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_small_interhand2d_all_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_small_interhand2d_all_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..d344dcaa937768f822dacfe6baf6d9c5c4efea0c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/ViTPose_small_interhand2d_all_256x192.py
@@ -0,0 +1,162 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/interhand2d.py'
+]
+checkpoint_config = dict(interval=5)
+evaluation = dict(interval=5, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 50])
+total_epochs = 60
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/interhand2.6m'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_joint_3d.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_joint_3d.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_joint_3d.json',
+        img_prefix=f'{data_root}/images/test/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5d4eac8170c2e1826c242caa4e5a179f8f5dc77
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py
@@ -0,0 +1,146 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/interhand2d.py'
+]
+checkpoint_config = dict(interval=5)
+evaluation = dict(interval=5, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 50])
+total_epochs = 60
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/interhand2.6m'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_joint_3d.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_joint_3d.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_joint_3d.json',
+        img_prefix=f'{data_root}/images/test/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b0fc2b1382ceff02bf4d0aa4514b4bbded9751e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py
@@ -0,0 +1,146 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/interhand2d.py'
+]
+checkpoint_config = dict(interval=5)
+evaluation = dict(interval=5, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 50])
+total_epochs = 60
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/interhand2.6m'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/human_annot/'
+        'InterHand2.6M_train_data.json',
+        camera_file=f'{data_root}/annotations/human_annot/'
+        'InterHand2.6M_train_camera.json',
+        joint_file=f'{data_root}/annotations/human_annot/'
+        'InterHand2.6M_train_joint_3d.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_joint_3d.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/human_annot/'
+        'InterHand2.6M_test_data.json',
+        camera_file=f'{data_root}/annotations/human_annot/'
+        'InterHand2.6M_test_camera.json',
+        joint_file=f'{data_root}/annotations/human_annot/'
+        'InterHand2.6M_test_joint_3d.json',
+        img_prefix=f'{data_root}/images/test/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b0cff66bc8a98de7a39581b048e01240db11dae
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py
@@ -0,0 +1,146 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/interhand2d.py'
+]
+checkpoint_config = dict(interval=5)
+evaluation = dict(interval=5, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[40, 50])
+total_epochs = 60
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/interhand2.6m'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_train_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_train_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_train_joint_3d.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_joint_3d.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='InterHand2DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_test_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_test_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_test_joint_3d.json',
+        img_prefix=f'{data_root}/images/test/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/resnet_interhand2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/resnet_interhand2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..197e53d44cbda53397a2b57f0a61cca10378d1c0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/resnet_interhand2d.md
@@ -0,0 +1,66 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/content/pdf/10.1007/978-3-030-58565-5_33.pdf">InterHand2.6M (ECCV'2020)</a></summary>
+
+```bibtex
+@InProceedings{Moon_2020_ECCV_InterHand2.6M,
+author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
+title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
+booktitle = {European Conference on Computer Vision (ECCV)},
+year = {2020}
+}
+```
+
+</details>
+
+Results on InterHand2.6M val & test set
+
+|Train Set| Set | Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--- | :--------: | :--------: | :------: | :------: | :------: |:------: |:------: |
+|Human_annot|val(M)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py) | 256x256 | 0.973 | 0.828 | 5.15 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human-77b27d1a_20201029.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human_20201029.log.json) |
+|Human_annot|test(H)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py) | 256x256 | 0.973 | 0.826 | 5.27 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human-77b27d1a_20201029.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human_20201029.log.json) |
+|Human_annot|test(M)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py) | 256x256 | 0.975 | 0.841 | 4.90 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human-77b27d1a_20201029.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human_20201029.log.json) |
+|Human_annot|test(H+M)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py) | 256x256 | 0.975 | 0.839 | 4.97 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human-77b27d1a_20201029.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human_20201029.log.json) |
+|Machine_annot|val(M)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py) | 256x256 | 0.970 | 0.824 | 5.39 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine-8f3efe9a_20201102.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine_20201102.log.json) |
+|Machine_annot|test(H)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py) | 256x256 | 0.969 | 0.821 | 5.52 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine-8f3efe9a_20201102.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine_20201102.log.json) |
+|Machine_annot|test(M)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py) | 256x256 | 0.972 | 0.838 | 5.03 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine-8f3efe9a_20201102.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine_20201102.log.json) |
+|Machine_annot|test(H+M)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py) | 256x256 | 0.972 | 0.837 | 5.11 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine-8f3efe9a_20201102.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine_20201102.log.json) |
+|All|val(M)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py) | 256x256 | 0.977 | 0.840 | 4.66 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all-78cc95d4_20201102.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all_20201102.log.json) |
+|All|test(H)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py) | 256x256 | 0.979 | 0.839 | 4.65 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all-78cc95d4_20201102.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all_20201102.log.json) |
+|All|test(M)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py) | 256x256 | 0.979 | 0.838 | 4.42 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all-78cc95d4_20201102.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all_20201102.log.json) |
+|All|test(H+M)| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py) | 256x256 | 0.979 | 0.851 | 4.46 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all-78cc95d4_20201102.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all_20201102.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/resnet_interhand2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/resnet_interhand2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ff9ca057a76e998db1da1871c8376f81f320a199
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/resnet_interhand2d.yml
@@ -0,0 +1,177 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_human_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.828
+      EPE: 5.15
+      PCK@0.2: 0.973
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human-77b27d1a_20201029.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_human_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.826
+      EPE: 5.27
+      PCK@0.2: 0.973
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human-77b27d1a_20201029.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_human_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.841
+      EPE: 4.9
+      PCK@0.2: 0.975
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human-77b27d1a_20201029.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_human_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_human_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.839
+      EPE: 4.97
+      PCK@0.2: 0.975
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_human-77b27d1a_20201029.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_machine_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.824
+      EPE: 5.39
+      PCK@0.2: 0.97
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine-8f3efe9a_20201102.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_machine_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.821
+      EPE: 5.52
+      PCK@0.2: 0.969
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine-8f3efe9a_20201102.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_machine_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.838
+      EPE: 5.03
+      PCK@0.2: 0.972
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine-8f3efe9a_20201102.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_machine_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_machine_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.837
+      EPE: 5.11
+      PCK@0.2: 0.972
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_machine-8f3efe9a_20201102.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_all_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.84
+      EPE: 4.66
+      PCK@0.2: 0.977
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all-78cc95d4_20201102.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_all_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.839
+      EPE: 4.65
+      PCK@0.2: 0.979
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all-78cc95d4_20201102.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_all_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.838
+      EPE: 4.42
+      PCK@0.2: 0.979
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all-78cc95d4_20201102.pth
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/res50_interhand2d_all_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: topdown_heatmap_res50_interhand2d_all_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      AUC: 0.851
+      EPE: 4.46
+      PCK@0.2: 0.979
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_interhand2d_256x256_all-78cc95d4_20201102.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6d40948042926792cabb2d4ce649458db06700b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.md
@@ -0,0 +1,60 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/8529221/">OneHand10K (TCSVT'2019)</a></summary>
+
+```bibtex
+@article{wang2018mask,
+  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+  journal={IEEE Transactions on Circuits and Systems for Video Technology},
+  volume={29},
+  number={11},
+  pages={3258--3268},
+  year={2018},
+  publisher={IEEE}
+}
+```
+
+</details>
+
+Results on OneHand10K val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18_dark](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256_dark.py) | 256x256 | 0.990 | 0.573 | 23.84 | [ckpt](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark-a2f80c64_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml
new file mode 100644
index 0000000000000000000000000000000000000000..17b2901b36f1c2f232283183bb07aea48e2c8d86
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - DarkPose
+    Training Data: OneHand10K
+  Name: topdown_heatmap_hrnetv2_w18_onehand10k_256x256_dark
+  Results:
+  - Dataset: OneHand10K
+    Metrics:
+      AUC: 0.573
+      EPE: 23.84
+      PCK@0.2: 0.99
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark-a2f80c64_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_onehand10k.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_onehand10k.md
new file mode 100644
index 0000000000000000000000000000000000000000..464e16a4c24e4eed7962ece0032a28796f0af877
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_onehand10k.md
@@ -0,0 +1,43 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/8529221/">OneHand10K (TCSVT'2019)</a></summary>
+
+```bibtex
+@article{wang2018mask,
+  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+  journal={IEEE Transactions on Circuits and Systems for Video Technology},
+  volume={29},
+  number={11},
+  pages={3258--3268},
+  year={2018},
+  publisher={IEEE}
+}
+```
+
+</details>
+
+Results on OneHand10K val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256.py) | 256x256 | 0.990 | 0.568 | 24.16 | [ckpt](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_onehand10k_256x256-30bc9c6b_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_onehand10k_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6b104bd7cb417114cc58e98aa333c204e49dc4a8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: HRNetv2
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Visual Recognition
+    URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256.py
+  In Collection: HRNetv2
+  Metadata:
+    Architecture:
+    - HRNetv2
+    Training Data: OneHand10K
+  Name: topdown_heatmap_hrnetv2_w18_onehand10k_256x256
+  Results:
+  - Dataset: OneHand10K
+    Metrics:
+      AUC: 0.568
+      EPE: 24.16
+      PCK@0.2: 0.99
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_onehand10k_256x256-30bc9c6b_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.md
new file mode 100644
index 0000000000000000000000000000000000000000..8247cd08105e23a430eb7ff3da2662476147d582
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.md
@@ -0,0 +1,60 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html">UDP (CVPR'2020)</a></summary>
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  month = {June},
+  year = {2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/8529221/">OneHand10K (TCSVT'2019)</a></summary>
+
+```bibtex
+@article{wang2018mask,
+  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+  journal={IEEE Transactions on Circuits and Systems for Video Technology},
+  volume={29},
+  number={11},
+  pages={3258--3268},
+  year={2018},
+  publisher={IEEE}
+}
+```
+
+</details>
+
+Results on OneHand10K val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18_udp](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256_udp.py) | 256x256 | 0.990 | 0.572 | 23.87 | [ckpt](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_onehand10k_256x256_udp-0d1b515d_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_onehand10k_256x256_udp_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7251110179d3a88f3e3dbfc98be990231e8a345f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml
@@ -0,0 +1,24 @@
+Collections:
+- Name: UDP
+  Paper:
+    Title: 'The Devil Is in the Details: Delving Into Unbiased Data Processing for
+      Human Pose Estimation'
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/udp.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256_udp.py
+  In Collection: UDP
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - UDP
+    Training Data: OneHand10K
+  Name: topdown_heatmap_hrnetv2_w18_onehand10k_256x256_udp
+  Results:
+  - Dataset: OneHand10K
+    Metrics:
+      AUC: 0.572
+      EPE: 23.87
+      PCK@0.2: 0.99
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_onehand10k_256x256_udp-0d1b515d_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..36e930631b0bae66f263f7b05afd3e447af66d70
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/onehand10k.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/onehand10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b1e8a7c93569fd20c461ccb1b6fee562e6657db
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256_dark.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/onehand10k.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/onehand10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..3694a3cdaf3d4142b4bfc73ec11984302a0b29fd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_w18_onehand10k_256x256_udp.py
@@ -0,0 +1,171 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/onehand10k.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/onehand10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e45d76b517355272151fc977886bf4f583591f8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.md
@@ -0,0 +1,42 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html">MobilenetV2 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4510--4520},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/8529221/">OneHand10K (TCSVT'2019)</a></summary>
+
+```bibtex
+@article{wang2018mask,
+  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+  journal={IEEE Transactions on Circuits and Systems for Video Technology},
+  volume={29},
+  number={11},
+  pages={3258--3268},
+  year={2018},
+  publisher={IEEE}
+}
+```
+
+</details>
+
+Results on OneHand10K val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_mobilenet_v2](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k_256x256.py) | 256x256 | 0.986 | 0.537 | 28.60 | [ckpt](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_onehand10k_256x256-f3a3d90e_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_onehand10k_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c4f81d6f4e18d139912e350887ab56e03eab4592
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: MobilenetV2
+  Paper:
+    Title: 'Mobilenetv2: Inverted residuals and linear bottlenecks'
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/mobilenetv2.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k_256x256.py
+  In Collection: MobilenetV2
+  Metadata:
+    Architecture:
+    - MobilenetV2
+    Training Data: OneHand10K
+  Name: topdown_heatmap_mobilenetv2_onehand10k_256x256
+  Results:
+  - Dataset: OneHand10K
+    Metrics:
+      AUC: 0.537
+      EPE: 28.6
+      PCK@0.2: 0.986
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_onehand10k_256x256-f3a3d90e_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cb41c397ce2b1ade1321d75e178e33a9fe37f7d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k_256x256.py
@@ -0,0 +1,131 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/onehand10k.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/onehand10k'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/res50_onehand10k_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/res50_onehand10k_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5bd56682c532be7a5c46963e2662012e040825f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/res50_onehand10k_256x256.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/onehand10k.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/onehand10k'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='OneHand10KDataset',
+        ann_file=f'{data_root}/annotations/onehand10k_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/resnet_onehand10k.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/resnet_onehand10k.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d190760318d2de5c390791a1ff293fb78c08ddd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/resnet_onehand10k.md
@@ -0,0 +1,59 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/8529221/">OneHand10K (TCSVT'2019)</a></summary>
+
+```bibtex
+@article{wang2018mask,
+  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+  journal={IEEE Transactions on Circuits and Systems for Video Technology},
+  volume={29},
+  number={11},
+  pages={3258--3268},
+  year={2018},
+  publisher={IEEE}
+}
+```
+
+</details>
+
+Results on OneHand10K val set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/res50_onehand10k_256x256.py) | 256x256 | 0.989 | 0.555 | 25.19 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_onehand10k_256x256-739c8639_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_onehand10k_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/resnet_onehand10k.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/resnet_onehand10k.yml
new file mode 100644
index 0000000000000000000000000000000000000000..065f99d667b0d62f7c0080ed24c9469c5cd8a82b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/resnet_onehand10k.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/res50_onehand10k_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture:
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: OneHand10K
+  Name: topdown_heatmap_res50_onehand10k_256x256
+  Results:
+  - Dataset: OneHand10K
+    Metrics:
+      AUC: 0.555
+      EPE: 25.19
+      PCK@0.2: 0.989
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_onehand10k_256x256-739c8639_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_dark_panoptic2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_dark_panoptic2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..6ac86361123f7e1e163a2057dd98a5c51032df63
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_dark_panoptic2d.md
@@ -0,0 +1,57 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2017/html/Simon_Hand_Keypoint_Detection_CVPR_2017_paper.html">CMU Panoptic HandDB (CVPR'2017)</a></summary>
+
+```bibtex
+@inproceedings{simon2017hand,
+  title={Hand keypoint detection in single images using multiview bootstrapping},
+  author={Simon, Tomas and Joo, Hanbyul and Matthews, Iain and Sheikh, Yaser},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={1145--1153},
+  year={2017}
+}
+```
+
+</details>
+
+Results on CMU Panoptic (MPII+NZSL val set)
+
+| Arch  | Input Size | PCKh@0.7 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18_dark](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic_256x256_dark.py) | 256x256 | 0.999 | 0.745 | 7.77 | [ckpt](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_panoptic_256x256_dark-1f1e4b74_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_panoptic_256x256_dark_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_dark_panoptic2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_dark_panoptic2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..33f7f7d25c382f7bc878b52d7d39fc3952c375fb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_dark_panoptic2d.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic_256x256_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - DarkPose
+    Training Data: CMU Panoptic HandDB
+  Name: topdown_heatmap_hrnetv2_w18_panoptic_256x256_dark
+  Results:
+  - Dataset: CMU Panoptic HandDB
+    Metrics:
+      AUC: 0.745
+      EPE: 7.77
+      PCKh@0.7: 0.999
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_panoptic_256x256_dark-1f1e4b74_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_panoptic2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_panoptic2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..8b4cf1f80c71596e2c049b580e246a589d9987f2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_panoptic2d.md
@@ -0,0 +1,40 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2017/html/Simon_Hand_Keypoint_Detection_CVPR_2017_paper.html">CMU Panoptic HandDB (CVPR'2017)</a></summary>
+
+```bibtex
+@inproceedings{simon2017hand,
+  title={Hand keypoint detection in single images using multiview bootstrapping},
+  author={Simon, Tomas and Joo, Hanbyul and Matthews, Iain and Sheikh, Yaser},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={1145--1153},
+  year={2017}
+}
+```
+
+</details>
+
+Results on CMU Panoptic (MPII+NZSL val set)
+
+| Arch  | Input Size | PCKh@0.7 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic_256x256.py) | 256x256 | 0.999 | 0.744 | 7.79 | [ckpt](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_panoptic_256x256-53b12345_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_panoptic_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_panoptic2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_panoptic2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..06f7bd1a20a40055256c2e49c4b844a71f0a118b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_panoptic2d.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: HRNetv2
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Visual Recognition
+    URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic_256x256.py
+  In Collection: HRNetv2
+  Metadata:
+    Architecture:
+    - HRNetv2
+    Training Data: CMU Panoptic HandDB
+  Name: topdown_heatmap_hrnetv2_w18_panoptic_256x256
+  Results:
+  - Dataset: CMU Panoptic HandDB
+    Metrics:
+      AUC: 0.744
+      EPE: 7.79
+      PCKh@0.7: 0.999
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_panoptic_256x256-53b12345_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_udp_panoptic2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_udp_panoptic2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe1ea73624c9fafa782452b59ee5cd671945360e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_udp_panoptic2d.md
@@ -0,0 +1,57 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html">UDP (CVPR'2020)</a></summary>
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  month = {June},
+  year = {2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2017/html/Simon_Hand_Keypoint_Detection_CVPR_2017_paper.html">CMU Panoptic HandDB (CVPR'2017)</a></summary>
+
+```bibtex
+@inproceedings{simon2017hand,
+  title={Hand keypoint detection in single images using multiview bootstrapping},
+  author={Simon, Tomas and Joo, Hanbyul and Matthews, Iain and Sheikh, Yaser},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={1145--1153},
+  year={2017}
+}
+```
+
+</details>
+
+Results on CMU Panoptic (MPII+NZSL val set)
+
+| Arch  | Input Size | PCKh@0.7 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18_udp](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic_256x256_udp.py) | 256x256 | 0.998 | 0.742 | 7.84 | [ckpt](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_panoptic_256x256_udp-f9e15948_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_panoptic_256x256_udp_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_udp_panoptic2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_udp_panoptic2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cd1e91e2dbe0d77e6f3a8398589ea8480874b985
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_udp_panoptic2d.yml
@@ -0,0 +1,24 @@
+Collections:
+- Name: UDP
+  Paper:
+    Title: 'The Devil Is in the Details: Delving Into Unbiased Data Processing for
+      Human Pose Estimation'
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/udp.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic_256x256_udp.py
+  In Collection: UDP
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - UDP
+    Training Data: CMU Panoptic HandDB
+  Name: topdown_heatmap_hrnetv2_w18_panoptic_256x256_udp
+  Results:
+  - Dataset: CMU Panoptic HandDB
+    Metrics:
+      AUC: 0.742
+      EPE: 7.84
+      PCKh@0.7: 0.998
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_panoptic_256x256_udp-f9e15948_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic2d_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic2d_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..148ba027ecca2e95e6b078ed2371959a72d90f5c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic2d_256x256.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/panoptic_hand2d.py'
+]
+evaluation = dict(interval=10, metric=['PCKh', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/panoptic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic2d_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic2d_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..94c2ab06be0d571ee00e1bc52ff55332f5ca0643
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic2d_256x256_dark.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/panoptic_hand2d.py'
+]
+evaluation = dict(interval=10, metric=['PCKh', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/panoptic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic2d_256x256_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic2d_256x256_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfb89a6adac93bcc2a294559348617fc6e99d451
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_w18_panoptic2d_256x256_udp.py
@@ -0,0 +1,171 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/panoptic_hand2d.py'
+]
+evaluation = dict(interval=10, metric=['PCKh', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/panoptic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..def2133ca8a77d92b7d74e0ea73f73d1dc1e4183
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d.md
@@ -0,0 +1,39 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html">MobilenetV2 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4510--4520},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2017/html/Simon_Hand_Keypoint_Detection_CVPR_2017_paper.html">CMU Panoptic HandDB (CVPR'2017)</a></summary>
+
+```bibtex
+@inproceedings{simon2017hand,
+  title={Hand keypoint detection in single images using multiview bootstrapping},
+  author={Simon, Tomas and Joo, Hanbyul and Matthews, Iain and Sheikh, Yaser},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={1145--1153},
+  year={2017}
+}
+```
+
+</details>
+
+Results on CMU Panoptic (MPII+NZSL val set)
+
+| Arch  | Input Size | PCKh@0.7 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_mobilenet_v2](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic_256x256.py) | 256x256 | 0.998 | 0.694 | 9.70 | [ckpt](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_panoptic_256x256-b733d98c_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_panoptic_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1339b1e944c7ee93585546e1fa0a853455fa12c7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: MobilenetV2
+  Paper:
+    Title: 'Mobilenetv2: Inverted residuals and linear bottlenecks'
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/mobilenetv2.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic_256x256.py
+  In Collection: MobilenetV2
+  Metadata:
+    Architecture:
+    - MobilenetV2
+    Training Data: CMU Panoptic HandDB
+  Name: topdown_heatmap_mobilenetv2_panoptic_256x256
+  Results:
+  - Dataset: CMU Panoptic HandDB
+    Metrics:
+      AUC: 0.694
+      EPE: 9.7
+      PCKh@0.7: 0.998
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_panoptic_256x256-b733d98c_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..a164074edc4fc866d22482e91d41730de2d3788f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d_256x256.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/panoptic_hand2d.py'
+]
+evaluation = dict(interval=10, metric=['PCKh', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/panoptic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/res50_panoptic2d_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/res50_panoptic2d_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..774711b19f68b0b335010df04acd456b385eb956
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/res50_panoptic2d_256x256.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/panoptic_hand2d.py'
+]
+evaluation = dict(interval=10, metric=['PCKh', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/panoptic'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='PanopticDataset',
+        ann_file=f'{data_root}/annotations/panoptic_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/resnet_panoptic2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/resnet_panoptic2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..f92f22bc561afdb93c16cc278e53c7ec842d2f5c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/resnet_panoptic2d.md
@@ -0,0 +1,56 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2017/html/Simon_Hand_Keypoint_Detection_CVPR_2017_paper.html">CMU Panoptic HandDB (CVPR'2017)</a></summary>
+
+```bibtex
+@inproceedings{simon2017hand,
+  title={Hand keypoint detection in single images using multiview bootstrapping},
+  author={Simon, Tomas and Joo, Hanbyul and Matthews, Iain and Sheikh, Yaser},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={1145--1153},
+  year={2017}
+}
+```
+
+</details>
+
+Results on CMU Panoptic (MPII+NZSL val set)
+
+| Arch  | Input Size | PCKh@0.7 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/res50_panoptic_256x256.py) | 256x256 | 0.999 | 0.713 | 9.00 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_panoptic_256x256-4eafc561_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_panoptic_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/resnet_panoptic2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/resnet_panoptic2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..79dd55598d5452168d31312b46f7a6ebe71861cb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/resnet_panoptic2d.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/res50_panoptic_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture:
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: CMU Panoptic HandDB
+  Name: topdown_heatmap_res50_panoptic_256x256
+  Results:
+  - Dataset: CMU Panoptic HandDB
+    Metrics:
+      AUC: 0.713
+      EPE: 9.0
+      PCKh@0.7: 0.999
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_panoptic_256x256-4eafc561_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..15bc4d5f75d31d0e804402c94f200f9314873361
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.md
@@ -0,0 +1,58 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://lmb.informatik.uni-freiburg.de/projects/hand3d/">RHD (ICCV'2017)</a></summary>
+
+```bibtex
+@TechReport{zb2017hand,
+  author={Christian Zimmermann and Thomas Brox},
+  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+  institution={arXiv:1705.01389},
+  year={2017},
+  note="https://arxiv.org/abs/1705.01389",
+  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+</details>
+
+Results on RHD test set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18_dark](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256_dark.py) | 256x256 | 0.992 | 0.903 | 2.17 | [ckpt](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_rhd2d_256x256_dark-4df3a347_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_rhd2d_256x256_dark_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6083f92e6b93058ede4d8ed1fce6b057f4e3be55
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - DarkPose
+    Training Data: RHD
+  Name: topdown_heatmap_hrnetv2_w18_rhd2d_256x256_dark
+  Results:
+  - Dataset: RHD
+    Metrics:
+      AUC: 0.903
+      EPE: 2.17
+      PCK@0.2: 0.992
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_rhd2d_256x256_dark-4df3a347_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_rhd2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_rhd2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb1b0ed6d18916b5e20588d91e3e915c4d23ccda
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_rhd2d.md
@@ -0,0 +1,41 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://lmb.informatik.uni-freiburg.de/projects/hand3d/">RHD (ICCV'2017)</a></summary>
+
+```bibtex
+@TechReport{zb2017hand,
+  author={Christian Zimmermann and Thomas Brox},
+  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+  institution={arXiv:1705.01389},
+  year={2017},
+  note="https://arxiv.org/abs/1705.01389",
+  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+</details>
+
+Results on RHD test set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256.py) | 256x256 | 0.992 | 0.902 | 2.21 | [ckpt](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256-95b20dd8_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6fbc9848896bf8a9b2a416c7bd95a932e6a39b73
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: HRNetv2
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Visual Recognition
+    URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256.py
+  In Collection: HRNetv2
+  Metadata:
+    Architecture:
+    - HRNetv2
+    Training Data: RHD
+  Name: topdown_heatmap_hrnetv2_w18_rhd2d_256x256
+  Results:
+  - Dataset: RHD
+    Metrics:
+      AUC: 0.902
+      EPE: 2.21
+      PCK@0.2: 0.992
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256-95b20dd8_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..e18b661b5e8283740f362a365b7fc3cf42ecddd2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.md
@@ -0,0 +1,58 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/9052469/">HRNetv2 (TPAMI'2019)</a></summary>
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+  title={Deep High-Resolution Representation Learning for Visual Recognition},
+  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+  journal={TPAMI},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html">UDP (CVPR'2020)</a></summary>
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  month = {June},
+  year = {2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://lmb.informatik.uni-freiburg.de/projects/hand3d/">RHD (ICCV'2017)</a></summary>
+
+```bibtex
+@TechReport{zb2017hand,
+  author={Christian Zimmermann and Thomas Brox},
+  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+  institution={arXiv:1705.01389},
+  year={2017},
+  note="https://arxiv.org/abs/1705.01389",
+  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+</details>
+
+Results on CMU Panoptic (MPII+NZSL val set)
+
+| Arch  | Input Size | PCKh@0.7 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_hrnetv2_w18_udp](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256_udp.py) | 256x256 | 0.998 | 0.742 | 7.84 | [ckpt](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_rhd2d_256x256_udp-63ba6007_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_rhd2d_256x256_udp_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..40a19b4e2c741b79de9cb23ffdbd5375f1ede6ae
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml
@@ -0,0 +1,24 @@
+Collections:
+- Name: UDP
+  Paper:
+    Title: 'The Devil Is in the Details: Delving Into Unbiased Data Processing for
+      Human Pose Estimation'
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/udp.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256_udp.py
+  In Collection: UDP
+  Metadata:
+    Architecture:
+    - HRNetv2
+    - UDP
+    Training Data: RHD
+  Name: topdown_heatmap_hrnetv2_w18_rhd2d_256x256_udp
+  Results:
+  - Dataset: RHD
+    Metrics:
+      AUC: 0.742
+      EPE: 7.84
+      PCKh@0.7: 0.998
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_rhd2d_256x256_udp-63ba6007_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..4989023f0161c68a32a8ad3c1e6f22d5b36372f0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/rhd2d.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/rhd'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..2645755550aee2aed054b621229e0aae29e955f3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256_dark.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/rhd2d.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/rhd'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256_udp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256_udp.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf3acf46eea463dff5d5cde9b151729fe6e727b2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_w18_rhd2d_256x256_udp.py
@@ -0,0 +1,171 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/rhd2d.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/rhd'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..448ed41f3c5dc4059238d86d53baf38be9b2277f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.md
@@ -0,0 +1,40 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html">MobilenetV2 (CVPR'2018)</a></summary>
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4510--4520},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://lmb.informatik.uni-freiburg.de/projects/hand3d/">RHD (ICCV'2017)</a></summary>
+
+```bibtex
+@TechReport{zb2017hand,
+  author={Christian Zimmermann and Thomas Brox},
+  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+  institution={arXiv:1705.01389},
+  year={2017},
+  note="https://arxiv.org/abs/1705.01389",
+  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+</details>
+
+Results on RHD test set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_mobilenet_v2](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d_256x256.py) | 256x256 | 0.985 | 0.883 | 2.80 | [ckpt](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_rhd2d_256x256-85fa02db_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_rhd2d_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bd448d4a59d83e3d747d68e0567b541d36808a7f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: MobilenetV2
+  Paper:
+    Title: 'Mobilenetv2: Inverted residuals and linear bottlenecks'
+    URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Sandler_MobileNetV2_Inverted_Residuals_CVPR_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/mobilenetv2.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d_256x256.py
+  In Collection: MobilenetV2
+  Metadata:
+    Architecture:
+    - MobilenetV2
+    Training Data: RHD
+  Name: topdown_heatmap_mobilenetv2_rhd2d_256x256
+  Results:
+  - Dataset: RHD
+    Metrics:
+      AUC: 0.883
+      EPE: 2.8
+      PCK@0.2: 0.985
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_rhd2d_256x256-85fa02db_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..44c94c1852500bf32390faba97b9a60b5357f191
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d_256x256.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/rhd2d.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/rhd'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/res50_rhd2d_224x224.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/res50_rhd2d_224x224.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1505698db92a5fb17347d180c69046636e98788
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/res50_rhd2d_224x224.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/rhd2d.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[56, 56],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/rhd'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/res50_rhd2d_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/res50_rhd2d_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..c987d338fc9580e62dbebe4c005f4355dba39334
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/res50_rhd2d_256x256.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/rhd2d.py'
+]
+evaluation = dict(interval=10, metric=['PCK', 'AUC', 'EPE'], save_best='AUC')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=90, scale_factor=0.3),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/rhd'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_train.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='Rhd2DDataset',
+        ann_file=f'{data_root}/annotations/rhd_test.json',
+        img_prefix=f'{data_root}/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/resnet_rhd2d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/resnet_rhd2d.md
new file mode 100644
index 0000000000000000000000000000000000000000..78dee7b93bda9073ad6afb019d9205c405d2614d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/resnet_rhd2d.md
@@ -0,0 +1,57 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://lmb.informatik.uni-freiburg.de/projects/hand3d/">RHD (ICCV'2017)</a></summary>
+
+```bibtex
+@TechReport{zb2017hand,
+  author={Christian Zimmermann and Thomas Brox},
+  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+  institution={arXiv:1705.01389},
+  year={2017},
+  note="https://arxiv.org/abs/1705.01389",
+  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+</details>
+
+Results on RHD test set
+
+| Arch  | Input Size | PCK@0.2 |  AUC  |  EPE  | ckpt    | log     |
+| :--- | :--------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet50](/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/res50_rhd2d_256x256.py) | 256x256 | 0.991 | 0.898 | 2.33 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_rhd2d_256x256-5dc7e4cc_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_rhd2d_256x256_20210330.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/resnet_rhd2d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/resnet_rhd2d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..457ace5fc2186f17b2ff73d0d5f532d090b6da41
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/resnet_rhd2d.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/res50_rhd2d_256x256.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture:
+    - SimpleBaseline2D
+    - ResNet
+    Training Data: RHD
+  Name: topdown_heatmap_res50_rhd2d_256x256
+  Results:
+  - Dataset: RHD
+    Metrics:
+      AUC: 0.898
+      EPE: 2.33
+      PCK@0.2: 0.991
+    Task: Hand 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_rhd2d_256x256-5dc7e4cc_20210330.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c058280df2ded5486bf04dcb92731ac6c6a93b0a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/README.md
@@ -0,0 +1,7 @@
+# 3D Hand Pose Estimation
+
+3D hand pose estimation is defined as the task of detecting the poses (or keypoints) of the hand from an input image.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/3d_hand_keypoint.md) to prepare data.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f7d2a8ccffc8cc6f2249d98e045a82d25f810199
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/README.md
@@ -0,0 +1,19 @@
+# InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/content/pdf/10.1007/978-3-030-58565-5_33.pdf">InterNet (ECCV'2020)</a></summary>
+
+```bibtex
+@InProceedings{Moon_2020_ECCV_InterHand2.6M,
+author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
+title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
+booktitle = {European Conference on Computer Vision (ECCV)},
+year = {2020}
+}
+```
+
+</details>
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/internet_interhand3d.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/internet_interhand3d.md
new file mode 100644
index 0000000000000000000000000000000000000000..2c141628483305df44bc186fd4caa958f473599e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/internet_interhand3d.md
@@ -0,0 +1,55 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/content/pdf/10.1007/978-3-030-58565-5_33.pdf">InterNet (ECCV'2020)</a></summary>
+
+```bibtex
+@InProceedings{Moon_2020_ECCV_InterHand2.6M,
+author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
+title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
+booktitle = {European Conference on Computer Vision (ECCV)},
+year = {2020}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
+
+```bibtex
+@inproceedings{he2016deep,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/content/pdf/10.1007/978-3-030-58565-5_33.pdf">InterHand2.6M (ECCV'2020)</a></summary>
+
+```bibtex
+@InProceedings{Moon_2020_ECCV_InterHand2.6M,
+author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
+title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
+booktitle = {European Conference on Computer Vision (ECCV)},
+year = {2020}
+}
+```
+
+</details>
+
+Results on InterHand2.6M val & test set
+
+|Train Set| Set | Arch  | Input Size | MPJPE-single |  MPJPE-interacting  |  MPJPE-all  | MRRPE | APh   | ckpt    | log     |
+| :--- | :--- | :--------: | :--------: | :------: | :------: | :------: |:------: |:------: |:------: |:------: |
+| All | test(H+M) | [InterNet_resnet_50](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 9.47 | 13.40 | 11.59 | 29.28 | 0.99 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256_20210702.log.json) |
+| All | val(M) | [InterNet_resnet_50](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 11.22 | 15.23 | 13.16 | 31.73 | 0.98 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256_20210702.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/internet_interhand3d.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/internet_interhand3d.yml
new file mode 100644
index 0000000000000000000000000000000000000000..34749b20c39124a1e9d5aaac91ebd25c45235c69
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/internet_interhand3d.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: InterNet
+  Paper:
+    Title: 'InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation
+      from a Single RGB Image'
+    URL: https://link.springer.com/content/pdf/10.1007/978-3-030-58565-5_33.pdf
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/internet.md
+Models:
+- Config: configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py
+  In Collection: InterNet
+  Metadata:
+    Architecture: &id001
+    - InterNet
+    - ResNet
+    Training Data: InterHand2.6M
+  Name: internet_res50_interhand3d_all_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      APh: 0.99
+      MPJPE-all: 11.59
+      MPJPE-interacting: 13.4
+      MPJPE-single: 9.47
+    Task: Hand 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth
+- Config: configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py
+  In Collection: InterNet
+  Metadata:
+    Architecture: *id001
+    Training Data: InterHand2.6M
+  Name: internet_res50_interhand3d_all_256x256
+  Results:
+  - Dataset: InterHand2.6M
+    Metrics:
+      APh: 0.98
+      MPJPE-all: 13.16
+      MPJPE-interacting: 15.23
+      MPJPE-single: 11.22
+    Task: Hand 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..6acb9180e996ef5f50c17633b13207048cf30420
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py
@@ -0,0 +1,181 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/interhand3d.py'
+]
+checkpoint_config = dict(interval=1)
+evaluation = dict(
+    interval=1,
+    metric=['MRRPE', 'MPJPE', 'Handedness_acc'],
+    save_best='MPJPE_all')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[15, 17])
+total_epochs = 20
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=42,
+    dataset_joints=42,
+    dataset_channel=[list(range(42))],
+    inference_channel=list(range(42)))
+
+# model settings
+model = dict(
+    type='Interhand3D',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='Interhand3DHead',
+        keypoint_head_cfg=dict(
+            in_channels=2048,
+            out_channels=21 * 64,
+            depth_size=64,
+            num_deconv_layers=3,
+            num_deconv_filters=(256, 256, 256),
+            num_deconv_kernels=(4, 4, 4),
+        ),
+        root_head_cfg=dict(
+            in_channels=2048,
+            heatmap_size=64,
+            hidden_dims=(512, ),
+        ),
+        hand_type_head_cfg=dict(
+            in_channels=2048,
+            num_labels=2,
+            hidden_dims=(512, ),
+        ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True),
+        loss_root_depth=dict(type='L1Loss', use_target_weight=True),
+        loss_hand_type=dict(type='BCELoss', use_target_weight=True),
+    ),
+    train_cfg={},
+    test_cfg=dict(flip_test=False))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64, 64],
+    heatmap3d_depth_bound=400.0,
+    heatmap_size_root=64,
+    root_depth_bound=400.0,
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='HandRandomFlip', flip_prob=0.5),
+    dict(type='TopDownRandomTranslation', trans_factor=0.15),
+    dict(
+        type='TopDownGetRandomScaleRotation',
+        rot_factor=45,
+        scale_factor=0.25,
+        rot_prob=0.6),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='MultitaskGatherTarget',
+        pipeline_list=[
+            [dict(
+                type='Generate3DHeatmapTarget',
+                sigma=2.5,
+                max_bound=255,
+            )], [dict(type='HandGenerateRelDepthTarget')],
+            [
+                dict(
+                    type='RenameKeys',
+                    key_pairs=[('hand_type', 'target'),
+                               ('hand_type_valid', 'target_weight')])
+            ]
+        ],
+        pipeline_indices=[0, 1, 2],
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'flip_pairs',
+            'heatmap3d_depth_bound', 'root_depth_bound'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/interhand2.6m'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=1,
+    train=dict(
+        type='InterHand3DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_joint_3d.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        use_gt_root_depth=True,
+        rootnet_result_file=None,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='InterHand3DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_joint_3d.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        use_gt_root_depth=True,
+        rootnet_result_file=None,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='InterHand3DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_joint_3d.json',
+        img_prefix=f'{data_root}/images/test/',
+        data_cfg=data_cfg,
+        use_gt_root_depth=True,
+        rootnet_result_file=None,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..904a391e7dd3ad45fa6b90a7ac0b9763f2ec2596
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/README.md
@@ -0,0 +1,19 @@
+# 2D Human Whole-Body Pose Estimation
+
+2D human whole-body pose estimation aims to localize dense landmarks on the entire human body including face, hands, body, and feet.
+
+Existing approaches can be categorized into top-down and bottom-up approaches.
+
+Top-down methods divide the task into two stages: human detection and whole-body pose estimation. They perform human detection first, followed by single-person whole-body pose estimation given human bounding boxes.
+
+Bottom-up approaches (e.g. AE) first detect all the whole-body keypoints and then group/associate them into person instances.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/tasks/2d_wholebody_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/2d_wholebody_pose_demo.md) to run demos.
+
+<img src="https://user-images.githubusercontent.com/9464825/95552839-00a61080-0a40-11eb-818c-b8dad7307217.gif" width="600px" alt><br>
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2048f2182b77605924ec48913c3203e3bc0a61be
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/README.md
@@ -0,0 +1,25 @@
+# Associative embedding: End-to-end learning for joint detection and grouping (AE)
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+AE is one of the most popular 2D bottom-up pose estimation approaches, that first detect all the keypoints and
+then group/associate them into person instances.
+
+In order to group all the predicted keypoints to individuals, a tag is also predicted for each detected keypoint.
+Tags of the same person are similar, while tags of different people are different. Thus the keypoints can be grouped
+according to the tags.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_coco-wholebody.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_coco-wholebody.md
new file mode 100644
index 0000000000000000000000000000000000000000..6496280d669e277e4490b86e52ed70ec24622e59
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_coco-wholebody.md
@@ -0,0 +1,58 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Cheng_HigherHRNet_Scale-Aware_Representation_Learning_for_Bottom-Up_Human_Pose_Estimation_CVPR_2020_paper.html">HigherHRNet (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{cheng2020higherhrnet,
+  title={HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose Estimation},
+  author={Cheng, Bowen and Xiao, Bin and Wang, Jingdong and Shi, Honghui and Huang, Thomas S and Zhang, Lei},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={5386--5395},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody v1.0 val  without multi-scale test
+
+| Arch  | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR  | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :---- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :-----: | :-----: | :------: |:-------: |:------: | :------: |
+| [HigherHRNet-w32+](/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_512x512.py)  | 512x512 | 0.590 | 0.672 | 0.185 | 0.335 | 0.676 | 0.721 | 0.212 | 0.298 | 0.401 | 0.493 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_wholebody_512x512_plus-2fa137ab_20210517.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_wholebody_512x512_plus_20210517.log.json) |
+| [HigherHRNet-w48+](/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_512x512.py)  | 512x512 | 0.630 | 0.706 | 0.440 | 0.573 | 0.730 | 0.777 | 0.389 | 0.477 | 0.487 | 0.574 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_wholebody_512x512_plus-934f08aa_20210517.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_wholebody_512x512_plus_20210517.log.json) |
+
+Note: `+` means the model is first pre-trained on original COCO dataset, and then fine-tuned on COCO-WholeBody dataset. We find this will lead to better performance.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_coco-wholebody.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_coco-wholebody.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8f7b133be9eab240a9c5a2c67a923e7950450d4d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_coco-wholebody.yml
@@ -0,0 +1,52 @@
+Collections:
+- Name: HigherHRNet
+  Paper:
+    Title: 'HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose
+      Estimation'
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Cheng_HigherHRNet_Scale-Aware_Representation_Learning_for_Bottom-Up_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/higherhrnet.md
+Models:
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_512x512.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HigherHRNet
+    Training Data: COCO-WholeBody
+  Name: associative_embedding_higherhrnet_w32_coco_wholebody_512x512
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.59
+      Body AR: 0.672
+      Face AP: 0.676
+      Face AR: 0.721
+      Foot AP: 0.185
+      Foot AR: 0.335
+      Hand AP: 0.212
+      Hand AR: 0.298
+      Whole AP: 0.401
+      Whole AR: 0.493
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet32_coco_wholebody_512x512_plus-2fa137ab_20210517.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_512x512.py
+  In Collection: HigherHRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: associative_embedding_higherhrnet_w48_coco_wholebody_512x512
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.63
+      Body AR: 0.706
+      Face AP: 0.73
+      Face AR: 0.777
+      Foot AP: 0.44
+      Foot AR: 0.573
+      Hand AP: 0.389
+      Hand AR: 0.477
+      Whole AP: 0.487
+      Whole AR: 0.574
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/higher_hrnet48_coco_wholebody_512x512_plus-934f08aa_20210517.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..05574f975347eb26e8503058546e43fcc1c3c527
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_512x512.py
@@ -0,0 +1,195 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=133,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=133,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0],
+            supervise_empty=False)),
+    train_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        img_size=data_cfg['image_size']),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee9edc893edfb38c816ca83238fe63c2aabf8872
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_640x640.py
@@ -0,0 +1,195 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160, 320],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=32,
+        num_joints=133,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[32],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=133,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0],
+            supervise_empty=False)),
+    train_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        img_size=data_cfg['image_size']),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d84143b8d2805f8650432147ab6f32b9922b215f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_512x512.py
@@ -0,0 +1,195 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128, 256],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=48,
+        num_joints=133,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[48],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=133,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0],
+            supervise_empty=False)),
+    train_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        img_size=data_cfg['image_size']),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c33e80df931f6a18f05ee1ebbb95998f7517600
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_640x640.py
@@ -0,0 +1,195 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160, 320],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=2,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AEHigherResolutionHead',
+        in_channels=48,
+        num_joints=133,
+        tag_per_joint=True,
+        extra=dict(final_conv_kernel=1, ),
+        num_deconv_layers=1,
+        num_deconv_filters=[48],
+        num_deconv_kernels=[4],
+        num_basic_blocks=4,
+        cat_output=[True],
+        with_ae_loss=[True, False],
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=133,
+            num_stages=2,
+            ae_loss_type='exp',
+            with_ae_loss=[True, False],
+            push_loss_factor=[0.001, 0.001],
+            pull_loss_factor=[0.001, 0.001],
+            with_heatmaps_loss=[True, True],
+            heatmaps_loss_factor=[1.0, 1.0],
+            supervise_empty=False)),
+    train_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        img_size=data_cfg['image_size']),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True, True],
+        with_ae=[True, False],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=8),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_coco-wholebody.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_coco-wholebody.md
new file mode 100644
index 0000000000000000000000000000000000000000..4bc12c1946ccc3186370f85e0c0472dcd2d6e108
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_coco-wholebody.md
@@ -0,0 +1,58 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1611.05424">Associative Embedding (NIPS'2017)</a></summary>
+
+```bibtex
+@inproceedings{newell2017associative,
+  title={Associative embedding: End-to-end learning for joint detection and grouping},
+  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+  booktitle={Advances in neural information processing systems},
+  pages={2277--2287},
+  year={2017}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody v1.0 val  without multi-scale test
+
+| Arch  | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR  | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :---- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :-----: | :-----: | :------: |:-------: |:------: | :------: |
+| [HRNet-w32+](/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_512x512.py)  | 512x512 | 0.551 | 0.650 | 0.271 | 0.451 | 0.564 | 0.618 | 0.159 | 0.238 | 0.342 | 0.453 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_wholebody_512x512_plus-f1f1185c_20210517.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_wholebody_512x512_plus_20210517.log.json) |
+| [HRNet-w48+](/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_512x512.py)  | 512x512 | 0.592 | 0.686 | 0.443 | 0.595 | 0.619 | 0.674 | 0.347 | 0.438 | 0.422 | 0.532 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_wholebody_512x512_plus-4de8a695_20210517.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_wholebody_512x512_plus_20210517.log.json) |
+
+Note: `+` means the model is first pre-trained on original COCO dataset, and then fine-tuned on COCO-WholeBody dataset. We find this will lead to better performance.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_coco-wholebody.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_coco-wholebody.yml
new file mode 100644
index 0000000000000000000000000000000000000000..69c1eded0903017450898fd4dc1e72fa5a3af505
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_coco-wholebody.yml
@@ -0,0 +1,51 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_512x512.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - Associative Embedding
+    - HRNet
+    Training Data: COCO-WholeBody
+  Name: associative_embedding_hrnet_w32_coco_wholebody_512x512
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.551
+      Body AR: 0.65
+      Face AP: 0.564
+      Face AR: 0.618
+      Foot AP: 0.271
+      Foot AR: 0.451
+      Hand AP: 0.159
+      Hand AR: 0.238
+      Whole AP: 0.342
+      Whole AR: 0.453
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_wholebody_512x512_plus-f1f1185c_20210517.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_512x512.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: associative_embedding_hrnet_w48_coco_wholebody_512x512
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.592
+      Body AR: 0.686
+      Face AP: 0.619
+      Face AR: 0.674
+      Foot AP: 0.443
+      Foot AR: 0.595
+      Hand AP: 0.347
+      Hand AR: 0.438
+      Whole AP: 0.422
+      Whole AR: 0.532
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w48_coco_wholebody_512x512_plus-4de8a695_20210517.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f48f8710cb31a3838d2dd93b52b101ebb246ae2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_512x512.py
@@ -0,0 +1,191 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=32,
+        num_joints=133,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=133,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0],
+            supervise_empty=False)),
+    train_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        img_size=data_cfg['image_size']),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=24),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..006dea83217a96bd623266b90a1528a6b491fe62
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_640x640.py
@@ -0,0 +1,191 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=32,
+        num_joints=133,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=133,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0],
+            supervise_empty=False)),
+    train_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        img_size=data_cfg['image_size']),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_512x512.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed3aeca41ae0f70f9e90b66fe4896062dbaf90d1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_512x512.py
@@ -0,0 +1,191 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+data_cfg = dict(
+    image_size=512,
+    base_size=256,
+    base_sigma=2,
+    heatmap_size=[128],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=48,
+        num_joints=133,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=133,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0],
+            supervise_empty=False)),
+    train_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        img_size=data_cfg['image_size']),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=16),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_640x640.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..f75d2ab17636349cef45076eeea61a350d539237
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_640x640.py
@@ -0,0 +1,191 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+checkpoint_config = dict(interval=50)
+evaluation = dict(interval=50, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[200, 260])
+total_epochs = 300
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+data_cfg = dict(
+    image_size=640,
+    base_size=320,
+    base_sigma=2,
+    heatmap_size=[160],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    num_scales=1,
+    scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+    type='AssociativeEmbedding',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='AESimpleHead',
+        in_channels=48,
+        num_joints=133,
+        num_deconv_layers=0,
+        tag_per_joint=True,
+        with_ae_loss=[True],
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(
+            type='MultiLossFactory',
+            num_joints=133,
+            num_stages=1,
+            ae_loss_type='exp',
+            with_ae_loss=[True],
+            push_loss_factor=[0.001],
+            pull_loss_factor=[0.001],
+            with_heatmaps_loss=[True],
+            heatmaps_loss_factor=[1.0],
+            supervise_empty=False)),
+    train_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        img_size=data_cfg['image_size']),
+    test_cfg=dict(
+        num_joints=channel_cfg['dataset_joints'],
+        max_num_people=30,
+        scale_factor=[1],
+        with_heatmaps=[True],
+        with_ae=[True],
+        project2image=True,
+        align_corners=False,
+        nms_kernel=5,
+        nms_padding=2,
+        tag_per_joint=True,
+        detection_threshold=0.1,
+        tag_threshold=1,
+        use_detection_val=True,
+        ignore_too_much=False,
+        adjust=True,
+        refine=True,
+        flip_test=True))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='BottomUpRandomAffine',
+        rot_factor=30,
+        scale_factor=[0.75, 1.5],
+        scale_type='short',
+        trans_factor=40),
+    dict(type='BottomUpRandomFlip', flip_prob=0.5),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='BottomUpGenerateTarget',
+        sigma=2,
+        max_num_people=30,
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'joints', 'targets', 'masks'],
+        meta_keys=[]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+    dict(
+        type='BottomUpResizeAlign',
+        transforms=[
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+        ]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+            'center', 'scale', 'flip_index'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    workers_per_gpu=2,
+    train_dataloader=dict(samples_per_gpu=8),
+    val_dataloader=dict(samples_per_gpu=1),
+    test_dataloader=dict(samples_per_gpu=1),
+    train=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='BottomUpCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/deeppose/coco-wholebody/res50_coco_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/deeppose/coco-wholebody/res50_coco_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..e24b56fb95f45a8d1e8f9928cb49f88591e7486f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/deeppose/coco-wholebody/res50_coco_wholebody_256x192.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50, num_stages=4, out_indices=(3, )),
+    neck=dict(type='GlobalAveragePooling'),
+    keypoint_head=dict(
+        type='DeepposeRegressionHead',
+        in_channels=2048,
+        num_joints=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(flip_test=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetRegression'),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d95e939ce35225e614245eeb43d2f1ff589afe97
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/README.md
@@ -0,0 +1,10 @@
+# Top-down heatmap-based whole-body pose estimation
+
+Top-down methods divide the task into two stages: human detection and whole-body pose estimation.
+
+They perform human detection first, followed by single-person whole-body pose estimation given human bounding boxes.
+Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the
+likelihood of being a keypoint.
+
+Various neural network models have been proposed for better performance.
+The popular ones include stacked hourglass networks, and HRNet.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_base_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_base_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..02db322650b1f58655998dcab20c0ef23fb8ec33
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_base_wholebody_256x192.py
@@ -0,0 +1,149 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_huge_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_huge_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccd8fd29afd372198cd4e89189c3f2186f96b810
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_huge_wholebody_256x192.py
@@ -0,0 +1,149 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_large_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_large_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..df96867906844766bfdf8cf12ce5246b4d9d73a8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_large_wholebody_256x192.py
@@ -0,0 +1,149 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_small_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_small_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1d4b054dcea5ff46c0723d13e445546dc307440
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_small_wholebody_256x192.py
@@ -0,0 +1,149 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.md
new file mode 100644
index 0000000000000000000000000000000000000000..d486926d2c473af7f78dae746f469ee39f920472
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.md
@@ -0,0 +1,41 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR  | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :---- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :-----: | :-----: | :------: |:-------: |:------: | :------: |
+| [pose_hrnet_w32](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_256x192.py)  | 256x192 | 0.700 | 0.746 | 0.567 | 0.645 | 0.637 | 0.688 | 0.473 | 0.546 | 0.553 | 0.626 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192-853765cd_20200918.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_20200918.log.json) |
+| [pose_hrnet_w32](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_384x288.py)  | 384x288 | 0.701 | 0.773 | 0.586 | 0.692 | 0.727 | 0.783 | 0.516 | 0.604 | 0.586 | 0.674 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_384x288-78cacac3_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_384x288_20200922.log.json) |
+| [pose_hrnet_w48](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_256x192.py)  | 256x192 | 0.700 | 0.776 | 0.672 | 0.785 | 0.656 | 0.743 | 0.534 | 0.639 | 0.579 | 0.681 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_256x192-643e18cb_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_256x192_20200922.log.json) |
+| [pose_hrnet_w48](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288.py)  | 384x288 | 0.722 | 0.790 | 0.694 | 0.799 | 0.777 | 0.834 | 0.587 | 0.679 | 0.631 | 0.716 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288-6e061c6a_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_20200922.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml
new file mode 100644
index 0000000000000000000000000000000000000000..707b893b6aa26d86ec4440de8b2264d71cfd9f7e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml
@@ -0,0 +1,92 @@
+Collections:
+- Name: HRNet
+  Paper:
+    Title: Deep high-resolution representation learning for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrnet.md
+Models:
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_hrnet_w32_coco_wholebody_256x192
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.7
+      Body AR: 0.746
+      Face AP: 0.637
+      Face AR: 0.688
+      Foot AP: 0.567
+      Foot AR: 0.645
+      Hand AP: 0.473
+      Hand AR: 0.546
+      Whole AP: 0.553
+      Whole AR: 0.626
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192-853765cd_20200918.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_384x288.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_hrnet_w32_coco_wholebody_384x288
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.701
+      Body AR: 0.773
+      Face AP: 0.727
+      Face AR: 0.783
+      Foot AP: 0.586
+      Foot AR: 0.692
+      Hand AP: 0.516
+      Hand AR: 0.604
+      Whole AP: 0.586
+      Whole AR: 0.674
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_384x288-78cacac3_20200922.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_256x192.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_hrnet_w48_coco_wholebody_256x192
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.7
+      Body AR: 0.776
+      Face AP: 0.656
+      Face AR: 0.743
+      Foot AP: 0.672
+      Foot AR: 0.785
+      Hand AP: 0.534
+      Hand AR: 0.639
+      Whole AP: 0.579
+      Whole AR: 0.681
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_256x192-643e18cb_20200922.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288.py
+  In Collection: HRNet
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_hrnet_w48_coco_wholebody_384x288
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.722
+      Body AR: 0.79
+      Face AP: 0.777
+      Face AR: 0.834
+      Foot AP: 0.694
+      Foot AR: 0.799
+      Hand AP: 0.587
+      Hand AR: 0.679
+      Whole AP: 0.631
+      Whole AR: 0.716
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288-6e061c6a_20200922.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.md
new file mode 100644
index 0000000000000000000000000000000000000000..3edd51bffb2cfaedfcf1e5c86170146993c2be01
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.md
@@ -0,0 +1,58 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR  | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :---- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :-----: | :-----: | :------: |:-------: |:------: | :------: |
+| [pose_hrnet_w32_dark](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_256x192_dark.py)  | 256x192 | 0.694 | 0.764 | 0.565 | 0.674 | 0.736 | 0.808 | 0.503 | 0.602 | 0.582 | 0.671 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_dark-469327ef_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_dark_20200922.log.json) |
+| [pose_hrnet_w48_dark+](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288_dark_plus.py)  | 384x288 | 0.742 | 0.807 | 0.705 | 0.804 | 0.840 | 0.892 | 0.602 | 0.694 | 0.661 | 0.743 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_dark-f5726563_20200918.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_dark_20200918.log.json) |
+
+Note: `+` means the model is first pre-trained on original COCO dataset, and then fine-tuned on COCO-WholeBody dataset. We find this will lead to better performance.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c15c6beda09a2586e135e184074501144ef018ae
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml
@@ -0,0 +1,51 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_256x192_dark.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: &id001
+    - HRNet
+    - DarkPose
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_hrnet_w32_coco_wholebody_256x192_dark
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.694
+      Body AR: 0.764
+      Face AP: 0.736
+      Face AR: 0.808
+      Foot AP: 0.565
+      Foot AR: 0.674
+      Hand AP: 0.503
+      Hand AR: 0.602
+      Whole AP: 0.582
+      Whole AR: 0.671
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_dark-469327ef_20200922.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288_dark_plus.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_hrnet_w48_coco_wholebody_384x288_dark_plus
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.742
+      Body AR: 0.807
+      Face AP: 0.84
+      Face AR: 0.892
+      Foot AP: 0.705
+      Foot AR: 0.804
+      Hand AP: 0.602
+      Hand AR: 0.694
+      Whole AP: 0.661
+      Whole AR: 0.743
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_dark-f5726563_20200918.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9c12160f1cd41ce3461b15cc747a0683e5b0e97
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_256x192.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_256x192_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_256x192_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b0745fa5dde9241c939e6a6c4fcd5a5b222252c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_256x192_dark.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e867fa57b62bdb36f6919850566b90a78a27865
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_384x288.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_384x288_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_384x288_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..97b7679cf0fa38d81fee10cff5edac97107838ad
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w32_coco_wholebody_384x288_dark.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..039610e0ce2c4134485ac770f252d7574c0e94fd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_256x192.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_256x192_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_256x192_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e19f03feaa3ec8f07b061d1ad095c05b95fd3157
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_256x192_dark.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..0be7d03e942d8d22543baa23d69fdec790ae50f4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288.py
@@ -0,0 +1,165 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup=None,
+    # warmup='linear',
+    # warmup_iters=500,
+    # warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..5239244b78e371e4603ca16bc40096d397e82567
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288_dark.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288_dark_plus.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288_dark_plus.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8a9856a6ac8c188b61cc87bb76d0649e187ea2f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_w48_coco_wholebody_384x288_dark_plus.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288_dark-741844ba_20200812.pth'  # noqa: E501
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res101_coco_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res101_coco_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..917396a4bc403d52aa0ba8216d909bf431b71c0d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res101_coco_wholebody_256x192.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res101_coco_wholebody_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res101_coco_wholebody_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2422e4334b5297a02ffd99c66830a279277448
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res101_coco_wholebody_384x288.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res152_coco_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res152_coco_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..a59d1dcb9692b5cfe7456a988b394edb1221a03f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res152_coco_wholebody_256x192.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res152_coco_wholebody_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res152_coco_wholebody_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe03a6c88805c69d2b0e51ace69b0a6e4066274a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res152_coco_wholebody_384x288.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet152',
+    backbone=dict(type='ResNet', depth=152),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res50_coco_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res50_coco_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e39682b52a7b8e2a7798454a88193c610c5bae2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res50_coco_wholebody_256x192.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res50_coco_wholebody_384x288.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res50_coco_wholebody_384x288.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d9de5d128cb432b26498259fbb8f7b0c269132b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res50_coco_wholebody_384x288.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=2048,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.md
new file mode 100644
index 0000000000000000000000000000000000000000..143c33f2e19bedca856178ba5de3e7c7521b7d8b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.md
@@ -0,0 +1,43 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html">SimpleBaseline2D (ECCV'2018)</a></summary>
+
+```bibtex
+@inproceedings{xiao2018simple,
+  title={Simple baselines for human pose estimation and tracking},
+  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={466--481},
+  year={2018}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR  | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :---- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :-----: | :-----: | :------: |:-------: |:------: | :------: |
+| [pose_resnet_50](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res50_coco_wholebody_256x192.py)  | 256x192 | 0.652 | 0.739 | 0.614 | 0.746 | 0.608 | 0.716 | 0.460 | 0.584 | 0.520 | 0.633 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_256x192-9e37ed88_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_256x192_20201004.log.json) |
+| [pose_resnet_50](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res50_coco_wholebody_384x288.py)  | 384x288 | 0.666 | 0.747 | 0.635 | 0.763 | 0.732 | 0.812 | 0.537 | 0.647 | 0.573 | 0.671 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_384x288-ce11e294_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_384x288_20201004.log.json) |
+| [pose_resnet_101](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res101_coco_wholebody_256x192.py)  | 256x192 | 0.670 | 0.754 | 0.640 | 0.767 | 0.611 | 0.723 | 0.463 | 0.589 | 0.533 | 0.647 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_256x192-7325f982_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_256x192_20201004.log.json) |
+| [pose_resnet_101](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res101_coco_wholebody_384x288.py)  | 384x288 | 0.692 | 0.770 | 0.680 | 0.798 | 0.747 | 0.822 | 0.549 | 0.658 | 0.597 | 0.692 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_384x288-6c137b9a_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_384x288_20201004.log.json) |
+| [pose_resnet_152](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res152_coco_wholebody_256x192.py)  | 256x192 | 0.682 | 0.764 | 0.662 | 0.788 | 0.624 | 0.728 | 0.482 | 0.606 | 0.548 | 0.661 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_256x192-5de8ae23_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_256x192_20201004.log.json) |
+| [pose_resnet_152](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res152_coco_wholebody_384x288.py)  | 384x288 | 0.703 | 0.780 | 0.693 | 0.813 | 0.751 | 0.825 | 0.559 | 0.667 | 0.610 | 0.705 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_384x288-eab8caa8_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_384x288_20201004.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml
new file mode 100644
index 0000000000000000000000000000000000000000..84fea0885a4105e4a83f50868db5a0aaa9263e7e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml
@@ -0,0 +1,134 @@
+Collections:
+- Name: SimpleBaseline2D
+  Paper:
+    Title: Simple baselines for human pose estimation and tracking
+    URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res50_coco_wholebody_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: &id001
+    - SimpleBaseline2D
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_res50_coco_wholebody_256x192
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.652
+      Body AR: 0.739
+      Face AP: 0.608
+      Face AR: 0.716
+      Foot AP: 0.614
+      Foot AR: 0.746
+      Hand AP: 0.46
+      Hand AR: 0.584
+      Whole AP: 0.52
+      Whole AR: 0.633
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_256x192-9e37ed88_20201004.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res50_coco_wholebody_384x288.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_res50_coco_wholebody_384x288
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.666
+      Body AR: 0.747
+      Face AP: 0.732
+      Face AR: 0.812
+      Foot AP: 0.635
+      Foot AR: 0.763
+      Hand AP: 0.537
+      Hand AR: 0.647
+      Whole AP: 0.573
+      Whole AR: 0.671
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_384x288-ce11e294_20201004.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res101_coco_wholebody_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_res101_coco_wholebody_256x192
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.67
+      Body AR: 0.754
+      Face AP: 0.611
+      Face AR: 0.723
+      Foot AP: 0.64
+      Foot AR: 0.767
+      Hand AP: 0.463
+      Hand AR: 0.589
+      Whole AP: 0.533
+      Whole AR: 0.647
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_256x192-7325f982_20201004.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res101_coco_wholebody_384x288.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_res101_coco_wholebody_384x288
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.692
+      Body AR: 0.77
+      Face AP: 0.747
+      Face AR: 0.822
+      Foot AP: 0.68
+      Foot AR: 0.798
+      Hand AP: 0.549
+      Hand AR: 0.658
+      Whole AP: 0.597
+      Whole AR: 0.692
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_384x288-6c137b9a_20201004.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res152_coco_wholebody_256x192.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_res152_coco_wholebody_256x192
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.682
+      Body AR: 0.764
+      Face AP: 0.624
+      Face AR: 0.728
+      Foot AP: 0.662
+      Foot AR: 0.788
+      Hand AP: 0.482
+      Hand AR: 0.606
+      Whole AP: 0.548
+      Whole AR: 0.661
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_256x192-5de8ae23_20201004.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/res152_coco_wholebody_384x288.py
+  In Collection: SimpleBaseline2D
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_res152_coco_wholebody_384x288
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.703
+      Body AR: 0.78
+      Face AP: 0.751
+      Face AR: 0.825
+      Foot AP: 0.693
+      Foot AR: 0.813
+      Hand AP: 0.559
+      Hand AR: 0.667
+      Whole AP: 0.61
+      Whole AR: 0.705
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_384x288-eab8caa8_20201004.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md
new file mode 100644
index 0000000000000000000000000000000000000000..b7ec8b96608f7cfc1a067703763b34ef76a276ad
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md
@@ -0,0 +1,38 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2105.10154">ViPNAS (CVPR'2021)</a></summary>
+
+```bibtex
+@article{xu2021vipnas,
+  title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
+  author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  year={2021}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR  | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :---- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :-----: | :-----: | :------: |:-------: |:------: | :------: |
+| [S-ViPNAS-MobileNetV3](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_mbv3_coco_wholebody_256x192.py)  | 256x192 | 0.619 | 0.700 | 0.477 | 0.608 | 0.585 | 0.689 | 0.386 | 0.505 | 0.473 | 0.578 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192-0fee581a_20211205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_20211205.log.json) |
+| [S-ViPNAS-Res50](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192.py)  | 256x192 | 0.643 | 0.726 | 0.553 | 0.694 | 0.587 | 0.698 | 0.410 | 0.529 | 0.495 | 0.607 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192-49e1c3a4_20211112.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_20211112.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f52ddcdfa4075aaa194679c7fd6a4cbd5fcb6af4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml
@@ -0,0 +1,50 @@
+Collections:
+- Name: ViPNAS
+  Paper:
+    Title: 'ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search'
+    URL: https://arxiv.org/abs/2105.10154
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/vipnas.md
+Models:
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_mbv3_coco_wholebody_256x192.py
+  In Collection: ViPNAS
+  Metadata:
+    Architecture: &id001
+    - ViPNAS
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_vipnas_mbv3_coco_wholebody_256x192
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.619
+      Body AR: 0.7
+      Face AP: 0.585
+      Face AR: 0.689
+      Foot AP: 0.477
+      Foot AR: 0.608
+      Hand AP: 0.386
+      Hand AR: 0.505
+      Whole AP: 0.473
+      Whole AR: 0.578
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192-0fee581a_20211205.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192.py
+  In Collection: ViPNAS
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_vipnas_res50_coco_wholebody_256x192
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.643
+      Body AR: 0.726
+      Face AP: 0.587
+      Face AR: 0.698
+      Foot AP: 0.553
+      Foot AR: 0.694
+      Hand AP: 0.41
+      Hand AR: 0.529
+      Whole AP: 0.495
+      Whole AR: 0.607
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192-49e1c3a4_20211112.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md
new file mode 100644
index 0000000000000000000000000000000000000000..ea7a9e9035ca9fbad53e7d9fa5c58437faf847a9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md
@@ -0,0 +1,55 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2105.10154">ViPNAS (CVPR'2021)</a></summary>
+
+```bibtex
+@article{xu2021vipnas,
+  title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
+  author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  year={2021}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR  | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :---- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :-----: | :-----: | :------: |:-------: |:------: | :------: |
+| [S-ViPNAS-MobileNetV3_dark](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_mbv3_coco_wholebody_256x192_dark.py)  | 256x192 | 0.632 | 0.710 | 0.530 | 0.660 | 0.672 | 0.771 | 0.404 | 0.519 | 0.508 | 0.607 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_dark-e2158108_20211205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_dark_20211205.log.json) |
+| [S-ViPNAS-Res50_dark](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192_dark.py)  | 256x192 | 0.650 | 0.732 | 0.550 | 0.686 | 0.684 | 0.784 | 0.437 | 0.554 | 0.528 | 0.632 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_dark-67c0ce35_20211112.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_dark_20211112.log.json) |
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ec948af798aea584577bf4aca6f5cf6c1085ef56
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml
@@ -0,0 +1,51 @@
+Collections:
+- Name: ViPNAS
+  Paper:
+    Title: 'ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search'
+    URL: https://arxiv.org/abs/2105.10154
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/vipnas.md
+Models:
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_mbv3_coco_wholebody_256x192_dark.py
+  In Collection: ViPNAS
+  Metadata:
+    Architecture: &id001
+    - ViPNAS
+    - DarkPose
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_vipnas_mbv3_coco_wholebody_256x192_dark
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.632
+      Body AR: 0.71
+      Face AP: 0.672
+      Face AR: 0.771
+      Foot AP: 0.53
+      Foot AR: 0.66
+      Hand AP: 0.404
+      Hand AR: 0.519
+      Whole AP: 0.508
+      Whole AR: 0.607
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_dark-e2158108_20211205.pth
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192_dark.py
+  In Collection: ViPNAS
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO-WholeBody
+  Name: topdown_heatmap_vipnas_res50_coco_wholebody_256x192_dark
+  Results:
+  - Dataset: COCO-WholeBody
+    Metrics:
+      Body AP: 0.65
+      Body AR: 0.732
+      Face AP: 0.684
+      Face AR: 0.784
+      Foot AP: 0.55
+      Foot AR: 0.686
+      Hand AP: 0.437
+      Hand AR: 0.554
+      Whole AP: 0.528
+      Whole AR: 0.632
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_dark-67c0ce35_20211112.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_mbv3_coco_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_mbv3_coco_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c36894785f2098f814704299e6837880e7b5694
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_mbv3_coco_wholebody_256x192.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ViPNAS_MobileNetV3'),
+    keypoint_head=dict(
+        type='ViPNASHeatmapSimpleHead',
+        in_channels=160,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_filters=(160, 160, 160),
+        num_deconv_groups=(160, 160, 160),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_mbv3_coco_wholebody_256x192_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_mbv3_coco_wholebody_256x192_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b825ef7531b20e39e895a743b1c358d0fab652
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_mbv3_coco_wholebody_256x192_dark.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ViPNAS_MobileNetV3'),
+    keypoint_head=dict(
+        type='ViPNASHeatmapSimpleHead',
+        in_channels=160,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_filters=(160, 160, 160),
+        num_deconv_groups=(160, 160, 160),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c64edb5fc403abf3c58a0b101b58dca8ee933a1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192.py
@@ -0,0 +1,134 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ViPNAS_ResNet', depth=50),
+    keypoint_head=dict(
+        type='ViPNASHeatmapSimpleHead',
+        in_channels=608,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192_dark.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192_dark.py
new file mode 100644
index 0000000000000000000000000000000000000000..12a00d54aee342623c33c050e183a36031be2865
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192_dark.py
@@ -0,0 +1,134 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco_wholebody.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(type='ViPNAS_ResNet', depth=50),
+    keypoint_head=dict(
+        type='ViPNASHeatmapSimpleHead',
+        in_channels=608,
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoWholeBodyDataset',
+        ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_dark_halpe.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_dark_halpe.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b22b4b53da6d8acb06464342495822068870441
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_dark_halpe.md
@@ -0,0 +1,57 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
+
+```bibtex
+@inproceedings{sun2019deep,
+  title={Deep high-resolution representation learning for human pose estimation},
+  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5693--5703},
+  year={2019}
+}
+```
+
+</details>
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html">DarkPose (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{zhang2020distribution,
+  title={Distribution-aware coordinate representation for human pose estimation},
+  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7093--7102},
+  year={2020}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2004.00945">Halpe (CVPR'2020)</a></summary>
+
+```bibtex
+@inproceedings{li2020pastanet,
+  title={PaStaNet: Toward Human Activity Knowledge Engine},
+  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
+  booktitle={CVPR},
+  year={2020}
+}
+```
+
+</details>
+
+Results on Halpe v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch  | Input Size |  Whole AP | Whole AR | ckpt | log |
+| :---- | :--------: | :------: |:-------: |:------: | :------: |
+| [pose_hrnet_w48_dark+](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_w48_halpe_384x288_dark_plus.py)  | 384x288 | 0.531 | 0.642 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_halpe_384x288_dark_plus-d13c2588_20211021.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_halpe_384x288_dark_plus_20211021.log.json) |
+
+Note: `+` means the model is first pre-trained on original COCO dataset, and then fine-tuned on Halpe dataset. We find this will lead to better performance.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_dark_halpe.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_dark_halpe.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9c7b419fa43dbbe203cbd14fb09cd22cdf74350c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_dark_halpe.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: DarkPose
+  Paper:
+    Title: Distribution-aware coordinate representation for human pose estimation
+    URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+  README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/techniques/dark.md
+Models:
+- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_w48_halpe_384x288_dark_plus.py
+  In Collection: DarkPose
+  Metadata:
+    Architecture:
+    - HRNet
+    - DarkPose
+    Training Data: Halpe
+  Name: topdown_heatmap_hrnet_w48_halpe_384x288_dark_plus
+  Results:
+  - Dataset: Halpe
+    Metrics:
+      Whole AP: 0.531
+      Whole AR: 0.642
+    Task: Wholebody 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_halpe_384x288_dark_plus-d13c2588_20211021.pth
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_w32_halpe_256x192.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_w32_halpe_256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d6a2825f3375879af3bfe74967c27268848e0e2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_w32_halpe_256x192.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/halpe.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=136,
+    dataset_joints=136,
+    dataset_channel=[
+        list(range(136)),
+    ],
+    inference_channel=list(range(136)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/halpe'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownHalpeDataset',
+        ann_file=f'{data_root}/annotations/halpe_train_v1.json',
+        img_prefix=f'{data_root}/hico_20160224_det/images/train2015/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownHalpeDataset',
+        ann_file=f'{data_root}/annotations/halpe_val_v1.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownHalpeDataset',
+        ann_file=f'{data_root}/annotations/halpe_val_v1.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_w48_halpe_384x288_dark_plus.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_w48_halpe_384x288_dark_plus.py
new file mode 100644
index 0000000000000000000000000000000000000000..b62947864f357c4aef49bc23063df452ccf6b0ee
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_w48_halpe_384x288_dark_plus.py
@@ -0,0 +1,164 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/halpe.py'
+]
+load_from = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_384x288_dark-741844ba_20200812.pth'  # noqa: E501
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=136,
+    dataset_joints=136,
+    dataset_channel=[
+        list(range(136)),
+    ],
+    inference_channel=list(range(136)))
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_cfg = dict(
+    image_size=[288, 384],
+    heatmap_size=[72, 96],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3, unbiased_encoding=True),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/halpe'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownHalpeDataset',
+        ann_file=f'{data_root}/annotations/halpe_train_v1.json',
+        img_prefix=f'{data_root}/hico_20160224_det/images/train2015/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownHalpeDataset',
+        ann_file=f'{data_root}/annotations/halpe_val_v1.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownHalpeDataset',
+        ann_file=f'{data_root}/annotations/halpe_val_v1.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/Dockerfile b/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f7d6192910fa2401218c67a7e9e01634d83f364e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/Dockerfile
@@ -0,0 +1,29 @@
+ARG PYTORCH="1.6.0"
+ARG CUDA="10.1"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
+ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
+
+RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx\
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install xtcocotools
+RUN pip install cython
+RUN pip install xtcocotools
+
+# Install MMCV
+RUN pip install mmcv-full==latest+torch1.6.0+cu101 -f https://download.openmmlab.com/mmcv/dist/index.html
+
+# Install MMPose
+RUN conda clean --all
+RUN git clone https://github.com/open-mmlab/mmpose.git /mmpose
+WORKDIR /mmpose
+RUN mkdir -p /mmpose/data
+ENV FORCE_CUDA="1"
+RUN pip install -r requirements/build.txt
+RUN pip install --no-cache-dir -e .
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/Dockerfile b/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..74a31044b09c0f50fdedeaf4c1ba6138f5c9823a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/Dockerfile
@@ -0,0 +1,47 @@
+ARG PYTORCH="1.6.0"
+ARG CUDA="10.1"
+ARG CUDNN="7"
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ENV PYTHONUNBUFFERED TRUE
+
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    g++ \
+    openjdk-11-jre-headless \
+    # MMDet Requirements
+    ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV PATH="/opt/conda/bin:$PATH"
+RUN export FORCE_CUDA=1
+
+
+# MMLAB
+ARG PYTORCH
+ARG CUDA
+RUN ["/bin/bash", "-c", "pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"]
+RUN pip install mmpose
+
+# TORCHSEVER
+RUN pip install torchserve torch-model-archiver
+
+RUN useradd -m model-server \
+    && mkdir -p /home/model-server/tmp
+
+COPY entrypoint.sh /usr/local/bin/entrypoint.sh
+
+RUN chmod +x /usr/local/bin/entrypoint.sh \
+    && chown -R model-server /home/model-server
+
+COPY config.properties /home/model-server/config.properties
+RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store
+
+EXPOSE 8080 8081 8082
+
+USER model-server
+WORKDIR /home/model-server
+ENV TEMP=/home/model-server/tmp
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["serve"]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/Dockerfile_mmcls b/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/Dockerfile_mmcls
new file mode 100644
index 0000000000000000000000000000000000000000..7f63170176b9e810f343197ad8cafd95dbda7752
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/Dockerfile_mmcls
@@ -0,0 +1,49 @@
+ARG PYTORCH="1.6.0"
+ARG CUDA="10.1"
+ARG CUDNN="7"
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ARG MMCV="1.3.8"
+ARG MMCLS="0.16.0"
+
+ENV PYTHONUNBUFFERED TRUE
+
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    g++ \
+    openjdk-11-jre-headless \
+    # MMDet Requirements
+    ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV PATH="/opt/conda/bin:$PATH"
+RUN export FORCE_CUDA=1
+
+# TORCHSEVER
+RUN pip install torchserve torch-model-archiver
+
+# MMLAB
+ARG PYTORCH
+ARG CUDA
+RUN ["/bin/bash", "-c", "pip install mmcv-full==${MMCV} -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"]
+RUN pip install mmcls==${MMCLS}
+
+RUN useradd -m model-server \
+    && mkdir -p /home/model-server/tmp
+
+COPY entrypoint.sh /usr/local/bin/entrypoint.sh
+
+RUN chmod +x /usr/local/bin/entrypoint.sh \
+    && chown -R model-server /home/model-server
+
+COPY config.properties /home/model-server/config.properties
+RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store
+
+EXPOSE 8080 8081 8082
+
+USER model-server
+WORKDIR /home/model-server
+ENV TEMP=/home/model-server/tmp
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["serve"]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/config.properties b/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/config.properties
new file mode 100644
index 0000000000000000000000000000000000000000..efb9c47e40ab550bac765611e6c6c6f2a7152f11
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/config.properties
@@ -0,0 +1,5 @@
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
+model_store=/home/model-server/model-store
+load_models=all
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/entrypoint.sh b/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..41ba00b048aed84b45c5a8015a016ff148e97d86
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/docker/serve/entrypoint.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+if [[ "$1" = "serve" ]]; then
+    shift 1
+    torchserve --start --ts-config /home/model-server/config.properties
+else
+    eval "$@"
+fi
+
+# prevent docker exit
+tail -f /dev/null
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..23cb66e9336d6e87483eba5313976c3aa2de5e61
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/__init__.py
@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+
+from .checkpoint import load_checkpoint
+from .layer_decay_optimizer_constructor import LayerDecayOptimizerConstructor
+from .apex_runner.optimizer import DistOptimizerHook_custom
+
+__all__ = ['load_checkpoint', 'LayerDecayOptimizerConstructor', 'DistOptimizerHook_custom']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b90d2cbaa978c67c83ce3a8393d172d5714e210
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+from .checkpoint import save_checkpoint
+from .apex_iter_based_runner import IterBasedRunnerAmp
+
+
+__all__ = [
+    'save_checkpoint', 'IterBasedRunnerAmp', 
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/apex_iter_based_runner.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/apex_iter_based_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..571733b091574607ba1ba39648da6a051a769d34
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/apex_iter_based_runner.py
@@ -0,0 +1,103 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+import os.path as osp
+import platform
+import shutil
+
+import torch
+from torch.optim import Optimizer
+
+import mmcv
+from mmcv.runner import RUNNERS, IterBasedRunner
+from .checkpoint import save_checkpoint
+
+try:
+    import apex
+except:
+    print('apex is not installed')
+
+
+@RUNNERS.register_module()
+class IterBasedRunnerAmp(IterBasedRunner):
+    """Iteration-based Runner with AMP support.
+
+    This runner train models iteration by iteration.
+    """
+
+    def save_checkpoint(self,
+                        out_dir,
+                        filename_tmpl='iter_{}.pth',
+                        meta=None,
+                        save_optimizer=True,
+                        create_symlink=False):
+        """Save checkpoint to file.
+
+        Args:
+            out_dir (str): Directory to save checkpoint files.
+            filename_tmpl (str, optional): Checkpoint file template.
+                Defaults to 'iter_{}.pth'.
+            meta (dict, optional): Metadata to be saved in checkpoint.
+                Defaults to None.
+            save_optimizer (bool, optional): Whether save optimizer.
+                Defaults to True.
+            create_symlink (bool, optional): Whether create symlink to the
+                latest checkpoint file. Defaults to True.
+        """
+        if meta is None:
+            meta = dict(iter=self.iter + 1, epoch=self.epoch + 1)
+        elif isinstance(meta, dict):
+            meta.update(iter=self.iter + 1, epoch=self.epoch + 1)
+        else:
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+        if self.meta is not None:
+            meta.update(self.meta)
+
+        filename = filename_tmpl.format(self.iter + 1)
+        filepath = osp.join(out_dir, filename)
+        optimizer = self.optimizer if save_optimizer else None
+        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
+        # in some environments, `os.symlink` is not supported, you may need to
+        # set `create_symlink` to False
+        # if create_symlink:
+        #     dst_file = osp.join(out_dir, 'latest.pth')
+        #     if platform.system() != 'Windows':
+        #         mmcv.symlink(filename, dst_file)
+        #     else:
+        #         shutil.copy(filepath, dst_file)
+
+    def resume(self,
+               checkpoint,
+               resume_optimizer=True,
+               map_location='default'):
+        if map_location == 'default':
+            if torch.cuda.is_available():
+                device_id = torch.cuda.current_device()
+                checkpoint = self.load_checkpoint(
+                    checkpoint,
+                    map_location=lambda storage, loc: storage.cuda(device_id))
+            else:
+                checkpoint = self.load_checkpoint(checkpoint)
+        else:
+            checkpoint = self.load_checkpoint(
+                checkpoint, map_location=map_location)
+
+        self._epoch = checkpoint['meta']['epoch']
+        self._iter = checkpoint['meta']['iter']
+        self._inner_iter = checkpoint['meta']['iter']
+        if 'optimizer' in checkpoint and resume_optimizer:
+            if isinstance(self.optimizer, Optimizer):
+                self.optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(self.optimizer, dict):
+                for k in self.optimizer.keys():
+                    self.optimizer[k].load_state_dict(
+                        checkpoint['optimizer'][k])
+            else:
+                raise TypeError(
+                    'Optimizer should be dict or torch.optim.Optimizer '
+                    f'but got {type(self.optimizer)}')
+
+        if 'amp' in checkpoint:
+            apex.amp.load_state_dict(checkpoint['amp'])
+            self.logger.info('load amp state dict')
+
+        self.logger.info(f'resumed from epoch: {self.epoch}, iter {self.iter}')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/checkpoint.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b04167e0fc5f16bc33e793830ebb9c4ef15ef1ed
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/checkpoint.py
@@ -0,0 +1,85 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+import os.path as osp
+import time
+from tempfile import TemporaryDirectory
+
+import torch
+from torch.optim import Optimizer
+
+import mmcv
+from mmcv.parallel import is_module_wrapper
+from mmcv.runner.checkpoint import weights_to_cpu, get_state_dict
+
+try:
+    import apex
+except:
+    print('apex is not installed')
+
+
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+    """Save checkpoint to file.
+
+    The checkpoint will have 4 fields: ``meta``, ``state_dict`` and
+    ``optimizer``, ``amp``. By default ``meta`` will contain version
+    and time info.
+
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+
+    if is_module_wrapper(model):
+        model = model.module
+
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+
+    # save amp state dict in the checkpoint
+    checkpoint['amp'] = apex.amp.state_dict()
+
+    if filename.startswith('pavi://'):
+        try:
+            from pavi import modelcloud
+            from pavi.exception import NodeNotFoundError
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        mmcv.mkdir_or_exist(osp.dirname(filename))
+        # immediately flush buffer
+        with open(filename, 'wb') as f:
+            torch.save(checkpoint, f)
+            f.flush()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/optimizer.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbc42989b569e63bbf008bbbd2700fe217399e9f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/apex_runner/optimizer.py
@@ -0,0 +1,33 @@
+from mmcv.runner import OptimizerHook, HOOKS
+try:
+    import apex
+except:
+    print('apex is not installed')
+
+
+@HOOKS.register_module()
+class DistOptimizerHook_custom(OptimizerHook):
+    """Optimizer hook for distributed training."""
+
+    def __init__(self, update_interval=1, grad_clip=None, coalesce=True, bucket_size_mb=-1, use_fp16=False):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+        self.update_interval = update_interval
+        self.use_fp16 = use_fp16
+
+    def before_run(self, runner):
+        runner.optimizer.zero_grad()
+
+    def after_train_iter(self, runner):
+        runner.outputs['loss'] /= self.update_interval
+        if self.use_fp16:
+            with apex.amp.scale_loss(runner.outputs['loss'], runner.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            runner.outputs['loss'].backward()
+        if self.every_n_iters(runner, self.update_interval):
+            if self.grad_clip is not None:
+                self.clip_grads(runner.model.parameters())
+            runner.optimizer.step()
+            runner.optimizer.zero_grad()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/checkpoint.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..52c9bac8a5eb89a4009e837ea338cd271e0a5bc7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/checkpoint.py
@@ -0,0 +1,552 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+import io
+import os
+import os.path as osp
+import pkgutil
+import time
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+
+import torch
+import torchvision
+from torch.optim import Optimizer
+from torch.utils import model_zoo
+from torch.nn import functional as F
+
+import mmcv
+from mmcv.fileio import FileClient
+from mmcv.fileio import load as load_file
+from mmcv.parallel import is_module_wrapper
+from mmcv.utils import mkdir_or_exist
+from mmcv.runner import get_dist_info
+
+from scipy import interpolate
+import numpy as np
+import math
+import re
+import copy
+
+ENV_MMCV_HOME = 'MMCV_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+def _get_mmcv_home():
+    mmcv_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMCV_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+
+    mkdir_or_exist(mmcv_home)
+    return mmcv_home
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    all_missing_keys = []
+    err_msg = []
+
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_module_wrapper(module):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+
+    load(module)
+    load = None  # break load->load reference cycle
+
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+
+
+def load_url_dist(url, model_dir=None, map_location="cpu"):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        checkpoint = model_zoo.load_url(url, model_dir=model_dir, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = model_zoo.load_url(url, model_dir=model_dir, map_location=map_location)
+    return checkpoint
+
+
+def load_pavimodel_dist(model_path, map_location=None):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        model = modelcloud.get(model_path)
+        with TemporaryDirectory() as tmp_dir:
+            downloaded_file = osp.join(tmp_dir, model.name)
+            model.download(downloaded_file)
+            checkpoint = torch.load(downloaded_file, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            model = modelcloud.get(model_path)
+            with TemporaryDirectory() as tmp_dir:
+                downloaded_file = osp.join(tmp_dir, model.name)
+                model.download(downloaded_file)
+                checkpoint = torch.load(
+                    downloaded_file, map_location=map_location)
+    return checkpoint
+
+
+def load_fileclient_dist(filename, backend, map_location):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    allowed_backends = ['ceph']
+    if backend not in allowed_backends:
+        raise ValueError(f'Load from Backend {backend} is not supported.')
+    if rank == 0:
+        fileclient = FileClient(backend=backend)
+        buffer = io.BytesIO(fileclient.get(filename))
+        checkpoint = torch.load(buffer, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            fileclient = FileClient(backend=backend)
+            buffer = io.BytesIO(fileclient.get(filename))
+            checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+
+
+def get_torchvision_models():
+    model_urls = dict()
+    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+        if ispkg:
+            continue
+        _zoo = import_module(f'torchvision.models.{name}')
+        if hasattr(_zoo, 'model_urls'):
+            _urls = getattr(_zoo, 'model_urls')
+            model_urls.update(_urls)
+    return model_urls
+
+
+def get_external_models():
+    mmcv_home = _get_mmcv_home()
+    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+
+    return default_urls
+
+
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+
+    return mmcls_urls
+
+
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmcv.__path__[0],
+                                   'model_zoo/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+
+    return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+    state_dict = checkpoint['state_dict']
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+
+    return new_checkpoint
+
+
+def _load_checkpoint(filename, map_location=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+
+    Returns:
+        dict | OrderedDict: The loaded checkpoint. It can be either an
+            OrderedDict storing model weights or a dict containing other
+            information, which depends on the checkpoint.
+    """
+    if filename.startswith('modelzoo://'):
+        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
+                      'use "torchvision://" instead')
+        model_urls = get_torchvision_models()
+        model_name = filename[11:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('torchvision://'):
+        model_urls = get_torchvision_models()
+        model_name = filename[14:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('open-mmlab://'):
+        model_urls = get_external_models()
+        model_name = filename[13:]
+        deprecated_urls = get_deprecated_model_names()
+        if model_name in deprecated_urls:
+            warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
+                          f'of open-mmlab://{deprecated_urls[model_name]}')
+            model_name = deprecated_urls[model_name]
+        model_url = model_urls[model_name]
+        # check if is url
+        if model_url.startswith(('http://', 'https://')):
+            checkpoint = load_url_dist(model_url)
+        else:
+            filename = osp.join(_get_mmcv_home(), model_url)
+            if not osp.isfile(filename):
+                raise IOError(f'{filename} is not a checkpoint file')
+            checkpoint = torch.load(filename, map_location=map_location)
+    elif filename.startswith('mmcls://'):
+        model_urls = get_mmcls_models()
+        model_name = filename[8:]
+        checkpoint = load_url_dist(model_urls[model_name])
+        checkpoint = _process_mmcls_checkpoint(checkpoint)
+    elif filename.startswith(('http://', 'https://')):
+        checkpoint = load_url_dist(filename)
+    elif filename.startswith('pavi://'):
+        model_path = filename[7:]
+        checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
+    elif filename.startswith('s3://'):
+        checkpoint = load_fileclient_dist(
+            filename, backend='ceph', map_location=map_location)
+    else:
+        if not osp.isfile(filename):
+            raise IOError(f'{filename} is not a checkpoint file')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0,
+                     start_warmup_value=0, warmup_steps=-1):
+    warmup_schedule = np.array([])
+    warmup_iters = warmup_epochs * niter_per_ep
+    if warmup_steps > 0:
+        warmup_iters = warmup_steps
+    print("Set warmup steps = %d" % warmup_iters)
+    if warmup_epochs > 0:
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+
+    iters = np.arange(epochs * niter_per_ep - warmup_iters)
+    schedule = np.array(
+        [final_value + 0.5 * (base_value - final_value) * (1 + math.cos(math.pi * i / (len(iters)))) for i in iters])
+
+    schedule = np.concatenate((warmup_schedule, schedule))
+
+    assert len(schedule) == epochs * niter_per_ep
+    return schedule
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location='cpu',
+                    strict=False,
+                    logger=None,
+                    patch_padding='pad',
+                    part_features=None
+                    ):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        patch_padding (str): 'pad' or 'bilinear' or 'bicubic', used for interpolate patch embed from 14x14 to 16x16
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    elif 'module' in checkpoint:
+        state_dict = checkpoint['module']
+    else:
+        state_dict = checkpoint
+    # strip prefix of state_dict
+    if list(state_dict.keys())[0].startswith('module.'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+    # for MoBY, load model of online branch
+    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
+        state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
+
+    rank, _ = get_dist_info()
+
+    if 'patch_embed.proj.weight' in state_dict:
+        proj_weight = state_dict['patch_embed.proj.weight']
+        orig_size = proj_weight.shape[2:]
+        current_size = model.patch_embed.proj.weight.shape[2:]
+        padding_size = current_size[0] - orig_size[0]
+        padding_l = padding_size // 2
+        padding_r = padding_size - padding_l
+        if orig_size != current_size:
+            if 'pad' in patch_padding:
+                proj_weight = torch.nn.functional.pad(proj_weight, (padding_l, padding_r, padding_l, padding_r))
+            elif 'bilinear' in patch_padding:
+                proj_weight = torch.nn.functional.interpolate(proj_weight, size=current_size, mode='bilinear', align_corners=False)
+            elif 'bicubic' in patch_padding:
+                proj_weight = torch.nn.functional.interpolate(proj_weight, size=current_size, mode='bicubic', align_corners=False)
+            state_dict['patch_embed.proj.weight'] = proj_weight
+
+    if 'pos_embed' in state_dict:
+        pos_embed_checkpoint = state_dict['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        H, W = model.patch_embed.patch_shape
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        if rank == 0:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, H, W))
+        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+        # only the position tokens are interpolated
+        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+        pos_tokens = torch.nn.functional.interpolate(
+            pos_tokens, size=(H, W), mode='bicubic', align_corners=False)
+        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+        state_dict['pos_embed'] = new_pos_embed
+    
+    new_state_dict = copy.deepcopy(state_dict)
+    if part_features is not None:
+        current_keys = list(model.state_dict().keys())
+        for key in current_keys:
+            if "mlp.experts" in key:
+                source_key = re.sub(r'experts.\d+.', 'fc2.', key)
+                new_state_dict[key] = state_dict[source_key][-part_features:]
+            elif 'fc2' in key:
+                new_state_dict[key] = state_dict[key][:-part_features]
+
+    # load state_dict
+    load_state_dict(model, new_state_dict, strict, logger)
+    return checkpoint
+
+
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    return state_dict_cpu
+
+
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+        if buf is not None:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Default: False.
+
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    if is_module_wrapper(module):
+        module = module.module
+
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    _save_to_state_dict(module, destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
+
+
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+    """Save checkpoint to file.
+
+    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+    ``optimizer``. By default ``meta`` will contain version and time info.
+
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+
+    if is_module_wrapper(model):
+        model = model.module
+
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+
+    if filename.startswith('pavi://'):
+        try:
+            from pavi import modelcloud
+            from pavi.exception import NodeNotFoundError
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        mmcv.mkdir_or_exist(osp.dirname(filename))
+        # immediately flush buffer
+        with open(filename, 'wb') as f:
+            torch.save(checkpoint, f)
+            f.flush()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/layer_decay_optimizer_constructor.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/layer_decay_optimizer_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1357082e66d0a91c2544ee83440745f0e93b5175
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmcv_custom/layer_decay_optimizer_constructor.py
@@ -0,0 +1,78 @@
+import json
+from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
+from mmcv.runner import get_dist_info
+
+
+def get_num_layer_for_vit(var_name, num_max_layer):
+    if var_name in ("backbone.cls_token", "backbone.mask_token", "backbone.pos_embed"):
+        return 0
+    elif var_name.startswith("backbone.patch_embed"):
+        return 0
+    elif var_name.startswith("backbone.blocks"):
+        layer_id = int(var_name.split('.')[2])
+        return layer_id + 1
+    else:
+        return num_max_layer - 1
+
+@OPTIMIZER_BUILDERS.register_module()
+class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
+    def add_params(self, params, module, prefix='', is_dcn_module=None):
+        """Add all parameters of module to the params list.
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        parameter_groups = {}
+        print(self.paramwise_cfg)
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate')
+        print("Build LayerDecayOptimizerConstructor %f - %d" % (layer_decay_rate, num_layers))
+        weight_decay = self.base_wd
+
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith(".bias") or 'pos_embed' in name:
+                group_name = "no_decay"
+                this_weight_decay = 0.
+            else:
+                group_name = "decay"
+                this_weight_decay = weight_decay
+
+            layer_id = get_num_layer_for_vit(name, num_layers)
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+
+            if group_name not in parameter_groups:
+                scale = layer_decay_rate ** (num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    "weight_decay": this_weight_decay,
+                    "params": [],
+                    "param_names": [], 
+                    "lr_scale": scale, 
+                    "group_name": group_name, 
+                    "lr": scale * self.base_lr, 
+                }
+
+            parameter_groups[group_name]["params"].append(param)
+            parameter_groups[group_name]["param_names"].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    "param_names": parameter_groups[key]["param_names"], 
+                    "lr_scale": parameter_groups[key]["lr_scale"], 
+                    "lr": parameter_groups[key]["lr"], 
+                    "weight_decay": parameter_groups[key]["weight_decay"], 
+                }
+            print("Param groups = %s" % json.dumps(to_display, indent=2))
+
+        params.extend(parameter_groups.values())
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f82acc83c3d1562b1a20002b8c2221fc104e5a36
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+
+from .version import __version__, short_version
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.3.8'
+mmcv_maximum_version = '1.7.2'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+
+__all__ = ['__version__', 'short_version']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e263edc4d6aa0a3380a3c2e8dc85e1a696bb164
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .inference import (inference_bottom_up_pose_model,
+                        inference_top_down_pose_model, init_pose_model,
+                        process_mmdet_results, vis_pose_result)
+from .inference_3d import (extract_pose_sequence, inference_interhand_3d_model,
+                           inference_mesh_model, inference_pose_lifter_model,
+                           vis_3d_mesh_result, vis_3d_pose_result)
+from .inference_tracking import get_track_id, vis_pose_tracking_result
+from .test import multi_gpu_test, single_gpu_test
+from .train import init_random_seed, train_model
+
+__all__ = [
+    'train_model', 'init_pose_model', 'inference_top_down_pose_model',
+    'inference_bottom_up_pose_model', 'multi_gpu_test', 'single_gpu_test',
+    'vis_pose_result', 'get_track_id', 'vis_pose_tracking_result',
+    'inference_pose_lifter_model', 'vis_3d_pose_result',
+    'inference_interhand_3d_model', 'extract_pose_sequence',
+    'inference_mesh_model', 'vis_3d_mesh_result', 'process_mmdet_results',
+    'init_random_seed'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/inference.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..5363d40c3f8680af79b470f59b5144941a0c4436
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/inference.py
@@ -0,0 +1,833 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import collate, scatter
+from mmcv.runner import load_checkpoint
+from PIL import Image
+
+from mmpose.core.post_processing import oks_nms
+from mmpose.datasets.dataset_info import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+from mmpose.models import build_posenet
+from mmpose.utils.hooks import OutputHook
+
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+
+
+def init_pose_model(config, checkpoint=None, device='cuda:0'):
+    """Initialize a pose model from config file.
+
+    Args:
+        config (str or :obj:`mmcv.Config`): Config file path or the config
+            object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+
+    Returns:
+        nn.Module: The constructed detector.
+    """
+    if isinstance(config, str):
+        config = mmcv.Config.fromfile(config)
+    elif not isinstance(config, mmcv.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    config.model.pretrained = None
+    model = build_posenet(config.model)
+    if checkpoint is not None:
+        # load model checkpoint
+        load_checkpoint(model, checkpoint, map_location='cpu')
+    # save the config in the model for convenience
+    model.cfg = config
+    model.to(device)
+    model.eval()
+    return model
+
+
+def _xyxy2xywh(bbox_xyxy):
+    """Transform the bbox format from x1y1x2y2 to xywh.
+
+    Args:
+        bbox_xyxy (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5). (left, top, right, bottom, [score])
+
+    Returns:
+        np.ndarray: Bounding boxes (with scores),
+          shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    """
+    bbox_xywh = bbox_xyxy.copy()
+    bbox_xywh[:, 2] = bbox_xywh[:, 2] - bbox_xywh[:, 0] + 1
+    bbox_xywh[:, 3] = bbox_xywh[:, 3] - bbox_xywh[:, 1] + 1
+
+    return bbox_xywh
+
+
+def _xywh2xyxy(bbox_xywh):
+    """Transform the bbox format from xywh to x1y1x2y2.
+
+    Args:
+        bbox_xywh (ndarray): Bounding boxes (with scores),
+            shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    Returns:
+        np.ndarray: Bounding boxes (with scores), shaped (n, 4) or
+          (n, 5). (left, top, right, bottom, [score])
+    """
+    bbox_xyxy = bbox_xywh.copy()
+    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
+    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1
+
+    return bbox_xyxy
+
+
+def _box2cs(cfg, box):
+    """This encodes bbox(x,y,w,h) into (center, scale)
+
+    Args:
+        x, y, w, h
+
+    Returns:
+        tuple: A tuple containing center and scale.
+
+        - np.ndarray[float32](2,): Center of the bbox (x, y).
+        - np.ndarray[float32](2,): Scale of the bbox w & h.
+    """
+
+    x, y, w, h = box[:4]
+    input_size = cfg.data_cfg['image_size']
+    aspect_ratio = input_size[0] / input_size[1]
+    center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+    if w > aspect_ratio * h:
+        h = w * 1.0 / aspect_ratio
+    elif w < aspect_ratio * h:
+        w = h * aspect_ratio
+
+    # pixel std is 200.0
+    scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+    scale = scale * 1.25
+
+    return center, scale
+
+
+def _inference_single_pose_model(model,
+                                 img_or_path,
+                                 bboxes,
+                                 dataset='TopDownCocoDataset',
+                                 dataset_info=None,
+                                 return_heatmap=False):
+    """Inference human bounding boxes.
+
+    Note:
+        - num_bboxes: N
+        - num_keypoints: K
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (str | np.ndarray): Image filename or loaded image.
+        bboxes (list | np.ndarray): All bounding boxes (with scores),
+            shaped (N, 4) or (N, 5). (left, top, width, height, [score])
+            where N is number of bounding boxes.
+        dataset (str): Dataset name. Deprecated.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        outputs (list[str] | tuple[str]): Names of layers whose output is
+            to be returned, default: None
+
+    Returns:
+        ndarray[NxKx3]: Predicted pose x, y, score.
+        heatmap[N, K, H, W]: Model output heatmap.
+    """
+
+    cfg = model.cfg
+    device = next(model.parameters()).device
+    if device.type == 'cpu':
+        device = -1
+
+    # build the data pipeline
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    assert len(bboxes[0]) in [4, 5]
+
+    if dataset_info is not None:
+        dataset_name = dataset_info.dataset_name
+        flip_pairs = dataset_info.flip_pairs
+    else:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        # TODO: These will be removed in the later versions.
+        if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset',
+                       'AnimalMacaqueDataset'):
+            flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
+                          [13, 14], [15, 16]]
+        elif dataset == 'TopDownCocoWholeBodyDataset':
+            body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
+                    [13, 14], [15, 16]]
+            foot = [[17, 20], [18, 21], [19, 22]]
+
+            face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34],
+                    [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46],
+                    [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66],
+                    [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75],
+                    [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]]
+
+            hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116],
+                    [96, 117], [97, 118], [98, 119], [99, 120], [100, 121],
+                    [101, 122], [102, 123], [103, 124], [104, 125], [105, 126],
+                    [106, 127], [107, 128], [108, 129], [109, 130], [110, 131],
+                    [111, 132]]
+            flip_pairs = body + foot + face + hand
+        elif dataset == 'TopDownAicDataset':
+            flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]]
+        elif dataset == 'TopDownMpiiDataset':
+            flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
+        elif dataset == 'TopDownMpiiTrbDataset':
+            flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11],
+                          [14, 15], [16, 22], [28, 34], [17, 23], [29, 35],
+                          [18, 24], [30, 36], [19, 25], [31, 37], [20, 26],
+                          [32, 38], [21, 27], [33, 39]]
+        elif dataset in ('OneHand10KDataset', 'FreiHandDataset',
+                         'PanopticDataset', 'InterHand2DDataset'):
+            flip_pairs = []
+        elif dataset in 'Face300WDataset':
+            flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11],
+                          [6, 10], [7, 9], [17, 26], [18, 25], [19, 24],
+                          [20, 23], [21, 22], [31, 35], [32, 34], [36, 45],
+                          [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
+                          [48, 54], [49, 53], [50, 52], [61, 63], [60, 64],
+                          [67, 65], [58, 56], [59, 55]]
+
+        elif dataset in 'FaceAFLWDataset':
+            flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9],
+                          [12, 14], [15, 17]]
+
+        elif dataset in 'FaceCOFWDataset':
+            flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11],
+                          [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]]
+
+        elif dataset in 'FaceWFLWDataset':
+            flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27],
+                          [6, 26], [7, 25], [8, 24], [9, 23], [10, 22],
+                          [11, 21], [12, 20], [13, 19], [14, 18], [15, 17],
+                          [33, 46], [34, 45], [35, 44], [36, 43], [37, 42],
+                          [38, 50], [39, 49], [40, 48], [41, 47], [60, 72],
+                          [61, 71], [62, 70], [63, 69], [64, 68], [65, 75],
+                          [66, 74], [67, 73], [55, 59], [56, 58], [76, 82],
+                          [77, 81], [78, 80], [87, 83], [86, 84], [88, 92],
+                          [89, 91], [95, 93], [96, 97]]
+
+        elif dataset in 'AnimalFlyDataset':
+            flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22],
+                          [11, 23], [12, 24], [13, 25], [14, 26], [15, 27],
+                          [16, 28], [17, 29], [30, 31]]
+        elif dataset in 'AnimalHorse10Dataset':
+            flip_pairs = []
+
+        elif dataset in 'AnimalLocustDataset':
+            flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24],
+                          [10, 25], [11, 26], [12, 27], [13, 28], [14, 29],
+                          [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]]
+
+        elif dataset in 'AnimalZebraDataset':
+            flip_pairs = [[3, 4], [5, 6]]
+
+        elif dataset in 'AnimalPoseDataset':
+            flip_pairs = [[0, 1], [2, 3], [8, 9], [10, 11], [12, 13], [14, 15],
+                          [16, 17], [18, 19]]
+        else:
+            raise NotImplementedError()
+        dataset_name = dataset
+
+    batch_data = []
+    for bbox in bboxes:
+        center, scale = _box2cs(cfg, bbox)
+
+        # prepare data
+        data = {
+            'center':
+            center,
+            'scale':
+            scale,
+            'bbox_score':
+            bbox[4] if len(bbox) == 5 else 1,
+            'bbox_id':
+            0,  # need to be assigned if batch_size > 1
+            'dataset':
+            dataset_name,
+            'joints_3d':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'joints_3d_visible':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'rotation':
+            0,
+            'ann_info': {
+                'image_size': np.array(cfg.data_cfg['image_size']),
+                'num_joints': cfg.data_cfg['num_joints'],
+                'flip_pairs': flip_pairs
+            }
+        }
+        if isinstance(img_or_path, np.ndarray):
+            data['img'] = img_or_path
+        else:
+            data['image_file'] = img_or_path
+
+        data = test_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
+    batch_data = scatter(batch_data, [device])[0]
+
+    # forward the model
+    with torch.no_grad():
+        result = model(
+            img=batch_data['img'],
+            img_metas=batch_data['img_metas'],
+            return_loss=False,
+            return_heatmap=return_heatmap)
+
+    return result['preds'], result['output_heatmap']
+
+
+def inference_top_down_pose_model(model,
+                                  img_or_path,
+                                  person_results=None,
+                                  bbox_thr=None,
+                                  format='xywh',
+                                  dataset='TopDownCocoDataset',
+                                  dataset_info=None,
+                                  return_heatmap=False,
+                                  outputs=None):
+    """Inference a single image with a list of person bounding boxes.
+
+    Note:
+        - num_people: P
+        - num_keypoints: K
+        - bbox height: H
+        - bbox width: W
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (str| np.ndarray): Image filename or loaded image.
+        person_results (list(dict), optional): a list of detected persons that
+            contains ``bbox`` and/or ``track_id``:
+
+            - ``bbox`` (4, ) or (5, ): The person bounding box, which contains
+                4 box coordinates (and score).
+            - ``track_id`` (int): The unique id for each human instance. If
+                not provided, a dummy person result with a bbox covering
+                the entire image will be used. Default: None.
+        bbox_thr (float | None): Threshold for bounding boxes. Only bboxes
+            with higher scores will be fed into the pose detector.
+            If bbox_thr is None, all boxes will be used.
+        format (str): bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
+
+            - `xyxy` means (left, top, right, bottom),
+            - `xywh` means (left, top, width, height).
+        dataset (str): Dataset name, e.g. 'TopDownCocoDataset'.
+            It is deprecated. Please use dataset_info instead.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        return_heatmap (bool) : Flag to return heatmap, default: False
+        outputs (list(str) | tuple(str)) : Names of layers whose outputs
+            need to be returned. Default: None.
+
+    Returns:
+        tuple:
+        - pose_results (list[dict]): The bbox & pose info. \
+            Each item in the list is a dictionary, \
+            containing the bbox: (left, top, right, bottom, [score]) \
+            and the pose (ndarray[Kx3]): x, y, score.
+        - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \
+            torch.Tensor[N, K, H, W]]]): \
+            Output feature maps from layers specified in `outputs`. \
+            Includes 'heatmap' if `return_heatmap` is True.
+    """
+    # get dataset info
+    if (dataset_info is None and hasattr(model, 'cfg')
+            and 'dataset_info' in model.cfg):
+        dataset_info = DatasetInfo(model.cfg.dataset_info)
+    if dataset_info is None:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663'
+            ' for details.', DeprecationWarning)
+
+    # only two kinds of bbox format is supported.
+    assert format in ['xyxy', 'xywh']
+
+    pose_results = []
+    returned_outputs = []
+
+    if person_results is None:
+        # create dummy person results
+        if isinstance(img_or_path, str):
+            width, height = Image.open(img_or_path).size
+        else:
+            height, width = img_or_path.shape[:2]
+        person_results = [{'bbox': np.array([0, 0, width, height])}]
+
+    if len(person_results) == 0:
+        return pose_results, returned_outputs
+
+    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
+    bboxes = np.array([box['bbox'] for box in person_results])
+
+    # Select bboxes by score threshold
+    if bbox_thr is not None:
+        assert bboxes.shape[1] == 5
+        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
+        bboxes = bboxes[valid_idx]
+        person_results = [person_results[i] for i in valid_idx]
+
+    if format == 'xyxy':
+        bboxes_xyxy = bboxes
+        bboxes_xywh = _xyxy2xywh(bboxes)
+    else:
+        # format is already 'xywh'
+        bboxes_xywh = bboxes
+        bboxes_xyxy = _xywh2xyxy(bboxes)
+
+    # if bbox_thr remove all bounding box
+    if len(bboxes_xywh) == 0:
+        return [], []
+
+    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
+        # poses is results['pred'] # N x 17x 3
+        poses, heatmap = _inference_single_pose_model(
+            model,
+            img_or_path,
+            bboxes_xywh,
+            dataset=dataset,
+            dataset_info=dataset_info,
+            return_heatmap=return_heatmap)
+
+        if return_heatmap:
+            h.layer_outputs['heatmap'] = heatmap
+
+        returned_outputs.append(h.layer_outputs)
+
+    assert len(poses) == len(person_results), print(
+        len(poses), len(person_results), len(bboxes_xyxy))
+    for pose, person_result, bbox_xyxy in zip(poses, person_results,
+                                              bboxes_xyxy):
+        pose_result = person_result.copy()
+        pose_result['keypoints'] = pose
+        pose_result['bbox'] = bbox_xyxy
+        pose_results.append(pose_result)
+
+    return pose_results, returned_outputs
+
+
+def inference_bottom_up_pose_model(model,
+                                   img_or_path,
+                                   dataset='BottomUpCocoDataset',
+                                   dataset_info=None,
+                                   pose_nms_thr=0.9,
+                                   return_heatmap=False,
+                                   outputs=None):
+    """Inference a single image with a bottom-up pose model.
+
+    Note:
+        - num_people: P
+        - num_keypoints: K
+        - bbox height: H
+        - bbox width: W
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (str| np.ndarray): Image filename or loaded image.
+        dataset (str): Dataset name, e.g. 'BottomUpCocoDataset'.
+            It is deprecated. Please use dataset_info instead.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        pose_nms_thr (float): retain oks overlap < pose_nms_thr, default: 0.9.
+        return_heatmap (bool) : Flag to return heatmap, default: False.
+        outputs (list(str) | tuple(str)) : Names of layers whose outputs
+            need to be returned, default: None.
+
+    Returns:
+        tuple:
+        - pose_results (list[np.ndarray]): The predicted pose info. \
+            The length of the list is the number of people (P). \
+            Each item in the list is a ndarray, containing each \
+            person's pose (np.ndarray[Kx3]): x, y, score.
+        - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \
+            torch.Tensor[N, K, H, W]]]): \
+            Output feature maps from layers specified in `outputs`. \
+            Includes 'heatmap' if `return_heatmap` is True.
+    """
+    # get dataset info
+    if (dataset_info is None and hasattr(model, 'cfg')
+            and 'dataset_info' in model.cfg):
+        dataset_info = DatasetInfo(model.cfg.dataset_info)
+
+    if dataset_info is not None:
+        dataset_name = dataset_info.dataset_name
+        flip_index = dataset_info.flip_index
+        sigmas = getattr(dataset_info, 'sigmas', None)
+    else:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        assert (dataset == 'BottomUpCocoDataset')
+        dataset_name = dataset
+        flip_index = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+        sigmas = None
+
+    pose_results = []
+    returned_outputs = []
+
+    cfg = model.cfg
+    device = next(model.parameters()).device
+    if device.type == 'cpu':
+        device = -1
+
+    # build the data pipeline
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    # prepare data
+    data = {
+        'dataset': dataset_name,
+        'ann_info': {
+            'image_size': np.array(cfg.data_cfg['image_size']),
+            'num_joints': cfg.data_cfg['num_joints'],
+            'flip_index': flip_index,
+        }
+    }
+    if isinstance(img_or_path, np.ndarray):
+        data['img'] = img_or_path
+    else:
+        data['image_file'] = img_or_path
+
+    data = test_pipeline(data)
+    data = collate([data], samples_per_gpu=1)
+    data = scatter(data, [device])[0]
+
+    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
+        # forward the model
+        with torch.no_grad():
+            result = model(
+                img=data['img'],
+                img_metas=data['img_metas'],
+                return_loss=False,
+                return_heatmap=return_heatmap)
+
+        if return_heatmap:
+            h.layer_outputs['heatmap'] = result['output_heatmap']
+
+        returned_outputs.append(h.layer_outputs)
+
+        for idx, pred in enumerate(result['preds']):
+            area = (np.max(pred[:, 0]) - np.min(pred[:, 0])) * (
+                np.max(pred[:, 1]) - np.min(pred[:, 1]))
+            pose_results.append({
+                'keypoints': pred[:, :3],
+                'score': result['scores'][idx],
+                'area': area,
+            })
+
+        # pose nms
+        score_per_joint = cfg.model.test_cfg.get('score_per_joint', False)
+        keep = oks_nms(
+            pose_results,
+            pose_nms_thr,
+            sigmas,
+            score_per_joint=score_per_joint)
+        pose_results = [pose_results[_keep] for _keep in keep]
+
+    return pose_results, returned_outputs
+
+
+def vis_pose_result(model,
+                    img,
+                    result,
+                    radius=4,
+                    thickness=1,
+                    kpt_score_thr=0.3,
+                    bbox_color='green',
+                    dataset='TopDownCocoDataset',
+                    dataset_info=None,
+                    show=False,
+                    out_file=None):
+    """Visualize the detection results on the image.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str | np.ndarray): Image filename or loaded image.
+        result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+        radius (int): Radius of circles.
+        thickness (int): Thickness of lines.
+        kpt_score_thr (float): The threshold to visualize the keypoints.
+        skeleton (list[tuple()]): Default None.
+        show (bool):  Whether to show the image. Default True.
+        out_file (str|None): The filename of the output visualization image.
+    """
+
+    # get dataset info
+    if (dataset_info is None and hasattr(model, 'cfg')
+            and 'dataset_info' in model.cfg):
+        dataset_info = DatasetInfo(model.cfg.dataset_info)
+
+    if dataset_info is not None:
+        skeleton = dataset_info.skeleton
+        pose_kpt_color = dataset_info.pose_kpt_color
+        pose_link_color = dataset_info.pose_link_color
+    else:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        # TODO: These will be removed in the later versions.
+        palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
+                            [230, 230, 0], [255, 153, 255], [153, 204, 255],
+                            [255, 102, 255], [255, 51, 255], [102, 178, 255],
+                            [51, 153, 255], [255, 153, 153], [255, 102, 102],
+                            [255, 51, 51], [153, 255, 153], [102, 255, 102],
+                            [51, 255, 51], [0, 255, 0], [0, 0, 255],
+                            [255, 0, 0], [255, 255, 255]])
+
+        if dataset in ('TopDownCocoDataset', 'BottomUpCocoDataset',
+                       'TopDownOCHumanDataset', 'AnimalMacaqueDataset'):
+            # show the results
+            skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12],
+                        [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
+                        [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4],
+                        [3, 5], [4, 6]]
+
+            pose_link_color = palette[[
+                0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16
+            ]]
+            pose_kpt_color = palette[[
+                16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0
+            ]]
+
+        elif dataset == 'TopDownCocoWholeBodyDataset':
+            # show the results
+            skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12],
+                        [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
+                        [8, 10], [1, 2], [0, 1], [0, 2],
+                        [1, 3], [2, 4], [3, 5], [4, 6], [15, 17], [15, 18],
+                        [15, 19], [16, 20], [16, 21], [16, 22], [91, 92],
+                        [92, 93], [93, 94], [94, 95], [91, 96], [96, 97],
+                        [97, 98], [98, 99], [91, 100], [100, 101], [101, 102],
+                        [102, 103], [91, 104], [104, 105], [105, 106],
+                        [106, 107], [91, 108], [108, 109], [109, 110],
+                        [110, 111], [112, 113], [113, 114], [114, 115],
+                        [115, 116], [112, 117], [117, 118], [118, 119],
+                        [119, 120], [112, 121], [121, 122], [122, 123],
+                        [123, 124], [112, 125], [125, 126], [126, 127],
+                        [127, 128], [112, 129], [129, 130], [130, 131],
+                        [131, 132]]
+
+            pose_link_color = palette[[
+                0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16
+            ] + [16, 16, 16, 16, 16, 16] + [
+                0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16,
+                16
+            ] + [
+                0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16,
+                16
+            ]]
+            pose_kpt_color = palette[
+                [16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0] +
+                [0, 0, 0, 0, 0, 0] + [19] * (68 + 42)]
+
+        elif dataset == 'TopDownAicDataset':
+            skeleton = [[2, 1], [1, 0], [0, 13], [13, 3], [3, 4], [4, 5],
+                        [8, 7], [7, 6], [6, 9], [9, 10], [10, 11], [12, 13],
+                        [0, 6], [3, 9]]
+
+            pose_link_color = palette[[
+                9, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 0, 7, 7
+            ]]
+            pose_kpt_color = palette[[
+                9, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 0, 0
+            ]]
+
+        elif dataset == 'TopDownMpiiDataset':
+            skeleton = [[0, 1], [1, 2], [2, 6], [6, 3], [3, 4], [4, 5], [6, 7],
+                        [7, 8], [8, 9], [8, 12], [12, 11], [11, 10], [8, 13],
+                        [13, 14], [14, 15]]
+
+            pose_link_color = palette[[
+                16, 16, 16, 16, 16, 16, 7, 7, 0, 9, 9, 9, 9, 9, 9
+            ]]
+            pose_kpt_color = palette[[
+                16, 16, 16, 16, 16, 16, 7, 7, 0, 0, 9, 9, 9, 9, 9, 9
+            ]]
+
+        elif dataset == 'TopDownMpiiTrbDataset':
+            skeleton = [[12, 13], [13, 0], [13, 1], [0, 2], [1, 3], [2, 4],
+                        [3, 5], [0, 6], [1, 7], [6, 7], [6, 8], [7,
+                                                                 9], [8, 10],
+                        [9, 11], [14, 15], [16, 17], [18, 19], [20, 21],
+                        [22, 23], [24, 25], [26, 27], [28, 29], [30, 31],
+                        [32, 33], [34, 35], [36, 37], [38, 39]]
+
+            pose_link_color = palette[[16] * 14 + [19] * 13]
+            pose_kpt_color = palette[[16] * 14 + [0] * 26]
+
+        elif dataset in ('OneHand10KDataset', 'FreiHandDataset',
+                         'PanopticDataset'):
+            skeleton = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7],
+                        [7, 8], [0, 9], [9, 10], [10, 11], [11, 12], [0, 13],
+                        [13, 14], [14, 15], [15, 16], [0, 17], [17, 18],
+                        [18, 19], [19, 20]]
+
+            pose_link_color = palette[[
+                0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16,
+                16
+            ]]
+            pose_kpt_color = palette[[
+                0, 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16,
+                16, 16
+            ]]
+
+        elif dataset == 'InterHand2DDataset':
+            skeleton = [[0, 1], [1, 2], [2, 3], [4, 5], [5, 6], [6, 7], [8, 9],
+                        [9, 10], [10, 11], [12, 13], [13, 14], [14, 15],
+                        [16, 17], [17, 18], [18, 19], [3, 20], [7, 20],
+                        [11, 20], [15, 20], [19, 20]]
+
+            pose_link_color = palette[[
+                0, 0, 0, 4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 0, 4, 8, 12,
+                16
+            ]]
+            pose_kpt_color = palette[[
+                0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16,
+                16, 0
+            ]]
+
+        elif dataset == 'Face300WDataset':
+            # show the results
+            skeleton = []
+
+            pose_link_color = palette[[]]
+            pose_kpt_color = palette[[19] * 68]
+            kpt_score_thr = 0
+
+        elif dataset == 'FaceAFLWDataset':
+            # show the results
+            skeleton = []
+
+            pose_link_color = palette[[]]
+            pose_kpt_color = palette[[19] * 19]
+            kpt_score_thr = 0
+
+        elif dataset == 'FaceCOFWDataset':
+            # show the results
+            skeleton = []
+
+            pose_link_color = palette[[]]
+            pose_kpt_color = palette[[19] * 29]
+            kpt_score_thr = 0
+
+        elif dataset == 'FaceWFLWDataset':
+            # show the results
+            skeleton = []
+
+            pose_link_color = palette[[]]
+            pose_kpt_color = palette[[19] * 98]
+            kpt_score_thr = 0
+
+        elif dataset == 'AnimalHorse10Dataset':
+            skeleton = [[0, 1], [1, 12], [12, 16], [16, 21], [21, 17],
+                        [17, 11], [11, 10], [10, 8], [8, 9], [9, 12], [2, 3],
+                        [3, 4], [5, 6], [6, 7], [13, 14], [14, 15], [18, 19],
+                        [19, 20]]
+
+            pose_link_color = palette[[4] * 10 + [6] * 2 + [6] * 2 + [7] * 2 +
+                                      [7] * 2]
+            pose_kpt_color = palette[[
+                4, 4, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 7, 7, 7, 4, 4, 7, 7, 7,
+                4
+            ]]
+
+        elif dataset == 'AnimalFlyDataset':
+            skeleton = [[1, 0], [2, 0], [3, 0], [4, 3], [5, 4], [7, 6], [8, 7],
+                        [9, 8], [11, 10], [12, 11], [13, 12], [15, 14],
+                        [16, 15], [17, 16], [19, 18], [20, 19], [21, 20],
+                        [23, 22], [24, 23], [25, 24], [27, 26], [28, 27],
+                        [29, 28], [30, 3], [31, 3]]
+
+            pose_link_color = palette[[0] * 25]
+            pose_kpt_color = palette[[0] * 32]
+
+        elif dataset == 'AnimalLocustDataset':
+            skeleton = [[1, 0], [2, 1], [3, 2], [4, 3], [6, 5], [7, 6], [9, 8],
+                        [10, 9], [11, 10], [13, 12], [14, 13], [15, 14],
+                        [17, 16], [18, 17], [19, 18], [21, 20], [22, 21],
+                        [24, 23], [25, 24], [26, 25], [28, 27], [29, 28],
+                        [30, 29], [32, 31], [33, 32], [34, 33]]
+
+            pose_link_color = palette[[0] * 26]
+            pose_kpt_color = palette[[0] * 35]
+
+        elif dataset == 'AnimalZebraDataset':
+            skeleton = [[1, 0], [2, 1], [3, 2], [4, 2], [5, 7], [6, 7], [7, 2],
+                        [8, 7]]
+
+            pose_link_color = palette[[0] * 8]
+            pose_kpt_color = palette[[0] * 9]
+
+        elif dataset in 'AnimalPoseDataset':
+            skeleton = [[0, 1], [0, 2], [1, 3], [0, 4], [1, 4], [4, 5], [5, 7],
+                        [6, 7], [5, 8], [8, 12], [12, 16], [5, 9], [9, 13],
+                        [13, 17], [6, 10], [10, 14], [14, 18], [6, 11],
+                        [11, 15], [15, 19]]
+
+            pose_link_color = palette[[0] * 20]
+            pose_kpt_color = palette[[0] * 20]
+        else:
+            NotImplementedError()
+
+    if hasattr(model, 'module'):
+        model = model.module
+
+    img = model.show_result(
+        img,
+        result,
+        skeleton,
+        radius=radius,
+        thickness=thickness,
+        pose_kpt_color=pose_kpt_color,
+        pose_link_color=pose_link_color,
+        kpt_score_thr=kpt_score_thr,
+        bbox_color=bbox_color,
+        show=show,
+        out_file=out_file)
+
+    return img
+
+
+def process_mmdet_results(mmdet_results, cat_id=1):
+    """Process mmdet results, and return a list of bboxes.
+
+    Args:
+        mmdet_results (list|tuple): mmdet results.
+        cat_id (int): category id (default: 1 for human)
+
+    Returns:
+        person_results (list): a list of detected bounding boxes
+    """
+    if isinstance(mmdet_results, tuple):
+        det_results = mmdet_results[0]
+    else:
+        det_results = mmdet_results
+
+    bboxes = det_results[cat_id - 1]
+
+    person_results = []
+    for bbox in bboxes:
+        person = {}
+        person['bbox'] = bbox
+        person_results.append(person)
+
+    return person_results
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/inference_3d.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/inference_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..f59f20a1d0794f542c60c2bcfc20bfa4a014a55a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/inference_3d.py
@@ -0,0 +1,791 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+from mmcv.parallel import collate, scatter
+
+from mmpose.datasets.pipelines import Compose
+from .inference import _box2cs, _xywh2xyxy, _xyxy2xywh
+
+
+def extract_pose_sequence(pose_results, frame_idx, causal, seq_len, step=1):
+    """Extract the target frame from 2D pose results, and pad the sequence to a
+    fixed length.
+
+    Args:
+        pose_results (list[list[dict]]): Multi-frame pose detection results
+            stored in a nested list. Each element of the outer list is the
+            pose detection results of a single frame, and each element of the
+            inner list is the pose information of one person, which contains:
+
+                - keypoints (ndarray[K, 2 or 3]): x, y, [score]
+                - track_id (int): unique id of each person, required \
+                    when ``with_track_id==True``.
+                - bbox ((4, ) or (5, )): left, right, top, bottom, [score]
+
+        frame_idx (int): The index of the frame in the original video.
+        causal (bool): If True, the target frame is the last frame in
+            a sequence. Otherwise, the target frame is in the middle of
+            a sequence.
+        seq_len (int): The number of frames in the input sequence.
+        step (int): Step size to extract frames from the video.
+
+    Returns:
+        list[list[dict]]: Multi-frame pose detection results stored \
+            in a nested list with a length of seq_len.
+    """
+
+    if causal:
+        frames_left = seq_len - 1
+        frames_right = 0
+    else:
+        frames_left = (seq_len - 1) // 2
+        frames_right = frames_left
+    num_frames = len(pose_results)
+
+    # get the padded sequence
+    pad_left = max(0, frames_left - frame_idx // step)
+    pad_right = max(0, frames_right - (num_frames - 1 - frame_idx) // step)
+    start = max(frame_idx % step, frame_idx - frames_left * step)
+    end = min(num_frames - (num_frames - 1 - frame_idx) % step,
+              frame_idx + frames_right * step + 1)
+    pose_results_seq = [pose_results[0]] * pad_left + \
+        pose_results[start:end:step] + [pose_results[-1]] * pad_right
+    return pose_results_seq
+
+
+def _gather_pose_lifter_inputs(pose_results,
+                               bbox_center,
+                               bbox_scale,
+                               norm_pose_2d=False):
+    """Gather input data (keypoints and track_id) for pose lifter model.
+
+    Note:
+        - The temporal length of the pose detection results: T
+        - The number of the person instances: N
+        - The number of the keypoints: K
+        - The channel number of each keypoint: C
+
+    Args:
+        pose_results (List[List[Dict]]): Multi-frame pose detection results
+            stored in a nested list. Each element of the outer list is the
+            pose detection results of a single frame, and each element of the
+            inner list is the pose information of one person, which contains:
+
+                - keypoints (ndarray[K, 2 or 3]): x, y, [score]
+                - track_id (int): unique id of each person, required when
+                    ``with_track_id==True```
+                - bbox ((4, ) or (5, )): left, right, top, bottom, [score]
+
+        bbox_center (ndarray[1, 2]): x, y. The average center coordinate of the
+            bboxes in the dataset.
+        bbox_scale (int|float): The average scale of the bboxes in the dataset.
+        norm_pose_2d (bool): If True, scale the bbox (along with the 2D
+            pose) to bbox_scale, and move the bbox (along with the 2D pose) to
+            bbox_center. Default: False.
+
+    Returns:
+        list[list[dict]]: Multi-frame pose detection results
+            stored in a nested list. Each element of the outer list is the
+            pose detection results of a single frame, and each element of the
+            inner list is the pose information of one person, which contains:
+
+                - keypoints (ndarray[K, 2 or 3]): x, y, [score]
+                - track_id (int): unique id of each person, required when
+                    ``with_track_id==True``
+    """
+    sequence_inputs = []
+    for frame in pose_results:
+        frame_inputs = []
+        for res in frame:
+            inputs = dict()
+
+            if norm_pose_2d:
+                bbox = res['bbox']
+                center = np.array([[(bbox[0] + bbox[2]) / 2,
+                                    (bbox[1] + bbox[3]) / 2]])
+                scale = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
+                inputs['keypoints'] = (res['keypoints'][:, :2] - center) \
+                    / scale * bbox_scale + bbox_center
+            else:
+                inputs['keypoints'] = res['keypoints'][:, :2]
+
+            if res['keypoints'].shape[1] == 3:
+                inputs['keypoints'] = np.concatenate(
+                    [inputs['keypoints'], res['keypoints'][:, 2:]], axis=1)
+
+            if 'track_id' in res:
+                inputs['track_id'] = res['track_id']
+            frame_inputs.append(inputs)
+        sequence_inputs.append(frame_inputs)
+    return sequence_inputs
+
+
+def _collate_pose_sequence(pose_results, with_track_id=True, target_frame=-1):
+    """Reorganize multi-frame pose detection results into individual pose
+    sequences.
+
+    Note:
+        - The temporal length of the pose detection results: T
+        - The number of the person instances: N
+        - The number of the keypoints: K
+        - The channel number of each keypoint: C
+
+    Args:
+        pose_results (List[List[Dict]]): Multi-frame pose detection results
+            stored in a nested list. Each element of the outer list is the
+            pose detection results of a single frame, and each element of the
+            inner list is the pose information of one person, which contains:
+
+                - keypoints (ndarray[K, 2 or 3]): x, y, [score]
+                - track_id (int): unique id of each person, required when
+                    ``with_track_id==True```
+
+        with_track_id (bool): If True, the element in pose_results is expected
+            to contain "track_id", which will be used to gather the pose
+            sequence of a person from multiple frames. Otherwise, the pose
+            results in each frame are expected to have a consistent number and
+            order of identities. Default is True.
+        target_frame (int): The index of the target frame. Default: -1.
+    """
+    T = len(pose_results)
+    assert T > 0
+
+    target_frame = (T + target_frame) % T  # convert negative index to positive
+
+    N = len(pose_results[target_frame])  # use identities in the target frame
+    if N == 0:
+        return []
+
+    K, C = pose_results[target_frame][0]['keypoints'].shape
+
+    track_ids = None
+    if with_track_id:
+        track_ids = [res['track_id'] for res in pose_results[target_frame]]
+
+    pose_sequences = []
+    for idx in range(N):
+        pose_seq = dict()
+        # gather static information
+        for k, v in pose_results[target_frame][idx].items():
+            if k != 'keypoints':
+                pose_seq[k] = v
+        # gather keypoints
+        if not with_track_id:
+            pose_seq['keypoints'] = np.stack(
+                [frame[idx]['keypoints'] for frame in pose_results])
+        else:
+            keypoints = np.zeros((T, K, C), dtype=np.float32)
+            keypoints[target_frame] = pose_results[target_frame][idx][
+                'keypoints']
+            # find the left most frame containing track_ids[idx]
+            for frame_idx in range(target_frame - 1, -1, -1):
+                contains_idx = False
+                for res in pose_results[frame_idx]:
+                    if res['track_id'] == track_ids[idx]:
+                        keypoints[frame_idx] = res['keypoints']
+                        contains_idx = True
+                        break
+                if not contains_idx:
+                    # replicate the left most frame
+                    keypoints[:frame_idx + 1] = keypoints[frame_idx + 1]
+                    break
+            # find the right most frame containing track_idx[idx]
+            for frame_idx in range(target_frame + 1, T):
+                contains_idx = False
+                for res in pose_results[frame_idx]:
+                    if res['track_id'] == track_ids[idx]:
+                        keypoints[frame_idx] = res['keypoints']
+                        contains_idx = True
+                        break
+                if not contains_idx:
+                    # replicate the right most frame
+                    keypoints[frame_idx + 1:] = keypoints[frame_idx]
+                    break
+            pose_seq['keypoints'] = keypoints
+        pose_sequences.append(pose_seq)
+
+    return pose_sequences
+
+
+def inference_pose_lifter_model(model,
+                                pose_results_2d,
+                                dataset=None,
+                                dataset_info=None,
+                                with_track_id=True,
+                                image_size=None,
+                                norm_pose_2d=False):
+    """Inference 3D pose from 2D pose sequences using a pose lifter model.
+
+    Args:
+        model (nn.Module): The loaded pose lifter model
+        pose_results_2d (list[list[dict]]): The 2D pose sequences stored in a
+            nested list. Each element of the outer list is the 2D pose results
+            of a single frame, and each element of the inner list is the 2D
+            pose of one person, which contains:
+
+            - "keypoints" (ndarray[K, 2 or 3]): x, y, [score]
+            - "track_id" (int)
+        dataset (str): Dataset name, e.g. 'Body3DH36MDataset'
+        with_track_id: If True, the element in pose_results_2d is expected to
+            contain "track_id", which will be used to gather the pose sequence
+            of a person from multiple frames. Otherwise, the pose results in
+            each frame are expected to have a consistent number and order of
+            identities. Default is True.
+        image_size (tuple|list): image width, image height. If None, image size
+            will not be contained in dict ``data``.
+        norm_pose_2d (bool): If True, scale the bbox (along with the 2D
+            pose) to the average bbox scale of the dataset, and move the bbox
+            (along with the 2D pose) to the average bbox center of the dataset.
+
+    Returns:
+        list[dict]: 3D pose inference results. Each element is the result of \
+            an instance, which contains:
+
+            - "keypoints_3d" (ndarray[K, 3]): predicted 3D keypoints
+            - "keypoints" (ndarray[K, 2 or 3]): from the last frame in \
+                ``pose_results_2d``.
+            - "track_id" (int): from the last frame in ``pose_results_2d``. \
+                If there is no valid instance, an empty list will be \
+                returned.
+    """
+    cfg = model.cfg
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    device = next(model.parameters()).device
+    if device.type == 'cpu':
+        device = -1
+
+    if dataset_info is not None:
+        flip_pairs = dataset_info.flip_pairs
+        assert 'stats_info' in dataset_info._dataset_info
+        bbox_center = dataset_info._dataset_info['stats_info']['bbox_center']
+        bbox_scale = dataset_info._dataset_info['stats_info']['bbox_scale']
+    else:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        # TODO: These will be removed in the later versions.
+        if dataset == 'Body3DH36MDataset':
+            flip_pairs = [[1, 4], [2, 5], [3, 6], [11, 14], [12, 15], [13, 16]]
+            bbox_center = np.array([[528, 427]], dtype=np.float32)
+            bbox_scale = 400
+        else:
+            raise NotImplementedError()
+
+    target_idx = -1 if model.causal else len(pose_results_2d) // 2
+    pose_lifter_inputs = _gather_pose_lifter_inputs(pose_results_2d,
+                                                    bbox_center, bbox_scale,
+                                                    norm_pose_2d)
+    pose_sequences_2d = _collate_pose_sequence(pose_lifter_inputs,
+                                               with_track_id, target_idx)
+
+    if not pose_sequences_2d:
+        return []
+
+    batch_data = []
+    for seq in pose_sequences_2d:
+        pose_2d = seq['keypoints'].astype(np.float32)
+        T, K, C = pose_2d.shape
+
+        input_2d = pose_2d[..., :2]
+        input_2d_visible = pose_2d[..., 2:3]
+        if C > 2:
+            input_2d_visible = pose_2d[..., 2:3]
+        else:
+            input_2d_visible = np.ones((T, K, 1), dtype=np.float32)
+
+        # TODO: Will be removed in the later versions
+        # Dummy 3D input
+        # This is for compatibility with configs in mmpose<=v0.14.0, where a
+        # 3D input is required to generate denormalization parameters. This
+        # part will be removed in the future.
+        target = np.zeros((K, 3), dtype=np.float32)
+        target_visible = np.ones((K, 1), dtype=np.float32)
+
+        # Dummy image path
+        # This is for compatibility with configs in mmpose<=v0.14.0, where
+        # target_image_path is required. This part will be removed in the
+        # future.
+        target_image_path = None
+
+        data = {
+            'input_2d': input_2d,
+            'input_2d_visible': input_2d_visible,
+            'target': target,
+            'target_visible': target_visible,
+            'target_image_path': target_image_path,
+            'ann_info': {
+                'num_joints': K,
+                'flip_pairs': flip_pairs
+            }
+        }
+
+        if image_size is not None:
+            assert len(image_size) == 2
+            data['image_width'] = image_size[0]
+            data['image_height'] = image_size[1]
+
+        data = test_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
+    batch_data = scatter(batch_data, target_gpus=[device])[0]
+
+    with torch.no_grad():
+        result = model(
+            input=batch_data['input'],
+            metas=batch_data['metas'],
+            return_loss=False)
+
+    poses_3d = result['preds']
+    if poses_3d.shape[-1] != 4:
+        assert poses_3d.shape[-1] == 3
+        dummy_score = np.ones(
+            poses_3d.shape[:-1] + (1, ), dtype=poses_3d.dtype)
+        poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1)
+    pose_results = []
+    for pose_2d, pose_3d in zip(pose_sequences_2d, poses_3d):
+        pose_result = pose_2d.copy()
+        pose_result['keypoints_3d'] = pose_3d
+        pose_results.append(pose_result)
+
+    return pose_results
+
+
+def vis_3d_pose_result(model,
+                       result,
+                       img=None,
+                       dataset='Body3DH36MDataset',
+                       dataset_info=None,
+                       kpt_score_thr=0.3,
+                       radius=8,
+                       thickness=2,
+                       num_instances=-1,
+                       show=False,
+                       out_file=None):
+    """Visualize the 3D pose estimation results.
+
+    Args:
+        model (nn.Module): The loaded model.
+        result (list[dict])
+    """
+
+    if dataset_info is not None:
+        skeleton = dataset_info.skeleton
+        pose_kpt_color = dataset_info.pose_kpt_color
+        pose_link_color = dataset_info.pose_link_color
+    else:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        # TODO: These will be removed in the later versions.
+        palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
+                            [230, 230, 0], [255, 153, 255], [153, 204, 255],
+                            [255, 102, 255], [255, 51, 255], [102, 178, 255],
+                            [51, 153, 255], [255, 153, 153], [255, 102, 102],
+                            [255, 51, 51], [153, 255, 153], [102, 255, 102],
+                            [51, 255, 51], [0, 255, 0], [0, 0, 255],
+                            [255, 0, 0], [255, 255, 255]])
+
+        if dataset == 'Body3DH36MDataset':
+            skeleton = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7],
+                        [7, 8], [8, 9], [9, 10], [8, 11], [11, 12], [12, 13],
+                        [8, 14], [14, 15], [15, 16]]
+
+            pose_kpt_color = palette[[
+                9, 0, 0, 0, 16, 16, 16, 9, 9, 9, 9, 16, 16, 16, 0, 0, 0
+            ]]
+            pose_link_color = palette[[
+                0, 0, 0, 16, 16, 16, 9, 9, 9, 9, 16, 16, 16, 0, 0, 0
+            ]]
+
+        elif dataset == 'InterHand3DDataset':
+            skeleton = [[0, 1], [1, 2], [2, 3], [3, 20], [4, 5], [5, 6],
+                        [6, 7], [7, 20], [8, 9], [9, 10], [10, 11], [11, 20],
+                        [12, 13], [13, 14], [14, 15], [15, 20], [16, 17],
+                        [17, 18], [18, 19], [19, 20], [21, 22], [22, 23],
+                        [23, 24], [24, 41], [25, 26], [26, 27], [27, 28],
+                        [28, 41], [29, 30], [30, 31], [31, 32], [32, 41],
+                        [33, 34], [34, 35], [35, 36], [36, 41], [37, 38],
+                        [38, 39], [39, 40], [40, 41]]
+
+            pose_kpt_color = [[14, 128, 250], [14, 128, 250], [14, 128, 250],
+                              [14, 128, 250], [80, 127, 255], [80, 127, 255],
+                              [80, 127, 255], [80, 127, 255], [71, 99, 255],
+                              [71, 99, 255], [71, 99, 255], [71, 99, 255],
+                              [0, 36, 255], [0, 36, 255], [0, 36, 255],
+                              [0, 36, 255], [0, 0, 230], [0, 0, 230],
+                              [0, 0, 230], [0, 0, 230], [0, 0, 139],
+                              [237, 149, 100], [237, 149, 100],
+                              [237, 149, 100], [237, 149, 100], [230, 128, 77],
+                              [230, 128, 77], [230, 128, 77], [230, 128, 77],
+                              [255, 144, 30], [255, 144, 30], [255, 144, 30],
+                              [255, 144, 30], [153, 51, 0], [153, 51, 0],
+                              [153, 51, 0], [153, 51, 0], [255, 51, 13],
+                              [255, 51, 13], [255, 51, 13], [255, 51, 13],
+                              [103, 37, 8]]
+
+            pose_link_color = [[14, 128, 250], [14, 128, 250], [14, 128, 250],
+                               [14, 128, 250], [80, 127, 255], [80, 127, 255],
+                               [80, 127, 255], [80, 127, 255], [71, 99, 255],
+                               [71, 99, 255], [71, 99, 255], [71, 99, 255],
+                               [0, 36, 255], [0, 36, 255], [0, 36, 255],
+                               [0, 36, 255], [0, 0, 230], [0, 0, 230],
+                               [0, 0, 230], [0, 0, 230], [237, 149, 100],
+                               [237, 149, 100], [237, 149, 100],
+                               [237, 149, 100], [230, 128, 77], [230, 128, 77],
+                               [230, 128, 77], [230, 128, 77], [255, 144, 30],
+                               [255, 144, 30], [255, 144, 30], [255, 144, 30],
+                               [153, 51, 0], [153, 51, 0], [153, 51, 0],
+                               [153, 51, 0], [255, 51, 13], [255, 51, 13],
+                               [255, 51, 13], [255, 51, 13]]
+        else:
+            raise NotImplementedError
+
+    if hasattr(model, 'module'):
+        model = model.module
+
+    img = model.show_result(
+        result,
+        img,
+        skeleton,
+        radius=radius,
+        thickness=thickness,
+        pose_kpt_color=pose_kpt_color,
+        pose_link_color=pose_link_color,
+        num_instances=num_instances,
+        show=show,
+        out_file=out_file)
+
+    return img
+
+
+def inference_interhand_3d_model(model,
+                                 img_or_path,
+                                 det_results,
+                                 bbox_thr=None,
+                                 format='xywh',
+                                 dataset='InterHand3DDataset'):
+    """Inference a single image with a list of hand bounding boxes.
+
+    Note:
+        - num_bboxes: N
+        - num_keypoints: K
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (str | np.ndarray): Image filename or loaded image.
+        det_results (list[dict]): The 2D bbox sequences stored in a list.
+            Each each element of the list is the bbox of one person, whose
+            shape is (ndarray[4 or 5]), containing 4 box coordinates
+            (and score).
+        dataset (str): Dataset name.
+        format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
+            'xyxy' means (left, top, right, bottom),
+            'xywh' means (left, top, width, height).
+
+    Returns:
+        list[dict]: 3D pose inference results. Each element is the result \
+            of an instance, which contains the predicted 3D keypoints with \
+            shape (ndarray[K,3]). If there is no valid instance, an \
+            empty list will be returned.
+    """
+
+    assert format in ['xyxy', 'xywh']
+
+    pose_results = []
+
+    if len(det_results) == 0:
+        return pose_results
+
+    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
+    bboxes = np.array([box['bbox'] for box in det_results])
+
+    # Select bboxes by score threshold
+    if bbox_thr is not None:
+        assert bboxes.shape[1] == 5
+        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
+        bboxes = bboxes[valid_idx]
+        det_results = [det_results[i] for i in valid_idx]
+
+    if format == 'xyxy':
+        bboxes_xyxy = bboxes
+        bboxes_xywh = _xyxy2xywh(bboxes)
+    else:
+        # format is already 'xywh'
+        bboxes_xywh = bboxes
+        bboxes_xyxy = _xywh2xyxy(bboxes)
+
+    # if bbox_thr remove all bounding box
+    if len(bboxes_xywh) == 0:
+        return []
+
+    cfg = model.cfg
+    device = next(model.parameters()).device
+    if device.type == 'cpu':
+        device = -1
+
+    # build the data pipeline
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    assert len(bboxes[0]) in [4, 5]
+
+    if dataset == 'InterHand3DDataset':
+        flip_pairs = [[i, 21 + i] for i in range(21)]
+    else:
+        raise NotImplementedError()
+
+    batch_data = []
+    for bbox in bboxes:
+        center, scale = _box2cs(cfg, bbox)
+
+        # prepare data
+        data = {
+            'center':
+            center,
+            'scale':
+            scale,
+            'bbox_score':
+            bbox[4] if len(bbox) == 5 else 1,
+            'bbox_id':
+            0,  # need to be assigned if batch_size > 1
+            'dataset':
+            dataset,
+            'joints_3d':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'joints_3d_visible':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'rotation':
+            0,
+            'ann_info': {
+                'image_size': np.array(cfg.data_cfg['image_size']),
+                'num_joints': cfg.data_cfg['num_joints'],
+                'flip_pairs': flip_pairs,
+                'heatmap3d_depth_bound': cfg.data_cfg['heatmap3d_depth_bound'],
+                'heatmap_size_root': cfg.data_cfg['heatmap_size_root'],
+                'root_depth_bound': cfg.data_cfg['root_depth_bound']
+            }
+        }
+
+        if isinstance(img_or_path, np.ndarray):
+            data['img'] = img_or_path
+        else:
+            data['image_file'] = img_or_path
+
+        data = test_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
+    batch_data = scatter(batch_data, [device])[0]
+
+    # forward the model
+    with torch.no_grad():
+        result = model(
+            img=batch_data['img'],
+            img_metas=batch_data['img_metas'],
+            return_loss=False)
+
+    poses_3d = result['preds']
+    rel_root_depth = result['rel_root_depth']
+    hand_type = result['hand_type']
+    if poses_3d.shape[-1] != 4:
+        assert poses_3d.shape[-1] == 3
+        dummy_score = np.ones(
+            poses_3d.shape[:-1] + (1, ), dtype=poses_3d.dtype)
+        poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1)
+
+    # add relative root depth to left hand joints
+    poses_3d[:, 21:, 2] += rel_root_depth
+
+    # set joint scores according to hand type
+    poses_3d[:, :21, 3] *= hand_type[:, [0]]
+    poses_3d[:, 21:, 3] *= hand_type[:, [1]]
+
+    pose_results = []
+    for pose_3d, person_res, bbox_xyxy in zip(poses_3d, det_results,
+                                              bboxes_xyxy):
+        pose_res = person_res.copy()
+        pose_res['keypoints_3d'] = pose_3d
+        pose_res['bbox'] = bbox_xyxy
+        pose_results.append(pose_res)
+
+    return pose_results
+
+
+def inference_mesh_model(model,
+                         img_or_path,
+                         det_results,
+                         bbox_thr=None,
+                         format='xywh',
+                         dataset='MeshH36MDataset'):
+    """Inference a single image with a list of bounding boxes.
+
+    Note:
+        - num_bboxes: N
+        - num_keypoints: K
+        - num_vertices: V
+        - num_faces: F
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (str | np.ndarray): Image filename or loaded image.
+        det_results (list[dict]): The 2D bbox sequences stored in a list.
+            Each element of the list is the bbox of one person.
+            "bbox" (ndarray[4 or 5]): The person bounding box,
+            which contains 4 box coordinates (and score).
+        bbox_thr (float | None): Threshold for bounding boxes.
+            Only bboxes with higher scores will be fed into the pose
+            detector. If bbox_thr is None, all boxes will be used.
+        format (str): bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
+
+            - 'xyxy' means (left, top, right, bottom),
+            - 'xywh' means (left, top, width, height).
+        dataset (str): Dataset name.
+
+    Returns:
+        list[dict]: 3D pose inference results. Each element \
+            is the result of an instance, which contains:
+
+            - 'bbox' (ndarray[4]): instance bounding bbox
+            - 'center' (ndarray[2]): bbox center
+            - 'scale' (ndarray[2]): bbox scale
+            - 'keypoints_3d' (ndarray[K,3]): predicted 3D keypoints
+            - 'camera' (ndarray[3]): camera parameters
+            - 'vertices' (ndarray[V, 3]): predicted 3D vertices
+            - 'faces' (ndarray[F, 3]): mesh faces
+
+            If there is no valid instance, an empty list
+            will be returned.
+    """
+
+    assert format in ['xyxy', 'xywh']
+
+    pose_results = []
+
+    if len(det_results) == 0:
+        return pose_results
+
+    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
+    bboxes = np.array([box['bbox'] for box in det_results])
+
+    # Select bboxes by score threshold
+    if bbox_thr is not None:
+        assert bboxes.shape[1] == 5
+        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
+        bboxes = bboxes[valid_idx]
+        det_results = [det_results[i] for i in valid_idx]
+
+    if format == 'xyxy':
+        bboxes_xyxy = bboxes
+        bboxes_xywh = _xyxy2xywh(bboxes)
+    else:
+        # format is already 'xywh'
+        bboxes_xywh = bboxes
+        bboxes_xyxy = _xywh2xyxy(bboxes)
+
+    # if bbox_thr remove all bounding box
+    if len(bboxes_xywh) == 0:
+        return []
+
+    cfg = model.cfg
+    device = next(model.parameters()).device
+    if device.type == 'cpu':
+        device = -1
+
+    # build the data pipeline
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    assert len(bboxes[0]) in [4, 5]
+
+    if dataset == 'MeshH36MDataset':
+        flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9],
+                      [20, 21], [22, 23]]
+    else:
+        raise NotImplementedError()
+
+    batch_data = []
+    for bbox in bboxes:
+        center, scale = _box2cs(cfg, bbox)
+
+        # prepare data
+        data = {
+            'image_file':
+            img_or_path,
+            'center':
+            center,
+            'scale':
+            scale,
+            'rotation':
+            0,
+            'bbox_score':
+            bbox[4] if len(bbox) == 5 else 1,
+            'dataset':
+            dataset,
+            'joints_2d':
+            np.zeros((cfg.data_cfg.num_joints, 2), dtype=np.float32),
+            'joints_2d_visible':
+            np.zeros((cfg.data_cfg.num_joints, 1), dtype=np.float32),
+            'joints_3d':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'joints_3d_visible':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'pose':
+            np.zeros(72, dtype=np.float32),
+            'beta':
+            np.zeros(10, dtype=np.float32),
+            'has_smpl':
+            0,
+            'ann_info': {
+                'image_size': np.array(cfg.data_cfg['image_size']),
+                'num_joints': cfg.data_cfg['num_joints'],
+                'flip_pairs': flip_pairs,
+            }
+        }
+
+        data = test_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
+    batch_data = scatter(batch_data, target_gpus=[device])[0]
+
+    # forward the model
+    with torch.no_grad():
+        preds = model(
+            img=batch_data['img'],
+            img_metas=batch_data['img_metas'],
+            return_loss=False,
+            return_vertices=True,
+            return_faces=True)
+
+    for idx in range(len(det_results)):
+        pose_res = det_results[idx].copy()
+        pose_res['bbox'] = bboxes_xyxy[idx]
+        pose_res['center'] = batch_data['img_metas'][idx]['center']
+        pose_res['scale'] = batch_data['img_metas'][idx]['scale']
+        pose_res['keypoints_3d'] = preds['keypoints_3d'][idx]
+        pose_res['camera'] = preds['camera'][idx]
+        pose_res['vertices'] = preds['vertices'][idx]
+        pose_res['faces'] = preds['faces']
+        pose_results.append(pose_res)
+    return pose_results
+
+
+def vis_3d_mesh_result(model, result, img=None, show=False, out_file=None):
+    """Visualize the 3D mesh estimation results.
+
+    Args:
+        model (nn.Module): The loaded model.
+        result (list[dict]): 3D mesh estimation results.
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+
+    img = model.show_result(result, img, show=show, out_file=out_file)
+
+    return img
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/inference_tracking.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/inference_tracking.py
new file mode 100644
index 0000000000000000000000000000000000000000..9494fbaa75ca54840bd2c3f8bbbfcc7955e3a05d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/inference_tracking.py
@@ -0,0 +1,347 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+
+from mmpose.core import OneEuroFilter, oks_iou
+
+
+def _compute_iou(bboxA, bboxB):
+    """Compute the Intersection over Union (IoU) between two boxes .
+
+    Args:
+        bboxA (list): The first bbox info (left, top, right, bottom, score).
+        bboxB (list): The second bbox info (left, top, right, bottom, score).
+
+    Returns:
+        float: The IoU value.
+    """
+
+    x1 = max(bboxA[0], bboxB[0])
+    y1 = max(bboxA[1], bboxB[1])
+    x2 = min(bboxA[2], bboxB[2])
+    y2 = min(bboxA[3], bboxB[3])
+
+    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
+
+    bboxA_area = (bboxA[2] - bboxA[0]) * (bboxA[3] - bboxA[1])
+    bboxB_area = (bboxB[2] - bboxB[0]) * (bboxB[3] - bboxB[1])
+    union_area = float(bboxA_area + bboxB_area - inter_area)
+    if union_area == 0:
+        union_area = 1e-5
+        warnings.warn('union_area=0 is unexpected')
+
+    iou = inter_area / union_area
+
+    return iou
+
+
+def _track_by_iou(res, results_last, thr):
+    """Get track id using IoU tracking greedily.
+
+    Args:
+        res (dict): The bbox & pose results of the person instance.
+        results_last (list[dict]): The bbox & pose & track_id info of the
+            last frame (bbox_result, pose_result, track_id).
+        thr (float): The threshold for iou tracking.
+
+    Returns:
+        int: The track id for the new person instance.
+        list[dict]: The bbox & pose & track_id info of the persons
+            that have not been matched on the last frame.
+        dict: The matched person instance on the last frame.
+    """
+
+    bbox = list(res['bbox'])
+
+    max_iou_score = -1
+    max_index = -1
+    match_result = {}
+    for index, res_last in enumerate(results_last):
+        bbox_last = list(res_last['bbox'])
+
+        iou_score = _compute_iou(bbox, bbox_last)
+        if iou_score > max_iou_score:
+            max_iou_score = iou_score
+            max_index = index
+
+    if max_iou_score > thr:
+        track_id = results_last[max_index]['track_id']
+        match_result = results_last[max_index]
+        del results_last[max_index]
+    else:
+        track_id = -1
+
+    return track_id, results_last, match_result
+
+
+def _track_by_oks(res, results_last, thr):
+    """Get track id using OKS tracking greedily.
+
+    Args:
+        res (dict): The pose results of the person instance.
+        results_last (list[dict]): The pose & track_id info of the
+            last frame (pose_result, track_id).
+        thr (float): The threshold for oks tracking.
+
+    Returns:
+        int: The track id for the new person instance.
+        list[dict]: The pose & track_id info of the persons
+            that have not been matched on the last frame.
+        dict: The matched person instance on the last frame.
+    """
+    pose = res['keypoints'].reshape((-1))
+    area = res['area']
+    max_index = -1
+    match_result = {}
+
+    if len(results_last) == 0:
+        return -1, results_last, match_result
+
+    pose_last = np.array(
+        [res_last['keypoints'].reshape((-1)) for res_last in results_last])
+    area_last = np.array([res_last['area'] for res_last in results_last])
+
+    oks_score = oks_iou(pose, pose_last, area, area_last)
+
+    max_index = np.argmax(oks_score)
+
+    if oks_score[max_index] > thr:
+        track_id = results_last[max_index]['track_id']
+        match_result = results_last[max_index]
+        del results_last[max_index]
+    else:
+        track_id = -1
+
+    return track_id, results_last, match_result
+
+
+def _get_area(results):
+    """Get bbox for each person instance on the current frame.
+
+    Args:
+        results (list[dict]): The pose results of the current frame
+            (pose_result).
+    Returns:
+        list[dict]: The bbox & pose info of the current frame
+            (bbox_result, pose_result, area).
+    """
+    for result in results:
+        if 'bbox' in result:
+            result['area'] = ((result['bbox'][2] - result['bbox'][0]) *
+                              (result['bbox'][3] - result['bbox'][1]))
+        else:
+            xmin = np.min(
+                result['keypoints'][:, 0][result['keypoints'][:, 0] > 0],
+                initial=1e10)
+            xmax = np.max(result['keypoints'][:, 0])
+            ymin = np.min(
+                result['keypoints'][:, 1][result['keypoints'][:, 1] > 0],
+                initial=1e10)
+            ymax = np.max(result['keypoints'][:, 1])
+            result['area'] = (xmax - xmin) * (ymax - ymin)
+            result['bbox'] = np.array([xmin, ymin, xmax, ymax])
+    return results
+
+
+def _temporal_refine(result, match_result, fps=None):
+    """Refine koypoints using tracked person instance on last frame.
+
+    Args:
+        results (dict): The pose results of the current frame
+                (pose_result).
+        match_result (dict): The pose results of the last frame
+                (match_result)
+    Returns:
+        (array): The person keypoints after refine.
+    """
+    if 'one_euro' in match_result:
+        result['keypoints'][:, :2] = match_result['one_euro'](
+            result['keypoints'][:, :2])
+        result['one_euro'] = match_result['one_euro']
+    else:
+        result['one_euro'] = OneEuroFilter(result['keypoints'][:, :2], fps=fps)
+    return result['keypoints']
+
+
+def get_track_id(results,
+                 results_last,
+                 next_id,
+                 min_keypoints=3,
+                 use_oks=False,
+                 tracking_thr=0.3,
+                 use_one_euro=False,
+                 fps=None):
+    """Get track id for each person instance on the current frame.
+
+    Args:
+        results (list[dict]): The bbox & pose results of the current frame
+            (bbox_result, pose_result).
+        results_last (list[dict]): The bbox & pose & track_id info of the
+            last frame (bbox_result, pose_result, track_id).
+        next_id (int): The track id for the new person instance.
+        min_keypoints (int): Minimum number of keypoints recognized as person.
+            default: 3.
+        use_oks (bool): Flag to using oks tracking. default: False.
+        tracking_thr (float): The threshold for tracking.
+        use_one_euro (bool): Option to use one-euro-filter. default: False.
+        fps (optional): Parameters that d_cutoff
+            when one-euro-filter is used as a video input
+
+    Returns:
+        tuple:
+        - results (list[dict]): The bbox & pose & track_id info of the \
+            current frame (bbox_result, pose_result, track_id).
+        - next_id (int): The track id for the new person instance.
+    """
+    results = _get_area(results)
+
+    if use_oks:
+        _track = _track_by_oks
+    else:
+        _track = _track_by_iou
+
+    for result in results:
+        track_id, results_last, match_result = _track(result, results_last,
+                                                      tracking_thr)
+        if track_id == -1:
+            if np.count_nonzero(result['keypoints'][:, 1]) > min_keypoints:
+                result['track_id'] = next_id
+                next_id += 1
+            else:
+                # If the number of keypoints detected is small,
+                # delete that person instance.
+                result['keypoints'][:, 1] = -10
+                result['bbox'] *= 0
+                result['track_id'] = -1
+        else:
+            result['track_id'] = track_id
+        if use_one_euro:
+            result['keypoints'] = _temporal_refine(
+                result, match_result, fps=fps)
+        del match_result
+
+    return results, next_id
+
+
+def vis_pose_tracking_result(model,
+                             img,
+                             result,
+                             radius=4,
+                             thickness=1,
+                             kpt_score_thr=0.3,
+                             dataset='TopDownCocoDataset',
+                             dataset_info=None,
+                             show=False,
+                             out_file=None):
+    """Visualize the pose tracking results on the image.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str | np.ndarray): Image filename or loaded image.
+        result (list[dict]): The results to draw over `img`
+            (bbox_result, pose_result).
+        radius (int): Radius of circles.
+        thickness (int): Thickness of lines.
+        kpt_score_thr (float): The threshold to visualize the keypoints.
+        skeleton (list[tuple]): Default None.
+        show (bool):  Whether to show the image. Default True.
+        out_file (str|None): The filename of the output visualization image.
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+
+    palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
+                        [230, 230, 0], [255, 153, 255], [153, 204, 255],
+                        [255, 102, 255], [255, 51, 255], [102, 178, 255],
+                        [51, 153, 255], [255, 153, 153], [255, 102, 102],
+                        [255, 51, 51], [153, 255, 153], [102, 255, 102],
+                        [51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0],
+                        [255, 255, 255]])
+
+    if dataset_info is None and dataset is not None:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        # TODO: These will be removed in the later versions.
+        if dataset in ('TopDownCocoDataset', 'BottomUpCocoDataset',
+                       'TopDownOCHumanDataset'):
+            kpt_num = 17
+            skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12],
+                        [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
+                        [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4],
+                        [3, 5], [4, 6]]
+
+        elif dataset == 'TopDownCocoWholeBodyDataset':
+            kpt_num = 133
+            skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12],
+                        [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
+                        [8, 10], [1, 2], [0, 1], [0, 2],
+                        [1, 3], [2, 4], [3, 5], [4, 6], [15, 17], [15, 18],
+                        [15, 19], [16, 20], [16, 21], [16, 22], [91, 92],
+                        [92, 93], [93, 94], [94, 95], [91, 96], [96, 97],
+                        [97, 98], [98, 99], [91, 100], [100, 101], [101, 102],
+                        [102, 103], [91, 104], [104, 105], [105, 106],
+                        [106, 107], [91, 108], [108, 109], [109, 110],
+                        [110, 111], [112, 113], [113, 114], [114, 115],
+                        [115, 116], [112, 117], [117, 118], [118, 119],
+                        [119, 120], [112, 121], [121, 122], [122, 123],
+                        [123, 124], [112, 125], [125, 126], [126, 127],
+                        [127, 128], [112, 129], [129, 130], [130, 131],
+                        [131, 132]]
+            radius = 1
+
+        elif dataset == 'TopDownAicDataset':
+            kpt_num = 14
+            skeleton = [[2, 1], [1, 0], [0, 13], [13, 3], [3, 4], [4, 5],
+                        [8, 7], [7, 6], [6, 9], [9, 10], [10, 11], [12, 13],
+                        [0, 6], [3, 9]]
+
+        elif dataset == 'TopDownMpiiDataset':
+            kpt_num = 16
+            skeleton = [[0, 1], [1, 2], [2, 6], [6, 3], [3, 4], [4, 5], [6, 7],
+                        [7, 8], [8, 9], [8, 12], [12, 11], [11, 10], [8, 13],
+                        [13, 14], [14, 15]]
+
+        elif dataset in ('OneHand10KDataset', 'FreiHandDataset',
+                         'PanopticDataset'):
+            kpt_num = 21
+            skeleton = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7],
+                        [7, 8], [0, 9], [9, 10], [10, 11], [11, 12], [0, 13],
+                        [13, 14], [14, 15], [15, 16], [0, 17], [17, 18],
+                        [18, 19], [19, 20]]
+
+        elif dataset == 'InterHand2DDataset':
+            kpt_num = 21
+            skeleton = [[0, 1], [1, 2], [2, 3], [4, 5], [5, 6], [6, 7], [8, 9],
+                        [9, 10], [10, 11], [12, 13], [13, 14], [14, 15],
+                        [16, 17], [17, 18], [18, 19], [3, 20], [7, 20],
+                        [11, 20], [15, 20], [19, 20]]
+
+        else:
+            raise NotImplementedError()
+
+    elif dataset_info is not None:
+        kpt_num = dataset_info.keypoint_num
+        skeleton = dataset_info.skeleton
+
+    for res in result:
+        track_id = res['track_id']
+        bbox_color = palette[track_id % len(palette)]
+        pose_kpt_color = palette[[track_id % len(palette)] * kpt_num]
+        pose_link_color = palette[[track_id % len(palette)] * len(skeleton)]
+        img = model.show_result(
+            img, [res],
+            skeleton,
+            radius=radius,
+            thickness=thickness,
+            pose_kpt_color=pose_kpt_color,
+            pose_link_color=pose_link_color,
+            bbox_color=tuple(bbox_color.tolist()),
+            kpt_score_thr=kpt_score_thr,
+            show=show,
+            out_file=out_file)
+
+    return img
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/test.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3843b5a594c03cf82144f6c3b3805a9221f16d72
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/test.py
@@ -0,0 +1,191 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+
+def single_gpu_test(model, data_loader):
+    """Test model with a single gpu.
+
+    This method tests model with a single gpu and displays test progress bar.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+
+
+    Returns:
+        list: The prediction results.
+    """
+
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for data in data_loader:
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.append(result)
+
+        # use the first key as main key to calculate the batch size
+        batch_size = len(next(iter(data.values())))
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+
+
+def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    for data in data_loader:
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.append(result)
+
+        if rank == 0:
+            # use the first key as main key to calculate the batch size
+            batch_size = len(next(iter(data.values())))
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        results = collect_results_gpu(results, len(dataset))
+    else:
+        results = collect_results_cpu(results, len(dataset), tmpdir)
+    return results
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    """Collect results in cpu mode.
+
+    It saves the results on different gpus to 'tmpdir' and collects
+    them by the rank 0 worker.
+
+    Args:
+        result_part (list): Results to be collected
+        size (int): Result size.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode. Default: None
+
+    Returns:
+        list: Ordered results.
+    """
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # synchronizes all processes to make sure tmpdir exist
+    dist.barrier()
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    # synchronizes all processes for loading pickle file
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+
+    # load results of all parts from tmp dir
+    part_list = []
+    for i in range(world_size):
+        part_file = osp.join(tmpdir, f'part_{i}.pkl')
+        part_list.append(mmcv.load(part_file))
+    # sort the results
+    ordered_results = []
+    for res in zip(*part_list):
+        ordered_results.extend(list(res))
+    # the dataloader may pad some samples
+    ordered_results = ordered_results[:size]
+    # remove tmp dir
+    shutil.rmtree(tmpdir)
+    return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    """Collect results in gpu mode.
+
+    It encodes results to gpu tensors and use gpu communication for results
+    collection.
+
+    Args:
+        result_part (list): Results to be collected
+        size (int): Result size.
+
+    Returns:
+        list: Ordered results.
+    """
+
+    rank, world_size = get_dist_info()
+    # dump result part to tensor with pickle
+    part_tensor = torch.tensor(
+        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
+    # gather all result part tensor shape
+    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
+    shape_list = [shape_tensor.clone() for _ in range(world_size)]
+    dist.all_gather(shape_list, shape_tensor)
+    # padding result part tensor to max length
+    shape_max = torch.tensor(shape_list).max()
+    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
+    part_send[:shape_tensor[0]] = part_tensor
+    part_recv_list = [
+        part_tensor.new_zeros(shape_max) for _ in range(world_size)
+    ]
+    # gather all result part
+    dist.all_gather(part_recv_list, part_send)
+
+    if rank == 0:
+        part_list = []
+        for recv, shape in zip(part_recv_list, shape_list):
+            part_list.append(
+                pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
+    return None
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/train.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c31f8b0b1ace6d27feb14b8d441fec6436ad9e2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/apis/train.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook,
+                         get_dist_info)
+from mmcv.utils import digit_version
+
+from mmpose.core import DistEvalHook, EvalHook, build_optimizers
+from mmpose.core.distributed_wrapper import DistributedDataParallelWrapper
+from mmpose.datasets import build_dataloader, build_dataset
+from mmpose.utils import get_root_logger
+
+try:
+    from mmcv.runner import Fp16OptimizerHook
+except ImportError:
+    warnings.warn(
+        'Fp16OptimizerHook from mmpose will be deprecated from '
+        'v0.15.0. Please install mmcv>=1.1.4', DeprecationWarning)
+    from mmpose.core import Fp16OptimizerHook
+
+
+def init_random_seed(seed=None, device='cuda'):
+    """Initialize random seed.
+
+    If the seed is not set, the seed will be automatically randomized,
+    and then broadcast to all processes to prevent some potential bugs.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is not None:
+        return seed
+
+    # Make sure all ranks share the same random seed to prevent
+    # some potential bugs. Please refer to
+    # https://github.com/open-mmlab/mmdetection/issues/6339
+    rank, world_size = get_dist_info()
+    seed = np.random.randint(2**31)
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """Train model entry function.
+
+    Args:
+        model (nn.Module): The model to be trained.
+        dataset (Dataset): Train dataset.
+        cfg (dict): The config dict for training.
+        distributed (bool): Whether to use distributed training.
+            Default: False.
+        validate (bool): Whether to do evaluation. Default: False.
+        timestamp (str | None): Local time for runner. Default: None.
+        meta (dict | None): Meta dict to record some important information.
+            Default: None
+    """
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    # step 1: give default values and override (if exist) from cfg.data
+    loader_cfg = {
+        **dict(
+            seed=cfg.get('seed'),
+            drop_last=False,
+            dist=distributed,
+            num_gpus=len(cfg.gpu_ids)),
+        **({} if torch.__version__ != 'parrots' else dict(
+               prefetch_num=2,
+               pin_memory=False,
+           )),
+        **dict((k, cfg.data[k]) for k in [
+                   'samples_per_gpu',
+                   'workers_per_gpu',
+                   'shuffle',
+                   'seed',
+                   'drop_last',
+                   'prefetch_num',
+                   'pin_memory',
+                   'persistent_workers',
+               ] if k in cfg.data)
+    }
+
+    # step 2: cfg.data.train_dataloader has highest priority
+    train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {}))
+
+    data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset]
+
+    # determine whether use adversarial training precess or not
+    use_adverserial_train = cfg.get('use_adversarial_train', False)
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+
+        if use_adverserial_train:
+            # Use DistributedDataParallelWrapper for adversarial training
+            model = DistributedDataParallelWrapper(
+                model,
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+        else:
+            model = MMDistributedDataParallel(
+                model.cuda(),
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+    else:
+        if digit_version(mmcv.__version__) >= digit_version(
+                '1.4.4') or torch.cuda.is_available():
+            model = MMDataParallel(model, device_ids=cfg.gpu_ids)
+        else:
+            warnings.warn(
+                'We recommend to use MMCV >= 1.4.4 for CPU training. '
+                'See https://github.com/open-mmlab/mmpose/pull/1157 for '
+                'details.')
+
+    # build runner
+    optimizer = build_optimizers(model, cfg.optimizer)
+
+    runner = EpochBasedRunner(
+        model,
+        optimizer=optimizer,
+        work_dir=cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    if use_adverserial_train:
+        # The optimizer step process is included in the train_step function
+        # of the model, so the runner should NOT include optimizer hook.
+        optimizer_config = None
+    else:
+        # fp16 setting
+        fp16_cfg = cfg.get('fp16', None)
+        if fp16_cfg is not None:
+            optimizer_config = Fp16OptimizerHook(
+                **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+        elif distributed and 'type' not in cfg.optimizer_config:
+            optimizer_config = OptimizerHook(**cfg.optimizer_config)
+        else:
+            optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    if distributed:
+        runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        eval_cfg = cfg.get('evaluation', {})
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+        dataloader_setting = dict(
+            samples_per_gpu=1,
+            workers_per_gpu=cfg.data.get('workers_per_gpu', 1),
+            # cfg.gpus will be ignored if distributed
+            num_gpus=len(cfg.gpu_ids),
+            dist=distributed,
+            drop_last=False,
+            shuffle=False)
+        dataloader_setting = dict(dataloader_setting,
+                                  **cfg.data.get('val_dataloader', {}))
+        val_dataloader = build_dataloader(val_dataset, **dataloader_setting)
+        eval_hook = DistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66185b72c47c99a0d296bf65c72f50a47f2d080c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .camera import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .fp16 import *  # noqa: F401, F403
+from .optimizer import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
+from .visualization import *  # noqa: F401, F403
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4a3c5526560996791a85f0d84a72a66286486ca
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .camera_base import CAMERAS
+from .single_camera import SimpleCamera
+from .single_camera_torch import SimpleCameraTorch
+
+__all__ = ['CAMERAS', 'SimpleCamera', 'SimpleCameraTorch']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/camera_base.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/camera_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..28b23e7c6279e3613265a949df91f6ced0413b99
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/camera_base.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.utils import Registry
+
+CAMERAS = Registry('camera')
+
+
+class SingleCameraBase(metaclass=ABCMeta):
+    """Base class for single camera model.
+
+    Args:
+        param (dict): Camera parameters
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_world: Project points from camera coordinates to world
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    @abstractmethod
+    def __init__(self, param):
+        """Load camera parameters and check validity."""
+
+    def world_to_camera(self, X):
+        """Project points from world coordinates to camera coordinates."""
+        raise NotImplementedError
+
+    def camera_to_world(self, X):
+        """Project points from camera coordinates to world coordinates."""
+        raise NotImplementedError
+
+    def camera_to_pixel(self, X):
+        """Project points from camera coordinates to pixel coordinates."""
+        raise NotImplementedError
+
+    def world_to_pixel(self, X):
+        """Project points from world coordinates to pixel coordinates."""
+        _X = self.world_to_camera(X)
+        return self.camera_to_pixel(_X)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/single_camera.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/single_camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..cabd79941af5c81110876e94ce6103cc02ea5078
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/single_camera.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from .camera_base import CAMERAS, SingleCameraBase
+
+
+@CAMERAS.register_module()
+class SimpleCamera(SingleCameraBase):
+    """Camera model to calculate coordinate transformation with given
+    intrinsic/extrinsic camera parameters.
+
+    Note:
+        The keypoint coordinate should be an np.ndarray with a shape of
+    [...,J, C] where J is the keypoint number of an instance, and C is
+    the coordinate dimension. For example:
+
+        [J, C]: shape of joint coordinates of a person with J joints.
+        [N, J, C]: shape of a batch of person joint coordinates.
+        [N, T, J, C]: shape of a batch of pose sequences.
+
+    Args:
+        param (dict): camera parameters including:
+            - R: 3x3, camera rotation matrix (camera-to-world)
+            - T: 3x1, camera translation (camera-to-world)
+            - K: (optional) 2x3, camera intrinsic matrix
+            - k: (optional) nx1, camera radial distortion coefficients
+            - p: (optional) mx1, camera tangential distortion coefficients
+            - f: (optional) 2x1, camera focal length
+            - c: (optional) 2x1, camera center
+        if K is not provided, it will be calculated from f and c.
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    def __init__(self, param):
+
+        self.param = {}
+        # extrinsic param
+        R = np.array(param['R'], dtype=np.float32)
+        T = np.array(param['T'], dtype=np.float32)
+        assert R.shape == (3, 3)
+        assert T.shape == (3, 1)
+        # The camera matrices are transposed in advance because the joint
+        # coordinates are stored as row vectors.
+        self.param['R_c2w'] = R.T
+        self.param['T_c2w'] = T.T
+        self.param['R_w2c'] = R
+        self.param['T_w2c'] = -self.param['T_c2w'] @ self.param['R_w2c']
+
+        # intrinsic param
+        if 'K' in param:
+            K = np.array(param['K'], dtype=np.float32)
+            assert K.shape == (2, 3)
+            self.param['K'] = K.T
+            self.param['f'] = np.array([K[0, 0], K[1, 1]])[:, np.newaxis]
+            self.param['c'] = np.array([K[0, 2], K[1, 2]])[:, np.newaxis]
+        elif 'f' in param and 'c' in param:
+            f = np.array(param['f'], dtype=np.float32)
+            c = np.array(param['c'], dtype=np.float32)
+            assert f.shape == (2, 1)
+            assert c.shape == (2, 1)
+            self.param['K'] = np.concatenate((np.diagflat(f), c), axis=-1).T
+            self.param['f'] = f
+            self.param['c'] = c
+        else:
+            raise ValueError('Camera intrinsic parameters are missing. '
+                             'Either "K" or "f"&"c" should be provided.')
+
+        # distortion param
+        if 'k' in param and 'p' in param:
+            self.undistortion = True
+            self.param['k'] = np.array(param['k'], dtype=np.float32).flatten()
+            self.param['p'] = np.array(param['p'], dtype=np.float32).flatten()
+            assert self.param['k'].size in {3, 6}
+            assert self.param['p'].size == 2
+        else:
+            self.undistortion = False
+
+    def world_to_camera(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_w2c'] + self.param['T_w2c']
+
+    def camera_to_world(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_c2w'] + self.param['T_c2w']
+
+    def camera_to_pixel(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+
+        _X = X / X[..., 2:]
+
+        if self.undistortion:
+            k = self.param['k']
+            p = self.param['p']
+            _X_2d = _X[..., :2]
+            r2 = (_X_2d**2).sum(-1)
+            radial = 1 + sum(ki * r2**(i + 1) for i, ki in enumerate(k[:3]))
+            if k.size == 6:
+                radial /= 1 + sum(
+                    (ki * r2**(i + 1) for i, ki in enumerate(k[3:])))
+
+            tangential = 2 * (p[1] * _X[..., 0] + p[0] * _X[..., 1])
+
+            _X[..., :2] = _X_2d * (radial + tangential)[..., None] + np.outer(
+                r2, p[::-1]).reshape(_X_2d.shape)
+        return _X @ self.param['K']
+
+    def pixel_to_camera(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        _X = X.copy()
+        _X[:, :2] = (X[:, :2] - self.param['c'].T) / self.param['f'].T * X[:,
+                                                                           [2]]
+        return _X
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/single_camera_torch.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/single_camera_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..22eb72f23d6eecf1b5c5a9b570a4f142fcf6e02a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/camera/single_camera_torch.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .camera_base import CAMERAS, SingleCameraBase
+
+
+@CAMERAS.register_module()
+class SimpleCameraTorch(SingleCameraBase):
+    """Camera model to calculate coordinate transformation with given
+    intrinsic/extrinsic camera parameters.
+
+    Notes:
+        The keypoint coordinate should be an np.ndarray with a shape of
+    [...,J, C] where J is the keypoint number of an instance, and C is
+    the coordinate dimension. For example:
+
+        [J, C]: shape of joint coordinates of a person with J joints.
+        [N, J, C]: shape of a batch of person joint coordinates.
+        [N, T, J, C]: shape of a batch of pose sequences.
+
+    Args:
+        param (dict): camera parameters including:
+            - R: 3x3, camera rotation matrix (camera-to-world)
+            - T: 3x1, camera translation (camera-to-world)
+            - K: (optional) 2x3, camera intrinsic matrix
+            - k: (optional) nx1, camera radial distortion coefficients
+            - p: (optional) mx1, camera tangential distortion coefficients
+            - f: (optional) 2x1, camera focal length
+            - c: (optional) 2x1, camera center
+        if K is not provided, it will be calculated from f and c.
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    def __init__(self, param, device):
+
+        self.param = {}
+        # extrinsic param
+        R = torch.tensor(param['R'], device=device)
+        T = torch.tensor(param['T'], device=device)
+
+        assert R.shape == (3, 3)
+        assert T.shape == (3, 1)
+        # The camera matrices are transposed in advance because the joint
+        # coordinates are stored as row vectors.
+        self.param['R_c2w'] = R.T
+        self.param['T_c2w'] = T.T
+        self.param['R_w2c'] = R
+        self.param['T_w2c'] = -self.param['T_c2w'] @ self.param['R_w2c']
+
+        # intrinsic param
+        if 'K' in param:
+            K = torch.tensor(param['K'], device=device)
+            assert K.shape == (2, 3)
+            self.param['K'] = K.T
+            self.param['f'] = torch.tensor([[K[0, 0]], [K[1, 1]]],
+                                           device=device)
+            self.param['c'] = torch.tensor([[K[0, 2]], [K[1, 2]]],
+                                           device=device)
+        elif 'f' in param and 'c' in param:
+            f = torch.tensor(param['f'], device=device)
+            c = torch.tensor(param['c'], device=device)
+            assert f.shape == (2, 1)
+            assert c.shape == (2, 1)
+            self.param['K'] = torch.cat([torch.diagflat(f), c], dim=-1).T
+            self.param['f'] = f
+            self.param['c'] = c
+        else:
+            raise ValueError('Camera intrinsic parameters are missing. '
+                             'Either "K" or "f"&"c" should be provided.')
+
+        # distortion param
+        if 'k' in param and 'p' in param:
+            self.undistortion = True
+            self.param['k'] = torch.tensor(param['k'], device=device).view(-1)
+            self.param['p'] = torch.tensor(param['p'], device=device).view(-1)
+            assert len(self.param['k']) in {3, 6}
+            assert len(self.param['p']) == 2
+        else:
+            self.undistortion = False
+
+    def world_to_camera(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_w2c'] + self.param['T_w2c']
+
+    def camera_to_world(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_c2w'] + self.param['T_c2w']
+
+    def camera_to_pixel(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+
+        _X = X / X[..., 2:]
+
+        if self.undistortion:
+            k = self.param['k']
+            p = self.param['p']
+            _X_2d = _X[..., :2]
+            r2 = (_X_2d**2).sum(-1)
+            radial = 1 + sum(ki * r2**(i + 1) for i, ki in enumerate(k[:3]))
+            if k.size == 6:
+                radial /= 1 + sum(
+                    (ki * r2**(i + 1) for i, ki in enumerate(k[3:])))
+
+            tangential = 2 * (p[1] * _X[..., 0] + p[0] * _X[..., 1])
+
+            _X[..., :2] = _X_2d * (radial + tangential)[..., None] + torch.ger(
+                r2, p.flip([0])).reshape(_X_2d.shape)
+        return _X @ self.param['K']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/distributed_wrapper.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/distributed_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..c67aceec992085e9952ea70c62009e9ec1db30ca
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/distributed_wrapper.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.parallel import MODULE_WRAPPERS as MMCV_MODULE_WRAPPERS
+from mmcv.parallel import MMDistributedDataParallel
+from mmcv.parallel.scatter_gather import scatter_kwargs
+from mmcv.utils import Registry
+from torch.cuda._utils import _get_device_index
+
+MODULE_WRAPPERS = Registry('module wrapper', parent=MMCV_MODULE_WRAPPERS)
+
+
+@MODULE_WRAPPERS.register_module()
+class DistributedDataParallelWrapper(nn.Module):
+    """A DistributedDataParallel wrapper for models in 3D mesh estimation task.
+
+    In  3D mesh estimation task, there is a need to wrap different modules in
+    the models with separate DistributedDataParallel. Otherwise, it will cause
+    errors for GAN training.
+    More specific, the GAN model, usually has two sub-modules:
+    generator and discriminator. If we wrap both of them in one
+    standard DistributedDataParallel, it will cause errors during training,
+    because when we update the parameters of the generator (or discriminator),
+    the parameters of the discriminator (or generator) is not updated, which is
+    not allowed for DistributedDataParallel.
+    So we design this wrapper to separately wrap DistributedDataParallel
+    for generator and discriminator.
+
+    In this wrapper, we perform two operations:
+    1. Wrap the modules in the models with separate MMDistributedDataParallel.
+        Note that only modules with parameters will be wrapped.
+    2. Do scatter operation for 'forward', 'train_step' and 'val_step'.
+
+    Note that the arguments of this wrapper is the same as those in
+    `torch.nn.parallel.distributed.DistributedDataParallel`.
+
+    Args:
+        module (nn.Module): Module that needs to be wrapped.
+        device_ids (list[int | `torch.device`]): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+        dim (int, optional): Same as that in the official scatter function in
+            pytorch. Defaults to 0.
+        broadcast_buffers (bool): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Defaults to False.
+        find_unused_parameters (bool, optional): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Traverse the autograd graph of all tensors contained in returned
+            value of the wrapped module’s forward function. Defaults to False.
+        kwargs (dict): Other arguments used in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+    """
+
+    def __init__(self,
+                 module,
+                 device_ids,
+                 dim=0,
+                 broadcast_buffers=False,
+                 find_unused_parameters=False,
+                 **kwargs):
+        super().__init__()
+        assert len(device_ids) == 1, (
+            'Currently, DistributedDataParallelWrapper only supports one'
+            'single CUDA device for each process.'
+            f'The length of device_ids must be 1, but got {len(device_ids)}.')
+        self.module = module
+        self.dim = dim
+        self.to_ddp(
+            device_ids=device_ids,
+            dim=dim,
+            broadcast_buffers=broadcast_buffers,
+            find_unused_parameters=find_unused_parameters,
+            **kwargs)
+        self.output_device = _get_device_index(device_ids[0], True)
+
+    def to_ddp(self, device_ids, dim, broadcast_buffers,
+               find_unused_parameters, **kwargs):
+        """Wrap models with separate MMDistributedDataParallel.
+
+        It only wraps the modules with parameters.
+        """
+        for name, module in self.module._modules.items():
+            if next(module.parameters(), None) is None:
+                module = module.cuda()
+            elif all(not p.requires_grad for p in module.parameters()):
+                module = module.cuda()
+            else:
+                module = MMDistributedDataParallel(
+                    module.cuda(),
+                    device_ids=device_ids,
+                    dim=dim,
+                    broadcast_buffers=broadcast_buffers,
+                    find_unused_parameters=find_unused_parameters,
+                    **kwargs)
+            self.module._modules[name] = module
+
+    def scatter(self, inputs, kwargs, device_ids):
+        """Scatter function.
+
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+            device_ids (int): Device id.
+        """
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def forward(self, *inputs, **kwargs):
+        """Forward function.
+
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        return self.module(*inputs[0], **kwargs[0])
+
+    def train_step(self, *inputs, **kwargs):
+        """Train step function.
+
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.train_step(*inputs[0], **kwargs[0])
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        """Validation step function.
+
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for ``scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.val_step(*inputs[0], **kwargs[0])
+        return output
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f9378429c8ddaa15f7ac17446bc9d484987df16
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bottom_up_eval import (aggregate_scale, aggregate_stage_flip,
+                             flip_feature_maps, get_group_preds,
+                             split_ae_outputs)
+from .eval_hooks import DistEvalHook, EvalHook
+from .mesh_eval import compute_similarity_transform
+from .pose3d_eval import keypoint_3d_auc, keypoint_3d_pck, keypoint_mpjpe
+from .top_down_eval import (keypoint_auc, keypoint_epe, keypoint_pck_accuracy,
+                            keypoints_from_heatmaps, keypoints_from_heatmaps3d,
+                            keypoints_from_regression,
+                            multilabel_classification_accuracy,
+                            pose_pck_accuracy, post_dark_udp)
+
+__all__ = [
+    'EvalHook', 'DistEvalHook', 'pose_pck_accuracy', 'keypoints_from_heatmaps',
+    'keypoints_from_regression', 'keypoint_pck_accuracy', 'keypoint_3d_pck',
+    'keypoint_3d_auc', 'keypoint_auc', 'keypoint_epe', 'get_group_preds',
+    'split_ae_outputs', 'flip_feature_maps', 'aggregate_stage_flip',
+    'aggregate_scale', 'compute_similarity_transform', 'post_dark_udp',
+    'keypoint_mpjpe', 'keypoints_from_heatmaps3d',
+    'multilabel_classification_accuracy'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/bottom_up_eval.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/bottom_up_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b37d7c98e684284e3863922e7c7d2abedce0e24
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/bottom_up_eval.py
@@ -0,0 +1,333 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmpose.core.post_processing import (get_warp_matrix, transform_preds,
+                                         warp_affine_joints)
+
+
+def split_ae_outputs(outputs, num_joints, with_heatmaps, with_ae,
+                     select_output_index):
+    """Split multi-stage outputs into heatmaps & tags.
+
+    Args:
+        outputs (list(Tensor)): Outputs of network
+        num_joints (int): Number of joints
+        with_heatmaps (list[bool]): Option to output
+            heatmaps for different stages.
+        with_ae (list[bool]): Option to output
+            ae tags for different stages.
+        select_output_index (list[int]): Output keep the selected index
+
+    Returns:
+        tuple: A tuple containing multi-stage outputs.
+
+        - list[Tensor]: multi-stage heatmaps.
+        - list[Tensor]: multi-stage tags.
+    """
+
+    heatmaps = []
+    tags = []
+
+    # aggregate heatmaps from different stages
+    for i, output in enumerate(outputs):
+        if i not in select_output_index:
+            continue
+        # staring index of the associative embeddings
+        offset_feat = num_joints if with_heatmaps[i] else 0
+        if with_heatmaps[i]:
+            heatmaps.append(output[:, :num_joints])
+        if with_ae[i]:
+            tags.append(output[:, offset_feat:])
+
+    return heatmaps, tags
+
+
+def flip_feature_maps(feature_maps, flip_index=None):
+    """Flip the feature maps and swap the channels.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        flip_index (list[int] | None): Channel-flip indexes.
+            If None, do not flip channels.
+
+    Returns:
+        list[Tensor]: Flipped feature_maps.
+    """
+    flipped_feature_maps = []
+    for feature_map in feature_maps:
+        feature_map = torch.flip(feature_map, [3])
+        if flip_index is not None:
+            flipped_feature_maps.append(feature_map[:, flip_index, :, :])
+        else:
+            flipped_feature_maps.append(feature_map)
+
+    return flipped_feature_maps
+
+
+def _resize_average(feature_maps, align_corners, index=-1, resize_size=None):
+    """Resize the feature maps and compute the average.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+
+    if feature_maps is None:
+        return None
+    feature_maps_avg = 0
+
+    feature_map_list = _resize_concate(
+        feature_maps, align_corners, index=index, resize_size=resize_size)
+    for feature_map in feature_map_list:
+        feature_maps_avg += feature_map
+
+    feature_maps_avg /= len(feature_map_list)
+    return [feature_maps_avg]
+
+
+def _resize_unsqueeze_concat(feature_maps,
+                             align_corners,
+                             index=-1,
+                             resize_size=None):
+    """Resize, unsqueeze and concatenate the feature_maps.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+    if feature_maps is None:
+        return None
+    feature_map_list = _resize_concate(
+        feature_maps, align_corners, index=index, resize_size=resize_size)
+
+    feat_dim = len(feature_map_list[0].shape) - 1
+    output_feature_maps = torch.cat(
+        [torch.unsqueeze(fmap, dim=feat_dim + 1) for fmap in feature_map_list],
+        dim=feat_dim + 1)
+    return [output_feature_maps]
+
+
+def _resize_concate(feature_maps, align_corners, index=-1, resize_size=None):
+    """Resize and concatenate the feature_maps.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+    if feature_maps is None:
+        return None
+
+    feature_map_list = []
+
+    if index < 0:
+        index += len(feature_maps)
+
+    if resize_size is None:
+        resize_size = (feature_maps[index].size(2),
+                       feature_maps[index].size(3))
+
+    for feature_map in feature_maps:
+        ori_size = (feature_map.size(2), feature_map.size(3))
+        if ori_size != resize_size:
+            feature_map = torch.nn.functional.interpolate(
+                feature_map,
+                size=resize_size,
+                mode='bilinear',
+                align_corners=align_corners)
+
+        feature_map_list.append(feature_map)
+
+    return feature_map_list
+
+
+def aggregate_stage_flip(feature_maps,
+                         feature_maps_flip,
+                         index=-1,
+                         project2image=True,
+                         size_projected=None,
+                         align_corners=False,
+                         aggregate_stage='concat',
+                         aggregate_flip='average'):
+    """Inference the model to get multi-stage outputs (heatmaps & tags), and
+    resize them to base sizes.
+
+    Args:
+        feature_maps (list[Tensor]): feature_maps can be heatmaps,
+            tags, and pafs.
+        feature_maps_flip (list[Tensor] | None): flipped feature_maps.
+            feature maps can be heatmaps, tags, and pafs.
+        project2image (bool): Option to resize to base scale.
+        size_projected (list[int, int]): Base size of heatmaps [w, h].
+        align_corners (bool): Align corners when performing interpolation.
+        aggregate_stage (str): Methods to aggregate multi-stage feature maps.
+            Options: 'concat', 'average'. Default: 'concat.
+
+            - 'concat': Concatenate the original and the flipped feature maps.
+            - 'average': Get the average of the original and the flipped
+                feature maps.
+        aggregate_flip (str): Methods to aggregate the original and
+            the flipped feature maps. Options: 'concat', 'average', 'none'.
+            Default: 'average.
+
+            - 'concat': Concatenate the original and the flipped feature maps.
+            - 'average': Get the average of the original and the flipped
+                feature maps..
+            - 'none': no flipped feature maps.
+
+    Returns:
+        list[Tensor]: Aggregated feature maps with shape [NxKxWxH].
+    """
+
+    if feature_maps_flip is None:
+        aggregate_flip = 'none'
+
+    output_feature_maps = []
+
+    if aggregate_stage == 'average':
+        _aggregate_stage_func = _resize_average
+    elif aggregate_stage == 'concat':
+        _aggregate_stage_func = _resize_concate
+    else:
+        NotImplementedError()
+
+    if project2image and size_projected:
+        _origin = _aggregate_stage_func(
+            feature_maps,
+            align_corners,
+            index=index,
+            resize_size=(size_projected[1], size_projected[0]))
+
+        _flipped = _aggregate_stage_func(
+            feature_maps_flip,
+            align_corners,
+            index=index,
+            resize_size=(size_projected[1], size_projected[0]))
+    else:
+        _origin = _aggregate_stage_func(
+            feature_maps, align_corners, index=index, resize_size=None)
+        _flipped = _aggregate_stage_func(
+            feature_maps_flip, align_corners, index=index, resize_size=None)
+
+    if aggregate_flip == 'average':
+        assert feature_maps_flip is not None
+        for _ori, _fli in zip(_origin, _flipped):
+            output_feature_maps.append((_ori + _fli) / 2.0)
+
+    elif aggregate_flip == 'concat':
+        assert feature_maps_flip is not None
+        output_feature_maps.append(*_origin)
+        output_feature_maps.append(*_flipped)
+
+    elif aggregate_flip == 'none':
+        if isinstance(_origin, list):
+            output_feature_maps.append(*_origin)
+        else:
+            output_feature_maps.append(_origin)
+    else:
+        NotImplementedError()
+
+    return output_feature_maps
+
+
+def aggregate_scale(feature_maps_list,
+                    align_corners=False,
+                    aggregate_scale='average'):
+    """Aggregate multi-scale outputs.
+
+    Note:
+        batch size: N
+        keypoints num : K
+        heatmap width: W
+        heatmap height: H
+
+    Args:
+        feature_maps_list (list[Tensor]): Aggregated feature maps.
+        project2image (bool): Option to resize to base scale.
+        align_corners (bool): Align corners when performing interpolation.
+        aggregate_scale (str): Methods to aggregate multi-scale feature maps.
+            Options: 'average', 'unsqueeze_concat'.
+
+            - 'average': Get the average of the feature maps.
+            - 'unsqueeze_concat': Concatenate the feature maps along new axis.
+                Default: 'average.
+
+    Returns:
+        Tensor: Aggregated feature maps.
+    """
+
+    if aggregate_scale == 'average':
+        output_feature_maps = _resize_average(
+            feature_maps_list, align_corners, index=0, resize_size=None)
+
+    elif aggregate_scale == 'unsqueeze_concat':
+        output_feature_maps = _resize_unsqueeze_concat(
+            feature_maps_list, align_corners, index=0, resize_size=None)
+    else:
+        NotImplementedError()
+
+    return output_feature_maps[0]
+
+
+def get_group_preds(grouped_joints,
+                    center,
+                    scale,
+                    heatmap_size,
+                    use_udp=False):
+    """Transform the grouped joints back to the image.
+
+    Args:
+        grouped_joints (list): Grouped person joints.
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        heatmap_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        use_udp (bool): Unbiased data processing.
+             Paper ref: Huang et al. The Devil is in the Details: Delving into
+             Unbiased Data Processing for Human Pose Estimation (CVPR'2020).
+
+    Returns:
+        list: List of the pose result for each person.
+    """
+    if len(grouped_joints) == 0:
+        return []
+
+    if use_udp:
+        if grouped_joints[0].shape[0] > 0:
+            heatmap_size_t = np.array(heatmap_size, dtype=np.float32) - 1.0
+            trans = get_warp_matrix(
+                theta=0,
+                size_input=heatmap_size_t,
+                size_dst=scale,
+                size_target=heatmap_size_t)
+            grouped_joints[0][..., :2] = \
+                warp_affine_joints(grouped_joints[0][..., :2], trans)
+        results = [person for person in grouped_joints[0]]
+    else:
+        results = []
+        for person in grouped_joints[0]:
+            joints = transform_preds(person, center, scale, heatmap_size)
+            results.append(joints)
+
+    return results
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/eval_hooks.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf36a038859ee7d7a77b68706ee96c2154fc39cc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/eval_hooks.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.runner import DistEvalHook as _DistEvalHook
+from mmcv.runner import EvalHook as _EvalHook
+
+MMPOSE_GREATER_KEYS = [
+    'acc', 'ap', 'ar', 'pck', 'auc', '3dpck', 'p-3dpck', '3dauc', 'p-3dauc'
+]
+MMPOSE_LESS_KEYS = ['loss', 'epe', 'nme', 'mpjpe', 'p-mpjpe', 'n-mpjpe']
+
+
+class EvalHook(_EvalHook):
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=MMPOSE_GREATER_KEYS,
+                 less_keys=MMPOSE_LESS_KEYS,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmpose.apis import single_gpu_test
+            test_fn = single_gpu_test
+
+        # to be compatible with the config before v0.16.0
+
+        # remove "gpu_collect" from eval_kwargs
+        if 'gpu_collect' in eval_kwargs:
+            warnings.warn(
+                '"gpu_collect" will be deprecated in EvalHook.'
+                'Please remove it from the config.', DeprecationWarning)
+            _ = eval_kwargs.pop('gpu_collect')
+
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="AP".', DeprecationWarning)
+
+            key_indicator = eval_kwargs.pop('key_indicator', 'AP')
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys, **eval_kwargs)
+
+
+class DistEvalHook(_DistEvalHook):
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=MMPOSE_GREATER_KEYS,
+                 less_keys=MMPOSE_LESS_KEYS,
+                 broadcast_bn_buffer=True,
+                 tmpdir=None,
+                 gpu_collect=False,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmpose.apis import multi_gpu_test
+            test_fn = multi_gpu_test
+
+        # to be compatible with the config before v0.16.0
+
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="AP".', DeprecationWarning)
+
+            key_indicator = eval_kwargs.pop('key_indicator', 'AP')
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys,
+                         broadcast_bn_buffer, tmpdir, gpu_collect,
+                         **eval_kwargs)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/mesh_eval.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/mesh_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..683b4539b29d1829a324de424c6d9f85a7037e5d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/mesh_eval.py
@@ -0,0 +1,66 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/akanazawa/hmr
+# Original licence: Copyright (c) 2018 akanazawa, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+
+
+def compute_similarity_transform(source_points, target_points):
+    """Computes a similarity transform (sR, t) that takes a set of 3D points
+    source_points (N x 3) closest to a set of 3D points target_points, where R
+    is an 3x3 rotation matrix, t 3x1 translation, s scale. And return the
+    transformed 3D points source_points_hat (N x 3). i.e. solves the orthogonal
+    Procrutes problem.
+
+    Note:
+        Points number: N
+
+    Args:
+        source_points (np.ndarray): Source point set with shape [N, 3].
+        target_points (np.ndarray): Target point set with shape [N, 3].
+
+    Returns:
+        np.ndarray: Transformed source point set with shape [N, 3].
+    """
+
+    assert target_points.shape[0] == source_points.shape[0]
+    assert target_points.shape[1] == 3 and source_points.shape[1] == 3
+
+    source_points = source_points.T
+    target_points = target_points.T
+
+    # 1. Remove mean.
+    mu1 = source_points.mean(axis=1, keepdims=True)
+    mu2 = target_points.mean(axis=1, keepdims=True)
+    X1 = source_points - mu1
+    X2 = target_points - mu2
+
+    # 2. Compute variance of X1 used for scale.
+    var1 = np.sum(X1**2)
+
+    # 3. The outer product of X1 and X2.
+    K = X1.dot(X2.T)
+
+    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
+    # singular vectors of K.
+    U, _, Vh = np.linalg.svd(K)
+    V = Vh.T
+    # Construct Z that fixes the orientation of R to get det(R)=1.
+    Z = np.eye(U.shape[0])
+    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
+    # Construct R.
+    R = V.dot(Z.dot(U.T))
+
+    # 5. Recover scale.
+    scale = np.trace(R.dot(K)) / var1
+
+    # 6. Recover translation.
+    t = mu2 - scale * (R.dot(mu1))
+
+    # 7. Transform the source points:
+    source_points_hat = scale * R.dot(source_points) + t
+
+    source_points_hat = source_points_hat.T
+
+    return source_points_hat
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/pose3d_eval.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/pose3d_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..545778ca7441c2d3e8ec58449c8ca7b162322e9e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/pose3d_eval.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from .mesh_eval import compute_similarity_transform
+
+
+def keypoint_mpjpe(pred, gt, mask, alignment='none'):
+    """Calculate the mean per-joint position error (MPJPE) and the error after
+    rigid alignment with the ground truth (P-MPJPE).
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - keypoint_dims: C
+
+    Args:
+        pred (np.ndarray): Predicted keypoint location with shape [N, K, C].
+        gt (np.ndarray): Groundtruth keypoint location with shape [N, K, C].
+        mask (np.ndarray): Visibility of the target with shape [N, K].
+            False for invisible joints, and True for visible.
+            Invisible joints will be ignored for accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+                - ``'none'``: no alignment will be applied
+                - ``'scale'``: align in the least-square sense in scale
+                - ``'procrustes'``: align in the least-square sense in
+                    scale, rotation and translation.
+    Returns:
+        tuple: A tuple containing joint position errors
+
+        - (float | np.ndarray): mean per-joint position error (mpjpe).
+        - (float | np.ndarray): mpjpe after rigid alignment with the
+            ground truth (p-mpjpe).
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)[mask].mean()
+
+    return error
+
+
+def keypoint_3d_pck(pred, gt, mask, alignment='none', threshold=0.15):
+    """Calculate the Percentage of Correct Keypoints (3DPCK) w. or w/o rigid
+    alignment.
+
+    Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved
+    CNN Supervision' 3DV'2017. <https://arxiv.org/pdf/1611.09813>`__ .
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - keypoint_dims: C
+
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+
+        threshold:  If L2 distance between the prediction and the groundtruth
+            is less then threshold, the predicted result is considered as
+            correct. Default: 0.15 (m).
+
+    Returns:
+        pck: percentage of correct keypoints.
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)
+    pck = (error < threshold).astype(np.float32)[mask].mean() * 100
+
+    return pck
+
+
+def keypoint_3d_auc(pred, gt, mask, alignment='none'):
+    """Calculate the Area Under the Curve (3DAUC) computed for a range of 3DPCK
+    thresholds.
+
+    Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved
+    CNN Supervision' 3DV'2017. <https://arxiv.org/pdf/1611.09813>`__ .
+    This implementation is derived from mpii_compute_3d_pck.m, which is
+    provided as part of the MPI-INF-3DHP test data release.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        keypoint_dims: C
+
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+
+    Returns:
+        auc: AUC computed for a range of 3DPCK thresholds.
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)
+
+    thresholds = np.linspace(0., 0.15, 31)
+    pck_values = np.zeros(len(thresholds))
+    for i in range(len(thresholds)):
+        pck_values[i] = (error < thresholds[i]).astype(np.float32)[mask].mean()
+
+    auc = pck_values.mean() * 100
+
+    return auc
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/top_down_eval.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/top_down_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee6a2501cf1eec1b16f7d58bf9fd62da0fa48ccf
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/evaluation/top_down_eval.py
@@ -0,0 +1,684 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import cv2
+import numpy as np
+
+from mmpose.core.post_processing import transform_preds
+
+
+def _calc_distances(preds, targets, mask, normalize):
+    """Calculate the normalized distances between preds and target.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        dimension of keypoints: D (normally, D=2 or D=3)
+
+    Args:
+        preds (np.ndarray[N, K, D]): Predicted keypoint location.
+        targets (np.ndarray[N, K, D]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize (np.ndarray[N, D]): Typical value is heatmap_size
+
+    Returns:
+        np.ndarray[K, N]: The normalized distances. \
+            If target keypoints are missing, the distance is -1.
+    """
+    N, K, _ = preds.shape
+    # set mask=0 when normalize==0
+    _mask = mask.copy()
+    _mask[np.where((normalize == 0).sum(1))[0], :] = False
+    distances = np.full((N, K), -1, dtype=np.float32)
+    # handle invalid values
+    normalize[np.where(normalize <= 0)] = 1e6
+    distances[_mask] = np.linalg.norm(
+        ((preds - targets) / normalize[:, None, :])[_mask], axis=-1)
+    return distances.T
+
+
+def _distance_acc(distances, thr=0.5):
+    """Return the percentage below the distance threshold, while ignoring
+    distances values with -1.
+
+    Note:
+        batch_size: N
+    Args:
+        distances (np.ndarray[N, ]): The normalized distances.
+        thr (float): Threshold of the distances.
+
+    Returns:
+        float: Percentage of distances below the threshold. \
+            If all target keypoints are missing, return -1.
+    """
+    distance_valid = distances != -1
+    num_distance_valid = distance_valid.sum()
+    if num_distance_valid > 0:
+        return (distances[distance_valid] < thr).sum() / num_distance_valid
+    return -1
+
+
+def _get_max_preds(heatmaps):
+    """Get keypoint predictions from score maps.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    assert isinstance(heatmaps,
+                      np.ndarray), ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+    N, K, _, W = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+    preds[:, :, 0] = preds[:, :, 0] % W
+    preds[:, :, 1] = preds[:, :, 1] // W
+
+    preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
+    return preds, maxvals
+
+
+def _get_max_preds_3d(heatmaps):
+    """Get keypoint predictions from 3D score maps.
+
+    Note:
+        batch size: N
+        num keypoints: K
+        heatmap depth size: D
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - preds (np.ndarray[N, K, 3]): Predicted keypoint location.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    assert isinstance(heatmaps, np.ndarray), \
+        ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 5, 'heatmaps should be 5-ndim'
+
+    N, K, D, H, W = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+
+    preds = np.zeros((N, K, 3), dtype=np.float32)
+    _idx = idx[..., 0]
+    preds[..., 2] = _idx // (H * W)
+    preds[..., 1] = (_idx // W) % H
+    preds[..., 0] = _idx % W
+
+    preds = np.where(maxvals > 0.0, preds, -1)
+    return preds, maxvals
+
+
+def pose_pck_accuracy(output, target, mask, thr=0.05, normalize=None):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints from heatmaps.
+
+    Note:
+        PCK metric measures accuracy of the localization of the body joints.
+        The distances between predicted positions and the ground-truth ones
+        are typically normalized by the bounding box size.
+        The threshold (thr) of the normalized distance is commonly set
+        as 0.05, 0.1 or 0.2 etc.
+
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        output (np.ndarray[N, K, H, W]): Model output heatmaps.
+        target (np.ndarray[N, K, H, W]): Groundtruth heatmaps.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        thr (float): Threshold of PCK calculation. Default 0.05.
+        normalize (np.ndarray[N, 2]): Normalization factor for H&W.
+
+    Returns:
+        tuple: A tuple containing keypoint accuracy.
+
+        - np.ndarray[K]: Accuracy of each keypoint.
+        - float: Averaged accuracy across all keypoints.
+        - int: Number of valid keypoints.
+    """
+    N, K, H, W = output.shape
+    if K == 0:
+        return None, 0, 0
+    if normalize is None:
+        normalize = np.tile(np.array([[H, W]]), (N, 1))
+
+    pred, _ = _get_max_preds(output)
+    gt, _ = _get_max_preds(target)
+    return keypoint_pck_accuracy(pred, gt, mask, thr, normalize)
+
+
+def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints for coordinates.
+
+    Note:
+        PCK metric measures accuracy of the localization of the body joints.
+        The distances between predicted positions and the ground-truth ones
+        are typically normalized by the bounding box size.
+        The threshold (thr) of the normalized distance is commonly set
+        as 0.05, 0.1 or 0.2 etc.
+
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        thr (float): Threshold of PCK calculation.
+        normalize (np.ndarray[N, 2]): Normalization factor for H&W.
+
+    Returns:
+        tuple: A tuple containing keypoint accuracy.
+
+        - acc (np.ndarray[K]): Accuracy of each keypoint.
+        - avg_acc (float): Averaged accuracy across all keypoints.
+        - cnt (int): Number of valid keypoints.
+    """
+    distances = _calc_distances(pred, gt, mask, normalize)
+
+    acc = np.array([_distance_acc(d, thr) for d in distances])
+    valid_acc = acc[acc >= 0]
+    cnt = len(valid_acc)
+    avg_acc = valid_acc.mean() if cnt > 0 else 0
+    return acc, avg_acc, cnt
+
+
+def keypoint_auc(pred, gt, mask, normalize, num_step=20):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints for coordinates.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize (float): Normalization factor.
+
+    Returns:
+        float: Area under curve.
+    """
+    nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))
+    x = [1.0 * i / num_step for i in range(num_step)]
+    y = []
+    for thr in x:
+        _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)
+        y.append(avg_acc)
+
+    auc = 0
+    for i in range(num_step):
+        auc += 1.0 / num_step * y[i]
+    return auc
+
+
+def keypoint_nme(pred, gt, mask, normalize_factor):
+    """Calculate the normalized mean error (NME).
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize_factor (np.ndarray[N, 2]): Normalization factor.
+
+    Returns:
+        float: normalized mean error
+    """
+    distances = _calc_distances(pred, gt, mask, normalize_factor)
+    distance_valid = distances[distances != -1]
+    return distance_valid.sum() / max(1, len(distance_valid))
+
+
+def keypoint_epe(pred, gt, mask):
+    """Calculate the end-point error.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+
+    Returns:
+        float: Average end-point error.
+    """
+
+    distances = _calc_distances(
+        pred, gt, mask,
+        np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32))
+    distance_valid = distances[distances != -1]
+    return distance_valid.sum() / max(1, len(distance_valid))
+
+
+def _taylor(heatmap, coord):
+    """Distribution aware coordinate decoding method.
+
+    Note:
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmap (np.ndarray[H, W]): Heatmap of a particular joint type.
+        coord (np.ndarray[2,]): Coordinates of the predicted keypoints.
+
+    Returns:
+        np.ndarray[2,]: Updated coordinates.
+    """
+    H, W = heatmap.shape[:2]
+    px, py = int(coord[0]), int(coord[1])
+    if 1 < px < W - 2 and 1 < py < H - 2:
+        dx = 0.5 * (heatmap[py][px + 1] - heatmap[py][px - 1])
+        dy = 0.5 * (heatmap[py + 1][px] - heatmap[py - 1][px])
+        dxx = 0.25 * (
+            heatmap[py][px + 2] - 2 * heatmap[py][px] + heatmap[py][px - 2])
+        dxy = 0.25 * (
+            heatmap[py + 1][px + 1] - heatmap[py - 1][px + 1] -
+            heatmap[py + 1][px - 1] + heatmap[py - 1][px - 1])
+        dyy = 0.25 * (
+            heatmap[py + 2 * 1][px] - 2 * heatmap[py][px] +
+            heatmap[py - 2 * 1][px])
+        derivative = np.array([[dx], [dy]])
+        hessian = np.array([[dxx, dxy], [dxy, dyy]])
+        if dxx * dyy - dxy**2 != 0:
+            hessianinv = np.linalg.inv(hessian)
+            offset = -hessianinv @ derivative
+            offset = np.squeeze(np.array(offset.T), axis=0)
+            coord += offset
+    return coord
+
+
+def post_dark_udp(coords, batch_heatmaps, kernel=3):
+    """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
+    Devil is in the Details: Delving into Unbiased Data Processing for Human
+    Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
+    Representation for Human Pose Estimation (CVPR 2020).
+
+    Note:
+        - batch size: B
+        - num keypoints: K
+        - num persons: N
+        - height of heatmaps: H
+        - width of heatmaps: W
+
+        B=1 for bottom_up paradigm where all persons share the same heatmap.
+        B=N for top_down paradigm where each person has its own heatmaps.
+
+    Args:
+        coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
+        batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
+        kernel (int): Gaussian kernel size (K) for modulation.
+
+    Returns:
+        np.ndarray([N, K, 2]): Refined coordinates.
+    """
+    if not isinstance(batch_heatmaps, np.ndarray):
+        batch_heatmaps = batch_heatmaps.cpu().numpy()
+    B, K, H, W = batch_heatmaps.shape
+    N = coords.shape[0]
+    assert (B == 1 or B == N)
+    for heatmaps in batch_heatmaps:
+        for heatmap in heatmaps:
+            cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
+    np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
+    np.log(batch_heatmaps, batch_heatmaps)
+
+    batch_heatmaps_pad = np.pad(
+        batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)),
+        mode='edge').flatten()
+
+    index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
+    index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
+    index = index.astype(int).reshape(-1, 1)
+    i_ = batch_heatmaps_pad[index]
+    ix1 = batch_heatmaps_pad[index + 1]
+    iy1 = batch_heatmaps_pad[index + W + 2]
+    ix1y1 = batch_heatmaps_pad[index + W + 3]
+    ix1_y1_ = batch_heatmaps_pad[index - W - 3]
+    ix1_ = batch_heatmaps_pad[index - 1]
+    iy1_ = batch_heatmaps_pad[index - 2 - W]
+
+    dx = 0.5 * (ix1 - ix1_)
+    dy = 0.5 * (iy1 - iy1_)
+    derivative = np.concatenate([dx, dy], axis=1)
+    derivative = derivative.reshape(N, K, 2, 1)
+    dxx = ix1 - 2 * i_ + ix1_
+    dyy = iy1 - 2 * i_ + iy1_
+    dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
+    hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
+    hessian = hessian.reshape(N, K, 2, 2)
+    hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
+    coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()
+    return coords
+
+
+def _gaussian_blur(heatmaps, kernel=11):
+    """Modulate heatmap distribution with Gaussian.
+     sigma = 0.3*((kernel_size-1)*0.5-1)+0.8
+     sigma~=3 if k=17
+     sigma=2 if k=11;
+     sigma~=1.5 if k=7;
+     sigma~=1 if k=3;
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        kernel (int): Gaussian kernel size (K) for modulation, which should
+            match the heatmap gaussian sigma when training.
+            K=17 for sigma=3 and k=11 for sigma=2.
+
+    Returns:
+        np.ndarray ([N, K, H, W]): Modulated heatmap distribution.
+    """
+    assert kernel % 2 == 1
+
+    border = (kernel - 1) // 2
+    batch_size = heatmaps.shape[0]
+    num_joints = heatmaps.shape[1]
+    height = heatmaps.shape[2]
+    width = heatmaps.shape[3]
+    for i in range(batch_size):
+        for j in range(num_joints):
+            origin_max = np.max(heatmaps[i, j])
+            dr = np.zeros((height + 2 * border, width + 2 * border),
+                          dtype=np.float32)
+            dr[border:-border, border:-border] = heatmaps[i, j].copy()
+            dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
+            heatmaps[i, j] = dr[border:-border, border:-border].copy()
+            heatmaps[i, j] *= origin_max / np.max(heatmaps[i, j])
+    return heatmaps
+
+
+def keypoints_from_regression(regression_preds, center, scale, img_size):
+    """Get final keypoint predictions from regression vectors and transform
+    them back to the image.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        regression_preds (np.ndarray[N, K, 2]): model prediction.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+        img_size (list(img_width, img_height)): model input image size.
+
+    Returns:
+        tuple:
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    N, K, _ = regression_preds.shape
+    preds, maxvals = regression_preds, np.ones((N, K, 1), dtype=np.float32)
+
+    preds = preds * img_size
+
+    # Transform back to the image
+    for i in range(N):
+        preds[i] = transform_preds(preds[i], center[i], scale[i], img_size)
+
+    return preds, maxvals
+
+
+def keypoints_from_heatmaps(heatmaps,
+                            center,
+                            scale,
+                            unbiased=False,
+                            post_process='default',
+                            kernel=11,
+                            valid_radius_factor=0.0546875,
+                            use_udp=False,
+                            target_type='GaussianHeatmap'):
+    """Get final keypoint predictions from heatmaps and transform them back to
+    the image.
+
+    Note:
+        - batch size: N
+        - num keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+        post_process (str/None): Choice of methods to post-process
+            heatmaps. Currently supported: None, 'default', 'unbiased',
+            'megvii'.
+        unbiased (bool): Option to use unbiased decoding. Mutually
+            exclusive with megvii.
+            Note: this arg is deprecated and unbiased=True can be replaced
+            by post_process='unbiased'
+            Paper ref: Zhang et al. Distribution-Aware Coordinate
+            Representation for Human Pose Estimation (CVPR 2020).
+        kernel (int): Gaussian kernel size (K) for modulation, which should
+            match the heatmap gaussian sigma when training.
+            K=17 for sigma=3 and k=11 for sigma=2.
+        valid_radius_factor (float): The radius factor of the positive area
+            in classification heatmap for UDP.
+        use_udp (bool): Use unbiased data processing.
+        target_type (str): 'GaussianHeatmap' or 'CombinedTarget'.
+            GaussianHeatmap: Classification target with gaussian distribution.
+            CombinedTarget: The combination of classification target
+            (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Returns:
+        tuple: A tuple containing keypoint predictions and scores.
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    # Avoid being affected
+    heatmaps = heatmaps.copy()
+
+    # detect conflicts
+    if unbiased:
+        assert post_process not in [False, None, 'megvii']
+    if post_process in ['megvii', 'unbiased']:
+        assert kernel > 0
+    if use_udp:
+        assert not post_process == 'megvii'
+
+    # normalize configs
+    if post_process is False:
+        warnings.warn(
+            'post_process=False is deprecated, '
+            'please use post_process=None instead', DeprecationWarning)
+        post_process = None
+    elif post_process is True:
+        if unbiased is True:
+            warnings.warn(
+                'post_process=True, unbiased=True is deprecated,'
+                " please use post_process='unbiased' instead",
+                DeprecationWarning)
+            post_process = 'unbiased'
+        else:
+            warnings.warn(
+                'post_process=True, unbiased=False is deprecated, '
+                "please use post_process='default' instead",
+                DeprecationWarning)
+            post_process = 'default'
+    elif post_process == 'default':
+        if unbiased is True:
+            warnings.warn(
+                'unbiased=True is deprecated, please use '
+                "post_process='unbiased' instead", DeprecationWarning)
+            post_process = 'unbiased'
+
+    # start processing
+    if post_process == 'megvii':
+        heatmaps = _gaussian_blur(heatmaps, kernel=kernel)
+
+    N, K, H, W = heatmaps.shape
+    if use_udp:
+        if target_type.lower() == 'GaussianHeatMap'.lower():
+            preds, maxvals = _get_max_preds(heatmaps)
+            preds = post_dark_udp(preds, heatmaps, kernel=kernel)
+        elif target_type.lower() == 'CombinedTarget'.lower():
+            for person_heatmaps in heatmaps:
+                for i, heatmap in enumerate(person_heatmaps):
+                    kt = 2 * kernel + 1 if i % 3 == 0 else kernel
+                    cv2.GaussianBlur(heatmap, (kt, kt), 0, heatmap)
+            # valid radius is in direct proportion to the height of heatmap.
+            valid_radius = valid_radius_factor * H
+            offset_x = heatmaps[:, 1::3, :].flatten() * valid_radius
+            offset_y = heatmaps[:, 2::3, :].flatten() * valid_radius
+            heatmaps = heatmaps[:, ::3, :]
+            preds, maxvals = _get_max_preds(heatmaps)
+            index = preds[..., 0] + preds[..., 1] * W
+            index += W * H * np.arange(0, N * K / 3)
+            index = index.astype(int).reshape(N, K // 3, 1)
+            preds += np.concatenate((offset_x[index], offset_y[index]), axis=2)
+        else:
+            raise ValueError('target_type should be either '
+                             "'GaussianHeatmap' or 'CombinedTarget'")
+    else:
+        preds, maxvals = _get_max_preds(heatmaps)
+        if post_process == 'unbiased':  # alleviate biased coordinate
+            # apply Gaussian distribution modulation.
+            heatmaps = np.log(
+                np.maximum(_gaussian_blur(heatmaps, kernel), 1e-10))
+            for n in range(N):
+                for k in range(K):
+                    preds[n][k] = _taylor(heatmaps[n][k], preds[n][k])
+        elif post_process is not None:
+            # add +/-0.25 shift to the predicted locations for higher acc.
+            for n in range(N):
+                for k in range(K):
+                    heatmap = heatmaps[n][k]
+                    px = int(preds[n][k][0])
+                    py = int(preds[n][k][1])
+                    if 1 < px < W - 1 and 1 < py < H - 1:
+                        diff = np.array([
+                            heatmap[py][px + 1] - heatmap[py][px - 1],
+                            heatmap[py + 1][px] - heatmap[py - 1][px]
+                        ])
+                        preds[n][k] += np.sign(diff) * .25
+                        if post_process == 'megvii':
+                            preds[n][k] += 0.5
+
+    # Transform back to the image
+    for i in range(N):
+        preds[i] = transform_preds(
+            preds[i], center[i], scale[i], [W, H], use_udp=use_udp)
+
+    if post_process == 'megvii':
+        maxvals = maxvals / 255.0 + 0.5
+
+    return preds, maxvals
+
+
+def keypoints_from_heatmaps3d(heatmaps, center, scale):
+    """Get final keypoint predictions from 3d heatmaps and transform them back
+    to the image.
+
+    Note:
+        - batch size: N
+        - num keypoints: K
+        - heatmap depth size: D
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+
+    Returns:
+        tuple: A tuple containing keypoint predictions and scores.
+
+        - preds (np.ndarray[N, K, 3]): Predicted 3d keypoint location \
+            in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    N, K, D, H, W = heatmaps.shape
+    preds, maxvals = _get_max_preds_3d(heatmaps)
+    # Transform back to the image
+    for i in range(N):
+        preds[i, :, :2] = transform_preds(preds[i, :, :2], center[i], scale[i],
+                                          [W, H])
+    return preds, maxvals
+
+
+def multilabel_classification_accuracy(pred, gt, mask, thr=0.5):
+    """Get multi-label classification accuracy.
+
+    Note:
+        - batch size: N
+        - label number: L
+
+    Args:
+        pred (np.ndarray[N, L, 2]): model predicted labels.
+        gt (np.ndarray[N, L, 2]): ground-truth labels.
+        mask (np.ndarray[N, 1] or np.ndarray[N, L] ): reliability of
+        ground-truth labels.
+
+    Returns:
+        float: multi-label classification accuracy.
+    """
+    # we only compute accuracy on the samples with ground-truth of all labels.
+    valid = (mask > 0).min(axis=1) if mask.ndim == 2 else (mask > 0)
+    pred, gt = pred[valid], gt[valid]
+
+    if pred.shape[0] == 0:
+        acc = 0.0  # when no sample is with gt labels, set acc to 0.
+    else:
+        # The classification of a sample is regarded as correct
+        # only if it's correct for all labels.
+        acc = (((pred - thr) * (gt - thr)) > 0).all(axis=1).mean()
+    return acc
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cb054810870626496ab4145446b17cf2c2e0b5d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .decorators import auto_fp16, force_fp32
+from .hooks import Fp16OptimizerHook, wrap_fp16_model
+from .utils import cast_tensor_type
+
+__all__ = [
+    'auto_fp16', 'force_fp32', 'Fp16OptimizerHook', 'wrap_fp16_model',
+    'cast_tensor_type'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/decorators.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d70ddf533c069b26f08ef3a973328790843def5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/decorators.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import warnings
+from inspect import getfullargspec
+
+import torch
+
+from .utils import cast_tensor_type
+
+
+def auto_fp16(apply_to=None, out_fp32=False):
+    """Decorator to enable fp16 training automatically.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If inputs arguments are fp32 tensors, they will
+    be converted to fp16 automatically. Arguments other than fp32 tensors are
+    ignored.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp32 (bool): Whether to convert the output back to fp32.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp16
+        >>>     @auto_fp16()
+        >>>     def forward(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp16
+        >>>     @auto_fp16(apply_to=('pred', ))
+        >>>     def do_something(self, pred, others):
+        >>>         pass
+    """
+
+    warnings.warn(
+        'auto_fp16 in mmpose will be deprecated in the next release.'
+        'Please use mmcv.runner.auto_fp16 instead (mmcv>=1.3.1).',
+        DeprecationWarning)
+
+    def auto_fp16_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@auto_fp16 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            # NOTE: default args are not taken into consideration
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.float, torch.half))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = {}
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.float, torch.half)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp32:
+                output = cast_tensor_type(output, torch.half, torch.float)
+            return output
+
+        return new_func
+
+    return auto_fp16_wrapper
+
+
+def force_fp32(apply_to=None, out_fp16=False):
+    """Decorator to convert input arguments to fp32 in force.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If there are some inputs that must be processed
+    in fp32 mode, then this decorator can handle it. If inputs arguments are
+    fp16 tensors, they will be converted to fp32 automatically. Arguments other
+    than fp16 tensors are ignored.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp16 (bool): Whether to convert the output back to fp16.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp32
+        >>>     @force_fp32()
+        >>>     def loss(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp32
+        >>>     @force_fp32(apply_to=('pred', ))
+        >>>     def post_process(self, pred, others):
+        >>>         pass
+    """
+    warnings.warn(
+        'force_fp32 in mmpose will be deprecated in the next release.'
+        'Please use mmcv.runner.force_fp32 instead (mmcv>=1.3.1).',
+        DeprecationWarning)
+
+    def force_fp32_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@force_fp32 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.half, torch.float))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = dict()
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.half, torch.float)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp16:
+                output = cast_tensor_type(output, torch.float, torch.half)
+            return output
+
+        return new_func
+
+    return force_fp32_wrapper
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/hooks.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..74081a9b73b95ebb20cabf07cfaeab86cc874780
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/hooks.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+from mmcv.runner import OptimizerHook
+from mmcv.utils import _BatchNorm
+
+from ..utils.dist_utils import allreduce_grads
+from .utils import cast_tensor_type
+
+
+class Fp16OptimizerHook(OptimizerHook):
+    """FP16 optimizer hook.
+
+    The steps of fp16 optimizer is as follows.
+    1. Scale the loss value.
+    2. BP in the fp16 model.
+    2. Copy gradients from fp16 model to fp32 weights.
+    3. Update fp32 weights.
+    4. Copy updated parameters from fp32 weights to fp16 model.
+
+    Refer to https://arxiv.org/abs/1710.03740 for more details.
+
+    Args:
+        loss_scale (float): Scale factor multiplied with loss.
+    """
+
+    def __init__(self,
+                 grad_clip=None,
+                 coalesce=True,
+                 bucket_size_mb=-1,
+                 loss_scale=512.,
+                 distributed=True):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+        self.loss_scale = loss_scale
+        self.distributed = distributed
+
+    def before_run(self, runner):
+        """Preparing steps before Mixed Precision Training.
+
+        1. Make a master copy of fp32 weights for optimization.
+        2. Convert the main model from fp32 to fp16.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlines training runner.
+        """
+        # keep a copy of fp32 weights
+        runner.optimizer.param_groups = copy.deepcopy(
+            runner.optimizer.param_groups)
+        # convert model to fp16
+        wrap_fp16_model(runner.model)
+
+    @staticmethod
+    def copy_grads_to_fp32(fp16_net, fp32_weights):
+        """Copy gradients from fp16 model to fp32 weight copy."""
+        for fp32_param, fp16_param in zip(fp32_weights, fp16_net.parameters()):
+            if fp16_param.grad is not None:
+                if fp32_param.grad is None:
+                    fp32_param.grad = fp32_param.data.new(fp32_param.size())
+                fp32_param.grad.copy_(fp16_param.grad)
+
+    @staticmethod
+    def copy_params_to_fp16(fp16_net, fp32_weights):
+        """Copy updated params from fp32 weight copy to fp16 model."""
+        for fp16_param, fp32_param in zip(fp16_net.parameters(), fp32_weights):
+            fp16_param.data.copy_(fp32_param.data)
+
+    def after_train_iter(self, runner):
+        """Backward optimization steps for Mixed Precision Training.
+
+        1. Scale the loss by a scale factor.
+        2. Backward the loss to obtain the gradients (fp16).
+        3. Copy gradients from the model to the fp32 weight copy.
+        4. Scale the gradients back and update the fp32 weight copy.
+        5. Copy back the params from fp32 weight copy to the fp16 model.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlines training runner.
+        """
+        # clear grads of last iteration
+        runner.model.zero_grad()
+        runner.optimizer.zero_grad()
+        # scale the loss value
+        scaled_loss = runner.outputs['loss'] * self.loss_scale
+        scaled_loss.backward()
+        # copy fp16 grads in the model to fp32 params in the optimizer
+        fp32_weights = []
+        for param_group in runner.optimizer.param_groups:
+            fp32_weights += param_group['params']
+        self.copy_grads_to_fp32(runner.model, fp32_weights)
+        # allreduce grads
+        if self.distributed:
+            allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb)
+        # scale the gradients back
+        for param in fp32_weights:
+            if param.grad is not None:
+                param.grad.div_(self.loss_scale)
+        if self.grad_clip is not None:
+            self.clip_grads(fp32_weights)
+        # update fp32 params
+        runner.optimizer.step()
+        # copy fp32 params to the fp16 model
+        self.copy_params_to_fp16(runner.model, fp32_weights)
+
+
+def wrap_fp16_model(model):
+    """Wrap the FP32 model to FP16.
+
+    1. Convert FP32 model to FP16.
+    2. Remain some necessary layers to be FP32, e.g., normalization layers.
+
+    Args:
+        model (nn.Module): Model in FP32.
+    """
+    # convert model to fp16
+    model.half()
+    # patch the normalization layers to make it work in fp32 mode
+    patch_norm_fp32(model)
+    # set `fp16_enabled` flag
+    for m in model.modules():
+        if hasattr(m, 'fp16_enabled'):
+            m.fp16_enabled = True
+
+
+def patch_norm_fp32(module):
+    """Recursively convert normalization layers from FP16 to FP32.
+
+    Args:
+        module (nn.Module): The modules to be converted in FP16.
+
+    Returns:
+        nn.Module: The converted module, the normalization layers have been
+            converted to FP32.
+    """
+    if isinstance(module, (_BatchNorm, nn.GroupNorm)):
+        module.float()
+        module.forward = patch_forward_method(module.forward, torch.half,
+                                              torch.float)
+    for child in module.children():
+        patch_norm_fp32(child)
+    return module
+
+
+def patch_forward_method(func, src_type, dst_type, convert_output=True):
+    """Patch the forward method of a module.
+
+    Args:
+        func (callable): The original forward method.
+        src_type (torch.dtype): Type of input arguments to be converted from.
+        dst_type (torch.dtype): Type of input arguments to be converted to.
+        convert_output (bool): Whether to convert the output back to src_type.
+
+    Returns:
+        callable: The patched forward method.
+    """
+
+    def new_forward(*args, **kwargs):
+        output = func(*cast_tensor_type(args, src_type, dst_type),
+                      **cast_tensor_type(kwargs, src_type, dst_type))
+        if convert_output:
+            output = cast_tensor_type(output, dst_type, src_type)
+        return output
+
+    return new_forward
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/utils.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1ec3d328328560c7959ae5e77621feb77692068
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/fp16/utils.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import abc
+
+import numpy as np
+import torch
+
+
+def cast_tensor_type(inputs, src_type, dst_type):
+    """Recursively convert Tensor in inputs from src_type to dst_type.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype): Source type.
+        dst_type (torch.dtype): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    if isinstance(inputs, torch.Tensor):
+        return inputs.to(dst_type)
+    elif isinstance(inputs, str):
+        return inputs
+    elif isinstance(inputs, np.ndarray):
+        return inputs
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type, dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type, dst_type) for item in inputs)
+
+    return inputs
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/optimizer/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4340ffc075afdcdf3d9f7a398ead394ca5a168a1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/optimizer/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import OPTIMIZERS, build_optimizers
+
+__all__ = ['build_optimizers', 'OPTIMIZERS']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/optimizer/builder.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/optimizer/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6accd707db0728142dbcfccee15d902e3632a3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/optimizer/builder.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import build_optimizer
+from mmcv.utils import Registry
+
+OPTIMIZERS = Registry('optimizers')
+
+
+def build_optimizers(model, cfgs):
+    """Build multiple optimizers from configs.
+
+    If `cfgs` contains several dicts for optimizers, then a dict for each
+    constructed optimizers will be returned.
+    If `cfgs` only contains one optimizer config, the constructed optimizer
+    itself will be returned.
+
+    For example,
+
+    1) Multiple optimizer configs:
+
+    .. code-block:: python
+
+        optimizer_cfg = dict(
+            model1=dict(type='SGD', lr=lr),
+            model2=dict(type='SGD', lr=lr))
+
+    The return dict is
+    ``dict('model1': torch.optim.Optimizer, 'model2': torch.optim.Optimizer)``
+
+    2) Single optimizer config:
+
+    .. code-block:: python
+
+        optimizer_cfg = dict(type='SGD', lr=lr)
+
+    The return is ``torch.optim.Optimizer``.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        cfgs (dict): The config dict of the optimizer.
+
+    Returns:
+        dict[:obj:`torch.optim.Optimizer`] | :obj:`torch.optim.Optimizer`:
+            The initialized optimizers.
+    """
+    optimizers = {}
+    if hasattr(model, 'module'):
+        model = model.module
+    # determine whether 'cfgs' has several dicts for optimizers
+    if all(isinstance(v, dict) for v in cfgs.values()):
+        for key, cfg in cfgs.items():
+            cfg_ = cfg.copy()
+            module = getattr(model, key)
+            optimizers[key] = build_optimizer(module, cfg_)
+        return optimizers
+
+    return build_optimizer(model, cfgs)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ee6858d953134a9b870b1a3635968729a4762ea
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .nms import oks_iou, oks_nms, soft_oks_nms
+from .one_euro_filter import OneEuroFilter
+from .post_transforms import (affine_transform, flip_back, fliplr_joints,
+                              fliplr_regression, get_affine_transform,
+                              get_warp_matrix, rotate_point, transform_preds,
+                              warp_affine_joints)
+
+__all__ = [
+    'oks_nms', 'soft_oks_nms', 'affine_transform', 'rotate_point', 'flip_back',
+    'fliplr_joints', 'fliplr_regression', 'transform_preds',
+    'get_affine_transform', 'get_warp_matrix', 'warp_affine_joints',
+    'OneEuroFilter', 'oks_iou'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/group.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/group.py
new file mode 100644
index 0000000000000000000000000000000000000000..6235dbc111eae55e8bc1d34671db84152bc7c542
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/group.py
@@ -0,0 +1,410 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/princeton-vl/pose-ae-train/
+# Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+import torch
+from munkres import Munkres
+
+from mmpose.core.evaluation import post_dark_udp
+
+
+def _py_max_match(scores):
+    """Apply munkres algorithm to get the best match.
+
+    Args:
+        scores(np.ndarray): cost matrix.
+
+    Returns:
+        np.ndarray: best match.
+    """
+    m = Munkres()
+    tmp = m.compute(scores)
+    tmp = np.array(tmp).astype(int)
+    return tmp
+
+
+def _match_by_tag(inp, params):
+    """Match joints by tags. Use Munkres algorithm to calculate the best match
+    for keypoints grouping.
+
+    Note:
+        number of keypoints: K
+        max number of people in an image: M (M=30 by default)
+        dim of tags: L
+            If use flip testing, L=2; else L=1.
+
+    Args:
+        inp(tuple):
+            tag_k (np.ndarray[KxMxL]): tag corresponding to the
+                top k values of feature map per keypoint.
+            loc_k (np.ndarray[KxMx2]): top k locations of the
+                feature maps for keypoint.
+            val_k (np.ndarray[KxM]): top k value of the
+                feature maps per keypoint.
+        params(Params): class Params().
+
+    Returns:
+        np.ndarray: result of pose groups.
+    """
+    assert isinstance(params, _Params), 'params should be class _Params()'
+
+    tag_k, loc_k, val_k = inp
+
+    default_ = np.zeros((params.num_joints, 3 + tag_k.shape[2]),
+                        dtype=np.float32)
+
+    joint_dict = {}
+    tag_dict = {}
+    for i in range(params.num_joints):
+        idx = params.joint_order[i]
+
+        tags = tag_k[idx]
+        joints = np.concatenate((loc_k[idx], val_k[idx, :, None], tags), 1)
+        mask = joints[:, 2] > params.detection_threshold
+        tags = tags[mask]
+        joints = joints[mask]
+
+        if joints.shape[0] == 0:
+            continue
+
+        if i == 0 or len(joint_dict) == 0:
+            for tag, joint in zip(tags, joints):
+                key = tag[0]
+                joint_dict.setdefault(key, np.copy(default_))[idx] = joint
+                tag_dict[key] = [tag]
+        else:
+            grouped_keys = list(joint_dict.keys())[:params.max_num_people]
+            grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys]
+
+            if (params.ignore_too_much
+                    and len(grouped_keys) == params.max_num_people):
+                continue
+
+            diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :]
+            diff_normed = np.linalg.norm(diff, ord=2, axis=2)
+            diff_saved = np.copy(diff_normed)
+
+            if params.use_detection_val:
+                diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3]
+
+            num_added = diff.shape[0]
+            num_grouped = diff.shape[1]
+
+            if num_added > num_grouped:
+                diff_normed = np.concatenate(
+                    (diff_normed,
+                     np.zeros((num_added, num_added - num_grouped),
+                              dtype=np.float32) + 1e10),
+                    axis=1)
+
+            pairs = _py_max_match(diff_normed)
+            for row, col in pairs:
+                if (row < num_added and col < num_grouped
+                        and diff_saved[row][col] < params.tag_threshold):
+                    key = grouped_keys[col]
+                    joint_dict[key][idx] = joints[row]
+                    tag_dict[key].append(tags[row])
+                else:
+                    key = tags[row][0]
+                    joint_dict.setdefault(key, np.copy(default_))[idx] = \
+                        joints[row]
+                    tag_dict[key] = [tags[row]]
+
+    results = np.array([joint_dict[i] for i in joint_dict]).astype(np.float32)
+    return results
+
+
+class _Params:
+    """A class of parameter.
+
+    Args:
+        cfg(Config): config.
+    """
+
+    def __init__(self, cfg):
+        self.num_joints = cfg['num_joints']
+        self.max_num_people = cfg['max_num_people']
+
+        self.detection_threshold = cfg['detection_threshold']
+        self.tag_threshold = cfg['tag_threshold']
+        self.use_detection_val = cfg['use_detection_val']
+        self.ignore_too_much = cfg['ignore_too_much']
+
+        if self.num_joints == 17:
+            self.joint_order = [
+                i - 1 for i in
+                [1, 2, 3, 4, 5, 6, 7, 12, 13, 8, 9, 10, 11, 14, 15, 16, 17]
+            ]
+        else:
+            self.joint_order = list(np.arange(self.num_joints))
+
+
+class HeatmapParser:
+    """The heatmap parser for post processing."""
+
+    def __init__(self, cfg):
+        self.params = _Params(cfg)
+        self.tag_per_joint = cfg['tag_per_joint']
+        self.pool = torch.nn.MaxPool2d(cfg['nms_kernel'], 1,
+                                       cfg['nms_padding'])
+        self.use_udp = cfg.get('use_udp', False)
+        self.score_per_joint = cfg.get('score_per_joint', False)
+
+    def nms(self, heatmaps):
+        """Non-Maximum Suppression for heatmaps.
+
+        Args:
+            heatmap(torch.Tensor): Heatmaps before nms.
+
+        Returns:
+            torch.Tensor: Heatmaps after nms.
+        """
+
+        maxm = self.pool(heatmaps)
+        maxm = torch.eq(maxm, heatmaps).float()
+        heatmaps = heatmaps * maxm
+
+        return heatmaps
+
+    def match(self, tag_k, loc_k, val_k):
+        """Group keypoints to human poses in a batch.
+
+        Args:
+            tag_k (np.ndarray[NxKxMxL]): tag corresponding to the
+                top k values of feature map per keypoint.
+            loc_k (np.ndarray[NxKxMx2]): top k locations of the
+                feature maps for keypoint.
+            val_k (np.ndarray[NxKxM]): top k value of the
+                feature maps per keypoint.
+
+        Returns:
+            list
+        """
+
+        def _match(x):
+            return _match_by_tag(x, self.params)
+
+        return list(map(_match, zip(tag_k, loc_k, val_k)))
+
+    def top_k(self, heatmaps, tags):
+        """Find top_k values in an image.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            max number of people: M
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmaps (torch.Tensor[NxKxHxW])
+            tags (torch.Tensor[NxKxHxWxL])
+
+        Returns:
+            dict: A dict containing top_k values.
+
+            - tag_k (np.ndarray[NxKxMxL]):
+                tag corresponding to the top k values of
+                feature map per keypoint.
+            - loc_k (np.ndarray[NxKxMx2]):
+                top k location of feature map per keypoint.
+            - val_k (np.ndarray[NxKxM]):
+                top k value of feature map per keypoint.
+        """
+        heatmaps = self.nms(heatmaps)
+        N, K, H, W = heatmaps.size()
+        heatmaps = heatmaps.view(N, K, -1)
+        val_k, ind = heatmaps.topk(self.params.max_num_people, dim=2)
+
+        tags = tags.view(tags.size(0), tags.size(1), W * H, -1)
+        if not self.tag_per_joint:
+            tags = tags.expand(-1, self.params.num_joints, -1, -1)
+
+        tag_k = torch.stack(
+            [torch.gather(tags[..., i], 2, ind) for i in range(tags.size(3))],
+            dim=3)
+
+        x = ind % W
+        y = ind // W
+
+        ind_k = torch.stack((x, y), dim=3)
+
+        results = {
+            'tag_k': tag_k.cpu().numpy(),
+            'loc_k': ind_k.cpu().numpy(),
+            'val_k': val_k.cpu().numpy()
+        }
+
+        return results
+
+    @staticmethod
+    def adjust(results, heatmaps):
+        """Adjust the coordinates for better accuracy.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            results (list(np.ndarray)): Keypoint predictions.
+            heatmaps (torch.Tensor[NxKxHxW]): Heatmaps.
+        """
+        _, _, H, W = heatmaps.shape
+        for batch_id, people in enumerate(results):
+            for people_id, people_i in enumerate(people):
+                for joint_id, joint in enumerate(people_i):
+                    if joint[2] > 0:
+                        x, y = joint[0:2]
+                        xx, yy = int(x), int(y)
+                        tmp = heatmaps[batch_id][joint_id]
+                        if tmp[min(H - 1, yy + 1), xx] > tmp[max(0, yy - 1),
+                                                             xx]:
+                            y += 0.25
+                        else:
+                            y -= 0.25
+
+                        if tmp[yy, min(W - 1, xx + 1)] > tmp[yy,
+                                                             max(0, xx - 1)]:
+                            x += 0.25
+                        else:
+                            x -= 0.25
+                        results[batch_id][people_id, joint_id,
+                                          0:2] = (x + 0.5, y + 0.5)
+        return results
+
+    @staticmethod
+    def refine(heatmap, tag, keypoints, use_udp=False):
+        """Given initial keypoint predictions, we identify missing joints.
+
+        Note:
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmap: np.ndarray(K, H, W).
+            tag: np.ndarray(K, H, W) |  np.ndarray(K, H, W, L)
+            keypoints: np.ndarray of size (K, 3 + L)
+                        last dim is (x, y, score, tag).
+            use_udp: bool-unbiased data processing
+
+        Returns:
+            np.ndarray: The refined keypoints.
+        """
+
+        K, H, W = heatmap.shape
+        if len(tag.shape) == 3:
+            tag = tag[..., None]
+
+        tags = []
+        for i in range(K):
+            if keypoints[i, 2] > 0:
+                # save tag value of detected keypoint
+                x, y = keypoints[i][:2].astype(int)
+                x = np.clip(x, 0, W - 1)
+                y = np.clip(y, 0, H - 1)
+                tags.append(tag[i, y, x])
+
+        # mean tag of current detected people
+        prev_tag = np.mean(tags, axis=0)
+        results = []
+
+        for _heatmap, _tag in zip(heatmap, tag):
+            # distance of all tag values with mean tag of
+            # current detected people
+            distance_tag = (((_tag -
+                              prev_tag[None, None, :])**2).sum(axis=2)**0.5)
+            norm_heatmap = _heatmap - np.round(distance_tag)
+
+            # find maximum position
+            y, x = np.unravel_index(np.argmax(norm_heatmap), _heatmap.shape)
+            xx = x.copy()
+            yy = y.copy()
+            # detection score at maximum position
+            val = _heatmap[y, x]
+            if not use_udp:
+                # offset by 0.5
+                x += 0.5
+                y += 0.5
+
+            # add a quarter offset
+            if _heatmap[yy, min(W - 1, xx + 1)] > _heatmap[yy, max(0, xx - 1)]:
+                x += 0.25
+            else:
+                x -= 0.25
+
+            if _heatmap[min(H - 1, yy + 1), xx] > _heatmap[max(0, yy - 1), xx]:
+                y += 0.25
+            else:
+                y -= 0.25
+
+            results.append((x, y, val))
+        results = np.array(results)
+
+        if results is not None:
+            for i in range(K):
+                # add keypoint if it is not detected
+                if results[i, 2] > 0 and keypoints[i, 2] == 0:
+                    keypoints[i, :3] = results[i, :3]
+
+        return keypoints
+
+    def parse(self, heatmaps, tags, adjust=True, refine=True):
+        """Group keypoints into poses given heatmap and tag.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps.
+            tags (torch.Tensor[NxKxHxWxL]): model output tagmaps.
+
+        Returns:
+            tuple: A tuple containing keypoint grouping results.
+
+            - results (list(np.ndarray)): Pose results.
+            - scores (list/list(np.ndarray)): Score of people.
+        """
+        results = self.match(**self.top_k(heatmaps, tags))
+
+        if adjust:
+            if self.use_udp:
+                for i in range(len(results)):
+                    if results[i].shape[0] > 0:
+                        results[i][..., :2] = post_dark_udp(
+                            results[i][..., :2].copy(), heatmaps[i:i + 1, :])
+            else:
+                results = self.adjust(results, heatmaps)
+
+        if self.score_per_joint:
+            scores = [i[:, 2] for i in results[0]]
+        else:
+            scores = [i[:, 2].mean() for i in results[0]]
+
+        if refine:
+            results = results[0]
+            # for every detected person
+            for i in range(len(results)):
+                heatmap_numpy = heatmaps[0].cpu().numpy()
+                tag_numpy = tags[0].cpu().numpy()
+                if not self.tag_per_joint:
+                    tag_numpy = np.tile(tag_numpy,
+                                        (self.params.num_joints, 1, 1, 1))
+                results[i] = self.refine(
+                    heatmap_numpy, tag_numpy, results[i], use_udp=self.use_udp)
+            results = [results]
+
+        return results, scores
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/nms.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a0ab35e0e26d27bb0bb55071018ffc5ac9af1d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/nms.py
@@ -0,0 +1,207 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+
+
+def nms(dets, thr):
+    """Greedily select boxes with high confidence and overlap <= thr.
+
+    Args:
+        dets: [[x1, y1, x2, y2, score]].
+        thr: Retain overlap < thr.
+
+    Returns:
+         list: Indexes to keep.
+    """
+    if len(dets) == 0:
+        return []
+
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while len(order) > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thr)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None):
+    """Calculate oks ious.
+
+    Args:
+        g: Ground truth keypoints.
+        d: Detected keypoints.
+        a_g: Area of the ground truth object.
+        a_d: Area of the detected object.
+        sigmas: standard deviation of keypoint labelling.
+        vis_thr: threshold of the keypoint visibility.
+
+    Returns:
+        list: The oks ious.
+    """
+    if sigmas is None:
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+    vars = (sigmas * 2)**2
+    xg = g[0::3]
+    yg = g[1::3]
+    vg = g[2::3]
+    ious = np.zeros(len(d), dtype=np.float32)
+    for n_d in range(0, len(d)):
+        xd = d[n_d, 0::3]
+        yd = d[n_d, 1::3]
+        vd = d[n_d, 2::3]
+        dx = xd - xg
+        dy = yd - yg
+        e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
+        if vis_thr is not None:
+            ind = list(vg > vis_thr) and list(vd > vis_thr)
+            e = e[ind]
+        ious[n_d] = np.sum(np.exp(-e)) / len(e) if len(e) != 0 else 0.0
+    return ious
+
+
+def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False):
+    """OKS NMS implementations.
+
+    Args:
+        kpts_db: keypoints.
+        thr: Retain overlap < thr.
+        sigmas: standard deviation of keypoint labelling.
+        vis_thr: threshold of the keypoint visibility.
+        score_per_joint: the input scores (in kpts_db) are per joint scores
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
+    areas = np.array([k['area'] for k in kpts_db])
+
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while len(order) > 0:
+        i = order[0]
+        keep.append(i)
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, vis_thr)
+
+        inds = np.where(oks_ovr <= thr)[0]
+        order = order[inds + 1]
+
+    keep = np.array(keep)
+
+    return keep
+
+
+def _rescore(overlap, scores, thr, type='gaussian'):
+    """Rescoring mechanism gaussian or linear.
+
+    Args:
+        overlap: calculated ious
+        scores: target scores.
+        thr: retain oks overlap < thr.
+        type: 'gaussian' or 'linear'
+
+    Returns:
+        np.ndarray: indexes to keep
+    """
+    assert len(overlap) == len(scores)
+    assert type in ['gaussian', 'linear']
+
+    if type == 'linear':
+        inds = np.where(overlap >= thr)[0]
+        scores[inds] = scores[inds] * (1 - overlap[inds])
+    else:
+        scores = scores * np.exp(-overlap**2 / thr)
+
+    return scores
+
+
+def soft_oks_nms(kpts_db,
+                 thr,
+                 max_dets=20,
+                 sigmas=None,
+                 vis_thr=None,
+                 score_per_joint=False):
+    """Soft OKS NMS implementations.
+
+    Args:
+        kpts_db
+        thr: retain oks overlap < thr.
+        max_dets: max number of detections to keep.
+        sigmas: Keypoint labelling uncertainty.
+        score_per_joint: the input scores (in kpts_db) are per joint scores
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
+    areas = np.array([k['area'] for k in kpts_db])
+
+    order = scores.argsort()[::-1]
+    scores = scores[order]
+
+    keep = np.zeros(max_dets, dtype=np.intp)
+    keep_cnt = 0
+    while len(order) > 0 and keep_cnt < max_dets:
+        i = order[0]
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, vis_thr)
+
+        order = order[1:]
+        scores = _rescore(oks_ovr, scores[1:], thr)
+
+        tmp = scores.argsort()[::-1]
+        order = order[tmp]
+        scores = scores[tmp]
+
+        keep[keep_cnt] = i
+        keep_cnt += 1
+
+    keep = keep[:keep_cnt]
+
+    return keep
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/one_euro_filter.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/one_euro_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..01ffa5fda9b1669e3611f14643ed731669b3b421
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/one_euro_filter.py
@@ -0,0 +1,102 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy
+# Original licence: Copyright (c)  HoBeom Jeon, under the MIT License.
+# ------------------------------------------------------------------------------
+from time import time
+
+import numpy as np
+
+
+def smoothing_factor(t_e, cutoff):
+    r = 2 * np.pi * cutoff * t_e
+    return r / (r + 1)
+
+
+def exponential_smoothing(a, x, x_prev):
+    return a * x + (1 - a) * x_prev
+
+
+class OneEuroFilter:
+
+    def __init__(self,
+                 x0,
+                 dx0=0.0,
+                 min_cutoff=1.7,
+                 beta=0.3,
+                 d_cutoff=30.0,
+                 fps=None):
+        """One Euro Filter for keypoints smoothing.
+
+        Args:
+            x0 (np.ndarray[K, 2]): Initialize keypoints value
+            dx0 (float): 0.0
+            min_cutoff (float): parameter for one euro filter
+            beta (float): parameter for one euro filter
+            d_cutoff (float): Input data FPS
+            fps (float): Video FPS for video inference
+        """
+
+        # The parameters.
+        self.data_shape = x0.shape
+        self.min_cutoff = np.full(x0.shape, min_cutoff)
+        self.beta = np.full(x0.shape, beta)
+        self.d_cutoff = np.full(x0.shape, d_cutoff)
+        # Previous values.
+        self.x_prev = x0.astype(np.float32)
+        self.dx_prev = np.full(x0.shape, dx0)
+        self.mask_prev = np.ma.masked_where(x0 <= 0, x0)
+        self.realtime = True
+        if fps is None:
+            # Using in realtime inference
+            self.t_e = None
+            self.skip_frame_factor = d_cutoff
+        else:
+            # fps using video inference
+            self.realtime = False
+            self.d_cutoff = np.full(x0.shape, float(fps))
+        self.t_prev = time()
+
+    def __call__(self, x, t_e=1.0):
+        """Compute the filtered signal.
+
+        Hyper-parameters (cutoff, beta) are from `VNect
+        <http://gvv.mpi-inf.mpg.de/projects/VNect/>`__ .
+
+        Realtime Camera fps (d_cutoff) default 30.0
+
+        Args:
+            x (np.ndarray[K, 2]): keypoints results in frame
+            t_e (Optional): video skip frame count for posetrack
+                evaluation
+        """
+        assert x.shape == self.data_shape
+
+        t = 0
+        if self.realtime:
+            t = time()
+            t_e = (t - self.t_prev) * self.skip_frame_factor
+        t_e = np.full(x.shape, t_e)
+
+        # missing keypoints mask
+        mask = np.ma.masked_where(x <= 0, x)
+
+        # The filtered derivative of the signal.
+        a_d = smoothing_factor(t_e, self.d_cutoff)
+        dx = (x - self.x_prev) / t_e
+        dx_hat = exponential_smoothing(a_d, dx, self.dx_prev)
+
+        # The filtered signal.
+        cutoff = self.min_cutoff + self.beta * np.abs(dx_hat)
+        a = smoothing_factor(t_e, cutoff)
+        x_hat = exponential_smoothing(a, x, self.x_prev)
+
+        # missing keypoints remove
+        np.copyto(x_hat, -10, where=mask.mask)
+
+        # Memorize the previous values.
+        self.x_prev = x_hat
+        self.dx_prev = dx_hat
+        self.t_prev = t
+        self.mask_prev = mask
+
+        return x_hat
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/post_transforms.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/post_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..93063fb1c1a60519a527037795654b0278a880e4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/post_processing/post_transforms.py
@@ -0,0 +1,366 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import math
+
+import cv2
+import numpy as np
+import torch
+
+
+def fliplr_joints(joints_3d, joints_3d_visible, img_width, flip_pairs):
+    """Flip human joints horizontally.
+
+    Note:
+        - num_keypoints: K
+
+    Args:
+        joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
+        joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints.
+        img_width (int): Image width.
+        flip_pairs (list[tuple]): Pairs of keypoints which are mirrored
+            (for example, left ear and right ear).
+
+    Returns:
+        tuple: Flipped human joints.
+
+        - joints_3d_flipped (np.ndarray([K, 3])): Flipped joints.
+        - joints_3d_visible_flipped (np.ndarray([K, 1])): Joint visibility.
+    """
+
+    assert len(joints_3d) == len(joints_3d_visible)
+    assert img_width > 0
+
+    joints_3d_flipped = joints_3d.copy()
+    joints_3d_visible_flipped = joints_3d_visible.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        joints_3d_flipped[left, :] = joints_3d[right, :]
+        joints_3d_flipped[right, :] = joints_3d[left, :]
+
+        joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
+        joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
+
+    # Flip horizontally
+    joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]
+    joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped
+
+    return joints_3d_flipped, joints_3d_visible_flipped
+
+
+def fliplr_regression(regression,
+                      flip_pairs,
+                      center_mode='static',
+                      center_x=0.5,
+                      center_index=0):
+    """Flip human joints horizontally.
+
+    Note:
+        - batch_size: N
+        - num_keypoint: K
+
+    Args:
+        regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K
+            is the joint number and C is the dimension. Example shapes are:
+
+            - [N, K, C]: a batch of keypoints where N is the batch size.
+            - [N, T, K, C]: a batch of pose sequences, where T is the frame
+                number.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        center_mode (str): The mode to set the center location on the x-axis
+            to flip around. Options are:
+
+            - static: use a static x value (see center_x also)
+            - root: use a root joint (see center_index also)
+        center_x (float): Set the x-axis location of the flip center. Only used
+            when center_mode=static.
+        center_index (int): Set the index of the root joint, whose x location
+            will be used as the flip center. Only used when center_mode=root.
+
+    Returns:
+        np.ndarray([..., K, C]): Flipped joints.
+    """
+    assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}'
+
+    allowed_center_mode = {'static', 'root'}
+    assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
+        f'{center_mode}, allowed choices are {allowed_center_mode}'
+
+    if center_mode == 'static':
+        x_c = center_x
+    elif center_mode == 'root':
+        assert regression.shape[-2] > center_index
+        x_c = regression[..., center_index:center_index + 1, 0]
+
+    regression_flipped = regression.copy()
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        regression_flipped[..., left, :] = regression[..., right, :]
+        regression_flipped[..., right, :] = regression[..., left, :]
+
+    # Flip horizontally
+    regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0]
+    return regression_flipped
+
+
+def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
+    """Flip the flipped heatmaps back to the original form.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
+            from the flipped images.
+        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        target_type (str): GaussianHeatmap or CombinedTarget
+
+    Returns:
+        np.ndarray: heatmaps that flipped back to the original image
+    """
+    assert output_flipped.ndim == 4, \
+        'output_flipped should be [batch_size, num_keypoints, height, width]'
+    shape_ori = output_flipped.shape
+    channels = 1
+    if target_type.lower() == 'CombinedTarget'.lower():
+        channels = 3
+        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
+    output_flipped = output_flipped.reshape(shape_ori[0], -1, channels,
+                                            shape_ori[2], shape_ori[3])
+    output_flipped_back = output_flipped.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
+        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
+    output_flipped_back = output_flipped_back.reshape(shape_ori)
+    # Flip horizontally
+    output_flipped_back = output_flipped_back[..., ::-1]
+    return output_flipped_back
+
+
+def transform_preds(coords, center, scale, output_size, use_udp=False):
+    """Get final keypoint predictions from heatmaps and apply scaling and
+    translation to map them back to the image.
+
+    Note:
+        num_keypoints: K
+
+    Args:
+        coords (np.ndarray[K, ndims]):
+
+            * If ndims=2, corrds are predicted keypoint location.
+            * If ndims=4, corrds are composed of (x, y, scores, tags)
+            * If ndims=5, corrds are composed of (x, y, scores, tags,
+              flipped_tags)
+
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        use_udp (bool): Use unbiased data processing
+
+    Returns:
+        np.ndarray: Predicted coordinates in the images.
+    """
+    assert coords.shape[1] in (2, 4, 5)
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+
+    # Recover the scale which is normalized by a factor of 200.
+    scale = scale * 200.0
+
+    if use_udp:
+        scale_x = scale[0] / (output_size[0] - 1.0)
+        scale_y = scale[1] / (output_size[1] - 1.0)
+    else:
+        scale_x = scale[0] / output_size[0]
+        scale_y = scale[1] / output_size[1]
+
+    target_coords = np.ones_like(coords)
+    target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
+    target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
+
+    return target_coords
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+
+    # pixel_std is 200.
+    scale_tmp = scale * 200.0
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, trans_mat):
+    """Apply an affine transformation to the points.
+
+    Args:
+        pt (np.ndarray): a 2 dimensional point to be transformed
+        trans_mat (np.ndarray): 2x3 matrix of an affine transform
+
+    Returns:
+        np.ndarray: Transformed points.
+    """
+    assert len(pt) == 2
+    new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.])
+
+    return new_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+
+    Returns:
+        np.ndarray: A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = math.cos(theta) * scale_x
+    matrix[0, 1] = -math.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) +
+                              0.5 * size_input[1] * math.sin(theta) +
+                              0.5 * size_target[0])
+    matrix[1, 0] = math.sin(theta) * scale_y
+    matrix[1, 1] = math.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) -
+                              0.5 * size_input[1] * math.cos(theta) +
+                              0.5 * size_target[1])
+    return matrix
+
+
+def warp_affine_joints(joints, mat):
+    """Apply affine transformation defined by the transform matrix on the
+    joints.
+
+    Args:
+        joints (np.ndarray[..., 2]): Origin coordinate of joints.
+        mat (np.ndarray[3, 2]): The affine matrix.
+
+    Returns:
+        np.ndarray[..., 2]: Result coordinate of joints.
+    """
+    joints = np.array(joints)
+    shape = joints.shape
+    joints = joints.reshape(-1, 2)
+    return np.dot(
+        np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1),
+        mat.T).reshape(shape)
+
+
+def affine_transform_torch(pts, t):
+    npts = pts.shape[0]
+    pts_homo = torch.cat([pts, torch.ones(npts, 1, device=pts.device)], dim=1)
+    out = torch.mm(t, torch.t(pts_homo))
+    return torch.t(out[:2, :])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/utils/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd6c0277a0647e605eaf29ccac41c1f9a37a05ac
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dist_utils import allreduce_grads
+from .regularizations import WeightNormClipHook
+
+__all__ = ['allreduce_grads', 'WeightNormClipHook']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/utils/dist_utils.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e76e591050284b1e9c541ea4ee8ee66708b8e7fb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/utils/dist_utils.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+import torch.distributed as dist
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    """Allreduce parameters as a whole."""
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Default: True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Default: -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/utils/regularizations.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/utils/regularizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c7449038066016f6efb60e126111ace962fe98
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/utils/regularizations.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod, abstractproperty
+
+import torch
+
+
+class PytorchModuleHook(metaclass=ABCMeta):
+    """Base class for PyTorch module hook registers.
+
+    An instance of a subclass of PytorchModuleHook can be used to
+    register hook to a pytorch module using the `register` method like:
+        hook_register.register(module)
+
+    Subclasses should add/overwrite the following methods:
+        - __init__
+        - hook
+        - hook_type
+    """
+
+    @abstractmethod
+    def hook(self, *args, **kwargs):
+        """Hook function."""
+
+    @abstractproperty
+    def hook_type(self) -> str:
+        """Hook type Subclasses should overwrite this function to return a
+        string value in.
+
+        {`forward`, `forward_pre`, `backward`}
+        """
+
+    def register(self, module):
+        """Register the hook function to the module.
+
+        Args:
+            module (pytorch module): the module to register the hook.
+
+        Returns:
+            handle (torch.utils.hooks.RemovableHandle): a handle to remove
+                the hook by calling handle.remove()
+        """
+        assert isinstance(module, torch.nn.Module)
+
+        if self.hook_type == 'forward':
+            h = module.register_forward_hook(self.hook)
+        elif self.hook_type == 'forward_pre':
+            h = module.register_forward_pre_hook(self.hook)
+        elif self.hook_type == 'backward':
+            h = module.register_backward_hook(self.hook)
+        else:
+            raise ValueError(f'Invalid hook type {self.hook}')
+
+        return h
+
+
+class WeightNormClipHook(PytorchModuleHook):
+    """Apply weight norm clip regularization.
+
+    The module's parameter will be clip to a given maximum norm before each
+    forward pass.
+
+    Args:
+        max_norm (float): The maximum norm of the parameter.
+        module_param_names (str|list): The parameter name (or name list) to
+            apply weight norm clip.
+    """
+
+    def __init__(self, max_norm=1.0, module_param_names='weight'):
+        self.module_param_names = module_param_names if isinstance(
+            module_param_names, list) else [module_param_names]
+        self.max_norm = max_norm
+
+    @property
+    def hook_type(self):
+        return 'forward_pre'
+
+    def hook(self, module, _input):
+        for name in self.module_param_names:
+            assert name in module._parameters, f'{name} is not a parameter' \
+                f' of the module {type(module)}'
+            param = module._parameters[name]
+
+            with torch.no_grad():
+                m = param.norm().item()
+                if m > self.max_norm:
+                    param.mul_(self.max_norm / (m + 1e-6))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/visualization/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9705494bc8ef4dfb49e6a8db21ab6f243f3bb6d2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/visualization/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .effects import apply_bugeye_effect, apply_sunglasses_effect
+from .image import (imshow_bboxes, imshow_keypoints, imshow_keypoints_3d,
+                    imshow_mesh_3d)
+
+__all__ = [
+    'imshow_keypoints',
+    'imshow_keypoints_3d',
+    'imshow_bboxes',
+    'apply_bugeye_effect',
+    'apply_sunglasses_effect',
+    'imshow_mesh_3d',
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/visualization/effects.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/visualization/effects.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3add7d95dafe4d072b7945823aaa75664622994
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/visualization/effects.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+
+def apply_bugeye_effect(img,
+                        pose_results,
+                        left_eye_index,
+                        right_eye_index,
+                        kpt_thr=0.5):
+    """Apply bug-eye effect.
+
+    Args:
+        img (np.ndarray): Image data.
+        pose_results (list[dict]): The pose estimation results containing:
+            - "bbox" ([K, 4(or 5)]): detection bbox in
+                [x1, y1, x2, y2, (score)]
+            - "keypoints" ([K,3]): keypoint detection result in [x, y, score]
+        left_eye_index (int): Keypoint index of left eye
+        right_eye_index (int): Keypoint index of right eye
+        kpt_thr (float): The score threshold of required keypoints.
+    """
+
+    xx, yy = np.meshgrid(np.arange(img.shape[1]), np.arange(img.shape[0]))
+    xx = xx.astype(np.float32)
+    yy = yy.astype(np.float32)
+
+    for pose in pose_results:
+        bbox = pose['bbox']
+        kpts = pose['keypoints']
+
+        if kpts[left_eye_index, 2] < kpt_thr or kpts[right_eye_index,
+                                                     2] < kpt_thr:
+            continue
+
+        kpt_leye = kpts[left_eye_index, :2]
+        kpt_reye = kpts[right_eye_index, :2]
+        for xc, yc in [kpt_leye, kpt_reye]:
+
+            # distortion parameters
+            k1 = 0.001
+            epe = 1e-5
+
+            scale = (bbox[2] - bbox[0])**2 + (bbox[3] - bbox[1])**2
+            r2 = ((xx - xc)**2 + (yy - yc)**2)
+            r2 = (r2 + epe) / scale  # normalized by bbox scale
+
+            xx = (xx - xc) / (1 + k1 / r2) + xc
+            yy = (yy - yc) / (1 + k1 / r2) + yc
+
+        img = cv2.remap(
+            img,
+            xx,
+            yy,
+            interpolation=cv2.INTER_AREA,
+            borderMode=cv2.BORDER_REPLICATE)
+    return img
+
+
+def apply_sunglasses_effect(img,
+                            pose_results,
+                            sunglasses_img,
+                            left_eye_index,
+                            right_eye_index,
+                            kpt_thr=0.5):
+    """Apply sunglasses effect.
+
+    Args:
+        img (np.ndarray): Image data.
+        pose_results (list[dict]): The pose estimation results containing:
+            - "keypoints" ([K,3]): keypoint detection result in [x, y, score]
+        sunglasses_img (np.ndarray): Sunglasses image with white background.
+        left_eye_index (int): Keypoint index of left eye
+        right_eye_index (int): Keypoint index of right eye
+        kpt_thr (float): The score threshold of required keypoints.
+    """
+
+    hm, wm = sunglasses_img.shape[:2]
+    # anchor points in the sunglasses mask
+    pts_src = np.array([[0.3 * wm, 0.3 * hm], [0.3 * wm, 0.7 * hm],
+                        [0.7 * wm, 0.3 * hm], [0.7 * wm, 0.7 * hm]],
+                       dtype=np.float32)
+
+    for pose in pose_results:
+        kpts = pose['keypoints']
+
+        if kpts[left_eye_index, 2] < kpt_thr or kpts[right_eye_index,
+                                                     2] < kpt_thr:
+            continue
+
+        kpt_leye = kpts[left_eye_index, :2]
+        kpt_reye = kpts[right_eye_index, :2]
+        # orthogonal vector to the left-to-right eyes
+        vo = 0.5 * (kpt_reye - kpt_leye)[::-1] * [-1, 1]
+
+        # anchor points in the image by eye positions
+        pts_tar = np.vstack(
+            [kpt_reye + vo, kpt_reye - vo, kpt_leye + vo, kpt_leye - vo])
+
+        h_mat, _ = cv2.findHomography(pts_src, pts_tar)
+        patch = cv2.warpPerspective(
+            sunglasses_img,
+            h_mat,
+            dsize=(img.shape[1], img.shape[0]),
+            borderValue=(255, 255, 255))
+        #  mask the white background area in the patch with a threshold 200
+        mask = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY)
+        mask = (mask < 200).astype(np.uint8)
+        img = cv2.copyTo(patch, mask, img)
+
+    return img
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/visualization/image.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/visualization/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..8acd10bd5851dd56db5ed86b9ce9fc4cc2c8a48f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/core/visualization/image.py
@@ -0,0 +1,442 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os
+import warnings
+
+import cv2
+import mmcv
+import numpy as np
+from matplotlib import pyplot as plt
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.color import color_val
+
+try:
+    import trimesh
+    has_trimesh = True
+except (ImportError, ModuleNotFoundError):
+    has_trimesh = False
+
+try:
+    os.environ['PYOPENGL_PLATFORM'] = 'osmesa'
+    import pyrender
+    has_pyrender = True
+except (ImportError, ModuleNotFoundError):
+    has_pyrender = False
+
+
+def imshow_bboxes(img,
+                  bboxes,
+                  labels=None,
+                  colors='green',
+                  text_color='white',
+                  thickness=1,
+                  font_scale=0.5,
+                  show=True,
+                  win_name='',
+                  wait_time=0,
+                  out_file=None):
+    """Draw bboxes with labels (optional) on an image. This is a wrapper of
+    mmcv.imshow_bboxes.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): ndarray of shape (k, 4), each row is a bbox in
+            format [x1, y1, x2, y2].
+        labels (str or list[str], optional): labels of each bbox.
+        colors (list[str or tuple or :obj:`Color`]): A list of colors.
+        text_color (str or tuple or :obj:`Color`): Color of texts.
+        thickness (int): Thickness of lines.
+        font_scale (float): Font scales of texts.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str, optional): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+
+    # adapt to mmcv.imshow_bboxes input format
+    bboxes = np.split(
+        bboxes, bboxes.shape[0], axis=0) if bboxes.shape[0] > 0 else []
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(bboxes))]
+    colors = [mmcv.color_val(c) for c in colors]
+    assert len(bboxes) == len(colors)
+
+    img = mmcv.imshow_bboxes(
+        img,
+        bboxes,
+        colors,
+        top_k=-1,
+        thickness=thickness,
+        show=False,
+        out_file=None)
+
+    if labels is not None:
+        if not isinstance(labels, list):
+            labels = [labels for _ in range(len(bboxes))]
+        assert len(labels) == len(bboxes)
+
+        for bbox, label, color in zip(bboxes, labels, colors):
+            if label is None:
+                continue
+            bbox_int = bbox[0, :4].astype(np.int32)
+            # roughly estimate the proper font size
+            text_size, text_baseline = cv2.getTextSize(label,
+                                                       cv2.FONT_HERSHEY_DUPLEX,
+                                                       font_scale, thickness)
+            text_x1 = bbox_int[0]
+            text_y1 = max(0, bbox_int[1] - text_size[1] - text_baseline)
+            text_x2 = bbox_int[0] + text_size[0]
+            text_y2 = text_y1 + text_size[1] + text_baseline
+            cv2.rectangle(img, (text_x1, text_y1), (text_x2, text_y2), color,
+                          cv2.FILLED)
+            cv2.putText(img, label, (text_x1, text_y2 - text_baseline),
+                        cv2.FONT_HERSHEY_DUPLEX, font_scale,
+                        mmcv.color_val(text_color), thickness)
+
+    if show:
+        mmcv.imshow(img, win_name, wait_time)
+    if out_file is not None:
+        mmcv.imwrite(img, out_file)
+    return img
+
+
+@deprecated_api_warning({'pose_limb_color': 'pose_link_color'})
+def imshow_keypoints(img,
+                     pose_result,
+                     skeleton=None,
+                     kpt_score_thr=0.3,
+                     pose_kpt_color=None,
+                     pose_link_color=None,
+                     radius=4,
+                     thickness=1,
+                     show_keypoint_weight=False):
+    """Draw keypoints and links on an image.
+
+    Args:
+            img (str or Tensor): The image to draw poses on. If an image array
+                is given, id will be modified in-place.
+            pose_result (list[kpts]): The poses to draw. Each element kpts is
+                a set of K keypoints as an Kx3 numpy.ndarray, where each
+                keypoint is represented as x, y, score.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None,
+                the keypoint will not be drawn.
+            pose_link_color (np.array[Mx3]): Color of M links. If None, the
+                links will not be drawn.
+            thickness (int): Thickness of lines.
+    """
+
+    img = mmcv.imread(img)
+    img_h, img_w, _ = img.shape
+
+    for kpts in pose_result:
+
+        kpts = np.array(kpts, copy=False)
+
+        # draw each point on image
+        if pose_kpt_color is not None:
+            assert len(pose_kpt_color) == len(kpts)
+            for kid, kpt in enumerate(kpts):
+                x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
+                if kpt_score > kpt_score_thr:
+                    color = tuple(int(c) for c in pose_kpt_color[kid])
+                    if show_keypoint_weight:
+                        img_copy = img.copy()
+                        cv2.circle(img_copy, (int(x_coord), int(y_coord)),
+                                   radius, color, -1)
+                        transparency = max(0, min(1, kpt_score))
+                        cv2.addWeighted(
+                            img_copy,
+                            transparency,
+                            img,
+                            1 - transparency,
+                            0,
+                            dst=img)
+                    else:
+                        cv2.circle(img, (int(x_coord), int(y_coord)), radius,
+                                   color, -1)
+
+        # draw links
+        if skeleton is not None and pose_link_color is not None:
+            assert len(pose_link_color) == len(skeleton)
+            for sk_id, sk in enumerate(skeleton):
+                pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
+                pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
+                if (pos1[0] > 0 and pos1[0] < img_w and pos1[1] > 0
+                        and pos1[1] < img_h and pos2[0] > 0 and pos2[0] < img_w
+                        and pos2[1] > 0 and pos2[1] < img_h
+                        and kpts[sk[0], 2] > kpt_score_thr
+                        and kpts[sk[1], 2] > kpt_score_thr):
+                    color = tuple(int(c) for c in pose_link_color[sk_id])
+                    if show_keypoint_weight:
+                        img_copy = img.copy()
+                        X = (pos1[0], pos2[0])
+                        Y = (pos1[1], pos2[1])
+                        mX = np.mean(X)
+                        mY = np.mean(Y)
+                        length = ((Y[0] - Y[1])**2 + (X[0] - X[1])**2)**0.5
+                        angle = math.degrees(
+                            math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                        stickwidth = 2
+                        polygon = cv2.ellipse2Poly(
+                            (int(mX), int(mY)),
+                            (int(length / 2), int(stickwidth)), int(angle), 0,
+                            360, 1)
+                        cv2.fillConvexPoly(img_copy, polygon, color)
+                        transparency = max(
+                            0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
+                        cv2.addWeighted(
+                            img_copy,
+                            transparency,
+                            img,
+                            1 - transparency,
+                            0,
+                            dst=img)
+                    else:
+                        cv2.line(img, pos1, pos2, color, thickness=thickness)
+
+    return img
+
+
+def imshow_keypoints_3d(
+    pose_result,
+    img=None,
+    skeleton=None,
+    pose_kpt_color=None,
+    pose_link_color=None,
+    vis_height=400,
+    kpt_score_thr=0.3,
+    num_instances=-1,
+    *,
+    axis_azimuth=70,
+    axis_limit=1.7,
+    axis_dist=10.0,
+    axis_elev=15.0,
+):
+    """Draw 3D keypoints and links in 3D coordinates.
+
+    Args:
+        pose_result (list[dict]): 3D pose results containing:
+            - "keypoints_3d" ([K,4]): 3D keypoints
+            - "title" (str): Optional. A string to specify the title of the
+                visualization of this pose result
+        img (str|np.ndarray): Opptional. The image or image path to show input
+            image and/or 2D pose. Note that the image should be given in BGR
+            channel order.
+        skeleton (list of [idx_i,idx_j]): Skeleton described by a list of
+            links, each is a pair of joint indices.
+        pose_kpt_color (np.ndarray[Nx3]`): Color of N keypoints. If None, do
+            not nddraw keypoints.
+        pose_link_color (np.array[Mx3]): Color of M links. If None, do not
+            draw links.
+        vis_height (int): The image height of the visualization. The width
+                will be N*vis_height depending on the number of visualized
+                items.
+        kpt_score_thr (float): Minimum score of keypoints to be shown.
+            Default: 0.3.
+        num_instances (int): Number of instances to be shown in 3D. If smaller
+            than 0, all the instances in the pose_result will be shown.
+            Otherwise, pad or truncate the pose_result to a length of
+            num_instances.
+        axis_azimuth (float): axis azimuth angle for 3D visualizations.
+        axis_dist (float): axis distance for 3D visualizations.
+        axis_elev (float): axis elevation view angle for 3D visualizations.
+        axis_limit (float): The axis limit to visualize 3d pose. The xyz
+            range will be set as:
+            - x: [x_c - axis_limit/2, x_c + axis_limit/2]
+            - y: [y_c - axis_limit/2, y_c + axis_limit/2]
+            - z: [0, axis_limit]
+            Where x_c, y_c is the mean value of x and y coordinates
+        figsize: (float): figure size in inch.
+    """
+
+    show_img = img is not None
+    if num_instances < 0:
+        num_instances = len(pose_result)
+    else:
+        if len(pose_result) > num_instances:
+            pose_result = pose_result[:num_instances]
+        elif len(pose_result) < num_instances:
+            pose_result += [dict()] * (num_instances - len(pose_result))
+    num_axis = num_instances + 1 if show_img else num_instances
+
+    plt.ioff()
+    fig = plt.figure(figsize=(vis_height * num_axis * 0.01, vis_height * 0.01))
+
+    if show_img:
+        img = mmcv.imread(img, channel_order='bgr')
+        img = mmcv.bgr2rgb(img)
+        img = mmcv.imrescale(img, scale=vis_height / img.shape[0])
+
+        ax_img = fig.add_subplot(1, num_axis, 1)
+        ax_img.get_xaxis().set_visible(False)
+        ax_img.get_yaxis().set_visible(False)
+        ax_img.set_axis_off()
+        ax_img.set_title('Input')
+        ax_img.imshow(img, aspect='equal')
+
+    for idx, res in enumerate(pose_result):
+        dummy = len(res) == 0
+        kpts = np.zeros((1, 3)) if dummy else res['keypoints_3d']
+        if kpts.shape[1] == 3:
+            kpts = np.concatenate([kpts, np.ones((kpts.shape[0], 1))], axis=1)
+        valid = kpts[:, 3] >= kpt_score_thr
+
+        ax_idx = idx + 2 if show_img else idx + 1
+        ax = fig.add_subplot(1, num_axis, ax_idx, projection='3d')
+        ax.view_init(
+            elev=axis_elev,
+            azim=axis_azimuth,
+        )
+        x_c = np.mean(kpts[valid, 0]) if sum(valid) > 0 else 0
+        y_c = np.mean(kpts[valid, 1]) if sum(valid) > 0 else 0
+        ax.set_xlim3d([x_c - axis_limit / 2, x_c + axis_limit / 2])
+        ax.set_ylim3d([y_c - axis_limit / 2, y_c + axis_limit / 2])
+        ax.set_zlim3d([0, axis_limit])
+        ax.set_aspect('auto')
+        ax.set_xticks([])
+        ax.set_yticks([])
+        ax.set_zticks([])
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_zticklabels([])
+        ax.dist = axis_dist
+
+        if not dummy and pose_kpt_color is not None:
+            pose_kpt_color = np.array(pose_kpt_color)
+            assert len(pose_kpt_color) == len(kpts)
+            x_3d, y_3d, z_3d = np.split(kpts[:, :3], [1, 2], axis=1)
+            # matplotlib uses RGB color in [0, 1] value range
+            _color = pose_kpt_color[..., ::-1] / 255.
+            ax.scatter(
+                x_3d[valid],
+                y_3d[valid],
+                z_3d[valid],
+                marker='o',
+                color=_color[valid],
+            )
+
+        if not dummy and skeleton is not None and pose_link_color is not None:
+            pose_link_color = np.array(pose_link_color)
+            assert len(pose_link_color) == len(skeleton)
+            for link, link_color in zip(skeleton, pose_link_color):
+                link_indices = [_i for _i in link]
+                xs_3d = kpts[link_indices, 0]
+                ys_3d = kpts[link_indices, 1]
+                zs_3d = kpts[link_indices, 2]
+                kpt_score = kpts[link_indices, 3]
+                if kpt_score.min() > kpt_score_thr:
+                    # matplotlib uses RGB color in [0, 1] value range
+                    _color = link_color[::-1] / 255.
+                    ax.plot(xs_3d, ys_3d, zs_3d, color=_color, zdir='z')
+
+        if 'title' in res:
+            ax.set_title(res['title'])
+
+    # convert figure to numpy array
+    fig.tight_layout()
+    fig.canvas.draw()
+    img_w, img_h = fig.canvas.get_width_height()
+    img_vis = np.frombuffer(
+        fig.canvas.tostring_rgb(), dtype=np.uint8).reshape(img_h, img_w, -1)
+    img_vis = mmcv.rgb2bgr(img_vis)
+
+    plt.close(fig)
+
+    return img_vis
+
+
+def imshow_mesh_3d(img,
+                   vertices,
+                   faces,
+                   camera_center,
+                   focal_length,
+                   colors=(76, 76, 204)):
+    """Render 3D meshes on background image.
+
+    Args:
+        img(np.ndarray): Background image.
+        vertices (list of np.ndarray): Vetrex coordinates in camera space.
+        faces (list of np.ndarray): Faces of meshes.
+        camera_center ([2]): Center pixel.
+        focal_length ([2]): Focal length of camera.
+        colors (list[str or tuple or Color]): A list of mesh colors.
+    """
+
+    H, W, C = img.shape
+
+    if not has_pyrender:
+        warnings.warn('pyrender package is not installed.')
+        return img
+
+    if not has_trimesh:
+        warnings.warn('trimesh package is not installed.')
+        return img
+
+    try:
+        renderer = pyrender.OffscreenRenderer(
+            viewport_width=W, viewport_height=H)
+    except (ImportError, RuntimeError):
+        warnings.warn('pyrender package is not installed correctly.')
+        return img
+
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(vertices))]
+    colors = [color_val(c) for c in colors]
+
+    depth_map = np.ones([H, W]) * np.inf
+    output_img = img
+    for idx in range(len(vertices)):
+        color = colors[idx]
+        color = [c / 255.0 for c in color]
+        color.append(1.0)
+        vert = vertices[idx]
+        face = faces[idx]
+
+        material = pyrender.MetallicRoughnessMaterial(
+            metallicFactor=0.2, alphaMode='OPAQUE', baseColorFactor=color)
+
+        mesh = trimesh.Trimesh(vert, face)
+        rot = trimesh.transformations.rotation_matrix(
+            np.radians(180), [1, 0, 0])
+        mesh.apply_transform(rot)
+        mesh = pyrender.Mesh.from_trimesh(mesh, material=material)
+
+        scene = pyrender.Scene(ambient_light=(0.5, 0.5, 0.5))
+        scene.add(mesh, 'mesh')
+
+        camera_pose = np.eye(4)
+        camera = pyrender.IntrinsicsCamera(
+            fx=focal_length[0],
+            fy=focal_length[1],
+            cx=camera_center[0],
+            cy=camera_center[1],
+            zfar=1e5)
+        scene.add(camera, pose=camera_pose)
+
+        light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=1)
+        light_pose = np.eye(4)
+
+        light_pose[:3, 3] = np.array([0, -1, 1])
+        scene.add(light, pose=light_pose)
+
+        light_pose[:3, 3] = np.array([0, 1, 1])
+        scene.add(light, pose=light_pose)
+
+        light_pose[:3, 3] = np.array([1, 1, 2])
+        scene.add(light, pose=light_pose)
+
+        color, rend_depth = renderer.render(
+            scene, flags=pyrender.RenderFlags.RGBA)
+
+        valid_mask = (rend_depth < depth_map) * (rend_depth > 0)
+        depth_map[valid_mask] = rend_depth[valid_mask]
+        valid_mask = valid_mask[:, :, None]
+        output_img = (
+            valid_mask * color[:, :, :3] + (1 - valid_mask) * output_img)
+
+    return output_img
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b9e7cf035e1e7621d82ce98eb8ab372ce8cfc98
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
+from .dataset_info import DatasetInfo
+from .pipelines import Compose
+from .samplers import DistributedSampler
+
+from .datasets import (  # isort:skip
+    AnimalATRWDataset, AnimalFlyDataset, AnimalHorse10Dataset,
+    AnimalLocustDataset, AnimalMacaqueDataset, AnimalPoseDataset,
+    AnimalZebraDataset, Body3DH36MDataset, BottomUpAicDataset,
+    BottomUpCocoDataset, BottomUpCocoWholeBodyDataset,
+    BottomUpCrowdPoseDataset, BottomUpMhpDataset, DeepFashionDataset,
+    Face300WDataset, FaceAFLWDataset, FaceCocoWholeBodyDataset,
+    FaceCOFWDataset, FaceWFLWDataset, FreiHandDataset,
+    HandCocoWholeBodyDataset, InterHand2DDataset, InterHand3DDataset,
+    MeshAdversarialDataset, MeshH36MDataset, MeshMixDataset, MoshDataset,
+    OneHand10KDataset, PanopticDataset, TopDownAicDataset, TopDownCocoDataset,
+    TopDownCocoWholeBodyDataset, TopDownCrowdPoseDataset,
+    TopDownFreiHandDataset, TopDownH36MDataset, TopDownJhmdbDataset,
+    TopDownMhpDataset, TopDownMpiiDataset, TopDownMpiiTrbDataset,
+    TopDownOCHumanDataset, TopDownOneHand10KDataset, TopDownPanopticDataset,
+    TopDownPoseTrack18Dataset, TopDownPoseTrack18VideoDataset)
+
+__all__ = [
+    'TopDownCocoDataset', 'BottomUpCocoDataset', 'BottomUpMhpDataset',
+    'BottomUpAicDataset', 'BottomUpCocoWholeBodyDataset', 'TopDownMpiiDataset',
+    'TopDownMpiiTrbDataset', 'OneHand10KDataset', 'PanopticDataset',
+    'HandCocoWholeBodyDataset', 'FreiHandDataset', 'InterHand2DDataset',
+    'InterHand3DDataset', 'TopDownOCHumanDataset', 'TopDownAicDataset',
+    'TopDownCocoWholeBodyDataset', 'MeshH36MDataset', 'MeshMixDataset',
+    'MoshDataset', 'MeshAdversarialDataset', 'TopDownCrowdPoseDataset',
+    'BottomUpCrowdPoseDataset', 'TopDownFreiHandDataset',
+    'TopDownOneHand10KDataset', 'TopDownPanopticDataset',
+    'TopDownPoseTrack18Dataset', 'TopDownJhmdbDataset', 'TopDownMhpDataset',
+    'DeepFashionDataset', 'Face300WDataset', 'FaceAFLWDataset',
+    'FaceWFLWDataset', 'FaceCOFWDataset', 'FaceCocoWholeBodyDataset',
+    'Body3DH36MDataset', 'AnimalHorse10Dataset', 'AnimalMacaqueDataset',
+    'AnimalFlyDataset', 'AnimalLocustDataset', 'AnimalZebraDataset',
+    'AnimalATRWDataset', 'AnimalPoseDataset', 'TopDownH36MDataset',
+    'TopDownPoseTrack18VideoDataset', 'build_dataloader', 'build_dataset',
+    'Compose', 'DistributedSampler', 'DATASETS', 'PIPELINES', 'DatasetInfo'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/builder.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..990ba859e010064377f805e6aa3826984cf25b55
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/builder.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import Registry, build_from_cfg, is_seq_of
+from mmcv.utils.parrots_wrapper import _get_dataloader
+from torch.utils.data.dataset import ConcatDataset
+
+from .samplers import DistributedSampler
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
+
+
+def _concat_dataset(cfg, default_args=None):
+    types = cfg['type']
+    ann_files = cfg['ann_file']
+    img_prefixes = cfg.get('img_prefix', None)
+    dataset_infos = cfg.get('dataset_info', None)
+
+    num_joints = cfg['data_cfg'].get('num_joints', None)
+    dataset_channel = cfg['data_cfg'].get('dataset_channel', None)
+
+    datasets = []
+    num_dset = len(ann_files)
+    for i in range(num_dset):
+        cfg_copy = copy.deepcopy(cfg)
+        cfg_copy['ann_file'] = ann_files[i]
+
+        if isinstance(types, (list, tuple)):
+            cfg_copy['type'] = types[i]
+        if isinstance(img_prefixes, (list, tuple)):
+            cfg_copy['img_prefix'] = img_prefixes[i]
+        if isinstance(dataset_infos, (list, tuple)):
+            cfg_copy['dataset_info'] = dataset_infos[i]
+
+        if isinstance(num_joints, (list, tuple)):
+            cfg_copy['data_cfg']['num_joints'] = num_joints[i]
+
+        if is_seq_of(dataset_channel, list):
+            cfg_copy['data_cfg']['dataset_channel'] = dataset_channel[i]
+
+        datasets.append(build_dataset(cfg_copy, default_args))
+
+    return ConcatDataset(datasets)
+
+
+def build_dataset(cfg, default_args=None):
+    """Build a dataset from config dict.
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        default_args (dict, optional): Default initialization arguments.
+            Default: None.
+
+    Returns:
+        Dataset: The constructed dataset.
+    """
+    from .dataset_wrappers import RepeatDataset
+
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'ConcatDataset':
+        dataset = ConcatDataset(
+            [build_dataset(c, default_args) for c in cfg['datasets']])
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+    return dataset
+
+
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     seed=None,
+                     drop_last=True,
+                     pin_memory=True,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        drop_last (bool): Whether to drop the last incomplete batch in epoch.
+            Default: True
+        pin_memory (bool): Whether to use pin_memory in DataLoader.
+            Default: True
+        kwargs: any keyword argument to be used to initialize DataLoader
+
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    if dist:
+        sampler = DistributedSampler(
+            dataset, world_size, rank, shuffle=shuffle, seed=seed)
+        shuffle = False
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        sampler = None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+
+    _, DataLoader = _get_dataloader()
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=pin_memory,
+        shuffle=shuffle,
+        worker_init_fn=init_fn,
+        drop_last=drop_last,
+        **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    """Init the random seed for various workers."""
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/dataset_info.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/dataset_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef0d62e43089770797ef565d2153c8d42e4956c5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/dataset_info.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+class DatasetInfo:
+
+    def __init__(self, dataset_info):
+        self._dataset_info = dataset_info
+        self.dataset_name = self._dataset_info['dataset_name']
+        self.paper_info = self._dataset_info['paper_info']
+        self.keypoint_info = self._dataset_info['keypoint_info']
+        self.skeleton_info = self._dataset_info['skeleton_info']
+        self.joint_weights = np.array(
+            self._dataset_info['joint_weights'], dtype=np.float32)[:, None]
+
+        self.sigmas = np.array(self._dataset_info['sigmas'])
+
+        self._parse_keypoint_info()
+        self._parse_skeleton_info()
+
+    def _parse_skeleton_info(self):
+        """Parse skeleton information.
+
+        - link_num (int): number of links.
+        - skeleton (list((2,))): list of links (id).
+        - skeleton_name (list((2,))): list of links (name).
+        - pose_link_color (np.ndarray): the color of the link for
+            visualization.
+        """
+        self.link_num = len(self.skeleton_info.keys())
+        self.pose_link_color = []
+
+        self.skeleton_name = []
+        self.skeleton = []
+        for skid in self.skeleton_info.keys():
+            link = self.skeleton_info[skid]['link']
+            self.skeleton_name.append(link)
+            self.skeleton.append([
+                self.keypoint_name2id[link[0]], self.keypoint_name2id[link[1]]
+            ])
+            self.pose_link_color.append(self.skeleton_info[skid].get(
+                'color', [255, 128, 0]))
+        self.pose_link_color = np.array(self.pose_link_color)
+
+    def _parse_keypoint_info(self):
+        """Parse keypoint information.
+
+        - keypoint_num (int): number of keypoints.
+        - keypoint_id2name (dict): mapping keypoint id to keypoint name.
+        - keypoint_name2id (dict): mapping keypoint name to keypoint id.
+        - upper_body_ids (list): a list of keypoints that belong to the
+            upper body.
+        - lower_body_ids (list): a list of keypoints that belong to the
+            lower body.
+        - flip_index (list): list of flip index (id)
+        - flip_pairs (list((2,))): list of flip pairs (id)
+        - flip_index_name (list): list of flip index (name)
+        - flip_pairs_name (list((2,))): list of flip pairs (name)
+        - pose_kpt_color (np.ndarray): the color of the keypoint for
+            visualization.
+        """
+
+        self.keypoint_num = len(self.keypoint_info.keys())
+        self.keypoint_id2name = {}
+        self.keypoint_name2id = {}
+
+        self.pose_kpt_color = []
+        self.upper_body_ids = []
+        self.lower_body_ids = []
+
+        self.flip_index_name = []
+        self.flip_pairs_name = []
+
+        for kid in self.keypoint_info.keys():
+
+            keypoint_name = self.keypoint_info[kid]['name']
+            self.keypoint_id2name[kid] = keypoint_name
+            self.keypoint_name2id[keypoint_name] = kid
+            self.pose_kpt_color.append(self.keypoint_info[kid].get(
+                'color', [255, 128, 0]))
+
+            type = self.keypoint_info[kid].get('type', '')
+            if type == 'upper':
+                self.upper_body_ids.append(kid)
+            elif type == 'lower':
+                self.lower_body_ids.append(kid)
+            else:
+                pass
+
+            swap_keypoint = self.keypoint_info[kid].get('swap', '')
+            if swap_keypoint == keypoint_name or swap_keypoint == '':
+                self.flip_index_name.append(keypoint_name)
+            else:
+                self.flip_index_name.append(swap_keypoint)
+                if [swap_keypoint, keypoint_name] not in self.flip_pairs_name:
+                    self.flip_pairs_name.append([keypoint_name, swap_keypoint])
+
+        self.flip_pairs = [[
+            self.keypoint_name2id[pair[0]], self.keypoint_name2id[pair[1]]
+        ] for pair in self.flip_pairs_name]
+        self.flip_index = [
+            self.keypoint_name2id[name] for name in self.flip_index_name
+        ]
+        self.pose_kpt_color = np.array(self.pose_kpt_color)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/dataset_wrappers.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaaa173b91f2ad63dc7d80b793fa3d9619a4630c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/dataset_wrappers.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import DATASETS
+
+
+@DATASETS.register_module()
+class RepeatDataset:
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        times (int): Repeat times.
+    """
+
+    def __init__(self, dataset, times):
+        self.dataset = dataset
+        self.times = times
+
+        self._ori_len = len(self.dataset)
+
+    def __getitem__(self, idx):
+        """Get data."""
+        return self.dataset[idx % self._ori_len]
+
+    def __len__(self):
+        """Length after repetition."""
+        return self.times * self._ori_len
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3839e5eaa0c068fec5e86804ce9d75c9e85ae4b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/__init__.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ...deprecated import (TopDownFreiHandDataset, TopDownOneHand10KDataset,
+                           TopDownPanopticDataset)
+from .animal import (AnimalATRWDataset, AnimalFlyDataset, AnimalHorse10Dataset,
+                     AnimalLocustDataset, AnimalMacaqueDataset,
+                     AnimalPoseDataset, AnimalZebraDataset)
+from .body3d import Body3DH36MDataset, Body3DMviewDirectPanopticDataset
+from .bottom_up import (BottomUpAicDataset, BottomUpCocoDataset,
+                        BottomUpCocoWholeBodyDataset, BottomUpCrowdPoseDataset,
+                        BottomUpMhpDataset)
+from .face import (Face300WDataset, FaceAFLWDataset, FaceCocoWholeBodyDataset,
+                   FaceCOFWDataset, FaceWFLWDataset)
+from .fashion import DeepFashionDataset
+from .hand import (FreiHandDataset, HandCocoWholeBodyDataset,
+                   InterHand2DDataset, InterHand3DDataset, OneHand10KDataset,
+                   PanopticDataset)
+from .mesh import (MeshAdversarialDataset, MeshH36MDataset, MeshMixDataset,
+                   MoshDataset)
+from .top_down import (TopDownAicDataset, TopDownCocoDataset,
+                       TopDownCocoWholeBodyDataset, TopDownCrowdPoseDataset,
+                       TopDownH36MDataset, TopDownHalpeDataset,
+                       TopDownJhmdbDataset, TopDownMhpDataset,
+                       TopDownMpiiDataset, TopDownMpiiTrbDataset,
+                       TopDownOCHumanDataset, TopDownPoseTrack18Dataset,
+                       TopDownPoseTrack18VideoDataset)
+
+__all__ = [
+    'TopDownCocoDataset', 'BottomUpCocoDataset', 'BottomUpMhpDataset',
+    'BottomUpAicDataset', 'BottomUpCocoWholeBodyDataset', 'TopDownMpiiDataset',
+    'TopDownMpiiTrbDataset', 'OneHand10KDataset', 'PanopticDataset',
+    'HandCocoWholeBodyDataset', 'FreiHandDataset', 'InterHand2DDataset',
+    'InterHand3DDataset', 'TopDownOCHumanDataset', 'TopDownAicDataset',
+    'TopDownCocoWholeBodyDataset', 'MeshH36MDataset', 'MeshMixDataset',
+    'MoshDataset', 'MeshAdversarialDataset', 'TopDownCrowdPoseDataset',
+    'BottomUpCrowdPoseDataset', 'TopDownFreiHandDataset',
+    'TopDownOneHand10KDataset', 'TopDownPanopticDataset',
+    'TopDownPoseTrack18Dataset', 'TopDownJhmdbDataset', 'TopDownMhpDataset',
+    'DeepFashionDataset', 'Face300WDataset', 'FaceAFLWDataset',
+    'FaceWFLWDataset', 'FaceCOFWDataset', 'FaceCocoWholeBodyDataset',
+    'Body3DH36MDataset', 'AnimalHorse10Dataset', 'AnimalMacaqueDataset',
+    'AnimalFlyDataset', 'AnimalLocustDataset', 'AnimalZebraDataset',
+    'AnimalATRWDataset', 'AnimalPoseDataset', 'TopDownH36MDataset',
+    'TopDownHalpeDataset', 'TopDownPoseTrack18VideoDataset',
+    'Body3DMviewDirectPanopticDataset'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185b935ced4cf072975ec37701b5e8a3aa1d7939
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .animal_ap10k_dataset import AnimalAP10KDataset
+from .animal_atrw_dataset import AnimalATRWDataset
+from .animal_fly_dataset import AnimalFlyDataset
+from .animal_horse10_dataset import AnimalHorse10Dataset
+from .animal_locust_dataset import AnimalLocustDataset
+from .animal_macaque_dataset import AnimalMacaqueDataset
+from .animal_pose_dataset import AnimalPoseDataset
+from .animal_zebra_dataset import AnimalZebraDataset
+
+__all__ = [
+    'AnimalHorse10Dataset', 'AnimalMacaqueDataset', 'AnimalFlyDataset',
+    'AnimalLocustDataset', 'AnimalZebraDataset', 'AnimalATRWDataset',
+    'AnimalPoseDataset', 'AnimalAP10KDataset'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_ap10k_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_ap10k_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..11a1e73ed0c72f5c3fc4ccdab010b53acd2a57c4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_ap10k_dataset.py
@@ -0,0 +1,367 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalAP10KDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """AP-10K dataset for animal pose estimation.
+
+    "AP-10K: A Benchmark for Animal Pose Estimation in the Wild"
+    Neurips Dataset Track'2021.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2108.12617>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    AP-10K keypoint indexes::
+
+        0: 'L_Eye',
+        1: 'R_Eye',
+        2: 'Nose',
+        3: 'Neck',
+        4: 'root of tail',
+        5: 'L_Shoulder',
+        6: 'L_Elbow',
+        7: 'L_F_Paw',
+        8: 'R_Shoulder',
+        9: 'R_Elbow',
+        10: 'R_F_Paw,
+        11: 'L_Hip',
+        12: 'L_Knee',
+        13: 'L_B_Paw',
+        14: 'R_Hip',
+        15: 'R_Knee',
+        16: 'R_B_Paw'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/ap10k.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db, self.id2Cat = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db, id2Cat = self._load_coco_keypoint_annotations()
+        return gt_db, id2Cat
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db, id2Cat = [], dict()
+        for img_id in self.img_ids:
+            db_tmp, id2Cat_tmp = self._load_coco_keypoint_annotation_kernel(
+                img_id)
+            gt_db.extend(db_tmp)
+            id2Cat.update({img_id: id2Cat_tmp})
+        return gt_db, id2Cat
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        id2Cat = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            category = obj['category_id']
+            id2Cat.append({
+                'image_file': image_file,
+                'bbox_id': bbox_id,
+                'category': category,
+            })
+            bbox_id = bbox_id + 1
+
+        return rec, id2Cat
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                cat = self.id2Cat[image_id][bbox_ids[i]]['category']
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i],
+                    'category': cat
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': img_kpt['category'],
+                'keypoints': key_point.tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_atrw_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_atrw_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..edfd3f96c6571cda4bd39b223c3382f8cff17f51
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_atrw_dataset.py
@@ -0,0 +1,353 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalATRWDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """ATRW dataset for animal pose estimation.
+
+    "ATRW: A Benchmark for Amur Tiger Re-identification in the Wild"
+    ACM MM'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1906.05586>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    ATRW keypoint indexes::
+
+        0: "left_ear",
+        1: "right_ear",
+        2: "nose",
+        3: "right_shoulder",
+        4: "right_front_paw",
+        5: "left_shoulder",
+        6: "left_front_paw",
+        7: "right_hip",
+        8: "right_knee",
+        9: "right_back_paw",
+        10: "left_hip",
+        11: "left_knee",
+        12: "left_back_paw",
+        13: "tail",
+        14: "center"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/atrw.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4], padding=1.0)
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': cat_id,
+                'keypoints': key_point.tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_base_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e191882f3424167e9bd07693498f36cd57905fd0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class AnimalBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgTopDownDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'AnimalBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgTopDownDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_fly_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_fly_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4141176142e0d12c1c65b772f4e48c873f04c47
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_fly_dataset.py
@@ -0,0 +1,215 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalFlyDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """AnimalFlyDataset for animal pose estimation.
+
+    "Fast animal pose estimation using deep neural networks"
+    Nature methods'2019. More details can be found in the `paper
+    <https://www.biorxiv.org/content/biorxiv/\
+    early/2018/05/25/331181.full.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Vinegar Fly keypoint indexes::
+
+        0: "head",
+        1: "eyeL",
+        2: "eyeR",
+        3: "neck",
+        4: "thorax",
+        5: "abdomen",
+        6: "forelegR1",
+        7: "forelegR2",
+        8: "forelegR3",
+        9: "forelegR4",
+        10: "midlegR1",
+        11: "midlegR2",
+        12: "midlegR3",
+        13: "midlegR4",
+        14: "hindlegR1",
+        15: "hindlegR2",
+        16: "hindlegR3",
+        17: "hindlegR4",
+        18: "forelegL1",
+        19: "forelegL2",
+        20: "forelegL3",
+        21: "forelegL4",
+        22: "midlegL1",
+        23: "midlegL2",
+        24: "midlegL3",
+        25: "midlegL4",
+        26: "hindlegL1",
+        27: "hindlegL2",
+        28: "hindlegL3",
+        29: "hindlegL4",
+        30: "wingL",
+        31: "wingR"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/fly.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # the ori image is 192x192
+                center, scale = self._xywh2cs(0, 0, 192, 192, 0.8)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate Fly keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+
+            res_folder (str): Path of directory to save the results.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_horse10_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_horse10_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2bf1986edb75f8f5e60c4ddd45bfb45d5e38d9c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_horse10_dataset.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalHorse10Dataset(Kpt2dSviewRgbImgTopDownDataset):
+    """AnimalHorse10Dataset for animal pose estimation.
+
+    "Pretraining boosts out-of-domain robustness for pose estimation"
+    WACV'2021. More details can be found in the `paper
+    <https://arxiv.org/pdf/1909.11229.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Horse-10 keypoint indexes::
+
+        0: 'Nose',
+        1: 'Eye',
+        2: 'Nearknee',
+        3: 'Nearfrontfetlock',
+        4: 'Nearfrontfoot',
+        5: 'Offknee',
+        6: 'Offfrontfetlock',
+        7: 'Offfrontfoot',
+        8: 'Shoulder',
+        9: 'Midshoulder',
+        10: 'Elbow',
+        11: 'Girth',
+        12: 'Wither',
+        13: 'Nearhindhock',
+        14: 'Nearhindfetlock',
+        15: 'Nearhindfoot',
+        16: 'Hip',
+        17: 'Stifle',
+        18: 'Offhindhock',
+        19: 'Offhindfetlock',
+        20: 'Offhindfoot',
+        21: 'Ischium'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/horse10.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # use 1.25 padded bbox as input
+                center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, gts):
+        """Get inter-ocular distance as the normalize factor, measured as the
+        Euclidean distance between the outer corners of the eyes.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        interocular = np.linalg.norm(
+            gts[:, 0, :] - gts[:, 1, :], axis=1, keepdims=True)
+        return np.tile(interocular, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate horse-10 keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_locust_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_locust_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..95fb6ac896e7d0553efb6c479fca92684d87ac22
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_locust_dataset.py
@@ -0,0 +1,218 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalLocustDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """AnimalLocustDataset for animal pose estimation.
+
+    "DeepPoseKit, a software toolkit for fast and robust animal
+    pose estimation using deep learning" Elife'2019.
+    More details can be found in the paper.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Desert Locust keypoint indexes::
+
+        0: "head",
+        1: "neck",
+        2: "thorax",
+        3: "abdomen1",
+        4: "abdomen2",
+        5: "anttipL",
+        6: "antbaseL",
+        7: "eyeL",
+        8: "forelegL1",
+        9: "forelegL2",
+        10: "forelegL3",
+        11: "forelegL4",
+        12: "midlegL1",
+        13: "midlegL2",
+        14: "midlegL3",
+        15: "midlegL4",
+        16: "hindlegL1",
+        17: "hindlegL2",
+        18: "hindlegL3",
+        19: "hindlegL4",
+        20: "anttipR",
+        21: "antbaseR",
+        22: "eyeR",
+        23: "forelegR1",
+        24: "forelegR2",
+        25: "forelegR3",
+        26: "forelegR4",
+        27: "midlegR1",
+        28: "midlegR2",
+        29: "midlegR3",
+        30: "midlegR4",
+        31: "hindlegR1",
+        32: "hindlegR2",
+        33: "hindlegR3",
+        34: "hindlegR4"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/locust.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # the ori image is 160x160
+                center, scale = self._xywh2cs(0, 0, 160, 160, 0.8)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate Fly keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_macaque_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_macaque_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..359fecaa2b6e29f24e2bdb01a3a8715f12c5925f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_macaque_dataset.py
@@ -0,0 +1,355 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalMacaqueDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """MacaquePose dataset for animal pose estimation.
+
+    "MacaquePose: A novel ‘in the wild’ macaque monkey pose dataset
+    for markerless motion capture" bioRxiv'2020.
+    More details can be found in the `paper
+    <https://www.biorxiv.org/content/10.1101/2020.07.30.229989v1>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Macaque keypoint indexes::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/macaque.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': cat_id,
+                'keypoints': key_point.tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_pose_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_pose_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ced5703f3771597f21123b44c77a53a02a48e78
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_pose_dataset.py
@@ -0,0 +1,359 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalPoseDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Animal-Pose dataset for animal pose estimation.
+
+    "Cross-domain Adaptation For Animal Pose Estimation" ICCV'2019
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1908.05806>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Animal-Pose keypoint indexes::
+
+        0: 'L_Eye',
+        1: 'R_Eye',
+        2: 'L_EarBase',
+        3: 'R_EarBase',
+        4: 'Nose',
+        5: 'Throat',
+        6: 'TailBase',
+        7: 'Withers',
+        8: 'L_F_Elbow',
+        9: 'R_F_Elbow',
+        10: 'L_B_Elbow',
+        11: 'R_B_Elbow',
+        12: 'L_F_Knee',
+        13: 'R_F_Knee',
+        14: 'L_B_Knee',
+        15: 'R_B_Knee',
+        16: 'L_F_Paw',
+        17: 'R_F_Paw',
+        18: 'L_B_Paw',
+        19: 'R_B_Paw'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/animalpose.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+
+        Args:
+            img_id: coco image id
+
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': cat_id,
+                'keypoints': key_point.tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_zebra_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_zebra_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c5e3b73c885f86c13e7a5ebf02b03441b2dc93d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/animal/animal_zebra_dataset.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalZebraDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """AnimalZebraDataset for animal pose estimation.
+
+    "DeepPoseKit, a software toolkit for fast and robust animal
+    pose estimation using deep learning" Elife'2019.
+    More details can be found in the paper.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Desert Locust keypoint indexes::
+
+        0: "snout",
+        1: "head",
+        2: "neck",
+        3: "forelegL1",
+        4: "forelegR1",
+        5: "hindlegL1",
+        6: "hindlegR1",
+        7: "tailbase",
+        8: "tailtip"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/zebra.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # the ori image is 160x160
+                center, scale = self._xywh2cs(0, 0, 160, 160, 0.8)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate Fly keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5f9a0899cdfde4132b068e6408ca721a59dc9b4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .kpt_2d_sview_rgb_img_bottom_up_dataset import \
+    Kpt2dSviewRgbImgBottomUpDataset
+from .kpt_2d_sview_rgb_img_top_down_dataset import \
+    Kpt2dSviewRgbImgTopDownDataset
+from .kpt_2d_sview_rgb_vid_top_down_dataset import \
+    Kpt2dSviewRgbVidTopDownDataset
+from .kpt_3d_mview_rgb_img_direct_dataset import Kpt3dMviewRgbImgDirectDataset
+from .kpt_3d_sview_kpt_2d_dataset import Kpt3dSviewKpt2dDataset
+from .kpt_3d_sview_rgb_img_top_down_dataset import \
+    Kpt3dSviewRgbImgTopDownDataset
+
+__all__ = [
+    'Kpt3dMviewRgbImgDirectDataset', 'Kpt2dSviewRgbImgTopDownDataset',
+    'Kpt3dSviewRgbImgTopDownDataset', 'Kpt2dSviewRgbImgBottomUpDataset',
+    'Kpt3dSviewKpt2dDataset', 'Kpt2dSviewRgbVidTopDownDataset'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..99306214db3a36465bdc8a24ebec41db58a6ca68
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import xtcocotools
+from torch.utils.data import Dataset
+from xtcocotools.coco import COCO
+
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt2dSviewRgbImgBottomUpDataset(Dataset, metaclass=ABCMeta):
+    """Base class for bottom-up datasets.
+
+    All datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_single`
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        coco_style (bool): Whether the annotation json is coco-style.
+            Default: True
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 coco_style=True,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        # bottom-up
+        self.base_size = data_cfg['base_size']
+        self.base_sigma = data_cfg['base_sigma']
+        self.int_sigma = False
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+        self.ann_info['num_scales'] = data_cfg['num_scales']
+        self.ann_info['scale_aware_sigma'] = data_cfg['scale_aware_sigma']
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        self.use_nms = data_cfg.get('use_nms', False)
+        self.soft_nms = data_cfg.get('soft_nms', True)
+        self.oks_thr = data_cfg.get('oks_thr', 0.9)
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['flip_index'] = dataset_info.flip_index
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        if coco_style:
+            self.coco = COCO(ann_file)
+            if 'categories' in self.coco.dataset:
+                cats = [
+                    cat['name']
+                    for cat in self.coco.loadCats(self.coco.getCatIds())
+                ]
+                self.classes = ['__background__'] + cats
+                self.num_classes = len(self.classes)
+                self._class_to_ind = dict(
+                    zip(self.classes, range(self.num_classes)))
+                self._class_to_coco_ind = dict(
+                    zip(cats, self.coco.getCatIds()))
+                self._coco_ind_to_class_ind = dict(
+                    (self._class_to_coco_ind[cls], self._class_to_ind[cls])
+                    for cls in self.classes[1:])
+            self.img_ids = self.coco.getImgIds()
+            if not test_mode:
+                self.img_ids = [
+                    img_id for img_id in self.img_ids if
+                    len(self.coco.getAnnIds(imgIds=img_id, iscrowd=None)) > 0
+                ]
+            self.num_images = len(self.img_ids)
+            self.id2name, self.name2id = self._get_mapping_id_name(
+                self.coco.imgs)
+
+        self.pipeline = Compose(self.pipeline)
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _get_mask(self, anno, idx):
+        """Get ignore masks to mask out losses."""
+        coco = self.coco
+        img_info = coco.loadImgs(self.img_ids[idx])[0]
+
+        m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32)
+
+        for obj in anno:
+            if 'segmentation' in obj:
+                if obj['iscrowd']:
+                    rle = xtcocotools.mask.frPyObjects(obj['segmentation'],
+                                                       img_info['height'],
+                                                       img_info['width'])
+                    m += xtcocotools.mask.decode(rle)
+                elif obj['num_keypoints'] == 0:
+                    rles = xtcocotools.mask.frPyObjects(
+                        obj['segmentation'], img_info['height'],
+                        img_info['width'])
+                    for rle in rles:
+                        m += xtcocotools.mask.decode(rle)
+
+        return m < 0.5
+
+    @abstractmethod
+    def _get_single(self, idx):
+        """Get anno for a single image."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    def prepare_train_img(self, idx):
+        """Prepare image for training given the index."""
+        results = copy.deepcopy(self._get_single(idx))
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        """Prepare image for testing given the index."""
+        results = copy.deepcopy(self._get_single(idx))
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def __len__(self):
+        """Get dataset length."""
+        return len(self.img_ids)
+
+    def __getitem__(self, idx):
+        """Get the sample for either training or testing given index."""
+        if self.test_mode:
+            return self.prepare_test_img(idx)
+
+        return self.prepare_train_img(idx)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb281f1bcf1a3771aea4fb5335487b17d5994168
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py
@@ -0,0 +1,287 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import json_tricks as json
+import numpy as np
+from torch.utils.data import Dataset
+from xtcocotools.coco import COCO
+
+from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe,
+                                                  keypoint_nme,
+                                                  keypoint_pck_accuracy)
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt2dSviewRgbImgTopDownDataset(Dataset, metaclass=ABCMeta):
+    """Base class for keypoint 2D top-down pose estimation with single-view RGB
+    image as the input.
+
+    All fashion datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_db`, 'evaluate'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        coco_style (bool): Whether the annotation json is coco-style.
+            Default: True
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 coco_style=True,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        self.ann_info['max_num_joints'] = data_cfg.get('max_num_joints', None)
+        self.ann_info['dataset_idx'] = data_cfg.get('dataset_idx', 0)
+
+        self.ann_info['use_different_joint_weights'] = data_cfg.get(
+            'use_different_joint_weights', False)
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['flip_index'] = dataset_info.flip_index
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        if coco_style:
+            self.coco = COCO(ann_file)
+            if 'categories' in self.coco.dataset:
+                cats = [
+                    cat['name']
+                    for cat in self.coco.loadCats(self.coco.getCatIds())
+                ]
+                self.classes = ['__background__'] + cats
+                self.num_classes = len(self.classes)
+                self._class_to_ind = dict(
+                    zip(self.classes, range(self.num_classes)))
+                self._class_to_coco_ind = dict(
+                    zip(cats, self.coco.getCatIds()))
+                self._coco_ind_to_class_ind = dict(
+                    (self._class_to_coco_ind[cls], self._class_to_ind[cls])
+                    for cls in self.classes[1:])
+            self.img_ids = self.coco.getImgIds()
+            self.num_images = len(self.img_ids)
+            self.id2name, self.name2id = self._get_mapping_id_name(
+                self.coco.imgs)
+
+        self.db = []
+
+        self.pipeline = Compose(self.pipeline)
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _xywh2cs(self, x, y, w, h, padding=1.25):
+        """This encodes bbox(x,y,w,h) into (center, scale)
+
+        Args:
+            x, y, w, h (float): left, top, width and height
+            padding (float): bounding box padding factor
+
+        Returns:
+            center (np.ndarray[float32](2,)): center of the bbox (x, y).
+            scale (np.ndarray[float32](2,)): scale of the bbox w & h.
+        """
+        aspect_ratio = self.ann_info['image_size'][0] / self.ann_info[
+            'image_size'][1]
+        center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+        if (not self.test_mode) and np.random.rand() < 0.3:
+            center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        # pixel std is 200.0
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        # padding to include proper amount of context
+        scale = scale * padding
+
+        return center, scale
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get the normalize factor. generally inter-ocular distance measured
+        as the Euclidean distance between the outer corners of the eyes is
+        used. This function should be overrode, to measure NME.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+        return np.ones([gts.shape[0], 2], dtype=np.float32)
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self,
+                       res_file,
+                       metrics,
+                       pck_thr=0.2,
+                       pckh_thr=0.7,
+                       auc_nor=30):
+        """Keypoint evaluation.
+
+        Args:
+            res_file (str): Json file stored prediction results.
+            metrics (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'PCKh', 'AUC', 'EPE', 'NME'.
+            pck_thr (float): PCK threshold, default as 0.2.
+            pckh_thr (float): PCKh threshold, default as 0.7.
+            auc_nor (float): AUC normalization factor, default as 30 pixel.
+
+        Returns:
+            List: Evaluation results for evaluation metric.
+        """
+        info_str = []
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.db)
+
+        outputs = []
+        gts = []
+        masks = []
+        box_sizes = []
+        threshold_bbox = []
+        threshold_head_box = []
+
+        for pred, item in zip(preds, self.db):
+            outputs.append(np.array(pred['keypoints'])[:, :-1])
+            gts.append(np.array(item['joints_3d'])[:, :-1])
+            masks.append((np.array(item['joints_3d_visible'])[:, 0]) > 0)
+            if 'PCK' in metrics:
+                bbox = np.array(item['bbox'])
+                bbox_thr = np.max(bbox[2:])
+                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
+            if 'PCKh' in metrics:
+                head_box_thr = item['head_size']
+                threshold_head_box.append(
+                    np.array([head_box_thr, head_box_thr]))
+            box_sizes.append(item.get('box_size', 1))
+
+        outputs = np.array(outputs)
+        gts = np.array(gts)
+        masks = np.array(masks)
+        threshold_bbox = np.array(threshold_bbox)
+        threshold_head_box = np.array(threshold_head_box)
+        box_sizes = np.array(box_sizes).reshape([-1, 1])
+
+        if 'PCK' in metrics:
+            _, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,
+                                              threshold_bbox)
+            info_str.append(('PCK', pck))
+
+        if 'PCKh' in metrics:
+            _, pckh, _ = keypoint_pck_accuracy(outputs, gts, masks, pckh_thr,
+                                               threshold_head_box)
+            info_str.append(('PCKh', pckh))
+
+        if 'AUC' in metrics:
+            info_str.append(('AUC', keypoint_auc(outputs, gts, masks,
+                                                 auc_nor)))
+
+        if 'EPE' in metrics:
+            info_str.append(('EPE', keypoint_epe(outputs, gts, masks)))
+
+        if 'NME' in metrics:
+            normalize_factor = self._get_normalize_factor(
+                gts=gts, box_sizes=box_sizes)
+            info_str.append(
+                ('NME', keypoint_nme(outputs, gts, masks, normalize_factor)))
+
+        return info_str
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = copy.deepcopy(self.db[idx])
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e52927032d87e93021307804dfabe08a5b7ee3b6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+from torch.utils.data import Dataset
+from xtcocotools.coco import COCO
+
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt2dSviewRgbVidTopDownDataset(Dataset, metaclass=ABCMeta):
+    """Base class for keypoint 2D top-down pose estimation with single-view RGB
+    video as the input.
+
+    All fashion datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_db`, 'evaluate'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where videos/images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        coco_style (bool): Whether the annotation json is coco-style.
+            Default: True
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 coco_style=True,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        self.ann_info['use_different_joint_weights'] = data_cfg.get(
+            'use_different_joint_weights', False)
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['flip_index'] = dataset_info.flip_index
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        if coco_style:
+            self.coco = COCO(ann_file)
+            if 'categories' in self.coco.dataset:
+                cats = [
+                    cat['name']
+                    for cat in self.coco.loadCats(self.coco.getCatIds())
+                ]
+                self.classes = ['__background__'] + cats
+                self.num_classes = len(self.classes)
+                self._class_to_ind = dict(
+                    zip(self.classes, range(self.num_classes)))
+                self._class_to_coco_ind = dict(
+                    zip(cats, self.coco.getCatIds()))
+                self._coco_ind_to_class_ind = dict(
+                    (self._class_to_coco_ind[cls], self._class_to_ind[cls])
+                    for cls in self.classes[1:])
+            self.img_ids = self.coco.getImgIds()
+            self.num_images = len(self.img_ids)
+            self.id2name, self.name2id = self._get_mapping_id_name(
+                self.coco.imgs)
+
+        self.db = []
+
+        self.pipeline = Compose(self.pipeline)
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _xywh2cs(self, x, y, w, h, padding=1.25):
+        """This encodes bbox(x,y,w,h) into (center, scale)
+
+        Args:
+            x, y, w, h (float): left, top, width and height
+            padding (float): bounding box padding factor
+
+        Returns:
+            center (np.ndarray[float32](2,)): center of the bbox (x, y).
+            scale (np.ndarray[float32](2,)): scale of the bbox w & h.
+        """
+        aspect_ratio = self.ann_info['image_size'][0] / self.ann_info[
+            'image_size'][1]
+        center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+        if (not self.test_mode) and np.random.rand() < 0.3:
+            center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        # pixel std is 200.0
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        # padding to include proper amount of context
+        scale = scale * padding
+
+        return center, scale
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    @staticmethod
+    @abstractmethod
+    def _write_keypoint_results(keypoint_results, gt_folder, pred_folder):
+        """Write results into a json file."""
+
+    @abstractmethod
+    def _do_keypoint_eval(self, gt_folder, pred_folder):
+        """Keypoint evaluation.
+        Args:
+            gt_folder (str): The folder of the json files storing
+                ground truth keypoint annotations.
+            pred_folder (str): The folder of the json files storing
+                prediction results.
+
+        Returns:
+            List: Evaluation results for evaluation metric.
+        """
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = copy.deepcopy(self.db[idx])
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..94cc1c22e97b8e5e798e366dfc69b611fa742d6e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import json_tricks as json
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt3dMviewRgbImgDirectDataset(Dataset, metaclass=ABCMeta):
+    """Base class for keypoint 3D top-down pose estimation with multi-view RGB
+    images as the input.
+
+    All subclasses should overwrite:
+        Methods:`_get_db`, 'evaluate'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['space_size'] = data_cfg['space_size']
+        self.ann_info['space_center'] = data_cfg['space_center']
+        self.ann_info['cube_size'] = data_cfg['cube_size']
+        self.ann_info['scale_aware_sigma'] = data_cfg.get(
+            'scale_aware_sigma', False)
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        assert self.ann_info['num_joints'] <= dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['num_scales'] = 1
+        self.ann_info['flip_index'] = dataset_info.flip_index
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        self.load_config(data_cfg)
+
+        self.db = []
+
+        self.pipeline = Compose(self.pipeline)
+
+    def load_config(self, data_cfg):
+        """Initialize dataset attributes according to the config.
+
+        Override this method to set dataset specific attributes.
+        """
+        self.num_joints = data_cfg['num_joints']
+        self.num_cameras = data_cfg['num_cameras']
+        self.seq_frame_interval = data_cfg.get('seq_frame_interval', 1)
+        self.subset = data_cfg.get('subset', 'train')
+        self.need_2d_label = data_cfg.get('need_2d_label', False)
+        self.need_camera_param = True
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.db) // self.num_cameras
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = {}
+        # return self.pipeline(results)
+        for c in range(self.num_cameras):
+            result = copy.deepcopy(self.db[self.num_cameras * idx + c])
+            result['ann_info'] = self.ann_info
+            results[c] = result
+
+        return self.pipeline(results)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbdb9989e83d9b8ff91cfd99f2fec6d87b13aceb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt3dSviewKpt2dDataset(Dataset, metaclass=ABCMeta):
+    """Base class for 3D human pose datasets.
+
+    Subclasses should consider overwriting following methods:
+        - load_config
+        - load_annotations
+        - build_sample_indices
+        - evaluate
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+            - num_joints: Number of joints.
+            - seq_len: Number of frames in a sequence. Default: 1.
+            - seq_frame_interval: Extract frames from the video at certain
+                intervals. Default: 1.
+            - causal: If set to True, the rightmost input frame will be the
+                target frame. Otherwise, the middle input frame will be the
+                target frame. Default: True.
+            - temporal_padding: Whether to pad the video so that poses will be
+                predicted for every frame in the video. Default: False
+            - subset: Reduce dataset size by fraction. Default: 1.
+            - need_2d_label: Whether need 2D joint labels or not.
+                Default: False.
+
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.data_cfg = copy.deepcopy(data_cfg)
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+        self.ann_info = {}
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        self.load_config(self.data_cfg)
+
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        self.data_info = self.load_annotations()
+        self.sample_indices = self.build_sample_indices()
+        self.pipeline = Compose(pipeline)
+
+        self.name2id = {
+            name: i
+            for i, name in enumerate(self.data_info['imgnames'])
+        }
+
+    def load_config(self, data_cfg):
+        """Initialize dataset attributes according to the config.
+
+        Override this method to set dataset specific attributes.
+        """
+
+        self.num_joints = data_cfg['num_joints']
+        self.seq_len = data_cfg.get('seq_len', 1)
+        self.seq_frame_interval = data_cfg.get('seq_frame_interval', 1)
+        self.causal = data_cfg.get('causal', True)
+        self.temporal_padding = data_cfg.get('temporal_padding', False)
+        self.subset = data_cfg.get('subset', 1)
+        self.need_2d_label = data_cfg.get('need_2d_label', False)
+        self.need_camera_param = False
+
+    def load_annotations(self):
+        """Load data annotation."""
+        data = np.load(self.ann_file)
+
+        # get image info
+        _imgnames = data['imgname']
+        num_imgs = len(_imgnames)
+        num_joints = self.ann_info['num_joints']
+
+        if 'scale' in data:
+            _scales = data['scale'].astype(np.float32)
+        else:
+            _scales = np.zeros(num_imgs, dtype=np.float32)
+
+        if 'center' in data:
+            _centers = data['center'].astype(np.float32)
+        else:
+            _centers = np.zeros((num_imgs, 2), dtype=np.float32)
+
+        # get 3D pose
+        if 'S' in data.keys():
+            _joints_3d = data['S'].astype(np.float32)
+        else:
+            _joints_3d = np.zeros((num_imgs, num_joints, 4), dtype=np.float32)
+
+        # get 2D pose
+        if 'part' in data.keys():
+            _joints_2d = data['part'].astype(np.float32)
+        else:
+            _joints_2d = np.zeros((num_imgs, num_joints, 3), dtype=np.float32)
+
+        data_info = {
+            'imgnames': _imgnames,
+            'joints_3d': _joints_3d,
+            'joints_2d': _joints_2d,
+            'scales': _scales,
+            'centers': _centers,
+        }
+
+        return data_info
+
+    def build_sample_indices(self):
+        """Build sample indices.
+
+        The default method creates sample indices that each sample is a single
+        frame (i.e. seq_len=1). Override this method in the subclass to define
+        how frames are sampled to form data samples.
+
+        Outputs:
+            sample_indices [list(tuple)]: the frame indices of each sample.
+                For a sample, all frames will be treated as an input sequence,
+                and the ground-truth pose of the last frame will be the target.
+        """
+        sample_indices = []
+        if self.seq_len == 1:
+            num_imgs = len(self.ann_info['imgnames'])
+            sample_indices = [(idx, ) for idx in range(num_imgs)]
+        else:
+            raise NotImplementedError('Multi-frame data sample unsupported!')
+        return sample_indices
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    def prepare_data(self, idx):
+        """Get data sample."""
+        data = self.data_info
+
+        frame_ids = self.sample_indices[idx]
+        assert len(frame_ids) == self.seq_len
+
+        # get the 3D/2D pose sequence
+        _joints_3d = data['joints_3d'][frame_ids]
+        _joints_2d = data['joints_2d'][frame_ids]
+
+        # get the image info
+        _imgnames = data['imgnames'][frame_ids]
+        _centers = data['centers'][frame_ids]
+        _scales = data['scales'][frame_ids]
+        if _scales.ndim == 1:
+            _scales = np.stack([_scales, _scales], axis=1)
+
+        target_idx = -1 if self.causal else int(self.seq_len) // 2
+
+        results = {
+            'input_2d': _joints_2d[:, :, :2],
+            'input_2d_visible': _joints_2d[:, :, -1:],
+            'input_3d': _joints_3d[:, :, :3],
+            'input_3d_visible': _joints_3d[:, :, -1:],
+            'target': _joints_3d[target_idx, :, :3],
+            'target_visible': _joints_3d[target_idx, :, -1:],
+            'image_paths': _imgnames,
+            'target_image_path': _imgnames[target_idx],
+            'scales': _scales,
+            'centers': _centers,
+        }
+
+        if self.need_2d_label:
+            results['target_2d'] = _joints_2d[target_idx, :, :2]
+
+        if self.need_camera_param:
+            _cam_param = self.get_camera_param(_imgnames[0])
+            results['camera_param'] = _cam_param
+            # get image size from camera parameters
+            if 'w' in _cam_param and 'h' in _cam_param:
+                results['image_width'] = _cam_param['w']
+                results['image_height'] = _cam_param['h']
+
+        return results
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.sample_indices)
+
+    def __getitem__(self, idx):
+        """Get a sample with given index."""
+        results = copy.deepcopy(self.prepare_data(idx))
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def get_camera_param(self, imgname):
+        """Get camera parameters of a frame by its image name."""
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..af01e81868d0a918da474be896525cbe47ef006d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py
@@ -0,0 +1,256 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import json_tricks as json
+import numpy as np
+from torch.utils.data import Dataset
+from xtcocotools.coco import COCO
+
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt3dSviewRgbImgTopDownDataset(Dataset, metaclass=ABCMeta):
+    """Base class for keypoint 3D top-down pose estimation with single-view RGB
+    image as the input.
+
+    All fashion datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_db`, 'evaluate'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        coco_style (bool): Whether the annotation json is coco-style.
+            Default: True
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 coco_style=True,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['flip_index'] = dataset_info.flip_index
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        if coco_style:
+            self.coco = COCO(ann_file)
+            if 'categories' in self.coco.dataset:
+                cats = [
+                    cat['name']
+                    for cat in self.coco.loadCats(self.coco.getCatIds())
+                ]
+                self.classes = ['__background__'] + cats
+                self.num_classes = len(self.classes)
+                self._class_to_ind = dict(
+                    zip(self.classes, range(self.num_classes)))
+                self._class_to_coco_ind = dict(
+                    zip(cats, self.coco.getCatIds()))
+                self._coco_ind_to_class_ind = dict(
+                    (self._class_to_coco_ind[cls], self._class_to_ind[cls])
+                    for cls in self.classes[1:])
+            self.img_ids = self.coco.getImgIds()
+            self.num_images = len(self.img_ids)
+            self.id2name, self.name2id = self._get_mapping_id_name(
+                self.coco.imgs)
+
+        self.db = []
+
+        self.pipeline = Compose(self.pipeline)
+
+    @staticmethod
+    def _cam2pixel(cam_coord, f, c):
+        """Transform the joints from their camera coordinates to their pixel
+        coordinates.
+
+        Note:
+            N: number of joints
+
+        Args:
+            cam_coord (ndarray[N, 3]): 3D joints coordinates
+                in the camera coordinate system
+            f (ndarray[2]): focal length of x and y axis
+            c (ndarray[2]): principal point of x and y axis
+
+        Returns:
+            img_coord (ndarray[N, 3]): the coordinates (x, y, 0)
+                in the image plane.
+        """
+        x = cam_coord[:, 0] / (cam_coord[:, 2] + 1e-8) * f[0] + c[0]
+        y = cam_coord[:, 1] / (cam_coord[:, 2] + 1e-8) * f[1] + c[1]
+        z = np.zeros_like(x)
+        img_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1)
+        return img_coord
+
+    @staticmethod
+    def _world2cam(world_coord, R, T):
+        """Transform the joints from their world coordinates to their camera
+        coordinates.
+
+        Note:
+            N: number of joints
+
+        Args:
+            world_coord (ndarray[3, N]): 3D joints coordinates
+                in the world coordinate system
+            R (ndarray[3, 3]): camera rotation matrix
+            T (ndarray[3, 1]): camera position (x, y, z)
+
+        Returns:
+            cam_coord (ndarray[3, N]): 3D joints coordinates
+                in the camera coordinate system
+        """
+        cam_coord = np.dot(R, world_coord - T)
+        return cam_coord
+
+    @staticmethod
+    def _pixel2cam(pixel_coord, f, c):
+        """Transform the joints from their pixel coordinates to their camera
+        coordinates.
+
+        Note:
+            N: number of joints
+
+        Args:
+            pixel_coord (ndarray[N, 3]): 3D joints coordinates
+                in the pixel coordinate system
+            f (ndarray[2]): focal length of x and y axis
+            c (ndarray[2]): principal point of x and y axis
+
+        Returns:
+            cam_coord (ndarray[N, 3]): 3D joints coordinates
+                in the camera coordinate system
+        """
+        x = (pixel_coord[:, 0] - c[0]) / f[0] * pixel_coord[:, 2]
+        y = (pixel_coord[:, 1] - c[1]) / f[1] * pixel_coord[:, 2]
+        z = pixel_coord[:, 2]
+        cam_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1)
+        return cam_coord
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _xywh2cs(self, x, y, w, h, padding=1.25):
+        """This encodes bbox(x,y,w,h) into (center, scale)
+
+        Args:
+            x, y, w, h (float): left, top, width and height
+            padding (float): bounding box padding factor
+
+        Returns:
+            center (np.ndarray[float32](2,)): center of the bbox (x, y).
+            scale (np.ndarray[float32](2,)): scale of the bbox w & h.
+        """
+        aspect_ratio = self.ann_info['image_size'][0] / self.ann_info[
+            'image_size'][1]
+        center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+        if (not self.test_mode) and np.random.rand() < 0.3:
+            center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        # pixel std is 200.0
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        # padding to include proper amount of context
+        scale = scale * padding
+
+        return center, scale
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = copy.deepcopy(self.db[idx])
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bc25a9ebbbeb936a304c9a0416fb9892b79cbef
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .body3d_h36m_dataset import Body3DH36MDataset
+from .body3d_mpi_inf_3dhp_dataset import Body3DMpiInf3dhpDataset
+from .body3d_mview_direct_panoptic_dataset import \
+    Body3DMviewDirectPanopticDataset
+from .body3d_semi_supervision_dataset import Body3DSemiSupervisionDataset
+
+__all__ = [
+    'Body3DH36MDataset', 'Body3DSemiSupervisionDataset',
+    'Body3DMpiInf3dhpDataset', 'Body3DMviewDirectPanopticDataset'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_base_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c29232cf74e4af2cf5b60cd71bd301e4dca7f3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class Body3DBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt3dSviewKpt2dDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'Body3DBaseDataset has been replaced by '
+            'Kpt3dSviewKpt2dDataset'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae4949d5c5a869bfd37a2f19d47afafc3c1c3eea
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py
@@ -0,0 +1,343 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import mmcv
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.core.evaluation import keypoint_mpjpe
+from mmpose.datasets.datasets.base import Kpt3dSviewKpt2dDataset
+from ...builder import DATASETS
+
+
+@DATASETS.register_module()
+class Body3DH36MDataset(Kpt3dSviewKpt2dDataset):
+    """Human3.6M dataset for 3D human pose estimation.
+
+    "Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human
+    Sensing in Natural Environments", TPAMI`2014.
+    More details can be found in the `paper
+    <http://vision.imar.ro/human3.6m/pami-h36m.pdf>`__.
+
+    Human3.6M keypoint indexes::
+
+        0: 'root (pelvis)',
+        1: 'right_hip',
+        2: 'right_knee',
+        3: 'right_foot',
+        4: 'left_hip',
+        5: 'left_knee',
+        6: 'left_foot',
+        7: 'spine',
+        8: 'thorax',
+        9: 'neck_base',
+        10: 'head',
+        11: 'left_shoulder',
+        12: 'left_elbow',
+        13: 'left_wrist',
+        14: 'right_shoulder',
+        15: 'right_elbow',
+        16: 'right_wrist'
+
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    JOINT_NAMES = [
+        'Root', 'RHip', 'RKnee', 'RFoot', 'LHip', 'LKnee', 'LFoot', 'Spine',
+        'Thorax', 'NeckBase', 'Head', 'LShoulder', 'LElbow', 'LWrist',
+        'RShoulder', 'RElbow', 'RWrist'
+    ]
+
+    # 2D joint source options:
+    # "gt": from the annotation file
+    # "detection": from a detection result file of 2D keypoint
+    # "pipeline": will be generate by the pipeline
+    SUPPORTED_JOINT_2D_SRC = {'gt', 'detection', 'pipeline'}
+
+    # metric
+    ALLOWED_METRICS = {'mpjpe', 'p-mpjpe', 'n-mpjpe'}
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/h36m.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+    def load_config(self, data_cfg):
+        super().load_config(data_cfg)
+        # h36m specific attributes
+        self.joint_2d_src = data_cfg.get('joint_2d_src', 'gt')
+        if self.joint_2d_src not in self.SUPPORTED_JOINT_2D_SRC:
+            raise ValueError(
+                f'Unsupported joint_2d_src "{self.joint_2d_src}". '
+                f'Supported options are {self.SUPPORTED_JOINT_2D_SRC}')
+
+        self.joint_2d_det_file = data_cfg.get('joint_2d_det_file', None)
+
+        self.need_camera_param = data_cfg.get('need_camera_param', False)
+        if self.need_camera_param:
+            assert 'camera_param_file' in data_cfg
+            self.camera_param = self._load_camera_param(
+                data_cfg['camera_param_file'])
+
+        # h36m specific annotation info
+        ann_info = {}
+        ann_info['use_different_joint_weights'] = False
+        # action filter
+        actions = data_cfg.get('actions', '_all_')
+        self.actions = set(
+            actions if isinstance(actions, (list, tuple)) else [actions])
+
+        # subject filter
+        subjects = data_cfg.get('subjects', '_all_')
+        self.subjects = set(
+            subjects if isinstance(subjects, (list, tuple)) else [subjects])
+
+        self.ann_info.update(ann_info)
+
+    def load_annotations(self):
+        data_info = super().load_annotations()
+
+        # get 2D joints
+        if self.joint_2d_src == 'gt':
+            data_info['joints_2d'] = data_info['joints_2d']
+        elif self.joint_2d_src == 'detection':
+            data_info['joints_2d'] = self._load_joint_2d_detection(
+                self.joint_2d_det_file)
+            assert data_info['joints_2d'].shape[0] == data_info[
+                'joints_3d'].shape[0]
+            assert data_info['joints_2d'].shape[2] == 3
+        elif self.joint_2d_src == 'pipeline':
+            # joint_2d will be generated in the pipeline
+            pass
+        else:
+            raise NotImplementedError(
+                f'Unhandled joint_2d_src option {self.joint_2d_src}')
+
+        return data_info
+
+    @staticmethod
+    def _parse_h36m_imgname(imgname):
+        """Parse imgname to get information of subject, action and camera.
+
+        A typical h36m image filename is like:
+        S1_Directions_1.54138969_000001.jpg
+        """
+        subj, rest = osp.basename(imgname).split('_', 1)
+        action, rest = rest.split('.', 1)
+        camera, rest = rest.split('_', 1)
+
+        return subj, action, camera
+
+    def build_sample_indices(self):
+        """Split original videos into sequences and build frame indices.
+
+        This method overrides the default one in the base class.
+        """
+
+        # Group frames into videos. Assume that self.data_info is
+        # chronological.
+        video_frames = defaultdict(list)
+        for idx, imgname in enumerate(self.data_info['imgnames']):
+            subj, action, camera = self._parse_h36m_imgname(imgname)
+
+            if '_all_' not in self.actions and action not in self.actions:
+                continue
+
+            if '_all_' not in self.subjects and subj not in self.subjects:
+                continue
+
+            video_frames[(subj, action, camera)].append(idx)
+
+        # build sample indices
+        sample_indices = []
+        _len = (self.seq_len - 1) * self.seq_frame_interval + 1
+        _step = self.seq_frame_interval
+        for _, _indices in sorted(video_frames.items()):
+            n_frame = len(_indices)
+
+            if self.temporal_padding:
+                # Pad the sequence so that every frame in the sequence will be
+                # predicted.
+                if self.causal:
+                    frames_left = self.seq_len - 1
+                    frames_right = 0
+                else:
+                    frames_left = (self.seq_len - 1) // 2
+                    frames_right = frames_left
+                for i in range(n_frame):
+                    pad_left = max(0, frames_left - i // _step)
+                    pad_right = max(0,
+                                    frames_right - (n_frame - 1 - i) // _step)
+                    start = max(i % _step, i - frames_left * _step)
+                    end = min(n_frame - (n_frame - 1 - i) % _step,
+                              i + frames_right * _step + 1)
+                    sample_indices.append([_indices[0]] * pad_left +
+                                          _indices[start:end:_step] +
+                                          [_indices[-1]] * pad_right)
+            else:
+                seqs_from_video = [
+                    _indices[i:(i + _len):_step]
+                    for i in range(0, n_frame - _len + 1)
+                ]
+                sample_indices.extend(seqs_from_video)
+
+        # reduce dataset size if self.subset < 1
+        assert 0 < self.subset <= 1
+        subset_size = int(len(sample_indices) * self.subset)
+        start = np.random.randint(0, len(sample_indices) - subset_size + 1)
+        end = start + subset_size
+
+        return sample_indices[start:end]
+
+    def _load_joint_2d_detection(self, det_file):
+        """"Load 2D joint detection results from file."""
+        joints_2d = np.load(det_file).astype(np.float32)
+
+        return joints_2d
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mpjpe', **kwargs):
+        metrics = metric if isinstance(metric, list) else [metric]
+        for _metric in metrics:
+            if _metric not in self.ALLOWED_METRICS:
+                raise ValueError(
+                    f'Unsupported metric "{_metric}" for human3.6 dataset.'
+                    f'Supported metrics are {self.ALLOWED_METRICS}')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            image_paths = result['target_image_paths']
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                target_id = self.name2id[image_paths[i]]
+                kpts.append({
+                    'keypoints': preds[i],
+                    'target_id': target_id,
+                })
+
+        mmcv.dump(kpts, res_file)
+
+        name_value_tuples = []
+        for _metric in metrics:
+            if _metric == 'mpjpe':
+                _nv_tuples = self._report_mpjpe(kpts)
+            elif _metric == 'p-mpjpe':
+                _nv_tuples = self._report_mpjpe(kpts, mode='p-mpjpe')
+            elif _metric == 'n-mpjpe':
+                _nv_tuples = self._report_mpjpe(kpts, mode='n-mpjpe')
+            else:
+                raise NotImplementedError
+            name_value_tuples.extend(_nv_tuples)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return OrderedDict(name_value_tuples)
+
+    def _report_mpjpe(self, keypoint_results, mode='mpjpe'):
+        """Cauculate mean per joint position error (MPJPE) or its variants like
+        P-MPJPE or N-MPJPE.
+
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DH36MDataset.evaluate' for details.
+            mode (str): Specify mpjpe variants. Supported options are:
+
+                - ``'mpjpe'``: Standard MPJPE.
+                - ``'p-mpjpe'``: MPJPE after aligning prediction to groundtruth
+                    via a rigid transformation (scale, rotation and
+                    translation).
+                - ``'n-mpjpe'``: MPJPE after aligning prediction to groundtruth
+                    in scale only.
+        """
+
+        preds = []
+        gts = []
+        masks = []
+        action_category_indices = defaultdict(list)
+        for idx, result in enumerate(keypoint_results):
+            pred = result['keypoints']
+            target_id = result['target_id']
+            gt, gt_visible = np.split(
+                self.data_info['joints_3d'][target_id], [3], axis=-1)
+            preds.append(pred)
+            gts.append(gt)
+            masks.append(gt_visible)
+
+            action = self._parse_h36m_imgname(
+                self.data_info['imgnames'][target_id])[1]
+            action_category = action.split('_')[0]
+            action_category_indices[action_category].append(idx)
+
+        preds = np.stack(preds)
+        gts = np.stack(gts)
+        masks = np.stack(masks).squeeze(-1) > 0
+
+        err_name = mode.upper()
+        if mode == 'mpjpe':
+            alignment = 'none'
+        elif mode == 'p-mpjpe':
+            alignment = 'procrustes'
+        elif mode == 'n-mpjpe':
+            alignment = 'scale'
+        else:
+            raise ValueError(f'Invalid mode: {mode}')
+
+        error = keypoint_mpjpe(preds, gts, masks, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        for action_category, indices in action_category_indices.items():
+            _error = keypoint_mpjpe(preds[indices], gts[indices],
+                                    masks[indices])
+            name_value_tuples.append((f'{err_name}_{action_category}', _error))
+
+        return name_value_tuples
+
+    def _load_camera_param(self, camera_param_file):
+        """Load camera parameters from file."""
+        return mmcv.load(camera_param_file)
+
+    def get_camera_param(self, imgname):
+        """Get camera parameters of a frame by its image name."""
+        assert hasattr(self, 'camera_param')
+        subj, _, camera = self._parse_h36m_imgname(imgname)
+        return self.camera_param[(subj, camera)]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d06fcd2f200e8c5c3d4174be90551990cc6886e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py
@@ -0,0 +1,417 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import mmcv
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.core.evaluation import (keypoint_3d_auc, keypoint_3d_pck,
+                                    keypoint_mpjpe)
+from mmpose.datasets.datasets.base import Kpt3dSviewKpt2dDataset
+from ...builder import DATASETS
+
+
+@DATASETS.register_module()
+class Body3DMpiInf3dhpDataset(Kpt3dSviewKpt2dDataset):
+    """MPI-INF-3DHP dataset for 3D human pose estimation.
+
+    "Monocular 3D Human Pose Estimation In The Wild Using Improved CNN
+    Supervision", 3DV'2017.
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/1611.09813>`__.
+
+    MPI-INF-3DHP keypoint indexes:
+
+        0: 'head_top',
+        1: 'neck',
+        2: 'right_shoulder',
+        3: 'right_elbow',
+        4: 'right_wrist',
+        5: 'left_shoulder;,
+        6: 'left_elbow',
+        7: 'left_wrist',
+        8: 'right_hip',
+        9: 'right_knee',
+        10: 'right_ankle',
+        11: 'left_hip',
+        12: 'left_knee',
+        13: 'left_ankle',
+        14: 'root (pelvis)',
+        15: 'spine',
+        16: 'head'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): Data configurations. Please refer to the docstring of
+            Body3DBaseDataset for common data attributes. Here are MPI-INF-3DHP
+            specific attributes.
+            - joint_2d_src: 2D joint source. Options include:
+                "gt": from the annotation file
+                "detection": from a detection result file of 2D keypoint
+                "pipeline": will be generate by the pipeline
+                Default: "gt".
+            - joint_2d_det_file: Path to the detection result file of 2D
+                keypoint. Only used when joint_2d_src == "detection".
+            - need_camera_param: Whether need camera parameters or not.
+                Default: False.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    JOINT_NAMES = [
+        'HeadTop', 'Neck', 'RShoulder', 'RElbow', 'RWrist', 'LShoulder',
+        'LElbow', 'LWrist', 'RHip', 'RKnee', 'RAnkle', 'LHip', 'LKnee',
+        'LAnkle', 'Root', 'Spine', 'Head'
+    ]
+
+    # 2D joint source options:
+    # "gt": from the annotation file
+    # "detection": from a detection result file of 2D keypoint
+    # "pipeline": will be generate by the pipeline
+    SUPPORTED_JOINT_2D_SRC = {'gt', 'detection', 'pipeline'}
+
+    # metric
+    ALLOWED_METRICS = {
+        'mpjpe', 'p-mpjpe', '3dpck', 'p-3dpck', '3dauc', 'p-3dauc'
+    }
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/mpi_inf_3dhp.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+    def load_config(self, data_cfg):
+        super().load_config(data_cfg)
+        # mpi-inf-3dhp specific attributes
+        self.joint_2d_src = data_cfg.get('joint_2d_src', 'gt')
+        if self.joint_2d_src not in self.SUPPORTED_JOINT_2D_SRC:
+            raise ValueError(
+                f'Unsupported joint_2d_src "{self.joint_2d_src}". '
+                f'Supported options are {self.SUPPORTED_JOINT_2D_SRC}')
+
+        self.joint_2d_det_file = data_cfg.get('joint_2d_det_file', None)
+
+        self.need_camera_param = data_cfg.get('need_camera_param', False)
+        if self.need_camera_param:
+            assert 'camera_param_file' in data_cfg
+            self.camera_param = self._load_camera_param(
+                data_cfg['camera_param_file'])
+
+        # mpi-inf-3dhp specific annotation info
+        ann_info = {}
+        ann_info['use_different_joint_weights'] = False
+
+        self.ann_info.update(ann_info)
+
+    def load_annotations(self):
+        data_info = super().load_annotations()
+
+        # get 2D joints
+        if self.joint_2d_src == 'gt':
+            data_info['joints_2d'] = data_info['joints_2d']
+        elif self.joint_2d_src == 'detection':
+            data_info['joints_2d'] = self._load_joint_2d_detection(
+                self.joint_2d_det_file)
+            assert data_info['joints_2d'].shape[0] == data_info[
+                'joints_3d'].shape[0]
+            assert data_info['joints_2d'].shape[2] == 3
+        elif self.joint_2d_src == 'pipeline':
+            # joint_2d will be generated in the pipeline
+            pass
+        else:
+            raise NotImplementedError(
+                f'Unhandled joint_2d_src option {self.joint_2d_src}')
+
+        return data_info
+
+    @staticmethod
+    def _parse_mpi_inf_3dhp_imgname(imgname):
+        """Parse imgname to get information of subject, sequence and camera.
+
+        A typical mpi-inf-3dhp training image filename is like:
+        S1_Seq1_Cam0_000001.jpg. A typical mpi-inf-3dhp testing image filename
+        is like: TS1_000001.jpg
+        """
+        if imgname[0] == 'S':
+            subj, rest = imgname.split('_', 1)
+            seq, rest = rest.split('_', 1)
+            camera, rest = rest.split('_', 1)
+            return subj, seq, camera
+        else:
+            subj, rest = imgname.split('_', 1)
+            return subj, None, None
+
+    def build_sample_indices(self):
+        """Split original videos into sequences and build frame indices.
+
+        This method overrides the default one in the base class.
+        """
+
+        # Group frames into videos. Assume that self.data_info is
+        # chronological.
+        video_frames = defaultdict(list)
+        for idx, imgname in enumerate(self.data_info['imgnames']):
+            subj, seq, camera = self._parse_mpi_inf_3dhp_imgname(imgname)
+            if seq is not None:
+                video_frames[(subj, seq, camera)].append(idx)
+            else:
+                video_frames[subj].append(idx)
+
+        # build sample indices
+        sample_indices = []
+        _len = (self.seq_len - 1) * self.seq_frame_interval + 1
+        _step = self.seq_frame_interval
+        for _, _indices in sorted(video_frames.items()):
+            n_frame = len(_indices)
+
+            if self.temporal_padding:
+                # Pad the sequence so that every frame in the sequence will be
+                # predicted.
+                if self.causal:
+                    frames_left = self.seq_len - 1
+                    frames_right = 0
+                else:
+                    frames_left = (self.seq_len - 1) // 2
+                    frames_right = frames_left
+                for i in range(n_frame):
+                    pad_left = max(0, frames_left - i // _step)
+                    pad_right = max(0,
+                                    frames_right - (n_frame - 1 - i) // _step)
+                    start = max(i % _step, i - frames_left * _step)
+                    end = min(n_frame - (n_frame - 1 - i) % _step,
+                              i + frames_right * _step + 1)
+                    sample_indices.append([_indices[0]] * pad_left +
+                                          _indices[start:end:_step] +
+                                          [_indices[-1]] * pad_right)
+            else:
+                seqs_from_video = [
+                    _indices[i:(i + _len):_step]
+                    for i in range(0, n_frame - _len + 1)
+                ]
+                sample_indices.extend(seqs_from_video)
+
+        # reduce dataset size if self.subset < 1
+        assert 0 < self.subset <= 1
+        subset_size = int(len(sample_indices) * self.subset)
+        start = np.random.randint(0, len(sample_indices) - subset_size + 1)
+        end = start + subset_size
+
+        return sample_indices[start:end]
+
+    def _load_joint_2d_detection(self, det_file):
+        """"Load 2D joint detection results from file."""
+        joints_2d = np.load(det_file).astype(np.float32)
+
+        return joints_2d
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mpjpe', **kwargs):
+        metrics = metric if isinstance(metric, list) else [metric]
+        for _metric in metrics:
+            if _metric not in self.ALLOWED_METRICS:
+                raise ValueError(
+                    f'Unsupported metric "{_metric}" for mpi-inf-3dhp dataset.'
+                    f'Supported metrics are {self.ALLOWED_METRICS}')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            image_paths = result['target_image_paths']
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                target_id = self.name2id[image_paths[i]]
+                kpts.append({
+                    'keypoints': preds[i],
+                    'target_id': target_id,
+                })
+
+        mmcv.dump(kpts, res_file)
+
+        name_value_tuples = []
+        for _metric in metrics:
+            if _metric == 'mpjpe':
+                _nv_tuples = self._report_mpjpe(kpts)
+            elif _metric == 'p-mpjpe':
+                _nv_tuples = self._report_mpjpe(kpts, mode='p-mpjpe')
+            elif _metric == '3dpck':
+                _nv_tuples = self._report_3d_pck(kpts)
+            elif _metric == 'p-3dpck':
+                _nv_tuples = self._report_3d_pck(kpts, mode='p-3dpck')
+            elif _metric == '3dauc':
+                _nv_tuples = self._report_3d_auc(kpts)
+            elif _metric == 'p-3dauc':
+                _nv_tuples = self._report_3d_auc(kpts, mode='p-3dauc')
+            else:
+                raise NotImplementedError
+            name_value_tuples.extend(_nv_tuples)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return OrderedDict(name_value_tuples)
+
+    def _report_mpjpe(self, keypoint_results, mode='mpjpe'):
+        """Cauculate mean per joint position error (MPJPE) or its variants
+        P-MPJPE.
+
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            mode (str): Specify mpjpe variants. Supported options are:
+                - ``'mpjpe'``: Standard MPJPE.
+                - ``'p-mpjpe'``: MPJPE after aligning prediction to groundtruth
+                    via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        preds = []
+        gts = []
+        for idx, result in enumerate(keypoint_results):
+            pred = result['keypoints']
+            target_id = result['target_id']
+            gt, gt_visible = np.split(
+                self.data_info['joints_3d'][target_id], [3], axis=-1)
+            preds.append(pred)
+            gts.append(gt)
+
+        preds = np.stack(preds)
+        gts = np.stack(gts)
+        masks = np.ones_like(gts[:, :, 0], dtype=bool)
+
+        err_name = mode.upper()
+        if mode == 'mpjpe':
+            alignment = 'none'
+        elif mode == 'p-mpjpe':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid mode: {mode}')
+
+        error = keypoint_mpjpe(preds, gts, masks, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _report_3d_pck(self, keypoint_results, mode='3dpck'):
+        """Cauculate Percentage of Correct Keypoints (3DPCK) w. or w/o
+        Procrustes alignment.
+
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            mode (str): Specify mpjpe variants. Supported options are:
+                - ``'3dpck'``: Standard 3DPCK.
+                - ``'p-3dpck'``: 3DPCK after aligning prediction to groundtruth
+                    via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        preds = []
+        gts = []
+        for idx, result in enumerate(keypoint_results):
+            pred = result['keypoints']
+            target_id = result['target_id']
+            gt, gt_visible = np.split(
+                self.data_info['joints_3d'][target_id], [3], axis=-1)
+            preds.append(pred)
+            gts.append(gt)
+
+        preds = np.stack(preds)
+        gts = np.stack(gts)
+        masks = np.ones_like(gts[:, :, 0], dtype=bool)
+
+        err_name = mode.upper()
+        if mode == '3dpck':
+            alignment = 'none'
+        elif mode == 'p-3dpck':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid mode: {mode}')
+
+        error = keypoint_3d_pck(preds, gts, masks, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _report_3d_auc(self, keypoint_results, mode='3dauc'):
+        """Cauculate the Area Under the Curve (AUC) computed for a range of
+        3DPCK thresholds.
+
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            mode (str): Specify mpjpe variants. Supported options are:
+
+                - ``'3dauc'``: Standard 3DAUC.
+                - ``'p-3dauc'``: 3DAUC after aligning prediction to
+                    groundtruth via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        preds = []
+        gts = []
+        for idx, result in enumerate(keypoint_results):
+            pred = result['keypoints']
+            target_id = result['target_id']
+            gt, gt_visible = np.split(
+                self.data_info['joints_3d'][target_id], [3], axis=-1)
+            preds.append(pred)
+            gts.append(gt)
+
+        preds = np.stack(preds)
+        gts = np.stack(gts)
+        masks = np.ones_like(gts[:, :, 0], dtype=bool)
+
+        err_name = mode.upper()
+        if mode == '3dauc':
+            alignment = 'none'
+        elif mode == 'p-3dauc':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid mode: {mode}')
+
+        error = keypoint_3d_auc(preds, gts, masks, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _load_camera_param(self, camear_param_file):
+        """Load camera parameters from file."""
+        return mmcv.load(camear_param_file)
+
+    def get_camera_param(self, imgname):
+        """Get camera parameters of a frame by its image name."""
+        assert hasattr(self, 'camera_param')
+        return self.camera_param[imgname[:-11]]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5bf92d182b972cd1821990bb3fc673d99f624e3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py
@@ -0,0 +1,493 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import glob
+import json
+import os.path as osp
+import pickle
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.core.camera import SimpleCamera
+from mmpose.datasets.builder import DATASETS
+from mmpose.datasets.datasets.base import Kpt3dMviewRgbImgDirectDataset
+
+
+@DATASETS.register_module()
+class Body3DMviewDirectPanopticDataset(Kpt3dMviewRgbImgDirectDataset):
+    """Panoptic dataset for direct multi-view human pose estimation.
+
+    `Panoptic Studio: A Massively Multiview System for Social Motion
+    Capture' ICCV'2015
+    More details can be found in the `paper
+    <https://openaccess.thecvf.com/content_iccv_2015/papers/
+    Joo_Panoptic_Studio_A_ICCV_2015_paper.pdf>`__ .
+
+    The dataset loads both 2D and 3D annotations as well as camera parameters.
+
+    Panoptic keypoint indexes::
+
+        'neck': 0,
+        'nose': 1,
+        'mid-hip': 2,
+        'l-shoulder': 3,
+        'l-elbow': 4,
+        'l-wrist': 5,
+        'l-hip': 6,
+        'l-knee': 7,
+        'l-ankle': 8,
+        'r-shoulder': 9,
+        'r-elbow': 10,
+        'r-wrist': 11,
+        'r-hip': 12,
+        'r-knee': 13,
+        'r-ankle': 14,
+        'l-eye': 15,
+        'l-ear': 16,
+        'r-eye': 17,
+        'r-ear': 18,
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+    ALLOWED_METRICS = {'mpjpe', 'mAP'}
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/panoptic_body3d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.load_config(data_cfg)
+        self.ann_info['use_different_joint_weights'] = False
+
+        if ann_file is None:
+            self.db_file = osp.join(
+                img_prefix, f'group_{self.subset}_cam{self.num_cameras}.pkl')
+        else:
+            self.db_file = ann_file
+
+        if osp.exists(self.db_file):
+            with open(self.db_file, 'rb') as f:
+                info = pickle.load(f)
+            assert info['sequence_list'] == self.seq_list
+            assert info['interval'] == self.seq_frame_interval
+            assert info['cam_list'] == self.cam_list
+            self.db = info['db']
+        else:
+            self.db = self._get_db()
+            info = {
+                'sequence_list': self.seq_list,
+                'interval': self.seq_frame_interval,
+                'cam_list': self.cam_list,
+                'db': self.db
+            }
+            with open(self.db_file, 'wb') as f:
+                pickle.dump(info, f)
+
+        self.db_size = len(self.db)
+
+        print(f'=> load {len(self.db)} samples')
+
+    def load_config(self, data_cfg):
+        """Initialize dataset attributes according to the config.
+
+        Override this method to set dataset specific attributes.
+        """
+        self.num_joints = data_cfg['num_joints']
+        assert self.num_joints <= 19
+        self.seq_list = data_cfg['seq_list']
+        self.cam_list = data_cfg['cam_list']
+        self.num_cameras = data_cfg['num_cameras']
+        assert self.num_cameras == len(self.cam_list)
+        self.seq_frame_interval = data_cfg.get('seq_frame_interval', 1)
+        self.subset = data_cfg.get('subset', 'train')
+        self.need_camera_param = True
+        self.root_id = data_cfg.get('root_id', 0)
+        self.max_persons = data_cfg.get('max_num', 10)
+
+    def _get_scale(self, raw_image_size):
+        heatmap_size = self.ann_info['heatmap_size']
+        image_size = self.ann_info['image_size']
+        assert heatmap_size[0][0] / heatmap_size[0][1] \
+               == image_size[0] / image_size[1]
+        w, h = raw_image_size
+        w_resized, h_resized = image_size
+        if w / w_resized < h / h_resized:
+            w_pad = h / h_resized * w_resized
+            h_pad = h
+        else:
+            w_pad = w
+            h_pad = w / w_resized * h_resized
+
+        scale = np.array([w_pad, h_pad], dtype=np.float32)
+
+        return scale
+
+    def _get_cam(self, seq):
+        """Get camera parameters.
+
+        Args:
+            seq (str): Sequence name.
+
+        Returns: Camera parameters.
+        """
+        cam_file = osp.join(self.img_prefix, seq,
+                            'calibration_{:s}.json'.format(seq))
+        with open(cam_file) as cfile:
+            calib = json.load(cfile)
+
+        M = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, -1.0], [0.0, 1.0, 0.0]])
+        cameras = {}
+        for cam in calib['cameras']:
+            if (cam['panel'], cam['node']) in self.cam_list:
+                sel_cam = {}
+                R_w2c = np.array(cam['R']).dot(M)
+                T_w2c = np.array(cam['t']).reshape((3, 1)) * 10.0  # cm to mm
+                R_c2w = R_w2c.T
+                T_c2w = -R_w2c.T @ T_w2c
+                sel_cam['R'] = R_c2w.tolist()
+                sel_cam['T'] = T_c2w.tolist()
+                sel_cam['K'] = cam['K'][:2]
+                distCoef = cam['distCoef']
+                sel_cam['k'] = [distCoef[0], distCoef[1], distCoef[4]]
+                sel_cam['p'] = [distCoef[2], distCoef[3]]
+                cameras[(cam['panel'], cam['node'])] = sel_cam
+
+        return cameras
+
+    def _get_db(self):
+        """Get dataset base.
+
+        Returns:
+            dict: the dataset base (2D and 3D information)
+        """
+        width = 1920
+        height = 1080
+        db = []
+        sample_id = 0
+        for seq in self.seq_list:
+            cameras = self._get_cam(seq)
+            curr_anno = osp.join(self.img_prefix, seq,
+                                 'hdPose3d_stage1_coco19')
+            anno_files = sorted(glob.iglob('{:s}/*.json'.format(curr_anno)))
+            print(f'load sequence: {seq}', flush=True)
+            for i, file in enumerate(anno_files):
+                if i % self.seq_frame_interval == 0:
+                    with open(file) as dfile:
+                        bodies = json.load(dfile)['bodies']
+                    if len(bodies) == 0:
+                        continue
+
+                    for k, cam_param in cameras.items():
+                        single_view_camera = SimpleCamera(cam_param)
+                        postfix = osp.basename(file).replace('body3DScene', '')
+                        prefix = '{:02d}_{:02d}'.format(k[0], k[1])
+                        image_file = osp.join(seq, 'hdImgs', prefix,
+                                              prefix + postfix)
+                        image_file = image_file.replace('json', 'jpg')
+
+                        all_poses_3d = np.zeros(
+                            (self.max_persons, self.num_joints, 3),
+                            dtype=np.float32)
+                        all_poses_vis_3d = np.zeros(
+                            (self.max_persons, self.num_joints, 3),
+                            dtype=np.float32)
+                        all_roots_3d = np.zeros((self.max_persons, 3),
+                                                dtype=np.float32)
+                        all_poses = np.zeros(
+                            (self.max_persons, self.num_joints, 3),
+                            dtype=np.float32)
+
+                        cnt = 0
+                        person_ids = -np.ones(self.max_persons, dtype=np.int)
+                        for body in bodies:
+                            if cnt >= self.max_persons:
+                                break
+                            pose3d = np.array(body['joints19']).reshape(
+                                (-1, 4))
+                            pose3d = pose3d[:self.num_joints]
+
+                            joints_vis = pose3d[:, -1] > 0.1
+
+                            if not joints_vis[self.root_id]:
+                                continue
+
+                            # Coordinate transformation
+                            M = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, -1.0],
+                                          [0.0, 1.0, 0.0]])
+                            pose3d[:, 0:3] = pose3d[:, 0:3].dot(M) * 10.0
+
+                            all_poses_3d[cnt] = pose3d[:, :3]
+                            all_roots_3d[cnt] = pose3d[self.root_id, :3]
+                            all_poses_vis_3d[cnt] = np.repeat(
+                                np.reshape(joints_vis, (-1, 1)), 3, axis=1)
+
+                            pose2d = np.zeros((pose3d.shape[0], 3))
+                            # get pose_2d from pose_3d
+                            pose2d[:, :2] = single_view_camera.world_to_pixel(
+                                pose3d[:, :3])
+                            x_check = np.bitwise_and(pose2d[:, 0] >= 0,
+                                                     pose2d[:, 0] <= width - 1)
+                            y_check = np.bitwise_and(
+                                pose2d[:, 1] >= 0, pose2d[:, 1] <= height - 1)
+                            check = np.bitwise_and(x_check, y_check)
+                            joints_vis[np.logical_not(check)] = 0
+                            pose2d[:, -1] = joints_vis
+
+                            all_poses[cnt] = pose2d
+                            person_ids[cnt] = body['id']
+                            cnt += 1
+
+                        if cnt > 0:
+                            db.append({
+                                'image_file':
+                                osp.join(self.img_prefix, image_file),
+                                'joints_3d':
+                                all_poses_3d,
+                                'person_ids':
+                                person_ids,
+                                'joints_3d_visible':
+                                all_poses_vis_3d,
+                                'joints': [all_poses],
+                                'roots_3d':
+                                all_roots_3d,
+                                'camera':
+                                cam_param,
+                                'num_persons':
+                                cnt,
+                                'sample_id':
+                                sample_id,
+                                'center':
+                                np.array((width / 2, height / 2),
+                                         dtype=np.float32),
+                                'scale':
+                                self._get_scale((width, height))
+                            })
+                            sample_id += 1
+        return db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mpjpe', **kwargs):
+        """
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+                - pose_3d (np.ndarray): predicted 3D human pose
+                - sample_id (np.ndarray): sample id of a frame.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Defaults: 'mpjpe'.
+            **kwargs:
+
+        Returns:
+
+        """
+        pose_3ds = np.concatenate([result['pose_3d'] for result in results],
+                                  axis=0)
+        sample_ids = []
+        for result in results:
+            sample_ids.extend(result['sample_id'])
+
+        _results = [
+            dict(sample_id=sample_id, pose_3d=pose_3d)
+            for (sample_id, pose_3d) in zip(sample_ids, pose_3ds)
+        ]
+        _results = self._sort_and_unique_outputs(_results, key='sample_id')
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        for _metric in metrics:
+            if _metric not in self.ALLOWED_METRICS:
+                raise ValueError(
+                    f'Unsupported metric "{_metric}"'
+                    f'Supported metrics are {self.ALLOWED_METRICS}')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        mmcv.dump(_results, res_file)
+
+        eval_list = []
+        gt_num = self.db_size // self.num_cameras
+        assert len(
+            _results) == gt_num, f'number mismatch: {len(_results)}, {gt_num}'
+
+        total_gt = 0
+        for i in range(gt_num):
+            index = self.num_cameras * i
+            db_rec = copy.deepcopy(self.db[index])
+            joints_3d = db_rec['joints_3d']
+            joints_3d_vis = db_rec['joints_3d_visible']
+
+            if joints_3d_vis.sum() < 1:
+                continue
+
+            pred = _results[i]['pose_3d'].copy()
+            pred = pred[pred[:, 0, 3] >= 0]
+            for pose in pred:
+                mpjpes = []
+                for (gt, gt_vis) in zip(joints_3d, joints_3d_vis):
+                    vis = gt_vis[:, 0] > 0
+                    if vis.sum() < 1:
+                        break
+                    mpjpe = np.mean(
+                        np.sqrt(
+                            np.sum((pose[vis, 0:3] - gt[vis])**2, axis=-1)))
+                    mpjpes.append(mpjpe)
+                min_gt = np.argmin(mpjpes)
+                min_mpjpe = np.min(mpjpes)
+                score = pose[0, 4]
+                eval_list.append({
+                    'mpjpe': float(min_mpjpe),
+                    'score': float(score),
+                    'gt_id': int(total_gt + min_gt)
+                })
+
+            total_gt += (joints_3d_vis[:, :, 0].sum(-1) >= 1).sum()
+
+        mpjpe_threshold = np.arange(25, 155, 25)
+        aps = []
+        ars = []
+        for t in mpjpe_threshold:
+            ap, ar = self._eval_list_to_ap(eval_list, total_gt, t)
+            aps.append(ap)
+            ars.append(ar)
+
+        name_value_tuples = []
+        for _metric in metrics:
+            if _metric == 'mpjpe':
+                stats_names = ['RECALL 500mm', 'MPJPE 500mm']
+                info_str = list(
+                    zip(stats_names, [
+                        self._eval_list_to_recall(eval_list, total_gt),
+                        self._eval_list_to_mpjpe(eval_list)
+                    ]))
+            elif _metric == 'mAP':
+                stats_names = [
+                    'AP 25', 'AP 50', 'AP 75', 'AP 100', 'AP 125', 'AP 150',
+                    'mAP', 'AR 25', 'AR 50', 'AR 75', 'AR 100', 'AR 125',
+                    'AR 150', 'mAR'
+                ]
+                mAP = np.array(aps).mean()
+                mAR = np.array(ars).mean()
+                info_str = list(zip(stats_names, aps + [mAP] + ars + [mAR]))
+            else:
+                raise NotImplementedError
+            name_value_tuples.extend(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return OrderedDict(name_value_tuples)
+
+    @staticmethod
+    def _eval_list_to_ap(eval_list, total_gt, threshold):
+        """Get Average Precision (AP) and Average Recall at a certain
+        threshold."""
+
+        eval_list.sort(key=lambda k: k['score'], reverse=True)
+        total_num = len(eval_list)
+
+        tp = np.zeros(total_num)
+        fp = np.zeros(total_num)
+        gt_det = []
+        for i, item in enumerate(eval_list):
+            if item['mpjpe'] < threshold and item['gt_id'] not in gt_det:
+                tp[i] = 1
+                gt_det.append(item['gt_id'])
+            else:
+                fp[i] = 1
+        tp = np.cumsum(tp)
+        fp = np.cumsum(fp)
+        recall = tp / (total_gt + 1e-5)
+        precise = tp / (tp + fp + 1e-5)
+        for n in range(total_num - 2, -1, -1):
+            precise[n] = max(precise[n], precise[n + 1])
+
+        precise = np.concatenate(([0], precise, [0]))
+        recall = np.concatenate(([0], recall, [1]))
+        index = np.where(recall[1:] != recall[:-1])[0]
+        ap = np.sum((recall[index + 1] - recall[index]) * precise[index + 1])
+
+        return ap, recall[-2]
+
+    @staticmethod
+    def _eval_list_to_mpjpe(eval_list, threshold=500):
+        """Get MPJPE within a certain threshold."""
+        eval_list.sort(key=lambda k: k['score'], reverse=True)
+        gt_det = []
+
+        mpjpes = []
+        for i, item in enumerate(eval_list):
+            if item['mpjpe'] < threshold and item['gt_id'] not in gt_det:
+                mpjpes.append(item['mpjpe'])
+                gt_det.append(item['gt_id'])
+
+        return np.mean(mpjpes) if len(mpjpes) > 0 else np.inf
+
+    @staticmethod
+    def _eval_list_to_recall(eval_list, total_gt, threshold=500):
+        """Get Recall at a certain threshold."""
+        gt_ids = [e['gt_id'] for e in eval_list if e['mpjpe'] < threshold]
+
+        return len(np.unique(gt_ids)) / total_gt
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = {}
+        for c in range(self.num_cameras):
+            result = copy.deepcopy(self.db[self.num_cameras * idx + c])
+            result['ann_info'] = self.ann_info
+            width = 1920
+            height = 1080
+            result['mask'] = [np.ones((height, width), dtype=np.float32)]
+            results[c] = result
+
+        return self.pipeline(results)
+
+    @staticmethod
+    def _sort_and_unique_outputs(outputs, key='sample_id'):
+        """sort outputs and remove the repeated ones."""
+        outputs = sorted(outputs, key=lambda x: x[key])
+        num_outputs = len(outputs)
+        for i in range(num_outputs - 1, 0, -1):
+            if outputs[i][key] == outputs[i - 1][key]:
+                del outputs[i]
+
+        return outputs
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..491d54914d5838a1759b7da7fb16ad2b205ba83c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets.builder import DATASETS, build_dataset
+
+
+@DATASETS.register_module()
+class Body3DSemiSupervisionDataset(Dataset):
+    """Mix Dataset for semi-supervised training in 3D human pose estimation
+    task.
+
+    The dataset combines data from two datasets (a labeled one and an unlabeled
+    one) and return a dict containing data from two datasets.
+
+    Args:
+        labeled_dataset (Dataset): Dataset with 3D keypoint annotations.
+        unlabeled_dataset (Dataset): Dataset without 3D keypoint annotations.
+    """
+
+    def __init__(self, labeled_dataset, unlabeled_dataset):
+        super().__init__()
+        self.labeled_dataset = build_dataset(labeled_dataset)
+        self.unlabeled_dataset = build_dataset(unlabeled_dataset)
+        self.length = len(self.unlabeled_dataset)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return self.length
+
+    def __getitem__(self, i):
+        """Given index, get the data from unlabeled dataset and randomly sample
+        an item from labeled dataset.
+
+        Return a dict containing data from labeled and unlabeled dataset.
+        """
+        data = self.unlabeled_dataset[i]
+        rand_ind = np.random.randint(0, len(self.labeled_dataset))
+        labeled_data = self.labeled_dataset[rand_ind]
+        data.update(labeled_data)
+        return data
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ac79377f8ef8c66f279e8c68c44c8bd61d87dbb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bottom_up_aic import BottomUpAicDataset
+from .bottom_up_coco import BottomUpCocoDataset
+from .bottom_up_coco_wholebody import BottomUpCocoWholeBodyDataset
+from .bottom_up_crowdpose import BottomUpCrowdPoseDataset
+from .bottom_up_mhp import BottomUpMhpDataset
+
+__all__ = [
+    'BottomUpCocoDataset', 'BottomUpCrowdPoseDataset', 'BottomUpMhpDataset',
+    'BottomUpAicDataset', 'BottomUpCocoWholeBodyDataset'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_aic.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_aic.py
new file mode 100644
index 0000000000000000000000000000000000000000..e56b72586f36bc0758876fa5d0ce3016efad3802
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_aic.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import json_tricks as json
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.datasets.builder import DATASETS
+from .bottom_up_coco import BottomUpCocoDataset
+
+
+@DATASETS.register_module()
+class BottomUpAicDataset(BottomUpCocoDataset):
+    """Aic dataset for bottom-up pose estimation.
+
+    "AI Challenger : A Large-scale Dataset for Going Deeper
+    in Image Understanding", arXiv'2017.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1711.06475>`__
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    AIC keypoint indexes::
+
+        0: "right_shoulder",
+        1: "right_elbow",
+        2: "right_wrist",
+        3: "left_shoulder",
+        4: "left_elbow",
+        5: "left_wrist",
+        6: "right_hip",
+        7: "right_knee",
+        8: "right_ankle",
+        9: "left_hip",
+        10: "left_knee",
+        11: "left_ankle",
+        12: "head_top",
+        13: "neck"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/aic.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(BottomUpCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        print(f'=> num_images: {self.num_images}')
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        with open(res_file, 'r') as file:
+            res_json = json.load(file)
+            if not res_json:
+                info_str = list(zip(stats_names, [
+                    0,
+                ] * len(stats_names)))
+                return info_str
+
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(
+            self.coco, coco_det, 'keypoints', self.sigmas, use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a2fea5d34b208b0d3703fe9dff1294e053ec950
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch.utils.data import Dataset
+
+
+class BottomUpBaseDataset(Dataset):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgBottomUpDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'BottomUpBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgBottomUpDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa2967fe22db1427975568aec40e7f1313d1de2d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py
@@ -0,0 +1,305 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.core.post_processing import oks_nms, soft_oks_nms
+from mmpose.datasets.builder import DATASETS
+from mmpose.datasets.datasets.base import Kpt2dSviewRgbImgBottomUpDataset
+
+
+@DATASETS.register_module()
+class BottomUpCocoDataset(Kpt2dSviewRgbImgBottomUpDataset):
+    """COCO dataset for bottom-up pose estimation.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO keypoint indexes::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/coco.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        print(f'=> num_images: {self.num_images}')
+
+    def _get_single(self, idx):
+        """Get anno for a single image.
+
+        Args:
+            idx (int): image idx
+
+        Returns:
+            dict: info for model training
+        """
+        coco = self.coco
+        img_id = self.img_ids[idx]
+        ann_ids = coco.getAnnIds(imgIds=img_id)
+        anno = coco.loadAnns(ann_ids)
+
+        mask = self._get_mask(anno, idx)
+        anno = [
+            obj.copy() for obj in anno
+            if obj['iscrowd'] == 0 or obj['num_keypoints'] > 0
+        ]
+
+        joints = self._get_joints(anno)
+        mask_list = [mask.copy() for _ in range(self.ann_info['num_scales'])]
+        joints_list = [
+            joints.copy() for _ in range(self.ann_info['num_scales'])
+        ]
+
+        db_rec = {}
+        db_rec['dataset'] = self.dataset_name
+        db_rec['image_file'] = osp.join(self.img_prefix, self.id2name[img_id])
+        db_rec['mask'] = mask_list
+        db_rec['joints'] = joints_list
+
+        return db_rec
+
+    def _get_joints(self, anno):
+        """Get joints for all people in an image."""
+        num_people = len(anno)
+
+        if self.ann_info['scale_aware_sigma']:
+            joints = np.zeros((num_people, self.ann_info['num_joints'], 4),
+                              dtype=np.float32)
+        else:
+            joints = np.zeros((num_people, self.ann_info['num_joints'], 3),
+                              dtype=np.float32)
+
+        for i, obj in enumerate(anno):
+            joints[i, :, :3] = \
+                np.array(obj['keypoints']).reshape([-1, 3])
+            if self.ann_info['scale_aware_sigma']:
+                # get person box
+                box = obj['bbox']
+                size = max(box[2], box[3])
+                sigma = size / self.base_size * self.base_sigma
+                if self.int_sigma:
+                    sigma = int(np.ceil(sigma))
+                assert sigma > 0, sigma
+                joints[i, :, 3] = sigma
+
+        return joints
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - num_people: P
+            - num_keypoints: K
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (list[np.ndarray(P, K, 3+tag_num)]): \
+                    Pose predictions for all people in images.
+                - scores (list[P]): List of person scores.
+                - image_path (list[str]): For example, ['coco/images/\
+                    val2017/000000397133.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model outputs.
+
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        preds = []
+        scores = []
+        image_paths = []
+
+        for result in results:
+            preds.append(result['preds'])
+            scores.append(result['scores'])
+            image_paths.append(result['image_paths'][0])
+
+        kpts = defaultdict(list)
+        # iterate over images
+        for idx, _preds in enumerate(preds):
+            str_image_path = image_paths[idx]
+            image_id = self.name2id[osp.basename(str_image_path)]
+            # iterate over people
+            for idx_person, kpt in enumerate(_preds):
+                # use bbox area
+                area = (np.max(kpt[:, 0]) - np.min(kpt[:, 0])) * (
+                    np.max(kpt[:, 1]) - np.min(kpt[:, 1]))
+
+                kpts[image_id].append({
+                    'keypoints': kpt[:, 0:3],
+                    'score': scores[idx][idx_person],
+                    'tags': kpt[:, 3],
+                    'image_id': image_id,
+                    'area': area,
+                })
+
+        valid_kpts = []
+        for img in kpts.keys():
+            img_kpts = kpts[img]
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(img_kpts, self.oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            for img_kpt, key_point in zip(img_kpts, key_points):
+                kpt = key_point.reshape((self.ann_info['num_joints'], 3))
+                left_top = np.amin(kpt, axis=0)
+                right_bottom = np.amax(kpt, axis=0)
+
+                w = right_bottom[0] - left_top[0]
+                h = right_bottom[1] - left_top[1]
+
+                cat_results.append({
+                    'image_id': img_kpt['image_id'],
+                    'category_id': cat_id,
+                    'keypoints': key_point.tolist(),
+                    'score': img_kpt['score'],
+                    'bbox': [left_top[0], left_top[1], w, h]
+                })
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        with open(res_file, 'r') as file:
+            res_json = json.load(file)
+            if not res_json:
+                info_str = list(zip(stats_names, [
+                    0,
+                ] * len(stats_names)))
+                return info_str
+
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py
new file mode 100644
index 0000000000000000000000000000000000000000..363d2efb2ec93dedb8abbe78430af52970c4afc3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py
@@ -0,0 +1,238 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.datasets.builder import DATASETS
+from .bottom_up_coco import BottomUpCocoDataset
+
+
+@DATASETS.register_module()
+class BottomUpCocoWholeBodyDataset(BottomUpCocoDataset):
+    """CocoWholeBodyDataset dataset for bottom-up pose estimation.
+
+    `Whole-Body Human Pose Estimation in the Wild', ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.11858>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    In total, we have 133 keypoints for wholebody pose estimation.
+
+    COCO-WholeBody keypoint indexes::
+
+        0-16: 17 body keypoints,
+        17-22: 6 foot keypoints,
+        23-90: 68 face keypoints,
+        91-132: 42 hand keypoints
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/coco_wholebody.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(BottomUpCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+
+        self.body_num = 17
+        self.foot_num = 6
+        self.face_num = 68
+        self.left_hand_num = 21
+        self.right_hand_num = 21
+
+        print(f'=> num_images: {self.num_images}')
+
+    def _get_joints(self, anno):
+        """Get joints for all people in an image."""
+        num_people = len(anno)
+
+        if self.ann_info['scale_aware_sigma']:
+            joints = np.zeros((num_people, self.ann_info['num_joints'], 4),
+                              dtype=np.float32)
+        else:
+            joints = np.zeros((num_people, self.ann_info['num_joints'], 3),
+                              dtype=np.float32)
+
+        for i, obj in enumerate(anno):
+            keypoints = np.array(obj['keypoints'] + obj['foot_kpts'] +
+                                 obj['face_kpts'] + obj['lefthand_kpts'] +
+                                 obj['righthand_kpts']).reshape(-1, 3)
+
+            joints[i, :self.ann_info['num_joints'], :3] = keypoints
+            if self.ann_info['scale_aware_sigma']:
+                # get person box
+                box = obj['bbox']
+                size = max(box[2], box[3])
+                sigma = size / self.base_size * self.base_sigma
+                if self.int_sigma:
+                    sigma = int(np.ceil(sigma))
+                assert sigma > 0, sigma
+                joints[i, :, 3] = sigma
+
+        return joints
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            cuts = np.cumsum([
+                0, self.body_num, self.foot_num, self.face_num,
+                self.left_hand_num, self.right_hand_num
+            ]) * 3
+
+            for img_kpt, key_point in zip(img_kpts, key_points):
+                kpt = key_point.reshape((self.ann_info['num_joints'], 3))
+                left_top = np.amin(kpt, axis=0)
+                right_bottom = np.amax(kpt, axis=0)
+
+                w = right_bottom[0] - left_top[0]
+                h = right_bottom[1] - left_top[1]
+
+                cat_results.append({
+                    'image_id':
+                    img_kpt['image_id'],
+                    'category_id':
+                    cat_id,
+                    'keypoints':
+                    key_point[cuts[0]:cuts[1]].tolist(),
+                    'foot_kpts':
+                    key_point[cuts[1]:cuts[2]].tolist(),
+                    'face_kpts':
+                    key_point[cuts[2]:cuts[3]].tolist(),
+                    'lefthand_kpts':
+                    key_point[cuts[3]:cuts[4]].tolist(),
+                    'righthand_kpts':
+                    key_point[cuts[4]:cuts[5]].tolist(),
+                    'score':
+                    img_kpt['score'],
+                    'bbox': [left_top[0], left_top[1], w, h]
+                })
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+
+        cuts = np.cumsum([
+            0, self.body_num, self.foot_num, self.face_num, self.left_hand_num,
+            self.right_hand_num
+        ])
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_body',
+            self.sigmas[cuts[0]:cuts[1]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_foot',
+            self.sigmas[cuts[1]:cuts[2]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_face',
+            self.sigmas[cuts[2]:cuts[3]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_lefthand',
+            self.sigmas[cuts[3]:cuts[4]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_righthand',
+            self.sigmas[cuts[4]:cuts[5]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_wholebody',
+            self.sigmas,
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebabf3e1ddddd96de8aea9bfe00a095480b3112f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import json_tricks as json
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.datasets.builder import DATASETS
+from .bottom_up_coco import BottomUpCocoDataset
+
+
+@DATASETS.register_module()
+class BottomUpCrowdPoseDataset(BottomUpCocoDataset):
+    """CrowdPose dataset for bottom-up pose estimation.
+
+    "CrowdPose: Efficient Crowded Scenes Pose Estimation and
+    A New Benchmark", CVPR'2019.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1812.00324>`__.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    CrowdPose keypoint indexes::
+
+        0: 'left_shoulder',
+        1: 'right_shoulder',
+        2: 'left_elbow',
+        3: 'right_elbow',
+        4: 'left_wrist',
+        5: 'right_wrist',
+        6: 'left_hip',
+        7: 'right_hip',
+        8: 'left_knee',
+        9: 'right_knee',
+        10: 'left_ankle',
+        11: 'right_ankle',
+        12: 'top_head',
+        13: 'neck'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/crowdpose.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(BottomUpCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        print(f'=> num_images: {self.num_images}')
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AR', 'AR .5', 'AR .75', 'AP(E)', 'AP(M)',
+            'AP(H)'
+        ]
+
+        with open(res_file, 'r') as file:
+            res_json = json.load(file)
+            if not res_json:
+                info_str = list(zip(stats_names, [
+                    0,
+                ] * len(stats_names)))
+                return info_str
+
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_crowd',
+            self.sigmas,
+            use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py
new file mode 100644
index 0000000000000000000000000000000000000000..143812332512e56e6962a780d8900d6ca8823c96
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import json_tricks as json
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.datasets.builder import DATASETS
+from .bottom_up_coco import BottomUpCocoDataset
+
+
+@DATASETS.register_module()
+class BottomUpMhpDataset(BottomUpCocoDataset):
+    """MHPv2.0 dataset for top-down pose estimation.
+
+    "Understanding Humans in Crowded Scenes: Deep Nested Adversarial
+    Learning and A New Benchmark for Multi-Human Parsing", ACM MM'2018.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1804.03287>`__
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MHP keypoint indexes::
+
+        0: "right ankle",
+        1: "right knee",
+        2: "right hip",
+        3: "left hip",
+        4: "left knee",
+        5: "left ankle",
+        6: "pelvis",
+        7: "thorax",
+        8: "upper neck",
+        9: "head top",
+        10: "right wrist",
+        11: "right elbow",
+        12: "right shoulder",
+        13: "left shoulder",
+        14: "left elbow",
+        15: "left wrist",
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/mhp.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(BottomUpCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        print(f'=> num_images: {self.num_images}')
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        with open(res_file, 'r') as file:
+            res_json = json.load(file)
+            if not res_json:
+                info_str = list(zip(stats_names, [
+                    0,
+                ] * len(stats_names)))
+                return info_str
+
+        coco_det = self.coco.loadRes(res_file)
+
+        coco_eval = COCOeval(
+            self.coco, coco_det, 'keypoints', self.sigmas, use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba42d4413a657080bddf6224850e49a5a24601b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .face_300w_dataset import Face300WDataset
+from .face_aflw_dataset import FaceAFLWDataset
+from .face_coco_wholebody_dataset import FaceCocoWholeBodyDataset
+from .face_cofw_dataset import FaceCOFWDataset
+from .face_wflw_dataset import FaceWFLWDataset
+
+__all__ = [
+    'Face300WDataset', 'FaceAFLWDataset', 'FaceWFLWDataset', 'FaceCOFWDataset',
+    'FaceCocoWholeBodyDataset'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_300w_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_300w_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5b602e09c2df2469444bec306342dc97a9c3d8d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_300w_dataset.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class Face300WDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Face300W dataset for top-down face keypoint localization.
+
+    "300 faces In-the-wild challenge: Database and results",
+    Image and Vision Computing (IMAVIS) 2019.
+
+    The dataset loads raw images and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The landmark annotations follow the 68 points mark-up. The definition
+    can be found in `https://ibug.doc.ic.ac.uk/resources/300-W/`.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/300w.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                if 'center' in obj and 'scale' in obj:
+                    center = np.array(obj['center'])
+                    scale = np.array([obj['scale'], obj['scale']]) * 1.25
+                else:
+                    center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get inter-ocular distance as the normalize factor, measured as the
+        Euclidean distance between the outer corners of the eyes.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        interocular = np.linalg.norm(
+            gts[:, 36, :] - gts[:, 45, :], axis=1, keepdims=True)
+        return np.tile(interocular, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='NME', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[1,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str]): For example, ['300W/ibug/\
+                    image_018.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_aflw_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_aflw_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..292d9eece7e33e97467088b8710bd2c7c272fe52
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_aflw_dataset.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class FaceAFLWDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Face AFLW dataset for top-down face keypoint localization.
+
+    "Annotated Facial Landmarks in the Wild: A Large-scale,
+    Real-world Database for Facial Landmark Localization".
+    In Proc. First IEEE International Workshop on Benchmarking
+    Facial Image Analysis Technologies, 2011.
+
+    The dataset loads raw images and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The landmark annotations follow the 19 points mark-up. The definition
+    can be found in `https://www.tugraz.at/institute/icg/research`
+    `/team-bischof/lrs/downloads/aflw/`
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/aflw.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if self.test_mode:
+                    # 'box_size' is used as normalization factor
+                    assert 'box_size' in obj
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                if 'center' in obj and 'scale' in obj:
+                    center = np.array(obj['center'])
+                    scale = np.array([obj['scale'], obj['scale']]) * 1.25
+                else:
+                    center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'box_size': obj['box_size'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, box_sizes, *args, **kwargs):
+        """Get normalize factor for evaluation.
+
+        Args:
+            box_sizes (np.ndarray[N, 1]): box size
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        return np.tile(box_sizes, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='NME', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[1,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str]): For example, ['aflw/images/flickr/ \
+                    0/image00002.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_base_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..466fabbfcbeaa8ba3abe976269ab8a1de56e4e51
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class FaceBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgTopDownDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'FaceBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgTopDownDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef5117a8a06626cb5bc520795cca06e788bf198d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class FaceCocoWholeBodyDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """CocoWholeBodyDataset for face keypoint localization.
+
+    `Whole-Body Human Pose Estimation in the Wild', ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.11858>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The face landmark annotations follow the 68 points mark-up.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/'
+                                  'coco_wholebody_face.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if obj['face_valid'] and max(obj['face_kpts']) > 0:
+                    joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                    joints_3d_visible = np.zeros((num_joints, 3),
+                                                 dtype=np.float32)
+
+                    keypoints = np.array(obj['face_kpts']).reshape(-1, 3)
+                    joints_3d[:, :2] = keypoints[:, :2]
+                    joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                    center, scale = self._xywh2cs(*obj['face_box'][:4], 1.25)
+
+                    image_file = osp.join(self.img_prefix,
+                                          self.id2name[img_id])
+                    gt_db.append({
+                        'image_file': image_file,
+                        'center': center,
+                        'scale': scale,
+                        'rotation': 0,
+                        'joints_3d': joints_3d,
+                        'joints_3d_visible': joints_3d_visible,
+                        'dataset': self.dataset_name,
+                        'bbox': obj['face_box'],
+                        'bbox_score': 1,
+                        'bbox_id': bbox_id
+                    })
+                    bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get inter-ocular distance as the normalize factor, measured as the
+        Euclidean distance between the outer corners of the eyes.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        interocular = np.linalg.norm(
+            gts[:, 36, :] - gts[:, 45, :], axis=1, keepdims=True)
+        return np.tile(interocular, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='NME', **kwargs):
+        """Evaluate COCO-WholeBody Face keypoint results. The pose prediction
+        results will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[1,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str]): For example, ['coco/train2017/\
+                    000000000009.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_cofw_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_cofw_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..456ea0e9adbbadb6ecf4dffb3b5ff5e48cf92123
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_cofw_dataset.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class FaceCOFWDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Face COFW dataset for top-down face keypoint localization.
+
+    "Robust face landmark estimation under occlusion", ICCV'2013.
+
+    The dataset loads raw images and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The landmark annotations follow the 29 points mark-up. The definition
+    can be found in `http://www.vision.caltech.edu/xpburgos/ICCV13/`.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/cofw.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                if 'center' in obj and 'scale' in obj:
+                    center = np.array(obj['center'])
+                    scale = np.array([obj['scale'], obj['scale']]) * 1.25
+                else:
+                    center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get normalize factor for evaluation.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        interocular = np.linalg.norm(
+            gts[:, 8, :] - gts[:, 9, :], axis=1, keepdims=True)
+        return np.tile(interocular, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='NME', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[1,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str]): For example, ['cofw/images/\
+                    000001.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_wflw_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_wflw_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4611e197bd334a3864d8af99f1778af94c51d16
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/face/face_wflw_dataset.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class FaceWFLWDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Face WFLW dataset for top-down face keypoint localization.
+
+    "Look at Boundary: A Boundary-Aware Face Alignment Algorithm",
+    CVPR'2018.
+
+    The dataset loads raw images and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The landmark annotations follow the 98 points mark-up. The definition
+    can be found in `https://wywu.github.io/projects/LAB/WFLW.html`.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/wflw.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                if 'center' in obj and 'scale' in obj:
+                    center = np.array(obj['center'])
+                    scale = np.array([obj['scale'], obj['scale']]) * 1.25
+                else:
+                    center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get normalize factor for evaluation.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        interocular = np.linalg.norm(
+            gts[:, 60, :] - gts[:, 72, :], axis=1, keepdims=True)
+        return np.tile(interocular, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='NME', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[1,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str]): For example, ['wflw/images/\
+                    0--Parade/0_Parade_marchingband_1_1015.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/fashion/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/fashion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..575d6ed4af94686a87443f5938ed8b0d0809540f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/fashion/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .deepfashion_dataset import DeepFashionDataset
+
+__all__ = ['DeepFashionDataset']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/fashion/deepfashion_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/fashion/deepfashion_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fef65528c27e4f4bb6c77100b5fd4e398c9129f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/fashion/deepfashion_dataset.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class DeepFashionDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """DeepFashion dataset (full-body clothes) for fashion landmark detection.
+
+    "DeepFashion: Powering Robust Clothes Recognition
+    and Retrieval with Rich Annotations", CVPR'2016.
+    "Fashion Landmark Detection in the Wild", ECCV'2016.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The dataset contains 3 categories for full-body, upper-body and lower-body.
+
+    Fashion landmark indexes for upper-body clothes::
+
+        0: 'left collar',
+        1: 'right collar',
+        2: 'left sleeve',
+        3: 'right sleeve',
+        4: 'left hem',
+        5: 'right hem'
+
+    Fashion landmark indexes for lower-body clothes::
+
+        0: 'left waistline',
+        1: 'right waistline',
+        2: 'left hem',
+        3: 'right hem'
+
+    Fashion landmark indexes for full-body clothes::
+
+        0: 'left collar',
+        1: 'right collar',
+        2: 'left sleeve',
+        3: 'right sleeve',
+        4: 'left waistline',
+        5: 'right waistline',
+        6: 'left hem',
+        7: 'right hem'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 subset='',
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            if subset != '':
+                warnings.warn(
+                    'subset is deprecated.'
+                    'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                    'for details.', DeprecationWarning)
+            if subset == 'upper':
+                cfg = Config.fromfile(
+                    'configs/_base_/datasets/deepfashion_upper.py')
+                dataset_info = cfg._cfg_dict['dataset_info']
+            elif subset == 'lower':
+                cfg = Config.fromfile(
+                    'configs/_base_/datasets/deepfashion_lower.py')
+                dataset_info = cfg._cfg_dict['dataset_info']
+            elif subset == 'full':
+                cfg = Config.fromfile(
+                    'configs/_base_/datasets/deepfashion_full.py')
+                dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # use 1.25bbox as input
+                center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['img_00000001.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/fashion/fashion_base_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/fashion/fashion_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4e5860a478f5b9fb8d7a30873b6a4b0a32c3533
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/fashion/fashion_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class FashionBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgTopDownDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'FashionBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgTopDownDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..49159afa6027e82ead87053f7f807267288b7a94
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .freihand_dataset import FreiHandDataset
+from .hand_coco_wholebody_dataset import HandCocoWholeBodyDataset
+from .interhand2d_dataset import InterHand2DDataset
+from .interhand3d_dataset import InterHand3DDataset
+from .onehand10k_dataset import OneHand10KDataset
+from .panoptic_hand2d_dataset import PanopticDataset
+from .rhd2d_dataset import Rhd2DDataset
+
+__all__ = [
+    'FreiHandDataset', 'InterHand2DDataset', 'InterHand3DDataset',
+    'OneHand10KDataset', 'PanopticDataset', 'Rhd2DDataset',
+    'HandCocoWholeBodyDataset'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/freihand_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/freihand_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9ceeff2ef61619fa42909526218740dbb89027a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/freihand_dataset.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class FreiHandDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """FreiHand dataset for top-down hand pose estimation.
+
+    "FreiHAND: A Dataset for Markerless Capture of Hand Pose
+    and Shape from Single RGB Images", ICCV'2019.
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/1909.04349.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    FreiHand keypoint indexes::
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/freihand2d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # the ori image is 224x224
+                center, scale = self._xywh2cs(0, 0, 224, 224, 0.8)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['training/rgb/\
+                    00031426.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/hand_base_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/hand_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd20846d40ec8f7d9520902d6a289ebedcb07cae
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/hand_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class HandBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgTopDownDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'HandBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgTopDownDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c95cc09fbbe61b16bc36646cff4d394b72a1711
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class HandCocoWholeBodyDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """CocoWholeBodyDataset for top-down hand pose estimation.
+
+    "Whole-Body Human Pose Estimation in the Wild", ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.11858>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO-WholeBody Hand keypoint indexes::
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile(
+                'configs/_base_/datasets/coco_wholebody_hand.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                for type in ['left', 'right']:
+                    if obj[f'{type}hand_valid'] and max(
+                            obj[f'{type}hand_kpts']) > 0:
+                        joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                        joints_3d_visible = np.zeros((num_joints, 3),
+                                                     dtype=np.float32)
+
+                        keypoints = np.array(obj[f'{type}hand_kpts']).reshape(
+                            -1, 3)
+                        joints_3d[:, :2] = keypoints[:, :2]
+                        joints_3d_visible[:, :2] = np.minimum(
+                            1, keypoints[:, 2:3])
+
+                        # use 1.25 padded bbox as input
+                        center, scale = self._xywh2cs(
+                            *obj[f'{type}hand_box'][:4], 1.25)
+
+                        image_file = osp.join(self.img_prefix,
+                                              self.id2name[img_id])
+
+                        gt_db.append({
+                            'image_file': image_file,
+                            'center': center,
+                            'scale': scale,
+                            'rotation': 0,
+                            'joints_3d': joints_3d,
+                            'joints_3d_visible': joints_3d_visible,
+                            'dataset': self.dataset_name,
+                            'bbox': obj[f'{type}hand_box'],
+                            'bbox_score': 1,
+                            'bbox_id': bbox_id
+                        })
+                        bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate COCO-WholeBody Hand keypoint results. The pose prediction
+        results will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/interhand2d_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/interhand2d_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..fea17fa59aa75ea9846c401a3ad2276fb2b525cc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/interhand2d_dataset.py
@@ -0,0 +1,306 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class InterHand2DDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """InterHand2.6M 2D dataset for top-down hand pose estimation.
+
+    "InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose
+    Estimation from a Single RGB Image", ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/2008.09309.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    InterHand2.6M keypoint indexes::
+
+        0: 'thumb4',
+        1: 'thumb3',
+        2: 'thumb2',
+        3: 'thumb1',
+        4: 'forefinger4',
+        5: 'forefinger3',
+        6: 'forefinger2',
+        7: 'forefinger1',
+        8: 'middle_finger4',
+        9: 'middle_finger3',
+        10: 'middle_finger2',
+        11: 'middle_finger1',
+        12: 'ring_finger4',
+        13: 'ring_finger3',
+        14: 'ring_finger2',
+        15: 'ring_finger1',
+        16: 'pinky_finger4',
+        17: 'pinky_finger3',
+        18: 'pinky_finger2',
+        19: 'pinky_finger1',
+        20: 'wrist'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        camera_file (str): Path to the camera file.
+        joint_file (str): Path to the joint file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (str): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 camera_file,
+                 joint_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/interhand2d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.camera_file = camera_file
+        self.joint_file = joint_file
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    @staticmethod
+    def _cam2pixel(cam_coord, f, c):
+        """Transform the joints from their camera coordinates to their pixel
+        coordinates.
+
+        Note:
+            - N: number of joints
+
+        Args:
+            cam_coord (ndarray[N, 3]): 3D joints coordinates
+                in the camera coordinate system
+            f (ndarray[2]): focal length of x and y axis
+            c (ndarray[2]): principal point of x and y axis
+
+        Returns:
+            img_coord (ndarray[N, 3]): the coordinates (x, y, 0)
+                in the image plane.
+        """
+        x = cam_coord[:, 0] / (cam_coord[:, 2] + 1e-8) * f[0] + c[0]
+        y = cam_coord[:, 1] / (cam_coord[:, 2] + 1e-8) * f[1] + c[1]
+        z = np.zeros_like(x)
+        img_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1)
+        return img_coord
+
+    @staticmethod
+    def _world2cam(world_coord, R, T):
+        """Transform the joints from their world coordinates to their camera
+        coordinates.
+
+        Note:
+            - N: number of joints
+
+        Args:
+            world_coord (ndarray[3, N]): 3D joints coordinates
+                in the world coordinate system
+            R (ndarray[3, 3]): camera rotation matrix
+            T (ndarray[3]): camera position (x, y, z)
+
+        Returns:
+            cam_coord (ndarray[3, N]): 3D joints coordinates
+                in the camera coordinate system
+        """
+        cam_coord = np.dot(R, world_coord - T)
+        return cam_coord
+
+    def _get_db(self):
+        """Load dataset.
+
+        Adapted from 'https://github.com/facebookresearch/InterHand2.6M/'
+            'blob/master/data/InterHand2.6M/dataset.py'
+        Copyright (c) FaceBook Research, under CC-BY-NC 4.0 license.
+        """
+        with open(self.camera_file, 'r') as f:
+            cameras = json.load(f)
+        with open(self.joint_file, 'r') as f:
+            joints = json.load(f)
+        gt_db = []
+        bbox_id = 0
+        for img_id in self.img_ids:
+            num_joints = self.ann_info['num_joints']
+
+            ann_id = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            ann = self.coco.loadAnns(ann_id)[0]
+            img = self.coco.loadImgs(img_id)[0]
+
+            capture_id = str(img['capture'])
+            camera_name = img['camera']
+            frame_idx = str(img['frame_idx'])
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+            camera_pos, camera_rot = np.array(
+                cameras[capture_id]['campos'][camera_name],
+                dtype=np.float32), np.array(
+                    cameras[capture_id]['camrot'][camera_name],
+                    dtype=np.float32)
+            focal, principal_pt = np.array(
+                cameras[capture_id]['focal'][camera_name],
+                dtype=np.float32), np.array(
+                    cameras[capture_id]['princpt'][camera_name],
+                    dtype=np.float32)
+            joint_world = np.array(
+                joints[capture_id][frame_idx]['world_coord'], dtype=np.float32)
+            joint_cam = self._world2cam(
+                joint_world.transpose(1, 0), camera_rot,
+                camera_pos.reshape(3, 1)).transpose(1, 0)
+            joint_img = self._cam2pixel(joint_cam, focal, principal_pt)[:, :2]
+            joint_img = joint_img.reshape(2, -1, 2)
+
+            joint_valid = np.array(
+                ann['joint_valid'], dtype=np.float32).reshape(2, -1)
+            # if root is not valid -> root-relative 3D pose is also not valid.
+            # Therefore, mark all joints as invalid
+            for hand in range(2):
+                joint_valid[hand, :] *= joint_valid[hand][-1]
+
+                if np.sum(joint_valid[hand, :]) > 11:
+                    joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                    joints_3d_visible = np.zeros((num_joints, 3),
+                                                 dtype=np.float32)
+                    joints_3d[:, :2] = joint_img[hand, :, :]
+                    joints_3d_visible[:, :2] = np.minimum(
+                        1, joint_valid[hand, :].reshape(-1, 1))
+
+                    # use the tightest bbox enclosing all keypoints as bbox
+                    bbox = [img['width'], img['height'], 0, 0]
+                    for i in range(num_joints):
+                        if joints_3d_visible[i][0]:
+                            bbox[0] = min(bbox[0], joints_3d[i][0])
+                            bbox[1] = min(bbox[1], joints_3d[i][1])
+                            bbox[2] = max(bbox[2], joints_3d[i][0])
+                            bbox[3] = max(bbox[3], joints_3d[i][1])
+
+                    bbox[2] -= bbox[0]
+                    bbox[3] -= bbox[1]
+
+                    # use 1.5bbox as input
+                    center, scale = self._xywh2cs(*bbox, 1.5)
+
+                    gt_db.append({
+                        'image_file': image_file,
+                        'center': center,
+                        'scale': scale,
+                        'rotation': 0,
+                        'joints_3d': joints_3d,
+                        'joints_3d_visible': joints_3d_visible,
+                        'dataset': self.dataset_name,
+                        'bbox': bbox,
+                        'bbox_score': 1,
+                        'bbox_id': bbox_id
+                    })
+                    bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate interhand2d keypoint results. The pose prediction results
+        will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Capture12/\
+                    0390_dh_touchROM/cam410209/image62434.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/interhand3d_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/interhand3d_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..318d73fbd561c215aa31c83b4df786030400a4d9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/interhand3d_dataset.py
@@ -0,0 +1,505 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.core.evaluation.top_down_eval import keypoint_epe
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt3dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class InterHand3DDataset(Kpt3dSviewRgbImgTopDownDataset):
+    """InterHand2.6M 3D dataset for top-down hand pose estimation.
+
+    "InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose
+    Estimation from a Single RGB Image", ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/2008.09309.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    InterHand2.6M keypoint indexes::
+
+        0: 'r_thumb4',
+        1: 'r_thumb3',
+        2: 'r_thumb2',
+        3: 'r_thumb1',
+        4: 'r_index4',
+        5: 'r_index3',
+        6: 'r_index2',
+        7: 'r_index1',
+        8: 'r_middle4',
+        9: 'r_middle3',
+        10: 'r_middle2',
+        11: 'r_middle1',
+        12: 'r_ring4',
+        13: 'r_ring3',
+        14: 'r_ring2',
+        15: 'r_ring1',
+        16: 'r_pinky4',
+        17: 'r_pinky3',
+        18: 'r_pinky2',
+        19: 'r_pinky1',
+        20: 'r_wrist',
+        21: 'l_thumb4',
+        22: 'l_thumb3',
+        23: 'l_thumb2',
+        24: 'l_thumb1',
+        25: 'l_index4',
+        26: 'l_index3',
+        27: 'l_index2',
+        28: 'l_index1',
+        29: 'l_middle4',
+        30: 'l_middle3',
+        31: 'l_middle2',
+        32: 'l_middle1',
+        33: 'l_ring4',
+        34: 'l_ring3',
+        35: 'l_ring2',
+        36: 'l_ring1',
+        37: 'l_pinky4',
+        38: 'l_pinky3',
+        39: 'l_pinky2',
+        40: 'l_pinky1',
+        41: 'l_wrist'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        camera_file (str): Path to the camera file.
+        joint_file (str): Path to the joint file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        use_gt_root_depth (bool): Using the ground truth depth of the wrist
+            or given depth from rootnet_result_file.
+        rootnet_result_file (str): Path to the wrist depth file.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (str): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 camera_file,
+                 joint_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 use_gt_root_depth=True,
+                 rootnet_result_file=None,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/interhand3d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['heatmap3d_depth_bound'] = data_cfg[
+            'heatmap3d_depth_bound']
+        self.ann_info['heatmap_size_root'] = data_cfg['heatmap_size_root']
+        self.ann_info['root_depth_bound'] = data_cfg['root_depth_bound']
+        self.ann_info['use_different_joint_weights'] = False
+
+        self.camera_file = camera_file
+        self.joint_file = joint_file
+
+        self.use_gt_root_depth = use_gt_root_depth
+        if not self.use_gt_root_depth:
+            assert rootnet_result_file is not None
+            self.rootnet_result_file = rootnet_result_file
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    @staticmethod
+    def _encode_handtype(hand_type):
+        if hand_type == 'right':
+            return np.array([1, 0], dtype=np.float32)
+        elif hand_type == 'left':
+            return np.array([0, 1], dtype=np.float32)
+        elif hand_type == 'interacting':
+            return np.array([1, 1], dtype=np.float32)
+        else:
+            assert 0, f'Not support hand type: {hand_type}'
+
+    def _get_db(self):
+        """Load dataset.
+
+        Adapted from 'https://github.com/facebookresearch/InterHand2.6M/'
+            'blob/master/data/InterHand2.6M/dataset.py'
+        Copyright (c) FaceBook Research, under CC-BY-NC 4.0 license.
+        """
+        with open(self.camera_file, 'r') as f:
+            cameras = json.load(f)
+        with open(self.joint_file, 'r') as f:
+            joints = json.load(f)
+
+        if not self.use_gt_root_depth:
+            rootnet_result = {}
+            with open(self.rootnet_result_file, 'r') as f:
+                rootnet_annot = json.load(f)
+            for i in range(len(rootnet_annot)):
+                rootnet_result[str(
+                    rootnet_annot[i]['annot_id'])] = rootnet_annot[i]
+
+        gt_db = []
+        bbox_id = 0
+        for img_id in self.img_ids:
+            num_joints = self.ann_info['num_joints']
+
+            ann_id = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            ann = self.coco.loadAnns(ann_id)[0]
+            img = self.coco.loadImgs(img_id)[0]
+
+            capture_id = str(img['capture'])
+            camera_name = img['camera']
+            frame_idx = str(img['frame_idx'])
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+            camera_pos = np.array(
+                cameras[capture_id]['campos'][camera_name], dtype=np.float32)
+            camera_rot = np.array(
+                cameras[capture_id]['camrot'][camera_name], dtype=np.float32)
+            focal = np.array(
+                cameras[capture_id]['focal'][camera_name], dtype=np.float32)
+            principal_pt = np.array(
+                cameras[capture_id]['princpt'][camera_name], dtype=np.float32)
+            joint_world = np.array(
+                joints[capture_id][frame_idx]['world_coord'], dtype=np.float32)
+            joint_cam = self._world2cam(
+                joint_world.transpose(1, 0), camera_rot,
+                camera_pos.reshape(3, 1)).transpose(1, 0)
+            joint_img = self._cam2pixel(joint_cam, focal, principal_pt)[:, :2]
+
+            joint_valid = np.array(
+                ann['joint_valid'], dtype=np.float32).flatten()
+            hand_type = self._encode_handtype(ann['hand_type'])
+            hand_type_valid = ann['hand_type_valid']
+
+            if self.use_gt_root_depth:
+                bbox = np.array(ann['bbox'], dtype=np.float32)
+                # extend the bbox to include some context
+                center, scale = self._xywh2cs(*bbox, 1.25)
+                abs_depth = [joint_cam[20, 2], joint_cam[41, 2]]
+            else:
+                rootnet_ann_data = rootnet_result[str(ann_id[0])]
+                bbox = np.array(rootnet_ann_data['bbox'], dtype=np.float32)
+                # the bboxes have been extended
+                center, scale = self._xywh2cs(*bbox, 1.0)
+                abs_depth = rootnet_ann_data['abs_depth']
+            # 41: 'l_wrist', left hand root
+            # 20: 'r_wrist', right hand root
+            rel_root_depth = joint_cam[41, 2] - joint_cam[20, 2]
+            # if root is not valid, root-relative 3D depth is also invalid.
+            rel_root_valid = joint_valid[20] * joint_valid[41]
+
+            # if root is not valid -> root-relative 3D pose is also not valid.
+            # Therefore, mark all joints as invalid
+            joint_valid[:20] *= joint_valid[20]
+            joint_valid[21:] *= joint_valid[41]
+
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d[:, :2] = joint_img
+            joints_3d[:21, 2] = joint_cam[:21, 2] - joint_cam[20, 2]
+            joints_3d[21:, 2] = joint_cam[21:, 2] - joint_cam[41, 2]
+            joints_3d_visible[...] = np.minimum(1, joint_valid.reshape(-1, 1))
+
+            gt_db.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'hand_type': hand_type,
+                'hand_type_valid': hand_type_valid,
+                'rel_root_depth': rel_root_depth,
+                'rel_root_valid': rel_root_valid,
+                'abs_depth': abs_depth,
+                'joints_cam': joint_cam,
+                'focal': focal,
+                'princpt': principal_pt,
+                'dataset': self.dataset_name,
+                'bbox': bbox,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='MPJPE', **kwargs):
+        """Evaluate interhand2d keypoint results. The pose prediction results
+        will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - hand_type (np.ndarray[N, 4]): The first two dimensions are \
+                    hand type, scores is the last two dimensions.
+                - rel_root_depth (np.ndarray[N]): The relative depth of left \
+                    wrist and right wrist.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Capture6/\
+                    0012_aokay_upright/cam410061/image4996.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'MRRPE', 'MPJPE', 'Handedness_acc'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['MRRPE', 'MPJPE', 'Handedness_acc']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result.get('preds')
+            if preds is None and 'MPJPE' in metrics:
+                raise KeyError('metric MPJPE is not supported')
+
+            hand_type = result.get('hand_type')
+            if hand_type is None and 'Handedness_acc' in metrics:
+                raise KeyError('metric Handedness_acc is not supported')
+
+            rel_root_depth = result.get('rel_root_depth')
+            if rel_root_depth is None and 'MRRPE' in metrics:
+                raise KeyError('metric MRRPE is not supported')
+
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpt = {
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                }
+
+                if preds is not None:
+                    kpt['keypoints'] = preds[i, :, :3].tolist()
+                if hand_type is not None:
+                    kpt['hand_type'] = hand_type[i][0:2].tolist()
+                    kpt['hand_type_score'] = hand_type[i][2:4].tolist()
+                if rel_root_depth is not None:
+                    kpt['rel_root_depth'] = float(rel_root_depth[i])
+
+                kpts.append(kpt)
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    @staticmethod
+    def _get_accuracy(outputs, gts, masks):
+        """Get accuracy of multi-label classification.
+
+        Note:
+            - batch_size: N
+            - label_num: C
+
+        Args:
+            outputs (np.array[N, C]): predicted multi-label.
+            gts (np.array[N, C]): Groundtruth muti-label.
+            masks (np.array[N, ]): masked outputs will be ignored for
+                accuracy calculation.
+
+        Returns:
+            float: mean accuracy
+        """
+        acc = (outputs == gts).all(axis=1)
+        return np.mean(acc[masks])
+
+    def _report_metric(self, res_file, metrics):
+        """Keypoint evaluation.
+
+        Args:
+            res_file (str): Json file stored prediction results.
+            metrics (str | list[str]): Metric to be performed.
+                Options: 'MRRPE', 'MPJPE', 'Handedness_acc'.
+
+        Returns:
+            list: Evaluation results for evaluation metric.
+        """
+        info_str = []
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.db)
+
+        gts_rel_root = []
+        preds_rel_root = []
+        rel_root_masks = []
+        gts_joint_coord_cam = []
+        preds_joint_coord_cam = []
+        single_masks = []
+        interacting_masks = []
+        all_masks = []
+        gts_hand_type = []
+        preds_hand_type = []
+        hand_type_masks = []
+
+        for pred, item in zip(preds, self.db):
+            # mrrpe
+            if 'MRRPE' in metrics:
+                if item['hand_type'].all() and item['joints_3d_visible'][
+                        20, 0] and item['joints_3d_visible'][41, 0]:
+                    rel_root_masks.append(True)
+
+                    pred_left_root_img = np.array(
+                        pred['keypoints'][41], dtype=np.float32)[None, :]
+                    pred_left_root_img[:, 2] += item['abs_depth'][0] + pred[
+                        'rel_root_depth']
+                    pred_left_root_cam = self._pixel2cam(
+                        pred_left_root_img, item['focal'], item['princpt'])
+
+                    pred_right_root_img = np.array(
+                        pred['keypoints'][20], dtype=np.float32)[None, :]
+                    pred_right_root_img[:, 2] += item['abs_depth'][0]
+                    pred_right_root_cam = self._pixel2cam(
+                        pred_right_root_img, item['focal'], item['princpt'])
+
+                    preds_rel_root.append(pred_left_root_cam -
+                                          pred_right_root_cam)
+                    gts_rel_root.append(
+                        [item['joints_cam'][41] - item['joints_cam'][20]])
+                else:
+                    rel_root_masks.append(False)
+                    preds_rel_root.append([[0., 0., 0.]])
+                    gts_rel_root.append([[0., 0., 0.]])
+
+            if 'MPJPE' in metrics:
+                pred_joint_coord_img = np.array(
+                    pred['keypoints'], dtype=np.float32)
+                gt_joint_coord_cam = item['joints_cam'].copy()
+
+                pred_joint_coord_img[:21, 2] += item['abs_depth'][0]
+                pred_joint_coord_img[21:, 2] += item['abs_depth'][1]
+                pred_joint_coord_cam = self._pixel2cam(pred_joint_coord_img,
+                                                       item['focal'],
+                                                       item['princpt'])
+
+                pred_joint_coord_cam[:21] -= pred_joint_coord_cam[20]
+                pred_joint_coord_cam[21:] -= pred_joint_coord_cam[41]
+                gt_joint_coord_cam[:21] -= gt_joint_coord_cam[20]
+                gt_joint_coord_cam[21:] -= gt_joint_coord_cam[41]
+
+                preds_joint_coord_cam.append(pred_joint_coord_cam)
+                gts_joint_coord_cam.append(gt_joint_coord_cam)
+
+                mask = (np.array(item['joints_3d_visible'])[:, 0]) > 0
+
+                if item['hand_type'].all():
+                    single_masks.append(
+                        np.zeros(self.ann_info['num_joints'], dtype=bool))
+                    interacting_masks.append(mask)
+                    all_masks.append(mask)
+                else:
+                    single_masks.append(mask)
+                    interacting_masks.append(
+                        np.zeros(self.ann_info['num_joints'], dtype=bool))
+                    all_masks.append(mask)
+
+            if 'Handedness_acc' in metrics:
+                pred_hand_type = np.array(pred['hand_type'], dtype=int)
+                preds_hand_type.append(pred_hand_type)
+                gts_hand_type.append(item['hand_type'])
+                hand_type_masks.append(item['hand_type_valid'] > 0)
+
+        gts_rel_root = np.array(gts_rel_root, dtype=np.float32)
+        preds_rel_root = np.array(preds_rel_root, dtype=np.float32)
+        rel_root_masks = np.array(rel_root_masks, dtype=bool)[:, None]
+        gts_joint_coord_cam = np.array(gts_joint_coord_cam, dtype=np.float32)
+        preds_joint_coord_cam = np.array(
+            preds_joint_coord_cam, dtype=np.float32)
+        single_masks = np.array(single_masks, dtype=bool)
+        interacting_masks = np.array(interacting_masks, dtype=bool)
+        all_masks = np.array(all_masks, dtype=bool)
+        gts_hand_type = np.array(gts_hand_type, dtype=int)
+        preds_hand_type = np.array(preds_hand_type, dtype=int)
+        hand_type_masks = np.array(hand_type_masks, dtype=bool)
+
+        if 'MRRPE' in metrics:
+            info_str.append(('MRRPE',
+                             keypoint_epe(preds_rel_root, gts_rel_root,
+                                          rel_root_masks)))
+
+        if 'MPJPE' in metrics:
+            info_str.append(('MPJPE_all',
+                             keypoint_epe(preds_joint_coord_cam,
+                                          gts_joint_coord_cam, all_masks)))
+            info_str.append(('MPJPE_single',
+                             keypoint_epe(preds_joint_coord_cam,
+                                          gts_joint_coord_cam, single_masks)))
+            info_str.append(
+                ('MPJPE_interacting',
+                 keypoint_epe(preds_joint_coord_cam, gts_joint_coord_cam,
+                              interacting_masks)))
+
+        if 'Handedness_acc' in metrics:
+            info_str.append(('Handedness_acc',
+                             self._get_accuracy(preds_hand_type, gts_hand_type,
+                                                hand_type_masks)))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/onehand10k_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/onehand10k_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9783cab16c7e3c3a9600005008e985d112e71a07
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/onehand10k_dataset.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class OneHand10KDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """OneHand10K dataset for top-down hand pose estimation.
+
+    "Mask-pose Cascaded CNN for 2D Hand Pose Estimation from
+    Single Color Images", TCSVT'2019.
+    More details can be found in the `paper
+    <https://www.yangangwang.com/papers/WANG-MCC-2018-10.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    OneHand10K keypoint indexes::
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/onehand10k.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # use 1.25 padded bbox as input
+                center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate onehand10k keypoint results. The pose prediction results
+        will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1d7fc6af1ec0dee22a81e2dff8819827062a3d5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class PanopticDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Panoptic dataset for top-down hand pose estimation.
+
+    "Hand Keypoint Detection in Single Images using Multiview
+    Bootstrapping", CVPR'2017.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1704.07809>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Panoptic keypoint indexes::
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/panoptic_hand2d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # The bbox is the tightest bbox enclosing keypoints.
+                # The paper uses 2.2 bbox as the input, while
+                # we use 1.76 (2.2 * 0.8) bbox as the input.
+                center, scale = self._xywh2cs(*obj['bbox'][:4], 1.76)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'head_size': obj['head_size'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCKh', **kwargs):
+        """Evaluate panoptic keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['hand_labels/\
+                    manual_test/000648952_02_l.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCKh', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCKh', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/rhd2d_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/rhd2d_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3667f5fb672f71b08331706656049734cdfa790d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/hand/rhd2d_dataset.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class Rhd2DDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Rendered Handpose Dataset for top-down hand pose estimation.
+
+    "Learning to Estimate 3D Hand Pose from Single RGB Images",
+    ICCV'2017.
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/1705.01389.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Rhd keypoint indexes::
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/rhd2d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # the ori image is 224x224
+                center, scale = self._xywh2cs(*obj['bbox'][:4], padding=1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate rhd keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1], area, score]
+                - image_paths (list[str]): For example,
+                    ['training/rgb/00031426.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..14297c7261aed14f814e2e986f315dedd51702be
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mesh_adv_dataset import MeshAdversarialDataset
+from .mesh_h36m_dataset import MeshH36MDataset
+from .mesh_mix_dataset import MeshMixDataset
+from .mosh_dataset import MoshDataset
+
+__all__ = [
+    'MeshH36MDataset', 'MoshDataset', 'MeshMixDataset',
+    'MeshAdversarialDataset'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_adv_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_adv_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd9ba39d50415d2897cd14e32435feee397c2963
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_adv_dataset.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets.builder import DATASETS, build_dataset
+
+
+@DATASETS.register_module()
+class MeshAdversarialDataset(Dataset):
+    """Mix Dataset for the adversarial training in 3D human mesh estimation
+    task.
+
+    The dataset combines data from two datasets and
+    return a dict containing data from two datasets.
+
+    Args:
+        train_dataset (Dataset): Dataset for 3D human mesh estimation.
+        adversarial_dataset (Dataset): Dataset for adversarial learning,
+            provides real SMPL parameters.
+    """
+
+    def __init__(self, train_dataset, adversarial_dataset):
+        super().__init__()
+        self.train_dataset = build_dataset(train_dataset)
+        self.adversarial_dataset = build_dataset(adversarial_dataset)
+        self.length = len(self.train_dataset)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return self.length
+
+    def __getitem__(self, i):
+        """Given index, get the data from train dataset and randomly sample an
+        item from adversarial dataset.
+
+        Return a dict containing data from train and adversarial dataset.
+        """
+        data = self.train_dataset[i]
+        ind_adv = np.random.randint(
+            low=0, high=len(self.adversarial_dataset), dtype=int)
+        data.update(self.adversarial_dataset[ind_adv %
+                                             len(self.adversarial_dataset)])
+        return data
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_base_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..79c8a8ac9040463152cb779ffff146ef5391b241
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_base_dataset.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+import os
+from abc import ABCMeta
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets.pipelines import Compose
+
+
+class MeshBaseDataset(Dataset, metaclass=ABCMeta):
+    """Base dataset for 3D human mesh estimation task. In 3D humamesh
+    estimation task, all datasets share this BaseDataset for training and have
+    their own evaluate function.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    This dataset can only be used for training.
+    For evaluation, subclass should write an extra evaluate function.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['iuv_size'] = np.array(data_cfg['iuv_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+        self.ann_info['flip_pairs'] = None
+        self.db = []
+        self.pipeline = Compose(self.pipeline)
+
+        # flip_pairs
+        # For all mesh dataset, we use 24 joints as CMR and SPIN.
+        self.ann_info['flip_pairs'] = [[0, 5], [1, 4], [2, 3], [6, 11],
+                                       [7, 10], [8, 9], [20, 21], [22, 23]]
+        self.ann_info['use_different_joint_weights'] = False
+        assert self.ann_info['num_joints'] == 24
+        self.ann_info['joint_weights'] = np.ones([24, 1], dtype=np.float32)
+
+        self.ann_info['uv_type'] = data_cfg['uv_type']
+        self.ann_info['use_IUV'] = data_cfg['use_IUV']
+        uv_type = self.ann_info['uv_type']
+        self.iuv_prefix = os.path.join(self.img_prefix, f'{uv_type}_IUV_gt')
+        self.db = self._get_db(ann_file)
+
+    def _get_db(self, ann_file):
+        """Load dataset."""
+        data = np.load(ann_file)
+        tmpl = dict(
+            image_file=None,
+            center=None,
+            scale=None,
+            rotation=0,
+            joints_2d=None,
+            joints_2d_visible=None,
+            joints_3d=None,
+            joints_3d_visible=None,
+            gender=None,
+            pose=None,
+            beta=None,
+            has_smpl=0,
+            iuv_file=None,
+            has_iuv=0)
+        gt_db = []
+
+        _imgnames = data['imgname']
+        _scales = data['scale'].astype(np.float32)
+        _centers = data['center'].astype(np.float32)
+        dataset_len = len(_imgnames)
+
+        # Get 2D keypoints
+        if 'part' in data.keys():
+            _keypoints = data['part'].astype(np.float32)
+        else:
+            _keypoints = np.zeros((dataset_len, 24, 3), dtype=np.float32)
+
+        # Get gt 3D joints, if available
+        if 'S' in data.keys():
+            _joints_3d = data['S'].astype(np.float32)
+        else:
+            _joints_3d = np.zeros((dataset_len, 24, 4), dtype=np.float32)
+
+        # Get gt SMPL parameters, if available
+        if 'pose' in data.keys() and 'shape' in data.keys():
+            _poses = data['pose'].astype(np.float32)
+            _betas = data['shape'].astype(np.float32)
+            has_smpl = 1
+        else:
+            _poses = np.zeros((dataset_len, 72), dtype=np.float32)
+            _betas = np.zeros((dataset_len, 10), dtype=np.float32)
+            has_smpl = 0
+
+        # Get gender data, if available
+        if 'gender' in data.keys():
+            _genders = data['gender']
+            _genders = np.array([str(g) != 'm' for g in _genders]).astype(int)
+        else:
+            _genders = -1 * np.ones(dataset_len).astype(int)
+
+        # Get IUV image, if available
+        if 'iuv_names' in data.keys():
+            _iuv_names = data['iuv_names']
+            has_iuv = has_smpl
+        else:
+            _iuv_names = [''] * dataset_len
+            has_iuv = 0
+
+        for i in range(len(_imgnames)):
+            newitem = cp.deepcopy(tmpl)
+            newitem['image_file'] = os.path.join(self.img_prefix, _imgnames[i])
+            newitem['scale'] = np.array([_scales[i], _scales[i]])
+            newitem['center'] = _centers[i]
+            newitem['joints_2d'] = _keypoints[i, :, :2]
+            newitem['joints_2d_visible'] = _keypoints[i, :, -1][:, None]
+            newitem['joints_3d'] = _joints_3d[i, :, :3]
+            newitem['joints_3d_visible'] = _joints_3d[i, :, -1][:, None]
+            newitem['pose'] = _poses[i]
+            newitem['beta'] = _betas[i]
+            newitem['has_smpl'] = has_smpl
+            newitem['gender'] = _genders[i]
+            newitem['iuv_file'] = os.path.join(self.iuv_prefix, _iuv_names[i])
+            newitem['has_iuv'] = has_iuv
+            gt_db.append(newitem)
+        return gt_db
+
+    def __len__(self, ):
+        """Get the size of the dataset."""
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = cp.deepcopy(self.db[idx])
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ac9ead1f5c1c1de40604c6830f6b0c762ad70eb
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+
+from mmpose.core.evaluation import keypoint_mpjpe
+from mmpose.datasets.builder import DATASETS
+from .mesh_base_dataset import MeshBaseDataset
+
+
+@DATASETS.register_module()
+class MeshH36MDataset(MeshBaseDataset):
+    """Human3.6M Dataset for 3D human mesh estimation. It inherits all function
+    from MeshBaseDataset and has its own evaluate function.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def evaluate(self, outputs, res_folder, metric='joint_error', logger=None):
+        """Evaluate 3D keypoint results."""
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['joint_error']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        res_file = os.path.join(res_folder, 'result_keypoints.json')
+        kpts = []
+        for out in outputs:
+            for (keypoints, image_path) in zip(out['keypoints_3d'],
+                                               out['image_path']):
+                kpts.append({
+                    'keypoints': keypoints.tolist(),
+                    'image': image_path,
+                })
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file)
+        name_value = OrderedDict(info_str)
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self, res_file):
+        """Keypoint evaluation.
+
+        Report mean per joint position error (MPJPE) and mean per joint
+        position error after rigid alignment (MPJPE-PA)
+        """
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.db)
+
+        pred_joints_3d = [pred['keypoints'] for pred in preds]
+        gt_joints_3d = [item['joints_3d'] for item in self.db]
+        gt_joints_visible = [item['joints_3d_visible'] for item in self.db]
+
+        pred_joints_3d = np.array(pred_joints_3d)
+        gt_joints_3d = np.array(gt_joints_3d)
+        gt_joints_visible = np.array(gt_joints_visible)
+
+        # we only evaluate on 14 lsp joints
+        joint_mapper = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18]
+        pred_joints_3d = pred_joints_3d[:, joint_mapper, :]
+        pred_pelvis = (pred_joints_3d[:, 2] + pred_joints_3d[:, 3]) / 2
+        pred_joints_3d = pred_joints_3d - pred_pelvis[:, None, :]
+
+        gt_joints_3d = gt_joints_3d[:, joint_mapper, :]
+        gt_pelvis = (gt_joints_3d[:, 2] + gt_joints_3d[:, 3]) / 2
+        gt_joints_3d = gt_joints_3d - gt_pelvis[:, None, :]
+        gt_joints_visible = gt_joints_visible[:, joint_mapper, 0] > 0
+
+        mpjpe = keypoint_mpjpe(pred_joints_3d, gt_joints_3d, gt_joints_visible)
+        mpjpe_pa = keypoint_mpjpe(
+            pred_joints_3d,
+            gt_joints_3d,
+            gt_joints_visible,
+            alignment='procrustes')
+
+        info_str = []
+        info_str.append(('MPJPE', mpjpe * 1000))
+        info_str.append(('MPJPE-PA', mpjpe_pa * 1000))
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_mix_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_mix_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..244a7c323c6c69aa2a00e9adfb0a11e08182c004
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mesh_mix_dataset.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+import numpy as np
+from torch.utils.data import ConcatDataset, Dataset, WeightedRandomSampler
+
+from mmpose.datasets.builder import DATASETS
+from .mesh_base_dataset import MeshBaseDataset
+
+
+@DATASETS.register_module()
+class MeshMixDataset(Dataset, metaclass=ABCMeta):
+    """Mix Dataset for 3D human mesh estimation.
+
+    The dataset combines data from multiple datasets (MeshBaseDataset) and
+    sample the data from different datasets with the provided proportions.
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Args:
+        configs (list): List of configs for multiple datasets.
+        partition (list): Sample proportion of multiple datasets. The length
+            of partition should be same with that of configs. The elements
+            of it should be non-negative and is not necessary summing up to
+            one.
+
+    Example:
+        >>> from mmpose.datasets import MeshMixDataset
+        >>> data_cfg = dict(
+        >>>     image_size=[256, 256],
+        >>>     iuv_size=[64, 64],
+        >>>     num_joints=24,
+        >>>     use_IUV=True,
+        >>>     uv_type='BF')
+        >>>
+        >>> mix_dataset = MeshMixDataset(
+        >>>     configs=[
+        >>>         dict(
+        >>>             ann_file='tests/data/h36m/test_h36m.npz',
+        >>>             img_prefix='tests/data/h36m',
+        >>>             data_cfg=data_cfg,
+        >>>             pipeline=[]),
+        >>>         dict(
+        >>>             ann_file='tests/data/h36m/test_h36m.npz',
+        >>>             img_prefix='tests/data/h36m',
+        >>>             data_cfg=data_cfg,
+        >>>             pipeline=[]),
+        >>>     ],
+        >>>     partition=[0.6, 0.4])
+    """
+
+    def __init__(self, configs, partition):
+        """Load data from multiple datasets."""
+        assert min(partition) >= 0
+        datasets = [MeshBaseDataset(**cfg) for cfg in configs]
+        self.dataset = ConcatDataset(datasets)
+        self.length = max(len(ds) for ds in datasets)
+        weights = [
+            np.ones(len(ds)) * p / len(ds)
+            for (p, ds) in zip(partition, datasets)
+        ]
+        weights = np.concatenate(weights, axis=0)
+        self.sampler = WeightedRandomSampler(weights, 1)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return self.length
+
+    def __getitem__(self, idx):
+        """Given index, sample the data from multiple datasets with the given
+        proportion."""
+        idx_new = list(self.sampler)[0]
+        return self.dataset[idx_new]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mosh_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mosh_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3185265e7d6e666d8c9096244c3df4104bcdb020
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/mesh/mosh_dataset.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+from abc import ABCMeta
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets.builder import DATASETS
+from mmpose.datasets.pipelines import Compose
+
+
+@DATASETS.register_module()
+class MoshDataset(Dataset, metaclass=ABCMeta):
+    """Mosh Dataset for the adversarial training in 3D human mesh estimation
+    task.
+
+    The dataset return a dict containing real-world SMPL parameters.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self, ann_file, pipeline, test_mode=False):
+
+        self.ann_file = ann_file
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.db = self._get_db(ann_file)
+        self.pipeline = Compose(self.pipeline)
+
+    @staticmethod
+    def _get_db(ann_file):
+        """Load dataset."""
+        data = np.load(ann_file)
+        _betas = data['shape'].astype(np.float32)
+        _poses = data['pose'].astype(np.float32)
+        tmpl = dict(
+            pose=None,
+            beta=None,
+        )
+        gt_db = []
+        dataset_len = len(_betas)
+
+        for i in range(dataset_len):
+            newitem = cp.deepcopy(tmpl)
+            newitem['pose'] = _poses[i]
+            newitem['beta'] = _betas[i]
+            gt_db.append(newitem)
+        return gt_db
+
+    def __len__(self, ):
+        """Get the size of the dataset."""
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        item = cp.deepcopy(self.db[idx])
+        trivial, pose, beta = \
+            np.zeros(3, dtype=np.float32), item['pose'], item['beta']
+        results = {
+            'mosh_theta':
+            np.concatenate((trivial, pose, beta), axis=0).astype(np.float32)
+        }
+        return self.pipeline(results)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc5b46a8b1e3d68cda6ab6564eb748987a9a9e8d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .topdown_aic_dataset import TopDownAicDataset
+from .topdown_coco_dataset import TopDownCocoDataset
+from .topdown_coco_wholebody_dataset import TopDownCocoWholeBodyDataset
+from .topdown_crowdpose_dataset import TopDownCrowdPoseDataset
+from .topdown_h36m_dataset import TopDownH36MDataset
+from .topdown_halpe_dataset import TopDownHalpeDataset
+from .topdown_jhmdb_dataset import TopDownJhmdbDataset
+from .topdown_mhp_dataset import TopDownMhpDataset
+from .topdown_mpii_dataset import TopDownMpiiDataset
+from .topdown_mpii_trb_dataset import TopDownMpiiTrbDataset
+from .topdown_ochuman_dataset import TopDownOCHumanDataset
+from .topdown_posetrack18_dataset import TopDownPoseTrack18Dataset
+from .topdown_posetrack18_video_dataset import TopDownPoseTrack18VideoDataset
+
+__all__ = [
+    'TopDownAicDataset',
+    'TopDownCocoDataset',
+    'TopDownCocoWholeBodyDataset',
+    'TopDownCrowdPoseDataset',
+    'TopDownMpiiDataset',
+    'TopDownMpiiTrbDataset',
+    'TopDownOCHumanDataset',
+    'TopDownPoseTrack18Dataset',
+    'TopDownJhmdbDataset',
+    'TopDownMhpDataset',
+    'TopDownH36MDataset',
+    'TopDownHalpeDataset',
+    'TopDownPoseTrack18VideoDataset',
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_aic_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_aic_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..13c41dfea92189e113dd291afa3771547881efbc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_aic_dataset.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownAicDataset(TopDownCocoDataset):
+    """AicDataset dataset for top-down pose estimation.
+
+    "AI Challenger : A Large-scale Dataset for Going Deeper
+    in Image Understanding", arXiv'2017.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1711.06475>`__
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    AIC keypoint indexes::
+
+        0: "right_shoulder",
+        1: "right_elbow",
+        2: "right_wrist",
+        3: "left_shoulder",
+        4: "left_elbow",
+        5: "left_wrist",
+        6: "right_hip",
+        7: "right_knee",
+        8: "right_ankle",
+        9: "left_hip",
+        10: "left_knee",
+        11: "left_ankle",
+        12: "head_top",
+        13: "neck"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/aic.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(
+            self.coco, coco_det, 'keypoints', self.sigmas, use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_base_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc99576716ea5fc77af277e3e764c2c9b5dd158f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class TopDownBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgTopDownDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgTopDownDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..664c88149634bb63966438508af52f6d746e9aef
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py
@@ -0,0 +1,405 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class TopDownCocoDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """CocoDataset dataset for top-down pose estimation.
+
+    "Microsoft COCO: Common Objects in Context", ECCV'2014.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1405.0312>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO keypoint indexes::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/coco.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        if (not self.test_mode) or self.use_gt_bbox:
+            # use ground truth bbox
+            gt_db = self._load_coco_keypoint_annotations()
+        else:
+            # use bbox from detection
+            gt_db = self._load_coco_person_detection_results()
+        return gt_db
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+
+        Args:
+            img_id: coco image id
+
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    def _load_coco_person_detection_results(self):
+        """Load coco person detection results."""
+        num_joints = self.ann_info['num_joints']
+        all_boxes = None
+        with open(self.bbox_file, 'r') as f:
+            all_boxes = json.load(f)
+
+        if not all_boxes:
+            raise ValueError('=> Load %s fail!' % self.bbox_file)
+
+        print(f'=> Total boxes: {len(all_boxes)}')
+
+        kpt_db = []
+        bbox_id = 0
+        for det_res in all_boxes:
+            if det_res['category_id'] != 1:
+                continue
+
+            image_file = osp.join(self.img_prefix,
+                                  self.id2name[det_res['image_id']])
+            box = det_res['bbox']
+            score = det_res['score']
+
+            if score < self.det_bbox_thr:
+                continue
+
+            center, scale = self._xywh2cs(*box[:4])
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.ones((num_joints, 3), dtype=np.float32)
+            kpt_db.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'bbox': box[:4],
+                'bbox_score': score,
+                'dataset': self.dataset_name,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+        print(f'=> Total boxes after filter '
+              f'low score@{self.det_bbox_thr}: {bbox_id}')
+        return kpt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(img_kpts, oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': cat_id,
+                'keypoints': key_point.tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..791a3c5790d68ef480bc54d94cf377c06e5f0383
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py
@@ -0,0 +1,274 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import warnings
+
+import numpy as np
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownCocoWholeBodyDataset(TopDownCocoDataset):
+    """CocoWholeBodyDataset dataset for top-down pose estimation.
+
+    "Whole-Body Human Pose Estimation in the Wild", ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.11858>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO-WholeBody keypoint indexes::
+
+        0-16: 17 body keypoints,
+        17-22: 6 foot keypoints,
+        23-90: 68 face keypoints,
+        91-132: 42 hand keypoints
+
+        In total, we have 133 keypoints for wholebody pose estimation.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/coco_wholebody.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.body_num = 17
+        self.foot_num = 6
+        self.face_num = 68
+        self.left_hand_num = 21
+        self.right_hand_num = 21
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        rec = []
+        bbox_id = 0
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints'] + obj['foot_kpts'] +
+                                 obj['face_kpts'] + obj['lefthand_kpts'] +
+                                 obj['righthand_kpts']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3] > 0)
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = os.path.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            cuts = np.cumsum([
+                0, self.body_num, self.foot_num, self.face_num,
+                self.left_hand_num, self.right_hand_num
+            ]) * 3
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': cat_id,
+                'keypoints': key_point[cuts[0]:cuts[1]].tolist(),
+                'foot_kpts': key_point[cuts[1]:cuts[2]].tolist(),
+                'face_kpts': key_point[cuts[2]:cuts[3]].tolist(),
+                'lefthand_kpts': key_point[cuts[3]:cuts[4]].tolist(),
+                'righthand_kpts': key_point[cuts[4]:cuts[5]].tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+
+        cuts = np.cumsum([
+            0, self.body_num, self.foot_num, self.face_num, self.left_hand_num,
+            self.right_hand_num
+        ])
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_body',
+            self.sigmas[cuts[0]:cuts[1]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_foot',
+            self.sigmas[cuts[1]:cuts[2]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_face',
+            self.sigmas[cuts[2]:cuts[3]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_lefthand',
+            self.sigmas[cuts[3]:cuts[4]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_righthand',
+            self.sigmas[cuts[4]:cuts[5]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_wholebody',
+            self.sigmas,
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9b196f744aa67d46c420612f9476b1d73c68cf3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownCrowdPoseDataset(TopDownCocoDataset):
+    """CrowdPoseDataset dataset for top-down pose estimation.
+
+    "CrowdPose: Efficient Crowded Scenes Pose Estimation and
+    A New Benchmark", CVPR'2019.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1812.00324>`__.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    CrowdPose keypoint indexes::
+
+        0: 'left_shoulder',
+        1: 'right_shoulder',
+        2: 'left_elbow',
+        3: 'right_elbow',
+        4: 'left_wrist',
+        5: 'right_wrist',
+        6: 'left_hip',
+        7: 'right_hip',
+        8: 'left_knee',
+        9: 'right_knee',
+        10: 'left_ankle',
+        11: 'right_ankle',
+        12: 'top_head',
+        13: 'neck'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/crowdpose.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_crowd',
+            self.sigmas,
+            use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AR', 'AR .5', 'AR .75', 'AP(E)', 'AP(M)',
+            'AP(H)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc49e3a2994037993bdb44a6ba59e44eeef0270
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py
@@ -0,0 +1,206 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class TopDownH36MDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Human3.6M dataset for top-down 2D pose estimation.
+
+    "Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human
+    Sensing in Natural Environments", TPAMI`2014.
+    More details can be found in the `paper
+    <http://vision.imar.ro/human3.6m/pami-h36m.pdf>`__.
+
+    Human3.6M keypoint indexes::
+
+        0: 'root (pelvis)',
+        1: 'right_hip',
+        2: 'right_knee',
+        3: 'right_foot',
+        4: 'left_hip',
+        5: 'left_knee',
+        6: 'left_foot',
+        7: 'spine',
+        8: 'thorax',
+        9: 'neck_base',
+        10: 'head',
+        11: 'left_shoulder',
+        12: 'left_elbow',
+        13: 'left_wrist',
+        14: 'right_shoulder',
+        15: 'right_elbow',
+        16: 'right_wrist'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/h36m.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # use 1.25 padded bbox as input
+                center, scale = self._xywh2cs(*obj['bbox'][:4])
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate human3.6m 2d keypoint results. The pose prediction results
+        will be saved in `${res_folder}/result_keypoints.json`.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0],
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'PCK'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..7042daa29ec2b2b8eafb16a1404be32cf761d678
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv import Config
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownHalpeDataset(TopDownCocoDataset):
+    """HalpeDataset for top-down pose estimation.
+
+    'https://github.com/Fang-Haoshu/Halpe-FullBody'
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Halpe keypoint indexes::
+
+        0-19: 20 body keypoints,
+        20-25: 6 foot keypoints,
+        26-93: 68 face keypoints,
+        94-135: 42 hand keypoints
+
+        In total, we have 136 keypoints for wholebody pose estimation.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/halpe.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.ann_info['use_different_joint_weights'] = False
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5204f04d869c59b9fe9b9f337714d1aa6f555c9e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py
@@ -0,0 +1,361 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.core.evaluation.top_down_eval import keypoint_pck_accuracy
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownJhmdbDataset(TopDownCocoDataset):
+    """JhmdbDataset dataset for top-down pose estimation.
+
+    "Towards understanding action recognition", ICCV'2013.
+    More details can be found in the `paper
+    <https://openaccess.thecvf.com/content_iccv_2013/papers/\
+    Jhuang_Towards_Understanding_Action_2013_ICCV_paper.pdf>`__
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    sub-JHMDB keypoint indexes::
+
+        0: "neck",
+        1: "belly",
+        2: "head",
+        3: "right_shoulder",
+        4: "left_shoulder",
+        5: "right_hip",
+        6: "left_hip",
+        7: "right_elbow",
+        8: "left_elbow",
+        9: "right_knee",
+        10: "left_knee",
+        11: "right_wrist",
+        12: "left_wrist",
+        13: "right_ankle",
+        14: "left_ankle"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/jhmdb.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            # JHMDB uses matlab format, index is 1-based,
+            # we should first convert to 0-based index
+            x -= 1
+            y -= 1
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        rec = []
+        bbox_id = 0
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+
+            # JHMDB uses matlab format, index is 1-based,
+            # we should first convert to 0-based index
+            joints_3d[:, :2] = keypoints[:, :2] - 1
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': f'{img_id}_{bbox_id:03}'
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    def _write_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self, res_file, metrics, pck_thr=0.2):
+        """Keypoint evaluation.
+
+        Args:
+            res_file (str): Json file stored prediction results.
+            metrics (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'PCKh', 'AUC', 'EPE'.
+            pck_thr (float): PCK threshold, default as 0.2.
+            pckh_thr (float): PCKh threshold, default as 0.7.
+            auc_nor (float): AUC normalization factor, default as 30 pixel.
+
+        Returns:
+            List: Evaluation results for evaluation metric.
+        """
+        info_str = []
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.db)
+
+        outputs = []
+        gts = []
+        masks = []
+        threshold_bbox = []
+        threshold_torso = []
+
+        for pred, item in zip(preds, self.db):
+            outputs.append(np.array(pred['keypoints'])[:, :-1])
+            gts.append(np.array(item['joints_3d'])[:, :-1])
+            masks.append((np.array(item['joints_3d_visible'])[:, 0]) > 0)
+            if 'PCK' in metrics:
+                bbox = np.array(item['bbox'])
+                bbox_thr = np.max(bbox[2:])
+                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
+
+            if 'tPCK' in metrics:
+                torso_thr = np.linalg.norm(item['joints_3d'][4, :2] -
+                                           item['joints_3d'][5, :2])
+                if torso_thr < 1:
+                    torso_thr = np.linalg.norm(
+                        np.array(pred['keypoints'])[4, :2] -
+                        np.array(pred['keypoints'])[5, :2])
+                    warnings.warn('Torso Size < 1.')
+                threshold_torso.append(np.array([torso_thr, torso_thr]))
+
+        outputs = np.array(outputs)
+        gts = np.array(gts)
+        masks = np.array(masks)
+        threshold_bbox = np.array(threshold_bbox)
+        threshold_torso = np.array(threshold_torso)
+
+        if 'PCK' in metrics:
+            pck_p, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,
+                                                  threshold_bbox)
+
+            stats_names = [
+                'Head PCK', 'Sho PCK', 'Elb PCK', 'Wri PCK', 'Hip PCK',
+                'Knee PCK', 'Ank PCK', 'Mean PCK'
+            ]
+
+            stats = [
+                pck_p[2], 0.5 * pck_p[3] + 0.5 * pck_p[4],
+                0.5 * pck_p[7] + 0.5 * pck_p[8],
+                0.5 * pck_p[11] + 0.5 * pck_p[12],
+                0.5 * pck_p[5] + 0.5 * pck_p[6],
+                0.5 * pck_p[9] + 0.5 * pck_p[10],
+                0.5 * pck_p[13] + 0.5 * pck_p[14], pck
+            ]
+
+            info_str.extend(list(zip(stats_names, stats)))
+
+        if 'tPCK' in metrics:
+            pck_p, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,
+                                                  threshold_torso)
+
+            stats_names = [
+                'Head tPCK', 'Sho tPCK', 'Elb tPCK', 'Wri tPCK', 'Hip tPCK',
+                'Knee tPCK', 'Ank tPCK', 'Mean tPCK'
+            ]
+
+            stats = [
+                pck_p[2], 0.5 * pck_p[3] + 0.5 * pck_p[4],
+                0.5 * pck_p[7] + 0.5 * pck_p[8],
+                0.5 * pck_p[11] + 0.5 * pck_p[12],
+                0.5 * pck_p[5] + 0.5 * pck_p[6],
+                0.5 * pck_p[9] + 0.5 * pck_p[10],
+                0.5 * pck_p[13] + 0.5 * pck_p[14], pck
+            ]
+
+            info_str.extend(list(zip(stats_names, stats)))
+
+        return info_str
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate onehand10k keypoint results. The pose prediction results
+        will be saved in `${res_folder}/result_keypoints.json`.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str])
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'tPCK'.
+                PCK means normalized by the bounding boxes, while tPCK
+                means normalized by the torso size.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'tPCK']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            # convert 0-based index to 1-based index,
+            # and get the first two dimensions.
+            preds[..., :2] += 1.0
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts.append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..050824a88ab520ad44feafd4a8553582689b1fab
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownMhpDataset(TopDownCocoDataset):
+    """MHPv2.0 dataset for top-down pose estimation.
+
+    "Understanding Humans in Crowded Scenes: Deep Nested Adversarial
+    Learning and A New Benchmark for Multi-Human Parsing", ACM MM'2018.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1804.03287>`__
+
+    Note that, the evaluation metric used here is mAP (adapted from COCO),
+    which may be different from the official evaluation codes.
+    'https://github.com/ZhaoJ9014/Multi-Human-Parsing/tree/master/'
+    'Evaluation/Multi-Human-Pose'
+    Please be cautious if you use the results in papers.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MHP keypoint indexes::
+
+        0: "right ankle",
+        1: "right knee",
+        2: "right hip",
+        3: "left hip",
+        4: "left knee",
+        5: "left ankle",
+        6: "pelvis",
+        7: "thorax",
+        8: "upper neck",
+        9: "head top",
+        10: "right wrist",
+        11: "right elbow",
+        12: "right shoulder",
+        13: "left shoulder",
+        14: "left elbow",
+        15: "left wrist",
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/mhp.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        if 'image_thr' in data_cfg:
+            warnings.warn(
+                'image_thr is deprecated, '
+                'please use det_bbox_thr instead', DeprecationWarning)
+            self.det_bbox_thr = data_cfg['image_thr']
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(
+            self.coco, coco_det, 'keypoints', self.sigmas, use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..751046aa683dd6304b97f639d85cc9489027a6ef
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from scipy.io import loadmat, savemat
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class TopDownMpiiDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """MPII Dataset for top-down pose estimation.
+
+    "2D Human Pose Estimation: New Benchmark and State of the Art Analysis"
+    ,CVPR'2014. More details can be found in the `paper
+    <http://human-pose.mpi-inf.mpg.de/contents/andriluka14cvpr.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MPII keypoint indexes::
+
+        0: 'right_ankle'
+        1: 'right_knee',
+        2: 'right_hip',
+        3: 'left_hip',
+        4: 'left_knee',
+        5: 'left_ankle',
+        6: 'pelvis',
+        7: 'thorax',
+        8: 'upper_neck',
+        9: 'head_top',
+        10: 'right_wrist',
+        11: 'right_elbow',
+        12: 'right_shoulder',
+        13: 'left_shoulder',
+        14: 'left_elbow',
+        15: 'left_wrist'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/mpii.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            coco_style=False,
+            test_mode=test_mode)
+
+        self.db = self._get_db()
+        self.image_set = set(x['image_file'] for x in self.db)
+        self.num_images = len(self.image_set)
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        # create train/val split
+        with open(self.ann_file) as anno_file:
+            anno = json.load(anno_file)
+
+        gt_db = []
+        bbox_id = 0
+        for a in anno:
+            image_name = a['image']
+
+            center = np.array(a['center'], dtype=np.float32)
+            scale = np.array([a['scale'], a['scale']], dtype=np.float32)
+
+            # Adjust center/scale slightly to avoid cropping limbs
+            if center[0] != -1:
+                center[1] = center[1] + 15 * scale[1]
+                # padding to include proper amount of context
+                scale = scale * 1.25
+
+            # MPII uses matlab format, index is 1-based,
+            # we should first convert to 0-based index
+            center = center - 1
+
+            joints_3d = np.zeros((self.ann_info['num_joints'], 3),
+                                 dtype=np.float32)
+            joints_3d_visible = np.zeros((self.ann_info['num_joints'], 3),
+                                         dtype=np.float32)
+            if not self.test_mode:
+                joints = np.array(a['joints'])
+                joints_vis = np.array(a['joints_vis'])
+                assert len(joints) == self.ann_info['num_joints'], \
+                    f'joint num diff: {len(joints)}' + \
+                    f' vs {self.ann_info["num_joints"]}'
+
+                joints_3d[:, 0:2] = joints[:, 0:2] - 1
+                joints_3d_visible[:, :2] = joints_vis[:, None]
+            image_file = osp.join(self.img_prefix, image_name)
+            gt_db.append({
+                'image_file': image_file,
+                'bbox_id': bbox_id,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1
+            })
+            bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCKh', **kwargs):
+        """Evaluate PCKh for MPII dataset. Adapted from
+        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+        Copyright (c) Microsoft, under the MIT License.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['/val2017/000000\
+                    397133.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap.
+            res_folder (str, optional): The folder to save the testing
+                results. Default: None.
+            metric (str | list[str]): Metrics to be performed.
+                Defaults: 'PCKh'.
+
+        Returns:
+            dict: PCKh for each joint
+        """
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCKh']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            bbox_ids = result['bbox_ids']
+            batch_size = len(bbox_ids)
+            for i in range(batch_size):
+                kpts.append({'keypoints': preds[i], 'bbox_id': bbox_ids[i]})
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        preds = np.stack([kpt['keypoints'] for kpt in kpts])
+
+        # convert 0-based index to 1-based index,
+        # and get the first two dimensions.
+        preds = preds[..., :2] + 1.0
+
+        if res_folder:
+            pred_file = osp.join(res_folder, 'pred.mat')
+            savemat(pred_file, mdict={'preds': preds})
+
+        SC_BIAS = 0.6
+        threshold = 0.5
+
+        gt_file = osp.join(osp.dirname(self.ann_file), 'mpii_gt_val.mat')
+        gt_dict = loadmat(gt_file)
+        dataset_joints = gt_dict['dataset_joints']
+        jnt_missing = gt_dict['jnt_missing']
+        pos_gt_src = gt_dict['pos_gt_src']
+        headboxes_src = gt_dict['headboxes_src']
+
+        pos_pred_src = np.transpose(preds, [1, 2, 0])
+
+        head = np.where(dataset_joints == 'head')[1][0]
+        lsho = np.where(dataset_joints == 'lsho')[1][0]
+        lelb = np.where(dataset_joints == 'lelb')[1][0]
+        lwri = np.where(dataset_joints == 'lwri')[1][0]
+        lhip = np.where(dataset_joints == 'lhip')[1][0]
+        lkne = np.where(dataset_joints == 'lkne')[1][0]
+        lank = np.where(dataset_joints == 'lank')[1][0]
+
+        rsho = np.where(dataset_joints == 'rsho')[1][0]
+        relb = np.where(dataset_joints == 'relb')[1][0]
+        rwri = np.where(dataset_joints == 'rwri')[1][0]
+        rkne = np.where(dataset_joints == 'rkne')[1][0]
+        rank = np.where(dataset_joints == 'rank')[1][0]
+        rhip = np.where(dataset_joints == 'rhip')[1][0]
+
+        jnt_visible = 1 - jnt_missing
+        uv_error = pos_pred_src - pos_gt_src
+        uv_err = np.linalg.norm(uv_error, axis=1)
+        headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :]
+        headsizes = np.linalg.norm(headsizes, axis=0)
+        headsizes *= SC_BIAS
+        scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32)
+        scaled_uv_err = uv_err / scale
+        scaled_uv_err = scaled_uv_err * jnt_visible
+        jnt_count = np.sum(jnt_visible, axis=1)
+        less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
+        PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count
+
+        # save
+        rng = np.arange(0, 0.5 + 0.01, 0.01)
+        pckAll = np.zeros((len(rng), 16), dtype=np.float32)
+
+        for r, threshold in enumerate(rng):
+            less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
+            pckAll[r, :] = 100. * np.sum(
+                less_than_threshold, axis=1) / jnt_count
+
+        PCKh = np.ma.array(PCKh, mask=False)
+        PCKh.mask[6:8] = True
+
+        jnt_count = np.ma.array(jnt_count, mask=False)
+        jnt_count.mask[6:8] = True
+        jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64)
+
+        name_value = [('Head', PCKh[head]),
+                      ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])),
+                      ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])),
+                      ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])),
+                      ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])),
+                      ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])),
+                      ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])),
+                      ('PCKh', np.sum(PCKh * jnt_ratio)),
+                      ('PCKh@0.1', np.sum(pckAll[10, :] * jnt_ratio))]
+        name_value = OrderedDict(name_value)
+
+        return name_value
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0da65b47a27074fac6dc1bfbd98309f75e359a3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py
@@ -0,0 +1,310 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class TopDownMpiiTrbDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """MPII-TRB Dataset dataset for top-down pose estimation.
+
+    "TRB: A Novel Triplet Representation for Understanding 2D Human Body",
+    ICCV'2019. More details can be found in the `paper
+    <https://arxiv.org/abs/1910.11535>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MPII-TRB keypoint indexes::
+
+        0: 'left_shoulder'
+        1: 'right_shoulder'
+        2: 'left_elbow'
+        3: 'right_elbow'
+        4: 'left_wrist'
+        5: 'right_wrist'
+        6: 'left_hip'
+        7: 'right_hip'
+        8: 'left_knee'
+        9: 'right_knee'
+        10: 'left_ankle'
+        11: 'right_ankle'
+        12: 'head'
+        13: 'neck'
+
+        14: 'right_neck'
+        15: 'left_neck'
+        16: 'medial_right_shoulder'
+        17: 'lateral_right_shoulder'
+        18: 'medial_right_bow'
+        19: 'lateral_right_bow'
+        20: 'medial_right_wrist'
+        21: 'lateral_right_wrist'
+        22: 'medial_left_shoulder'
+        23: 'lateral_left_shoulder'
+        24: 'medial_left_bow'
+        25: 'lateral_left_bow'
+        26: 'medial_left_wrist'
+        27: 'lateral_left_wrist'
+        28: 'medial_right_hip'
+        29: 'lateral_right_hip'
+        30: 'medial_right_knee'
+        31: 'lateral_right_knee'
+        32: 'medial_right_ankle'
+        33: 'lateral_right_ankle'
+        34: 'medial_left_hip'
+        35: 'lateral_left_hip'
+        36: 'medial_left_knee'
+        37: 'lateral_left_knee'
+        38: 'medial_left_ankle'
+        39: 'lateral_left_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/mpii_trb.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.db = self._get_db(ann_file)
+        self.image_set = set(x['image_file'] for x in self.db)
+        self.num_images = len(self.image_set)
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self, ann_file):
+        """Load dataset."""
+        with open(ann_file, 'r') as f:
+            data = json.load(f)
+        tmpl = dict(
+            image_file=None,
+            bbox_id=None,
+            center=None,
+            scale=None,
+            rotation=0,
+            joints_3d=None,
+            joints_3d_visible=None,
+            dataset=self.dataset_name)
+
+        imid2info = {
+            int(osp.splitext(x['file_name'])[0]): x
+            for x in data['images']
+        }
+
+        num_joints = self.ann_info['num_joints']
+        gt_db = []
+
+        for anno in data['annotations']:
+            newitem = cp.deepcopy(tmpl)
+            image_id = anno['image_id']
+            newitem['bbox_id'] = anno['id']
+            newitem['image_file'] = osp.join(self.img_prefix,
+                                             imid2info[image_id]['file_name'])
+
+            if max(anno['keypoints']) == 0:
+                continue
+
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            for ipt in range(num_joints):
+                joints_3d[ipt, 0] = anno['keypoints'][ipt * 3 + 0]
+                joints_3d[ipt, 1] = anno['keypoints'][ipt * 3 + 1]
+                joints_3d[ipt, 2] = 0
+                t_vis = min(anno['keypoints'][ipt * 3 + 2], 1)
+                joints_3d_visible[ipt, :] = (t_vis, t_vis, 0)
+
+            center = np.array(anno['center'], dtype=np.float32)
+            scale = self.ann_info['image_size'] / anno['scale'] / 200.0
+            newitem['center'] = center
+            newitem['scale'] = scale
+            newitem['joints_3d'] = joints_3d
+            newitem['joints_3d_visible'] = joints_3d_visible
+            if 'headbox' in anno:
+                newitem['headbox'] = anno['headbox']
+            gt_db.append(newitem)
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _evaluate_kernel(self, pred, joints_3d, joints_3d_visible, headbox):
+        """Evaluate one example."""
+        num_joints = self.ann_info['num_joints']
+        headbox = np.array(headbox)
+        threshold = np.linalg.norm(headbox[:2] - headbox[2:]) * 0.3
+        hit = np.zeros(num_joints, dtype=np.float32)
+        exist = np.zeros(num_joints, dtype=np.float32)
+
+        for i in range(num_joints):
+            pred_pt = pred[i]
+            gt_pt = joints_3d[i]
+            vis = joints_3d_visible[i][0]
+            if vis:
+                exist[i] = 1
+            else:
+                continue
+            distance = np.linalg.norm(pred_pt[:2] - gt_pt[:2])
+            if distance < threshold:
+                hit[i] = 1
+        return hit, exist
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCKh', **kwargs):
+        """Evaluate PCKh for MPII-TRB dataset.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['/val2017/\
+                    000000397133.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap.
+                - bbox_ids (list[str]): For example, ['27407'].
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metrics to be performed.
+                Defaults: 'PCKh'.
+
+        Returns:
+            dict: PCKh for each joint
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCKh']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                str_image_path = image_paths[i]
+                image_id = int(osp.basename(osp.splitext(str_image_path)[0]))
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self, res_file):
+        """Keypoint evaluation.
+
+        Report Mean Acc of skeleton, contour and all joints.
+        """
+        num_joints = self.ann_info['num_joints']
+        hit = np.zeros(num_joints, dtype=np.float32)
+        exist = np.zeros(num_joints, dtype=np.float32)
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+
+        assert len(preds) == len(
+            self.db), f'len(preds)={len(preds)}, len(self.db)={len(self.db)}'
+        for pred, item in zip(preds, self.db):
+            h, e = self._evaluate_kernel(pred['keypoints'], item['joints_3d'],
+                                         item['joints_3d_visible'],
+                                         item['headbox'])
+            hit += h
+            exist += e
+        skeleton = np.sum(hit[:14]) / np.sum(exist[:14])
+        contour = np.sum(hit[14:]) / np.sum(exist[14:])
+        mean = np.sum(hit) / np.sum(exist)
+
+        info_str = []
+        info_str.append(('Skeleton_acc', skeleton.item()))
+        info_str.append(('Contour_acc', contour.item()))
+        info_str.append(('PCKh', mean.item()))
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ad6b81405e2411bae1a531521208d2cc272fbf3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv import Config
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownOCHumanDataset(TopDownCocoDataset):
+    """OChuman dataset for top-down pose estimation.
+
+    "Pose2Seg: Detection Free Human Instance Segmentation", CVPR'2019.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1803.10683>`__ .
+
+    "Occluded Human (OCHuman)" dataset contains 8110 heavily occluded
+    human instances within 4731 images. OCHuman dataset is designed for
+    validation and testing. To evaluate on OCHuman, the model should be
+    trained on COCO training set, and then test the robustness of the
+    model to occlusion using OCHuman.
+
+    OCHuman keypoint indexes (same as COCO)::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/ochuman.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c690860ac7a11129c9eee50c19eda05279e9ace1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py
@@ -0,0 +1,312 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+try:
+    from poseval import eval_helpers
+    from poseval.evaluateAP import evaluateAP
+    has_poseval = True
+except (ImportError, ModuleNotFoundError):
+    has_poseval = False
+
+
+@DATASETS.register_module()
+class TopDownPoseTrack18Dataset(TopDownCocoDataset):
+    """PoseTrack18 dataset for top-down pose estimation.
+
+    "Posetrack: A benchmark for human pose estimation and tracking", CVPR'2018.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1710.10000>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    PoseTrack2018 keypoint indexes::
+
+        0: 'nose',
+        1: 'head_bottom',
+        2: 'head_top',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/posetrack18.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate posetrack keypoint results. The pose prediction results
+        will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - num_keypoints: K
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['val/010016_mpii_test\
+                    /000024.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap.
+                - bbox_id (list(int))
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_folder = tmp_folder.name
+
+        gt_folder = osp.join(
+            osp.dirname(self.ann_file),
+            osp.splitext(self.ann_file.split('_')[-1])[0])
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = defaultdict(list)
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(img_kpts, oks_thr, sigmas=self.sigmas)
+                valid_kpts[image_id].append(
+                    [img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts[image_id].append(img_kpts)
+
+        self._write_posetrack18_keypoint_results(valid_kpts, gt_folder,
+                                                 res_folder)
+
+        info_str = self._do_python_keypoint_eval(gt_folder, res_folder)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    @staticmethod
+    def _write_posetrack18_keypoint_results(keypoint_results, gt_folder,
+                                            pred_folder):
+        """Write results into a json file.
+
+        Args:
+            keypoint_results (dict): keypoint results organized by image_id.
+            gt_folder (str): Path of directory for official gt files.
+            pred_folder (str): Path of directory to save the results.
+        """
+        categories = []
+
+        cat = {}
+        cat['supercategory'] = 'person'
+        cat['id'] = 1
+        cat['name'] = 'person'
+        cat['keypoints'] = [
+            'nose', 'head_bottom', 'head_top', 'left_ear', 'right_ear',
+            'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
+            'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee',
+            'right_knee', 'left_ankle', 'right_ankle'
+        ]
+        cat['skeleton'] = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13],
+                           [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10],
+                           [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5],
+                           [4, 6], [5, 7]]
+        categories.append(cat)
+
+        json_files = [
+            pos for pos in os.listdir(gt_folder) if pos.endswith('.json')
+        ]
+        for json_file in json_files:
+
+            with open(osp.join(gt_folder, json_file), 'r') as f:
+                gt = json.load(f)
+
+            annotations = []
+            images = []
+
+            for image in gt['images']:
+                im = {}
+                im['id'] = image['id']
+                im['file_name'] = image['file_name']
+                images.append(im)
+
+                img_kpts = keypoint_results[im['id']]
+
+                if len(img_kpts) == 0:
+                    continue
+                for track_id, img_kpt in enumerate(img_kpts[0]):
+                    ann = {}
+                    ann['image_id'] = img_kpt['image_id']
+                    ann['keypoints'] = np.array(
+                        img_kpt['keypoints']).reshape(-1).tolist()
+                    ann['scores'] = np.array(ann['keypoints']).reshape(
+                        [-1, 3])[:, 2].tolist()
+                    ann['score'] = float(img_kpt['score'])
+                    ann['track_id'] = track_id
+                    annotations.append(ann)
+
+            info = {}
+            info['images'] = images
+            info['categories'] = categories
+            info['annotations'] = annotations
+
+            with open(osp.join(pred_folder, json_file), 'w') as f:
+                json.dump(info, f, sort_keys=True, indent=4)
+
+    def _do_python_keypoint_eval(self, gt_folder, pred_folder):
+        """Keypoint evaluation using poseval."""
+
+        if not has_poseval:
+            raise ImportError('Please install poseval package for evaluation'
+                              'on PoseTrack dataset '
+                              '(see requirements/optional.txt)')
+
+        argv = ['', gt_folder + '/', pred_folder + '/']
+
+        print('Loading data')
+        gtFramesAll, prFramesAll = eval_helpers.load_data_dir(argv)
+
+        print('# gt frames  :', len(gtFramesAll))
+        print('# pred frames:', len(prFramesAll))
+
+        # evaluate per-frame multi-person pose estimation (AP)
+        # compute AP
+        print('Evaluation of per-frame multi-person pose estimation')
+        apAll, _, _ = evaluateAP(gtFramesAll, prFramesAll, None, False, False)
+
+        # print AP
+        print('Average Precision (AP) metric:')
+        eval_helpers.printTable(apAll)
+
+        stats = eval_helpers.getCum(apAll)
+
+        stats_names = [
+            'Head AP', 'Shou AP', 'Elb AP', 'Wri AP', 'Hip AP', 'Knee AP',
+            'Ankl AP', 'Total AP'
+        ]
+
+        info_str = list(zip(stats_names, stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..045148d3e01ed513d9514ee81a85efaba9a72287
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py
@@ -0,0 +1,549 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import deprecated_api_warning
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbVidTopDownDataset
+
+try:
+    from poseval import eval_helpers
+    from poseval.evaluateAP import evaluateAP
+    has_poseval = True
+except (ImportError, ModuleNotFoundError):
+    has_poseval = False
+
+
+@DATASETS.register_module()
+class TopDownPoseTrack18VideoDataset(Kpt2dSviewRgbVidTopDownDataset):
+    """PoseTrack18 dataset for top-down pose estimation.
+
+    "Posetrack: A benchmark for human pose estimation and tracking", CVPR'2018.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1710.10000>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    PoseTrack2018 keypoint indexes::
+
+        0: 'nose',
+        1: 'head_bottom',
+        2: 'head_top',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where videos/images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+        ph_fill_len (int): The length of the placeholder to fill in the
+            image filenames, default: 6 in PoseTrack18.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False,
+                 ph_fill_len=6):
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+        self.frame_weight_train = data_cfg['frame_weight_train']
+        self.frame_weight_test = data_cfg['frame_weight_test']
+        self.frame_weight = self.frame_weight_test \
+            if self.test_mode else self.frame_weight_train
+
+        self.ph_fill_len = ph_fill_len
+
+        # select the frame indices
+        self.frame_index_rand = data_cfg.get('frame_index_rand', True)
+        self.frame_index_range = data_cfg.get('frame_index_range', [-2, 2])
+        self.num_adj_frames = data_cfg.get('num_adj_frames', 1)
+        self.frame_indices_train = data_cfg.get('frame_indices_train', None)
+        self.frame_indices_test = data_cfg.get('frame_indices_test',
+                                               [-2, -1, 0, 1, 2])
+
+        if self.frame_indices_train is not None:
+            self.frame_indices_train.sort()
+        self.frame_indices_test.sort()
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        if (not self.test_mode) or self.use_gt_bbox:
+            # use ground truth bbox
+            gt_db = self._load_coco_keypoint_annotations()
+        else:
+            # use bbox from detection
+            gt_db = self._load_posetrack_person_detection_results()
+        return gt_db
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        file_name = img_ann['file_name']
+        nframes = int(img_ann['nframes'])
+        frame_id = int(img_ann['frame_id'])
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_files = []
+            cur_image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            image_files.append(cur_image_file)
+
+            # "images/val/012834_mpii_test/000000.jpg" -->> "000000.jpg"
+            cur_image_name = file_name.split('/')[-1]
+            ref_idx = int(cur_image_name.replace('.jpg', ''))
+
+            # select the frame indices
+            if not self.test_mode and self.frame_indices_train is not None:
+                indices = self.frame_indices_train
+            elif not self.test_mode and self.frame_index_rand:
+                low, high = self.frame_index_range
+                indices = np.random.randint(low, high + 1, self.num_adj_frames)
+            else:
+                indices = self.frame_indices_test
+
+            for index in indices:
+                if self.test_mode and index == 0:
+                    continue
+                # the supporting frame index
+                support_idx = ref_idx + index
+                support_idx = np.clip(support_idx, 0, nframes - 1)
+                sup_image_file = cur_image_file.replace(
+                    cur_image_name,
+                    str(support_idx).zfill(self.ph_fill_len) + '.jpg')
+
+                if osp.exists(sup_image_file):
+                    image_files.append(sup_image_file)
+                else:
+                    warnings.warn(
+                        f'{sup_image_file} does not exist, '
+                        f'use {cur_image_file} instead.', UserWarning)
+                    image_files.append(cur_image_file)
+            rec.append({
+                'image_file': image_files,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id,
+                'nframes': nframes,
+                'frame_id': frame_id,
+                'frame_weight': self.frame_weight
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    def _load_posetrack_person_detection_results(self):
+        """Load Posetrack person detection results.
+
+        Only in test mode.
+        """
+        num_joints = self.ann_info['num_joints']
+        all_boxes = None
+        with open(self.bbox_file, 'r') as f:
+            all_boxes = json.load(f)
+
+        if not all_boxes:
+            raise ValueError('=> Load %s fail!' % self.bbox_file)
+
+        print(f'=> Total boxes: {len(all_boxes)}')
+
+        kpt_db = []
+        bbox_id = 0
+        for det_res in all_boxes:
+            if det_res['category_id'] != 1:
+                continue
+
+            score = det_res['score']
+            if score < self.det_bbox_thr:
+                continue
+
+            box = det_res['bbox']
+
+            # deal with different bbox file formats
+            if 'nframes' in det_res and 'frame_id' in det_res:
+                nframes = int(det_res['nframes'])
+                frame_id = int(det_res['frame_id'])
+            elif 'image_name' in det_res:
+                img_id = self.name2id[det_res['image_name']]
+                img_ann = self.coco.loadImgs(img_id)[0]
+                nframes = int(img_ann['nframes'])
+                frame_id = int(img_ann['frame_id'])
+            else:
+                img_id = det_res['image_id']
+                img_ann = self.coco.loadImgs(img_id)[0]
+                nframes = int(img_ann['nframes'])
+                frame_id = int(img_ann['frame_id'])
+
+            image_files = []
+            if 'image_name' in det_res:
+                file_name = det_res['image_name']
+            else:
+                file_name = self.id2name[det_res['image_id']]
+
+            cur_image_file = osp.join(self.img_prefix, file_name)
+            image_files.append(cur_image_file)
+
+            # "images/val/012834_mpii_test/000000.jpg" -->> "000000.jpg"
+            cur_image_name = file_name.split('/')[-1]
+            ref_idx = int(cur_image_name.replace('.jpg', ''))
+
+            indices = self.frame_indices_test
+            for index in indices:
+                if self.test_mode and index == 0:
+                    continue
+                # the supporting frame index
+                support_idx = ref_idx + index
+                support_idx = np.clip(support_idx, 0, nframes - 1)
+                sup_image_file = cur_image_file.replace(
+                    cur_image_name,
+                    str(support_idx).zfill(self.ph_fill_len) + '.jpg')
+
+                if osp.exists(sup_image_file):
+                    image_files.append(sup_image_file)
+                else:
+                    warnings.warn(f'{sup_image_file} does not exist, '
+                                  f'use {cur_image_file} instead.')
+                    image_files.append(cur_image_file)
+
+            center, scale = self._xywh2cs(*box[:4])
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.ones((num_joints, 3), dtype=np.float32)
+            kpt_db.append({
+                'image_file': image_files,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'bbox': box[:4],
+                'bbox_score': score,
+                'dataset': self.dataset_name,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'bbox_id': bbox_id,
+                'nframes': nframes,
+                'frame_id': frame_id,
+                'frame_weight': self.frame_weight
+            })
+            bbox_id = bbox_id + 1
+        print(f'=> Total boxes after filter '
+              f'low score@{self.det_bbox_thr}: {bbox_id}')
+        return kpt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate posetrack keypoint results. The pose prediction results
+        will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - num_keypoints: K
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['val/010016_mpii_test\
+                    /000024.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap.
+                - bbox_id (list(int))
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_folder = tmp_folder.name
+
+        gt_folder = osp.join(
+            osp.dirname(self.ann_file),
+            osp.splitext(self.ann_file.split('_')[-1])[0])
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                if not isinstance(image_paths[i], list):
+                    image_id = self.name2id[image_paths[i]
+                                            [len(self.img_prefix):]]
+                else:
+                    image_id = self.name2id[image_paths[i][0]
+                                            [len(self.img_prefix):]]
+
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = defaultdict(list)
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(img_kpts, oks_thr, sigmas=self.sigmas)
+                valid_kpts[image_id].append(
+                    [img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts[image_id].append(img_kpts)
+
+        self._write_keypoint_results(valid_kpts, gt_folder, res_folder)
+
+        info_str = self._do_keypoint_eval(gt_folder, res_folder)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoint_results, gt_folder, pred_folder):
+        """Write results into a json file.
+
+        Args:
+            keypoint_results (dict): keypoint results organized by image_id.
+            gt_folder (str): Path of directory for official gt files.
+            pred_folder (str): Path of directory to save the results.
+        """
+        categories = []
+
+        cat = {}
+        cat['supercategory'] = 'person'
+        cat['id'] = 1
+        cat['name'] = 'person'
+        cat['keypoints'] = [
+            'nose', 'head_bottom', 'head_top', 'left_ear', 'right_ear',
+            'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
+            'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee',
+            'right_knee', 'left_ankle', 'right_ankle'
+        ]
+        cat['skeleton'] = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13],
+                           [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10],
+                           [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5],
+                           [4, 6], [5, 7]]
+        categories.append(cat)
+
+        json_files = [
+            pos for pos in os.listdir(gt_folder) if pos.endswith('.json')
+        ]
+        for json_file in json_files:
+
+            with open(osp.join(gt_folder, json_file), 'r') as f:
+                gt = json.load(f)
+
+            annotations = []
+            images = []
+
+            for image in gt['images']:
+                im = {}
+                im['id'] = image['id']
+                im['file_name'] = image['file_name']
+                images.append(im)
+
+                img_kpts = keypoint_results[im['id']]
+
+                if len(img_kpts) == 0:
+                    continue
+                for track_id, img_kpt in enumerate(img_kpts[0]):
+                    ann = {}
+                    ann['image_id'] = img_kpt['image_id']
+                    ann['keypoints'] = np.array(
+                        img_kpt['keypoints']).reshape(-1).tolist()
+                    ann['scores'] = np.array(ann['keypoints']).reshape(
+                        [-1, 3])[:, 2].tolist()
+                    ann['score'] = float(img_kpt['score'])
+                    ann['track_id'] = track_id
+                    annotations.append(ann)
+
+            info = {}
+            info['images'] = images
+            info['categories'] = categories
+            info['annotations'] = annotations
+
+            with open(osp.join(pred_folder, json_file), 'w') as f:
+                json.dump(info, f, sort_keys=True, indent=4)
+
+    def _do_keypoint_eval(self, gt_folder, pred_folder):
+        """Keypoint evaluation using poseval."""
+
+        if not has_poseval:
+            raise ImportError('Please install poseval package for evaluation'
+                              'on PoseTrack dataset '
+                              '(see requirements/optional.txt)')
+
+        argv = ['', gt_folder + '/', pred_folder + '/']
+
+        print('Loading data')
+        gtFramesAll, prFramesAll = eval_helpers.load_data_dir(argv)
+
+        print('# gt frames  :', len(gtFramesAll))
+        print('# pred frames:', len(prFramesAll))
+
+        # evaluate per-frame multi-person pose estimation (AP)
+        # compute AP
+        print('Evaluation of per-frame multi-person pose estimation')
+        apAll, _, _ = evaluateAP(gtFramesAll, prFramesAll, None, False, False)
+
+        # print AP
+        print('Average Precision (AP) metric:')
+        eval_helpers.printTable(apAll)
+
+        stats = eval_helpers.getCum(apAll)
+
+        stats_names = [
+            'Head AP', 'Shou AP', 'Elb AP', 'Wri AP', 'Hip AP', 'Knee AP',
+            'Ankl AP', 'Total AP'
+        ]
+
+        info_str = list(zip(stats_names, stats))
+
+        return info_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf06db1c9d0656627ed91670d9a91ede66e0254f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bottom_up_transform import *  # noqa
+from .hand_transform import *  # noqa
+from .loading import LoadImageFromFile  # noqa
+from .mesh_transform import *  # noqa
+from .pose3d_transform import *  # noqa
+from .shared_transform import *  # noqa
+from .top_down_transform import *  # noqa
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/bottom_up_transform.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/bottom_up_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..032ce4548f5c6c142771405bf84b3a647641b460
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/bottom_up_transform.py
@@ -0,0 +1,816 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+from mmpose.core.post_processing import (get_affine_transform, get_warp_matrix,
+                                         warp_affine_joints)
+from mmpose.datasets.builder import PIPELINES
+from .shared_transform import Compose
+
+
+def _ceil_to_multiples_of(x, base=64):
+    """Transform x to the integral multiple of the base."""
+    return int(np.ceil(x / base)) * base
+
+
+def _get_multi_scale_size(image,
+                          input_size,
+                          current_scale,
+                          min_scale,
+                          use_udp=False):
+    """Get the size for multi-scale training.
+
+    Args:
+        image: Input image.
+        input_size (np.ndarray[2]): Size (w, h) of the image input.
+        current_scale (float): Scale factor.
+        min_scale (float): Minimal scale.
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Returns:
+        tuple: A tuple containing multi-scale sizes.
+
+        - (w_resized, h_resized) (tuple(int)): resized width/height
+        - center (np.ndarray)image center
+        - scale (np.ndarray): scales wrt width/height
+    """
+    assert len(input_size) == 2
+    h, w, _ = image.shape
+
+    # calculate the size for min_scale
+    min_input_w = _ceil_to_multiples_of(min_scale * input_size[0], 64)
+    min_input_h = _ceil_to_multiples_of(min_scale * input_size[1], 64)
+    if w < h:
+        w_resized = int(min_input_w * current_scale / min_scale)
+        h_resized = int(
+            _ceil_to_multiples_of(min_input_w / w * h, 64) * current_scale /
+            min_scale)
+        if use_udp:
+            scale_w = w - 1.0
+            scale_h = (h_resized - 1.0) / (w_resized - 1.0) * (w - 1.0)
+        else:
+            scale_w = w / 200.0
+            scale_h = h_resized / w_resized * w / 200.0
+    else:
+        h_resized = int(min_input_h * current_scale / min_scale)
+        w_resized = int(
+            _ceil_to_multiples_of(min_input_h / h * w, 64) * current_scale /
+            min_scale)
+        if use_udp:
+            scale_h = h - 1.0
+            scale_w = (w_resized - 1.0) / (h_resized - 1.0) * (h - 1.0)
+        else:
+            scale_h = h / 200.0
+            scale_w = w_resized / h_resized * h / 200.0
+    if use_udp:
+        center = (scale_w / 2.0, scale_h / 2.0)
+    else:
+        center = np.array([round(w / 2.0), round(h / 2.0)])
+    return (w_resized, h_resized), center, np.array([scale_w, scale_h])
+
+
+def _resize_align_multi_scale(image, input_size, current_scale, min_scale):
+    """Resize the images for multi-scale training.
+
+    Args:
+        image: Input image
+        input_size (np.ndarray[2]): Size (w, h) of the image input
+        current_scale (float): Current scale
+        min_scale (float): Minimal scale
+
+    Returns:
+        tuple: A tuple containing image info.
+
+        - image_resized (np.ndarray): resized image
+        - center (np.ndarray): center of image
+        - scale (np.ndarray): scale
+    """
+    assert len(input_size) == 2
+    size_resized, center, scale = _get_multi_scale_size(
+        image, input_size, current_scale, min_scale)
+
+    trans = get_affine_transform(center, scale, 0, size_resized)
+    image_resized = cv2.warpAffine(image, trans, size_resized)
+
+    return image_resized, center, scale
+
+
+def _resize_align_multi_scale_udp(image, input_size, current_scale, min_scale):
+    """Resize the images for multi-scale training.
+
+    Args:
+        image: Input image
+        input_size (np.ndarray[2]): Size (w, h) of the image input
+        current_scale (float): Current scale
+        min_scale (float): Minimal scale
+
+    Returns:
+        tuple: A tuple containing image info.
+
+        - image_resized (np.ndarray): resized image
+        - center (np.ndarray): center of image
+        - scale (np.ndarray): scale
+    """
+    assert len(input_size) == 2
+    size_resized, _, _ = _get_multi_scale_size(image, input_size,
+                                               current_scale, min_scale, True)
+
+    _, center, scale = _get_multi_scale_size(image, input_size, min_scale,
+                                             min_scale, True)
+
+    trans = get_warp_matrix(
+        theta=0,
+        size_input=np.array(scale, dtype=np.float32),
+        size_dst=np.array(size_resized, dtype=np.float32) - 1.0,
+        size_target=np.array(scale, dtype=np.float32))
+    image_resized = cv2.warpAffine(
+        image.copy(), trans, size_resized, flags=cv2.INTER_LINEAR)
+
+    return image_resized, center, scale
+
+
+class HeatmapGenerator:
+    """Generate heatmaps for bottom-up models.
+
+    Args:
+        num_joints (int): Number of keypoints
+        output_size (np.ndarray): Size (w, h) of feature map
+        sigma (int): Sigma of the heatmaps.
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, output_size, num_joints, sigma=-1, use_udp=False):
+        if not isinstance(output_size, np.ndarray):
+            output_size = np.array(output_size)
+        if output_size.size > 1:
+            assert len(output_size) == 2
+            self.output_size = output_size
+        else:
+            self.output_size = np.array([output_size, output_size],
+                                        dtype=np.int)
+        self.num_joints = num_joints
+        if sigma < 0:
+            sigma = self.output_size.prod()**0.5 / 64
+        self.sigma = sigma
+        size = 6 * sigma + 3
+        self.use_udp = use_udp
+        if use_udp:
+            self.x = np.arange(0, size, 1, np.float32)
+            self.y = self.x[:, None]
+        else:
+            x = np.arange(0, size, 1, np.float32)
+            y = x[:, None]
+            x0, y0 = 3 * sigma + 1, 3 * sigma + 1
+            self.g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
+
+    def __call__(self, joints):
+        """Generate heatmaps."""
+        hms = np.zeros(
+            (self.num_joints, self.output_size[1], self.output_size[0]),
+            dtype=np.float32)
+
+        sigma = self.sigma
+        for p in joints:
+            for idx, pt in enumerate(p):
+                if pt[2] > 0:
+                    x, y = int(pt[0]), int(pt[1])
+                    if x < 0 or y < 0 or \
+                       x >= self.output_size[0] or y >= self.output_size[1]:
+                        continue
+
+                    if self.use_udp:
+                        x0 = 3 * sigma + 1 + pt[0] - x
+                        y0 = 3 * sigma + 1 + pt[1] - y
+                        g = np.exp(-((self.x - x0)**2 + (self.y - y0)**2) /
+                                   (2 * sigma**2))
+                    else:
+                        g = self.g
+
+                    ul = int(np.round(x - 3 * sigma -
+                                      1)), int(np.round(y - 3 * sigma - 1))
+                    br = int(np.round(x + 3 * sigma +
+                                      2)), int(np.round(y + 3 * sigma + 2))
+
+                    c, d = max(0,
+                               -ul[0]), min(br[0], self.output_size[0]) - ul[0]
+                    a, b = max(0,
+                               -ul[1]), min(br[1], self.output_size[1]) - ul[1]
+
+                    cc, dd = max(0, ul[0]), min(br[0], self.output_size[0])
+                    aa, bb = max(0, ul[1]), min(br[1], self.output_size[1])
+                    hms[idx, aa:bb,
+                        cc:dd] = np.maximum(hms[idx, aa:bb, cc:dd], g[a:b,
+                                                                      c:d])
+        return hms
+
+
+class JointsEncoder:
+    """Encodes the visible joints into (coordinates, score); The coordinate of
+    one joint and its score are of `int` type.
+
+    (idx * output_size**2 + y * output_size + x, 1) or (0, 0).
+
+    Args:
+        max_num_people(int): Max number of people in an image
+        num_joints(int): Number of keypoints
+        output_size(np.ndarray): Size (w, h) of feature map
+        tag_per_joint(bool):  Option to use one tag map per joint.
+    """
+
+    def __init__(self, max_num_people, num_joints, output_size, tag_per_joint):
+        self.max_num_people = max_num_people
+        self.num_joints = num_joints
+        if not isinstance(output_size, np.ndarray):
+            output_size = np.array(output_size)
+        if output_size.size > 1:
+            assert len(output_size) == 2
+            self.output_size = output_size
+        else:
+            self.output_size = np.array([output_size, output_size],
+                                        dtype=np.int)
+        self.tag_per_joint = tag_per_joint
+
+    def __call__(self, joints):
+        """
+        Note:
+            - number of people in image: N
+            - number of keypoints: K
+            - max number of people in an image: M
+
+        Args:
+            joints (np.ndarray[N,K,3])
+
+        Returns:
+            visible_kpts (np.ndarray[M,K,2]).
+        """
+        visible_kpts = np.zeros((self.max_num_people, self.num_joints, 2),
+                                dtype=np.float32)
+        for i in range(len(joints)):
+            tot = 0
+            for idx, pt in enumerate(joints[i]):
+                x, y = int(pt[0]), int(pt[1])
+                if (pt[2] > 0 and 0 <= y < self.output_size[1]
+                        and 0 <= x < self.output_size[0]):
+                    if self.tag_per_joint:
+                        visible_kpts[i][tot] = \
+                            (idx * self.output_size.prod()
+                             + y * self.output_size[0] + x, 1)
+                    else:
+                        visible_kpts[i][tot] = (y * self.output_size[0] + x, 1)
+                    tot += 1
+        return visible_kpts
+
+
+class PAFGenerator:
+    """Generate part affinity fields.
+
+    Args:
+        output_size (np.ndarray): Size (w, h) of feature map.
+        limb_width (int): Limb width of part affinity fields.
+        skeleton (list[list]): connections of joints.
+    """
+
+    def __init__(self, output_size, limb_width, skeleton):
+        if not isinstance(output_size, np.ndarray):
+            output_size = np.array(output_size)
+        if output_size.size > 1:
+            assert len(output_size) == 2
+            self.output_size = output_size
+        else:
+            self.output_size = np.array([output_size, output_size],
+                                        dtype=np.int)
+        self.limb_width = limb_width
+        self.skeleton = skeleton
+
+    def _accumulate_paf_map_(self, pafs, src, dst, count):
+        """Accumulate part affinity fields between two given joints.
+
+        Args:
+            pafs (np.ndarray[2,H,W]): paf maps (2 dimensions:x axis and
+                y axis) for a certain limb connection. This argument will
+                be modified inplace.
+            src (np.ndarray[2,]): coordinates of the source joint.
+            dst (np.ndarray[2,]): coordinates of the destination joint.
+            count (np.ndarray[H,W]): count map that preserves the number
+                of non-zero vectors at each point. This argument will be
+                modified inplace.
+        """
+        limb_vec = dst - src
+        norm = np.linalg.norm(limb_vec)
+        if norm == 0:
+            unit_limb_vec = np.zeros(2)
+        else:
+            unit_limb_vec = limb_vec / norm
+
+        min_x = max(np.floor(min(src[0], dst[0]) - self.limb_width), 0)
+        max_x = min(
+            np.ceil(max(src[0], dst[0]) + self.limb_width),
+            self.output_size[0] - 1)
+        min_y = max(np.floor(min(src[1], dst[1]) - self.limb_width), 0)
+        max_y = min(
+            np.ceil(max(src[1], dst[1]) + self.limb_width),
+            self.output_size[1] - 1)
+
+        range_x = list(range(int(min_x), int(max_x + 1), 1))
+        range_y = list(range(int(min_y), int(max_y + 1), 1))
+
+        mask = np.zeros_like(count, dtype=bool)
+        if len(range_x) > 0 and len(range_y) > 0:
+            xx, yy = np.meshgrid(range_x, range_y)
+            delta_x = xx - src[0]
+            delta_y = yy - src[1]
+            dist = np.abs(delta_x * unit_limb_vec[1] -
+                          delta_y * unit_limb_vec[0])
+            mask_local = (dist < self.limb_width)
+            mask[yy, xx] = mask_local
+
+        pafs[0, mask] += unit_limb_vec[0]
+        pafs[1, mask] += unit_limb_vec[1]
+        count += mask
+
+        return pafs, count
+
+    def __call__(self, joints):
+        """Generate the target part affinity fields."""
+        pafs = np.zeros(
+            (len(self.skeleton) * 2, self.output_size[1], self.output_size[0]),
+            dtype=np.float32)
+
+        for idx, sk in enumerate(self.skeleton):
+            count = np.zeros((self.output_size[1], self.output_size[0]),
+                             dtype=np.float32)
+
+            for p in joints:
+                src = p[sk[0]]
+                dst = p[sk[1]]
+                if src[2] > 0 and dst[2] > 0:
+                    self._accumulate_paf_map_(pafs[2 * idx:2 * idx + 2],
+                                              src[:2], dst[:2], count)
+
+            pafs[2 * idx:2 * idx + 2] /= np.maximum(count, 1)
+
+        return pafs
+
+
+@PIPELINES.register_module()
+class BottomUpRandomFlip:
+    """Data augmentation with random image flip for bottom-up.
+
+    Args:
+        flip_prob (float): Probability of flip.
+    """
+
+    def __init__(self, flip_prob=0.5):
+        self.flip_prob = flip_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+        image, mask, joints = results['img'], results['mask'], results[
+            'joints']
+        self.flip_index = results['ann_info']['flip_index']
+        self.output_size = results['ann_info']['heatmap_size']
+
+        assert isinstance(mask, list)
+        assert isinstance(joints, list)
+        assert len(mask) == len(joints)
+        assert len(mask) == len(self.output_size)
+
+        if np.random.random() < self.flip_prob:
+            image = image[:, ::-1].copy() - np.zeros_like(image)
+            for i, _output_size in enumerate(self.output_size):
+                if not isinstance(_output_size, np.ndarray):
+                    _output_size = np.array(_output_size)
+                if _output_size.size > 1:
+                    assert len(_output_size) == 2
+                else:
+                    _output_size = np.array([_output_size, _output_size],
+                                            dtype=np.int)
+                mask[i] = mask[i][:, ::-1].copy()
+                joints[i] = joints[i][:, self.flip_index]
+                joints[i][:, :, 0] = _output_size[0] - joints[i][:, :, 0] - 1
+        results['img'], results['mask'], results[
+            'joints'] = image, mask, joints
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpRandomAffine:
+    """Data augmentation with random scaling & rotating.
+
+    Args:
+        rot_factor (int): Rotating to [-rotation_factor, rotation_factor]
+        scale_factor (float): Scaling to [1-scale_factor, 1+scale_factor]
+        scale_type: wrt ``long`` or ``short`` length of the image.
+        trans_factor: Translation factor.
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self,
+                 rot_factor,
+                 scale_factor,
+                 scale_type,
+                 trans_factor,
+                 use_udp=False):
+        self.max_rotation = rot_factor
+        self.min_scale = scale_factor[0]
+        self.max_scale = scale_factor[1]
+        self.scale_type = scale_type
+        self.trans_factor = trans_factor
+        self.use_udp = use_udp
+
+    def _get_scale(self, image_size, resized_size):
+        w, h = image_size
+        w_resized, h_resized = resized_size
+        if w / w_resized < h / h_resized:
+            if self.scale_type == 'long':
+                w_pad = h / h_resized * w_resized
+                h_pad = h
+            elif self.scale_type == 'short':
+                w_pad = w
+                h_pad = w / w_resized * h_resized
+            else:
+                raise ValueError(f'Unknown scale type: {self.scale_type}')
+        else:
+            if self.scale_type == 'long':
+                w_pad = w
+                h_pad = w / w_resized * h_resized
+            elif self.scale_type == 'short':
+                w_pad = h / h_resized * w_resized
+                h_pad = h
+            else:
+                raise ValueError(f'Unknown scale type: {self.scale_type}')
+
+        scale = np.array([w_pad, h_pad], dtype=np.float32)
+
+        return scale
+
+    def __call__(self, results):
+        """Perform data augmentation with random scaling & rotating."""
+        image, mask, joints = results['img'], results['mask'], results[
+            'joints']
+
+        self.input_size = results['ann_info']['image_size']
+        if not isinstance(self.input_size, np.ndarray):
+            self.input_size = np.array(self.input_size)
+        if self.input_size.size > 1:
+            assert len(self.input_size) == 2
+        else:
+            self.input_size = [self.input_size, self.input_size]
+        self.output_size = results['ann_info']['heatmap_size']
+
+        assert isinstance(mask, list)
+        assert isinstance(joints, list)
+        assert len(mask) == len(joints)
+        assert len(mask) == len(self.output_size), (len(mask),
+                                                    len(self.output_size),
+                                                    self.output_size)
+
+        height, width = image.shape[:2]
+        if self.use_udp:
+            center = np.array(((width - 1.0) / 2, (height - 1.0) / 2))
+        else:
+            center = np.array((width / 2, height / 2))
+
+        img_scale = np.array([width, height], dtype=np.float32)
+        aug_scale = np.random.random() * (self.max_scale - self.min_scale) \
+            + self.min_scale
+        img_scale *= aug_scale
+        aug_rot = (np.random.random() * 2 - 1) * self.max_rotation
+
+        if self.trans_factor > 0:
+            dx = np.random.randint(-self.trans_factor * img_scale[0] / 200.0,
+                                   self.trans_factor * img_scale[0] / 200.0)
+            dy = np.random.randint(-self.trans_factor * img_scale[1] / 200.0,
+                                   self.trans_factor * img_scale[1] / 200.0)
+
+            center[0] += dx
+            center[1] += dy
+        if self.use_udp:
+            for i, _output_size in enumerate(self.output_size):
+                if not isinstance(_output_size, np.ndarray):
+                    _output_size = np.array(_output_size)
+                if _output_size.size > 1:
+                    assert len(_output_size) == 2
+                else:
+                    _output_size = [_output_size, _output_size]
+
+                scale = self._get_scale(img_scale, _output_size)
+
+                trans = get_warp_matrix(
+                    theta=aug_rot,
+                    size_input=center * 2.0,
+                    size_dst=np.array(
+                        (_output_size[0], _output_size[1]), dtype=np.float32) -
+                    1.0,
+                    size_target=scale)
+                mask[i] = cv2.warpAffine(
+                    (mask[i] * 255).astype(np.uint8),
+                    trans, (int(_output_size[0]), int(_output_size[1])),
+                    flags=cv2.INTER_LINEAR) / 255
+                mask[i] = (mask[i] > 0.5).astype(np.float32)
+                joints[i][:, :, 0:2] = \
+                    warp_affine_joints(joints[i][:, :, 0:2].copy(), trans)
+                if results['ann_info']['scale_aware_sigma']:
+                    joints[i][:, :, 3] = joints[i][:, :, 3] / aug_scale
+            scale = self._get_scale(img_scale, self.input_size)
+            mat_input = get_warp_matrix(
+                theta=aug_rot,
+                size_input=center * 2.0,
+                size_dst=np.array((self.input_size[0], self.input_size[1]),
+                                  dtype=np.float32) - 1.0,
+                size_target=scale)
+            image = cv2.warpAffine(
+                image,
+                mat_input, (int(self.input_size[0]), int(self.input_size[1])),
+                flags=cv2.INTER_LINEAR)
+        else:
+            for i, _output_size in enumerate(self.output_size):
+                if not isinstance(_output_size, np.ndarray):
+                    _output_size = np.array(_output_size)
+                if _output_size.size > 1:
+                    assert len(_output_size) == 2
+                else:
+                    _output_size = [_output_size, _output_size]
+                scale = self._get_scale(img_scale, _output_size)
+                mat_output = get_affine_transform(
+                    center=center,
+                    scale=scale / 200.0,
+                    rot=aug_rot,
+                    output_size=_output_size)
+                mask[i] = cv2.warpAffine(
+                    (mask[i] * 255).astype(np.uint8), mat_output,
+                    (int(_output_size[0]), int(_output_size[1]))) / 255
+                mask[i] = (mask[i] > 0.5).astype(np.float32)
+
+                joints[i][:, :, 0:2] = \
+                    warp_affine_joints(joints[i][:, :, 0:2], mat_output)
+                if results['ann_info']['scale_aware_sigma']:
+                    joints[i][:, :, 3] = joints[i][:, :, 3] / aug_scale
+
+            scale = self._get_scale(img_scale, self.input_size)
+            mat_input = get_affine_transform(
+                center=center,
+                scale=scale / 200.0,
+                rot=aug_rot,
+                output_size=self.input_size)
+            image = cv2.warpAffine(image, mat_input, (int(
+                self.input_size[0]), int(self.input_size[1])))
+
+        results['img'], results['mask'], results[
+            'joints'] = image, mask, joints
+
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpGenerateHeatmapTarget:
+    """Generate multi-scale heatmap target for bottom-up.
+
+    Args:
+        sigma (int): Sigma of heatmap Gaussian
+        max_num_people (int): Maximum number of people in an image
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, sigma, use_udp=False):
+        self.sigma = sigma
+        self.use_udp = use_udp
+
+    def _generate(self, num_joints, heatmap_size):
+        """Get heatmap generator."""
+        heatmap_generator = [
+            HeatmapGenerator(output_size, num_joints, self.sigma, self.use_udp)
+            for output_size in heatmap_size
+        ]
+        return heatmap_generator
+
+    def __call__(self, results):
+        """Generate multi-scale heatmap target for bottom-up."""
+        heatmap_generator = \
+            self._generate(results['ann_info']['num_joints'],
+                           results['ann_info']['heatmap_size'])
+        target_list = list()
+        joints_list = results['joints']
+
+        for scale_id in range(results['ann_info']['num_scales']):
+            heatmaps = heatmap_generator[scale_id](joints_list[scale_id])
+            target_list.append(heatmaps.astype(np.float32))
+        results['target'] = target_list
+
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpGenerateTarget:
+    """Generate multi-scale heatmap target for associate embedding.
+
+    Args:
+        sigma (int): Sigma of heatmap Gaussian
+        max_num_people (int): Maximum number of people in an image
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, sigma, max_num_people, use_udp=False):
+        self.sigma = sigma
+        self.max_num_people = max_num_people
+        self.use_udp = use_udp
+
+    def _generate(self, num_joints, heatmap_size):
+        """Get heatmap generator and joint encoder."""
+        heatmap_generator = [
+            HeatmapGenerator(output_size, num_joints, self.sigma, self.use_udp)
+            for output_size in heatmap_size
+        ]
+        joints_encoder = [
+            JointsEncoder(self.max_num_people, num_joints, output_size, True)
+            for output_size in heatmap_size
+        ]
+        return heatmap_generator, joints_encoder
+
+    def __call__(self, results):
+        """Generate multi-scale heatmap target for bottom-up."""
+        heatmap_generator, joints_encoder = \
+            self._generate(results['ann_info']['num_joints'],
+                           results['ann_info']['heatmap_size'])
+        target_list = list()
+        mask_list, joints_list = results['mask'], results['joints']
+
+        for scale_id in range(results['ann_info']['num_scales']):
+            target_t = heatmap_generator[scale_id](joints_list[scale_id])
+            joints_t = joints_encoder[scale_id](joints_list[scale_id])
+
+            target_list.append(target_t.astype(np.float32))
+            mask_list[scale_id] = mask_list[scale_id].astype(np.float32)
+            joints_list[scale_id] = joints_t.astype(np.int32)
+
+        results['masks'], results['joints'] = mask_list, joints_list
+        results['targets'] = target_list
+
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpGeneratePAFTarget:
+    """Generate multi-scale heatmaps and part affinity fields (PAF) target for
+    bottom-up. Paper ref: Cao et al. Realtime Multi-Person 2D Human Pose
+    Estimation using Part Affinity Fields (CVPR 2017).
+
+    Args:
+        limb_width (int): Limb width of part affinity fields
+    """
+
+    def __init__(self, limb_width, skeleton=None):
+        self.limb_width = limb_width
+        self.skeleton = skeleton
+
+    def _generate(self, heatmap_size, skeleton):
+        """Get PAF generator."""
+        paf_generator = [
+            PAFGenerator(output_size, self.limb_width, skeleton)
+            for output_size in heatmap_size
+        ]
+        return paf_generator
+
+    def __call__(self, results):
+        """Generate multi-scale part affinity fields for bottom-up."""
+        if self.skeleton is None:
+            assert results['ann_info']['skeleton'] is not None
+            self.skeleton = results['ann_info']['skeleton']
+
+        paf_generator = \
+            self._generate(results['ann_info']['heatmap_size'],
+                           self.skeleton)
+        target_list = list()
+        joints_list = results['joints']
+
+        for scale_id in range(results['ann_info']['num_scales']):
+            pafs = paf_generator[scale_id](joints_list[scale_id])
+            target_list.append(pafs.astype(np.float32))
+
+        results['target'] = target_list
+
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpGetImgSize:
+    """Get multi-scale image sizes for bottom-up, including base_size and
+    test_scale_factor. Keep the ratio and the image is resized to
+    `results['ann_info']['image_size']×current_scale`.
+
+    Args:
+        test_scale_factor (List[float]): Multi scale
+        current_scale (int): default 1
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, test_scale_factor, current_scale=1, use_udp=False):
+        self.test_scale_factor = test_scale_factor
+        self.min_scale = min(test_scale_factor)
+        self.current_scale = current_scale
+        self.use_udp = use_udp
+
+    def __call__(self, results):
+        """Get multi-scale image sizes for bottom-up."""
+        input_size = results['ann_info']['image_size']
+        if not isinstance(input_size, np.ndarray):
+            input_size = np.array(input_size)
+        if input_size.size > 1:
+            assert len(input_size) == 2
+        else:
+            input_size = np.array([input_size, input_size], dtype=np.int)
+        img = results['img']
+
+        h, w, _ = img.shape
+
+        # calculate the size for min_scale
+        min_input_w = _ceil_to_multiples_of(self.min_scale * input_size[0], 64)
+        min_input_h = _ceil_to_multiples_of(self.min_scale * input_size[1], 64)
+        if w < h:
+            w_resized = int(min_input_w * self.current_scale / self.min_scale)
+            h_resized = int(
+                _ceil_to_multiples_of(min_input_w / w * h, 64) *
+                self.current_scale / self.min_scale)
+            if self.use_udp:
+                scale_w = w - 1.0
+                scale_h = (h_resized - 1.0) / (w_resized - 1.0) * (w - 1.0)
+            else:
+                scale_w = w / 200.0
+                scale_h = h_resized / w_resized * w / 200.0
+        else:
+            h_resized = int(min_input_h * self.current_scale / self.min_scale)
+            w_resized = int(
+                _ceil_to_multiples_of(min_input_h / h * w, 64) *
+                self.current_scale / self.min_scale)
+            if self.use_udp:
+                scale_h = h - 1.0
+                scale_w = (w_resized - 1.0) / (h_resized - 1.0) * (h - 1.0)
+            else:
+                scale_h = h / 200.0
+                scale_w = w_resized / h_resized * h / 200.0
+        if self.use_udp:
+            center = (scale_w / 2.0, scale_h / 2.0)
+        else:
+            center = np.array([round(w / 2.0), round(h / 2.0)])
+        results['ann_info']['test_scale_factor'] = self.test_scale_factor
+        results['ann_info']['base_size'] = (w_resized, h_resized)
+        results['ann_info']['center'] = center
+        results['ann_info']['scale'] = np.array([scale_w, scale_h])
+
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpResizeAlign:
+    """Resize multi-scale size and align transform for bottom-up.
+
+    Args:
+        transforms (List): ToTensor & Normalize
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, transforms, use_udp=False):
+        self.transforms = Compose(transforms)
+        if use_udp:
+            self._resize_align_multi_scale = _resize_align_multi_scale_udp
+        else:
+            self._resize_align_multi_scale = _resize_align_multi_scale
+
+    def __call__(self, results):
+        """Resize multi-scale size and align transform for bottom-up."""
+        input_size = results['ann_info']['image_size']
+        if not isinstance(input_size, np.ndarray):
+            input_size = np.array(input_size)
+        if input_size.size > 1:
+            assert len(input_size) == 2
+        else:
+            input_size = np.array([input_size, input_size], dtype=np.int)
+        test_scale_factor = results['ann_info']['test_scale_factor']
+        aug_data = []
+
+        for _, s in enumerate(sorted(test_scale_factor, reverse=True)):
+            _results = results.copy()
+            image_resized, _, _ = self._resize_align_multi_scale(
+                _results['img'], input_size, s, min(test_scale_factor))
+            _results['img'] = image_resized
+            _results = self.transforms(_results)
+            transformed_img = _results['img'].unsqueeze(0)
+            aug_data.append(transformed_img)
+
+        results['ann_info']['aug_data'] = aug_data
+
+        return results
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/hand_transform.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/hand_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83e399c4e7a5e5b07650cb01e9426da9d8cee4b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/hand_transform.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from mmpose.datasets.builder import PIPELINES
+from .top_down_transform import TopDownRandomFlip
+
+
+@PIPELINES.register_module()
+class HandRandomFlip(TopDownRandomFlip):
+    """Data augmentation with random image flip. A child class of
+    TopDownRandomFlip.
+
+    Required keys: 'img', 'joints_3d', 'joints_3d_visible', 'center',
+    'hand_type', 'rel_root_depth' and 'ann_info'.
+
+    Modifies key: 'img', 'joints_3d', 'joints_3d_visible', 'center',
+    'hand_type', 'rel_root_depth'.
+
+    Args:
+        flip_prob (float): Probability of flip.
+    """
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+        # base flip augmentation
+        super().__call__(results)
+
+        # flip hand type and root depth
+        hand_type = results['hand_type']
+        rel_root_depth = results['rel_root_depth']
+        flipped = results['flipped']
+        if flipped:
+            hand_type[0], hand_type[1] = hand_type[1], hand_type[0]
+            rel_root_depth = -rel_root_depth
+        results['hand_type'] = hand_type
+        results['rel_root_depth'] = rel_root_depth
+        return results
+
+
+@PIPELINES.register_module()
+class HandGenerateRelDepthTarget:
+    """Generate the target relative root depth.
+
+    Required keys: 'rel_root_depth', 'rel_root_valid', 'ann_info'.
+
+    Modified keys: 'target', 'target_weight'.
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        rel_root_depth = results['rel_root_depth']
+        rel_root_valid = results['rel_root_valid']
+        cfg = results['ann_info']
+        D = cfg['heatmap_size_root']
+        root_depth_bound = cfg['root_depth_bound']
+        target = (rel_root_depth / root_depth_bound + 0.5) * D
+        target_weight = rel_root_valid * (target >= 0) * (target <= D)
+        results['target'] = target * np.ones(1, dtype=np.float32)
+        results['target_weight'] = target_weight * np.ones(1, dtype=np.float32)
+        return results
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/loading.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..64750056438e8c06bcc4083dc1e8164f0671cd0f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/loading.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadImageFromFile:
+    """Loading image(s) from file.
+
+    Required key: "image_file".
+
+    Added key: "img".
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): Flags specifying the color type of a loaded image,
+          candidates are 'color', 'grayscale' and 'unchanged'.
+        channel_order (str): Order of channel, candidates are 'bgr' and 'rgb'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='color',
+                 channel_order='rgb',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.channel_order = channel_order
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _read_image(self, path):
+        img_bytes = self.file_client.get(path)
+        img = mmcv.imfrombytes(
+            img_bytes, flag=self.color_type, channel_order=self.channel_order)
+        if img is None:
+            raise ValueError(f'Fail to read {path}')
+        if self.to_float32:
+            img = img.astype(np.float32)
+        return img
+
+    def __call__(self, results):
+        """Loading image(s) from file."""
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        image_file = results.get('image_file', None)
+
+        if isinstance(image_file, (list, tuple)):
+            # Load images from a list of paths
+            results['img'] = [self._read_image(path) for path in image_file]
+        elif image_file is not None:
+            # Load single image from path
+            results['img'] = self._read_image(image_file)
+        else:
+            if 'img' not in results:
+                # If `image_file`` is not in results, check the `img` exists
+                # and format the image. This for compatibility when the image
+                # is manually set outside the pipeline.
+                raise KeyError('Either `image_file` or `img` should exist in '
+                               'results.')
+            assert isinstance(results['img'], np.ndarray)
+            if self.color_type == 'color' and self.channel_order == 'rgb':
+                # The original results['img'] is assumed to be image(s) in BGR
+                # order, so we convert the color according to the arguments.
+                if results['img'].ndim == 3:
+                    results['img'] = mmcv.bgr2rgb(results['img'])
+                elif results['img'].ndim == 4:
+                    results['img'] = np.concatenate(
+                        [mmcv.bgr2rgb(img) for img in results['img']], axis=0)
+                else:
+                    raise ValueError('results["img"] has invalid shape '
+                                     f'{results["img"].shape}')
+
+            results['image_file'] = None
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/mesh_transform.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/mesh_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3f32febcf01f37daa4957bfb0f17b8478773d59
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/mesh_transform.py
@@ -0,0 +1,399 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import mmcv
+import numpy as np
+import torch
+
+from mmpose.core.post_processing import (affine_transform, fliplr_joints,
+                                         get_affine_transform)
+from mmpose.datasets.builder import PIPELINES
+
+
+def _flip_smpl_pose(pose):
+    """Flip SMPL pose parameters horizontally.
+
+    Args:
+        pose (np.ndarray([72])): SMPL pose parameters
+
+    Returns:
+        pose_flipped
+    """
+
+    flippedParts = [
+        0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11, 15, 16, 17, 12, 13, 14, 18, 19,
+        20, 24, 25, 26, 21, 22, 23, 27, 28, 29, 33, 34, 35, 30, 31, 32, 36, 37,
+        38, 42, 43, 44, 39, 40, 41, 45, 46, 47, 51, 52, 53, 48, 49, 50, 57, 58,
+        59, 54, 55, 56, 63, 64, 65, 60, 61, 62, 69, 70, 71, 66, 67, 68
+    ]
+    pose_flipped = pose[flippedParts]
+    # Negate the second and the third dimension of the axis-angle
+    pose_flipped[1::3] = -pose_flipped[1::3]
+    pose_flipped[2::3] = -pose_flipped[2::3]
+    return pose_flipped
+
+
+def _flip_iuv(iuv, uv_type='BF'):
+    """Flip IUV image horizontally.
+
+    Note:
+        IUV image height: H
+        IUV image width: W
+
+    Args:
+        iuv np.ndarray([H, W, 3]): IUV image
+        uv_type (str): The type of the UV map.
+            Candidate values:
+                'DP': The UV map used in DensePose project.
+                'SMPL': The default UV map of SMPL model.
+                'BF': The UV map used in DecoMR project.
+            Default: 'BF'
+
+    Returns:
+        iuv_flipped np.ndarray([H, W, 3]): Flipped IUV image
+    """
+    assert uv_type in ['DP', 'SMPL', 'BF']
+    if uv_type == 'BF':
+        iuv_flipped = iuv[:, ::-1, :]
+        iuv_flipped[:, :, 1] = 255 - iuv_flipped[:, :, 1]
+    else:
+        # The flip of other UV map is complex, not finished yet.
+        raise NotImplementedError(
+            f'The flip of {uv_type} UV map is not implemented yet.')
+
+    return iuv_flipped
+
+
+def _construct_rotation_matrix(rot, size=3):
+    """Construct the in-plane rotation matrix.
+
+    Args:
+        rot (float): Rotation angle (degree).
+        size (int): The size of the rotation matrix.
+            Candidate Values: 2, 3. Defaults to 3.
+
+    Returns:
+        rot_mat (np.ndarray([size, size]): Rotation matrix.
+    """
+    rot_mat = np.eye(size, dtype=np.float32)
+    if rot != 0:
+        rot_rad = np.deg2rad(rot)
+        sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+        rot_mat[0, :2] = [cs, -sn]
+        rot_mat[1, :2] = [sn, cs]
+
+    return rot_mat
+
+
+def _rotate_joints_3d(joints_3d, rot):
+    """Rotate the 3D joints in the local coordinates.
+
+    Note:
+        Joints number: K
+
+    Args:
+        joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
+        rot (float): Rotation angle (degree).
+
+    Returns:
+        joints_3d_rotated
+    """
+    # in-plane rotation
+    # 3D joints are rotated counterclockwise,
+    # so the rot angle is inversed.
+    rot_mat = _construct_rotation_matrix(-rot, 3)
+
+    joints_3d_rotated = np.einsum('ij,kj->ki', rot_mat, joints_3d)
+    joints_3d_rotated = joints_3d_rotated.astype('float32')
+    return joints_3d_rotated
+
+
+def _rotate_smpl_pose(pose, rot):
+    """Rotate SMPL pose parameters. SMPL (https://smpl.is.tue.mpg.de/) is a 3D
+    human model.
+
+    Args:
+        pose (np.ndarray([72])): SMPL pose parameters
+        rot (float): Rotation angle (degree).
+
+    Returns:
+        pose_rotated
+    """
+    pose_rotated = pose.copy()
+    if rot != 0:
+        rot_mat = _construct_rotation_matrix(-rot)
+        orient = pose[:3]
+        # find the rotation of the body in camera frame
+        per_rdg, _ = cv2.Rodrigues(orient)
+        # apply the global rotation to the global orientation
+        res_rot, _ = cv2.Rodrigues(np.dot(rot_mat, per_rdg))
+        pose_rotated[:3] = (res_rot.T)[0]
+
+    return pose_rotated
+
+
+def _flip_joints_3d(joints_3d, joints_3d_visible, flip_pairs):
+    """Flip human joints in 3D space horizontally.
+
+    Note:
+        num_keypoints: K
+
+    Args:
+        joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
+        joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+
+    Returns:
+        joints_3d_flipped, joints_3d_visible_flipped
+    """
+
+    assert len(joints_3d) == len(joints_3d_visible)
+
+    joints_3d_flipped = joints_3d.copy()
+    joints_3d_visible_flipped = joints_3d_visible.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        joints_3d_flipped[left, :] = joints_3d[right, :]
+        joints_3d_flipped[right, :] = joints_3d[left, :]
+
+        joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
+        joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
+
+    # Flip horizontally
+    joints_3d_flipped[:, 0] = -joints_3d_flipped[:, 0]
+    joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped
+
+    return joints_3d_flipped, joints_3d_visible_flipped
+
+
+@PIPELINES.register_module()
+class LoadIUVFromFile:
+    """Loading IUV image from file."""
+
+    def __init__(self, to_float32=False):
+        self.to_float32 = to_float32
+        self.color_type = 'color'
+        # channel relations: iuv->bgr
+        self.channel_order = 'bgr'
+
+    def __call__(self, results):
+        """Loading image from file."""
+        has_iuv = results['has_iuv']
+        use_iuv = results['ann_info']['use_IUV']
+        if has_iuv and use_iuv:
+            iuv_file = results['iuv_file']
+            iuv = mmcv.imread(iuv_file, self.color_type, self.channel_order)
+            if iuv is None:
+                raise ValueError(f'Fail to read {iuv_file}')
+        else:
+            has_iuv = 0
+            iuv = None
+
+        results['has_iuv'] = has_iuv
+        results['iuv'] = iuv
+        return results
+
+
+@PIPELINES.register_module()
+class IUVToTensor:
+    """Transform IUV image to part index mask and uv coordinates image. The 3
+    channels of IUV image means: part index, u coordinates, v coordinates.
+
+    Required key: 'iuv', 'ann_info'.
+    Modifies key: 'part_index', 'uv_coordinates'.
+
+    Args:
+        results (dict): contain all information about training.
+    """
+
+    def __call__(self, results):
+        iuv = results['iuv']
+        if iuv is None:
+            H, W = results['ann_info']['iuv_size']
+            part_index = torch.zeros([1, H, W], dtype=torch.long)
+            uv_coordinates = torch.zeros([2, H, W], dtype=torch.float32)
+        else:
+            part_index = torch.LongTensor(iuv[:, :, 0])[None, :, :]
+            uv_coordinates = torch.FloatTensor(iuv[:, :, 1:]) / 255
+            uv_coordinates = uv_coordinates.permute(2, 0, 1)
+        results['part_index'] = part_index
+        results['uv_coordinates'] = uv_coordinates
+        return results
+
+
+@PIPELINES.register_module()
+class MeshRandomChannelNoise:
+    """Data augmentation with random channel noise.
+
+    Required keys: 'img'
+    Modifies key: 'img'
+
+    Args:
+        noise_factor (float): Multiply each channel with
+         a factor between``[1-scale_factor, 1+scale_factor]``
+    """
+
+    def __init__(self, noise_factor=0.4):
+        self.noise_factor = noise_factor
+
+    def __call__(self, results):
+        """Perform data augmentation with random channel noise."""
+        img = results['img']
+
+        # Each channel is multiplied with a number
+        # in the area [1-self.noise_factor, 1+self.noise_factor]
+        pn = np.random.uniform(1 - self.noise_factor, 1 + self.noise_factor,
+                               (1, 3))
+        img = cv2.multiply(img, pn)
+
+        results['img'] = img
+        return results
+
+
+@PIPELINES.register_module()
+class MeshRandomFlip:
+    """Data augmentation with random image flip.
+
+    Required keys: 'img', 'joints_2d','joints_2d_visible', 'joints_3d',
+    'joints_3d_visible', 'center', 'pose', 'iuv' and 'ann_info'.
+    Modifies key: 'img', 'joints_2d','joints_2d_visible', 'joints_3d',
+    'joints_3d_visible', 'center', 'pose', 'iuv'.
+
+    Args:
+        flip_prob (float): Probability of flip.
+    """
+
+    def __init__(self, flip_prob=0.5):
+        self.flip_prob = flip_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+        if np.random.rand() > self.flip_prob:
+            return results
+
+        img = results['img']
+        joints_2d = results['joints_2d']
+        joints_2d_visible = results['joints_2d_visible']
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        pose = results['pose']
+        center = results['center']
+
+        img = img[:, ::-1, :]
+        pose = _flip_smpl_pose(pose)
+
+        joints_2d, joints_2d_visible = fliplr_joints(
+            joints_2d, joints_2d_visible, img.shape[1],
+            results['ann_info']['flip_pairs'])
+
+        joints_3d, joints_3d_visible = _flip_joints_3d(
+            joints_3d, joints_3d_visible, results['ann_info']['flip_pairs'])
+        center[0] = img.shape[1] - center[0] - 1
+
+        if 'iuv' in results.keys():
+            iuv = results['iuv']
+            if iuv is not None:
+                iuv = _flip_iuv(iuv, results['ann_info']['uv_type'])
+            results['iuv'] = iuv
+
+        results['img'] = img
+        results['joints_2d'] = joints_2d
+        results['joints_2d_visible'] = joints_2d_visible
+        results['joints_3d'] = joints_3d
+        results['joints_3d_visible'] = joints_3d_visible
+        results['pose'] = pose
+        results['center'] = center
+        return results
+
+
+@PIPELINES.register_module()
+class MeshGetRandomScaleRotation:
+    """Data augmentation with random scaling & rotating.
+
+    Required key: 'scale'. Modifies key: 'scale' and 'rotation'.
+
+    Args:
+        rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``.
+        scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``.
+        rot_prob (float): Probability of random rotation.
+    """
+
+    def __init__(self, rot_factor=30, scale_factor=0.25, rot_prob=0.6):
+        self.rot_factor = rot_factor
+        self.scale_factor = scale_factor
+        self.rot_prob = rot_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random scaling & rotating."""
+        s = results['scale']
+
+        sf = self.scale_factor
+        rf = self.rot_factor
+
+        s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+        s = s * s_factor
+
+        r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)
+        r = r_factor if np.random.rand() <= self.rot_prob else 0
+
+        results['scale'] = s
+        results['rotation'] = r
+
+        return results
+
+
+@PIPELINES.register_module()
+class MeshAffine:
+    """Affine transform the image to get input image. Affine transform the 2D
+    keypoints, 3D kepoints and IUV image too.
+
+    Required keys: 'img', 'joints_2d','joints_2d_visible', 'joints_3d',
+    'joints_3d_visible', 'pose', 'iuv', 'ann_info','scale',  'rotation' and
+    'center'. Modifies key: 'img', 'joints_2d','joints_2d_visible',
+    'joints_3d',  'pose', 'iuv'.
+    """
+
+    def __call__(self, results):
+        image_size = results['ann_info']['image_size']
+
+        img = results['img']
+        joints_2d = results['joints_2d']
+        joints_2d_visible = results['joints_2d_visible']
+        joints_3d = results['joints_3d']
+        pose = results['pose']
+
+        c = results['center']
+        s = results['scale']
+        r = results['rotation']
+        trans = get_affine_transform(c, s, r, image_size)
+
+        img = cv2.warpAffine(
+            img,
+            trans, (int(image_size[0]), int(image_size[1])),
+            flags=cv2.INTER_LINEAR)
+
+        for i in range(results['ann_info']['num_joints']):
+            if joints_2d_visible[i, 0] > 0.0:
+                joints_2d[i] = affine_transform(joints_2d[i], trans)
+
+        joints_3d = _rotate_joints_3d(joints_3d, r)
+        pose = _rotate_smpl_pose(pose, r)
+
+        results['img'] = img
+        results['joints_2d'] = joints_2d
+        results['joints_2d_visible'] = joints_2d_visible
+        results['joints_3d'] = joints_3d
+        results['pose'] = pose
+
+        if 'iuv' in results.keys():
+            iuv = results['iuv']
+            if iuv is not None:
+                iuv_size = results['ann_info']['iuv_size']
+                iuv = cv2.warpAffine(
+                    iuv,
+                    trans, (int(iuv_size[0]), int(iuv_size[1])),
+                    flags=cv2.INTER_NEAREST)
+            results['iuv'] = iuv
+
+        return results
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/pose3d_transform.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/pose3d_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..124937861f71bf8148641d59dbb42bd47457c902
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/pose3d_transform.py
@@ -0,0 +1,643 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.utils import build_from_cfg
+
+from mmpose.core.camera import CAMERAS
+from mmpose.core.post_processing import fliplr_regression
+from mmpose.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class GetRootCenteredPose:
+    """Zero-center the pose around a given root joint. Optionally, the root
+    joint can be removed from the original pose and stored as a separate item.
+
+    Note that the root-centered joints may no longer align with some annotation
+    information (e.g. flip_pairs, num_joints, inference_channel, etc.) due to
+    the removal of the root joint.
+
+    Args:
+        item (str): The name of the pose to apply root-centering.
+        root_index (int): Root joint index in the pose.
+        visible_item (str): The name of the visibility item.
+        remove_root (bool): If true, remove the root joint from the pose
+        root_name (str): Optional. If not none, it will be used as the key to
+            store the root position separated from the original pose.
+
+    Required keys:
+        item
+
+    Modified keys:
+        item, visible_item, root_name
+    """
+
+    def __init__(self,
+                 item,
+                 root_index,
+                 visible_item=None,
+                 remove_root=False,
+                 root_name=None):
+        self.item = item
+        self.root_index = root_index
+        self.remove_root = remove_root
+        self.root_name = root_name
+        self.visible_item = visible_item
+
+    def __call__(self, results):
+        assert self.item in results
+        joints = results[self.item]
+        root_idx = self.root_index
+
+        assert joints.ndim >= 2 and joints.shape[-2] > root_idx,\
+            f'Got invalid joint shape {joints.shape}'
+
+        root = joints[..., root_idx:root_idx + 1, :]
+        joints = joints - root
+
+        results[self.item] = joints
+        if self.root_name is not None:
+            results[self.root_name] = root
+
+        if self.remove_root:
+            results[self.item] = np.delete(
+                results[self.item], root_idx, axis=-2)
+            if self.visible_item is not None:
+                assert self.visible_item in results
+                results[self.visible_item] = np.delete(
+                    results[self.visible_item], root_idx, axis=-2)
+            # Add a flag to avoid latter transforms that rely on the root
+            # joint or the original joint index
+            results[f'{self.item}_root_removed'] = True
+
+            # Save the root index which is necessary to restore the global pose
+            if self.root_name is not None:
+                results[f'{self.root_name}_index'] = self.root_index
+
+        return results
+
+
+@PIPELINES.register_module()
+class NormalizeJointCoordinate:
+    """Normalize the joint coordinate with given mean and std.
+
+    Args:
+        item (str): The name of the pose to normalize.
+        mean (array): Mean values of joint coordinates in shape [K, C].
+        std (array): Std values of joint coordinates in shape [K, C].
+        norm_param_file (str): Optionally load a dict containing `mean` and
+            `std` from a file using `mmcv.load`.
+
+    Required keys:
+        item
+
+    Modified keys:
+        item
+    """
+
+    def __init__(self, item, mean=None, std=None, norm_param_file=None):
+        self.item = item
+        self.norm_param_file = norm_param_file
+        if norm_param_file is not None:
+            norm_param = mmcv.load(norm_param_file)
+            assert 'mean' in norm_param and 'std' in norm_param
+            mean = norm_param['mean']
+            std = norm_param['std']
+        else:
+            assert mean is not None
+            assert std is not None
+
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+
+    def __call__(self, results):
+        assert self.item in results
+        results[self.item] = (results[self.item] - self.mean) / self.std
+        results[f'{self.item}_mean'] = self.mean.copy()
+        results[f'{self.item}_std'] = self.std.copy()
+        return results
+
+
+@PIPELINES.register_module()
+class ImageCoordinateNormalization:
+    """Normalize the 2D joint coordinate with image width and height. Range [0,
+    w] is mapped to [-1, 1], while preserving the aspect ratio.
+
+    Args:
+        item (str|list[str]): The name of the pose to normalize.
+        norm_camera (bool): Whether to normalize camera intrinsics.
+            Default: False.
+        camera_param (dict|None): The camera parameter dict. See the camera
+            class definition for more details. If None is given, the camera
+            parameter will be obtained during processing of each data sample
+            with the key "camera_param".
+
+    Required keys:
+        item
+
+    Modified keys:
+        item (, camera_param)
+    """
+
+    def __init__(self, item, norm_camera=False, camera_param=None):
+        self.item = item
+        if isinstance(self.item, str):
+            self.item = [self.item]
+
+        self.norm_camera = norm_camera
+
+        if camera_param is None:
+            self.static_camera = False
+        else:
+            self.static_camera = True
+            self.camera_param = camera_param
+
+    def __call__(self, results):
+        center = np.array(
+            [0.5 * results['image_width'], 0.5 * results['image_height']],
+            dtype=np.float32)
+        scale = np.array(0.5 * results['image_width'], dtype=np.float32)
+
+        for item in self.item:
+            results[item] = (results[item] - center) / scale
+
+        if self.norm_camera:
+            if self.static_camera:
+                camera_param = copy.deepcopy(self.camera_param)
+            else:
+                assert 'camera_param' in results, \
+                    'Camera parameters are missing.'
+                camera_param = results['camera_param']
+            assert 'f' in camera_param and 'c' in camera_param
+            camera_param['f'] = camera_param['f'] / scale
+            camera_param['c'] = (camera_param['c'] - center[:, None]) / scale
+            if 'camera_param' not in results:
+                results['camera_param'] = dict()
+            results['camera_param'].update(camera_param)
+
+        return results
+
+
+@PIPELINES.register_module()
+class CollectCameraIntrinsics:
+    """Store camera intrinsics in a 1-dim array, including f, c, k, p.
+
+    Args:
+        camera_param (dict|None): The camera parameter dict. See the camera
+            class definition for more details. If None is given, the camera
+            parameter will be obtained during processing of each data sample
+            with the key "camera_param".
+        need_distortion (bool): Whether need distortion parameters k and p.
+            Default: True.
+
+    Required keys:
+        camera_param (if camera parameters are not given in initialization)
+
+    Modified keys:
+        intrinsics
+    """
+
+    def __init__(self, camera_param=None, need_distortion=True):
+        if camera_param is None:
+            self.static_camera = False
+        else:
+            self.static_camera = True
+            self.camera_param = camera_param
+        self.need_distortion = need_distortion
+
+    def __call__(self, results):
+        if self.static_camera:
+            camera_param = copy.deepcopy(self.camera_param)
+        else:
+            assert 'camera_param' in results, 'Camera parameters are missing.'
+            camera_param = results['camera_param']
+        assert 'f' in camera_param and 'c' in camera_param
+        intrinsics = np.concatenate(
+            [camera_param['f'].reshape(2), camera_param['c'].reshape(2)])
+        if self.need_distortion:
+            assert 'k' in camera_param and 'p' in camera_param
+            intrinsics = np.concatenate([
+                intrinsics, camera_param['k'].reshape(3),
+                camera_param['p'].reshape(2)
+            ])
+        results['intrinsics'] = intrinsics
+
+        return results
+
+
+@PIPELINES.register_module()
+class CameraProjection:
+    """Apply camera projection to joint coordinates.
+
+    Args:
+        item (str): The name of the pose to apply camera projection.
+        mode (str): The type of camera projection, supported options are
+
+            - world_to_camera
+            - world_to_pixel
+            - camera_to_world
+            - camera_to_pixel
+        output_name (str|None): The name of the projected pose. If None
+            (default) is given, the projected pose will be stored in place.
+        camera_type (str): The camera class name (should be registered in
+            CAMERA).
+        camera_param (dict|None): The camera parameter dict. See the camera
+            class definition for more details. If None is given, the camera
+            parameter will be obtained during processing of each data sample
+            with the key "camera_param".
+
+    Required keys:
+
+        - item
+        - camera_param (if camera parameters are not given in initialization)
+
+    Modified keys:
+        output_name
+    """
+
+    def __init__(self,
+                 item,
+                 mode,
+                 output_name=None,
+                 camera_type='SimpleCamera',
+                 camera_param=None):
+        self.item = item
+        self.mode = mode
+        self.output_name = output_name
+        self.camera_type = camera_type
+        allowed_mode = {
+            'world_to_camera',
+            'world_to_pixel',
+            'camera_to_world',
+            'camera_to_pixel',
+        }
+        if mode not in allowed_mode:
+            raise ValueError(
+                f'Got invalid mode: {mode}, allowed modes are {allowed_mode}')
+
+        if camera_param is None:
+            self.static_camera = False
+        else:
+            self.static_camera = True
+            self.camera = self._build_camera(camera_param)
+
+    def _build_camera(self, param):
+        cfgs = dict(type=self.camera_type, param=param)
+        return build_from_cfg(cfgs, CAMERAS)
+
+    def __call__(self, results):
+        assert self.item in results
+        joints = results[self.item]
+
+        if self.static_camera:
+            camera = self.camera
+        else:
+            assert 'camera_param' in results, 'Camera parameters are missing.'
+            camera = self._build_camera(results['camera_param'])
+
+        if self.mode == 'world_to_camera':
+            output = camera.world_to_camera(joints)
+        elif self.mode == 'world_to_pixel':
+            output = camera.world_to_pixel(joints)
+        elif self.mode == 'camera_to_world':
+            output = camera.camera_to_world(joints)
+        elif self.mode == 'camera_to_pixel':
+            output = camera.camera_to_pixel(joints)
+        else:
+            raise NotImplementedError
+
+        output_name = self.output_name
+        if output_name is None:
+            output_name = self.item
+
+        results[output_name] = output
+        return results
+
+
+@PIPELINES.register_module()
+class RelativeJointRandomFlip:
+    """Data augmentation with random horizontal joint flip around a root joint.
+
+    Args:
+        item (str|list[str]): The name of the pose to flip.
+        flip_cfg (dict|list[dict]): Configurations of the fliplr_regression
+            function. It should contain the following arguments:
+
+            - ``center_mode``: The mode to set the center location on the \
+                x-axis to flip around.
+            - ``center_x`` or ``center_index``: Set the x-axis location or \
+                the root joint's index to define the flip center.
+
+            Please refer to the docstring of the fliplr_regression function for
+            more details.
+        visible_item (str|list[str]): The name of the visibility item which
+            will be flipped accordingly along with the pose.
+        flip_prob (float): Probability of flip.
+        flip_camera (bool): Whether to flip horizontal distortion coefficients.
+        camera_param (dict|None): The camera parameter dict. See the camera
+            class definition for more details. If None is given, the camera
+            parameter will be obtained during processing of each data sample
+            with the key "camera_param".
+
+    Required keys:
+        item
+
+    Modified keys:
+        item (, camera_param)
+    """
+
+    def __init__(self,
+                 item,
+                 flip_cfg,
+                 visible_item=None,
+                 flip_prob=0.5,
+                 flip_camera=False,
+                 camera_param=None):
+        self.item = item
+        self.flip_cfg = flip_cfg
+        self.vis_item = visible_item
+        self.flip_prob = flip_prob
+        self.flip_camera = flip_camera
+        if camera_param is None:
+            self.static_camera = False
+        else:
+            self.static_camera = True
+            self.camera_param = camera_param
+
+        if isinstance(self.item, str):
+            self.item = [self.item]
+        if isinstance(self.flip_cfg, dict):
+            self.flip_cfg = [self.flip_cfg] * len(self.item)
+        assert len(self.item) == len(self.flip_cfg)
+        if isinstance(self.vis_item, str):
+            self.vis_item = [self.vis_item]
+
+    def __call__(self, results):
+
+        if results.get(f'{self.item}_root_removed', False):
+            raise RuntimeError('The transform RelativeJointRandomFlip should '
+                               f'not be applied to {self.item} whose root '
+                               'joint has been removed and joint indices have '
+                               'been changed')
+
+        if np.random.rand() <= self.flip_prob:
+
+            flip_pairs = results['ann_info']['flip_pairs']
+
+            # flip joint coordinates
+            for i, item in enumerate(self.item):
+                assert item in results
+                joints = results[item]
+
+                joints_flipped = fliplr_regression(joints, flip_pairs,
+                                                   **self.flip_cfg[i])
+
+                results[item] = joints_flipped
+
+            # flip joint visibility
+            for vis_item in self.vis_item:
+                assert vis_item in results
+                visible = results[vis_item]
+                visible_flipped = visible.copy()
+                for left, right in flip_pairs:
+                    visible_flipped[..., left, :] = visible[..., right, :]
+                    visible_flipped[..., right, :] = visible[..., left, :]
+                results[vis_item] = visible_flipped
+
+            # flip horizontal distortion coefficients
+            if self.flip_camera:
+                if self.static_camera:
+                    camera_param = copy.deepcopy(self.camera_param)
+                else:
+                    assert 'camera_param' in results, \
+                        'Camera parameters are missing.'
+                    camera_param = results['camera_param']
+                assert 'c' in camera_param
+                camera_param['c'][0] *= -1
+
+                if 'p' in camera_param:
+                    camera_param['p'][0] *= -1
+
+                if 'camera_param' not in results:
+                    results['camera_param'] = dict()
+                results['camera_param'].update(camera_param)
+
+        return results
+
+
+@PIPELINES.register_module()
+class PoseSequenceToTensor:
+    """Convert pose sequence from numpy array to Tensor.
+
+    The original pose sequence should have a shape of [T,K,C] or [K,C], where
+    T is the sequence length, K and C are keypoint number and dimension. The
+    converted pose sequence will have a shape of [KxC, T].
+
+    Args:
+        item (str): The name of the pose sequence
+
+    Required keys:
+        item
+
+    Modified keys:
+        item
+    """
+
+    def __init__(self, item):
+        self.item = item
+
+    def __call__(self, results):
+        assert self.item in results
+        seq = results[self.item]
+
+        assert isinstance(seq, np.ndarray)
+        assert seq.ndim in {2, 3}
+
+        if seq.ndim == 2:
+            seq = seq[None, ...]
+
+        T = seq.shape[0]
+        seq = seq.transpose(1, 2, 0).reshape(-1, T)
+        results[self.item] = torch.from_numpy(seq)
+
+        return results
+
+
+@PIPELINES.register_module()
+class Generate3DHeatmapTarget:
+    """Generate the target 3d heatmap.
+
+    Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'.
+    Modified keys: 'target', and 'target_weight'.
+
+    Args:
+        sigma: Sigma of heatmap gaussian.
+        joint_indices (list): Indices of joints used for heatmap generation.
+            If None (default) is given, all joints will be used.
+        max_bound (float): The maximal value of heatmap.
+    """
+
+    def __init__(self, sigma=2, joint_indices=None, max_bound=1.0):
+        self.sigma = sigma
+        self.joint_indices = joint_indices
+        self.max_bound = max_bound
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        cfg = results['ann_info']
+        image_size = cfg['image_size']
+        W, H, D = cfg['heatmap_size']
+        heatmap3d_depth_bound = cfg['heatmap3d_depth_bound']
+        joint_weights = cfg['joint_weights']
+        use_different_joint_weights = cfg['use_different_joint_weights']
+
+        # select the joints used for target generation
+        if self.joint_indices is not None:
+            joints_3d = joints_3d[self.joint_indices, ...]
+            joints_3d_visible = joints_3d_visible[self.joint_indices, ...]
+            joint_weights = joint_weights[self.joint_indices, ...]
+        num_joints = joints_3d.shape[0]
+
+        # get the joint location in heatmap coordinates
+        mu_x = joints_3d[:, 0] * W / image_size[0]
+        mu_y = joints_3d[:, 1] * H / image_size[1]
+        mu_z = (joints_3d[:, 2] / heatmap3d_depth_bound + 0.5) * D
+
+        target = np.zeros([num_joints, D, H, W], dtype=np.float32)
+
+        target_weight = joints_3d_visible[:, 0].astype(np.float32)
+        target_weight = target_weight * (mu_z >= 0) * (mu_z < D)
+        if use_different_joint_weights:
+            target_weight = target_weight * joint_weights
+        target_weight = target_weight[:, None]
+
+        # only compute the voxel value near the joints location
+        tmp_size = 3 * self.sigma
+
+        # get neighboring voxels coordinates
+        x = y = z = np.arange(2 * tmp_size + 1, dtype=np.float32) - tmp_size
+        zz, yy, xx = np.meshgrid(z, y, x)
+        xx = xx[None, ...].astype(np.float32)
+        yy = yy[None, ...].astype(np.float32)
+        zz = zz[None, ...].astype(np.float32)
+        mu_x = mu_x[..., None, None, None]
+        mu_y = mu_y[..., None, None, None]
+        mu_z = mu_z[..., None, None, None]
+        xx, yy, zz = xx + mu_x, yy + mu_y, zz + mu_z
+
+        # round the coordinates
+        xx = xx.round().clip(0, W - 1)
+        yy = yy.round().clip(0, H - 1)
+        zz = zz.round().clip(0, D - 1)
+
+        # compute the target value near joints
+        local_target = \
+            np.exp(-((xx - mu_x)**2 + (yy - mu_y)**2 + (zz - mu_z)**2) /
+                   (2 * self.sigma**2))
+
+        # put the local target value to the full target heatmap
+        local_size = xx.shape[1]
+        idx_joints = np.tile(
+            np.arange(num_joints)[:, None, None, None],
+            [1, local_size, local_size, local_size])
+        idx = np.stack([idx_joints, zz, yy, xx],
+                       axis=-1).astype(int).reshape(-1, 4)
+        target[idx[:, 0], idx[:, 1], idx[:, 2],
+               idx[:, 3]] = local_target.reshape(-1)
+        target = target * self.max_bound
+        results['target'] = target
+        results['target_weight'] = target_weight
+        return results
+
+
+@PIPELINES.register_module()
+class GenerateVoxel3DHeatmapTarget:
+    """Generate the target 3d heatmap.
+
+    Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info_3d'.
+    Modified keys: 'target', and 'target_weight'.
+
+    Args:
+        sigma: Sigma of heatmap gaussian (mm).
+        joint_indices (list): Indices of joints used for heatmap generation.
+            If None (default) is given, all joints will be used.
+    """
+
+    def __init__(self, sigma=200.0, joint_indices=None):
+        self.sigma = sigma  # mm
+        self.joint_indices = joint_indices
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        cfg = results['ann_info']
+
+        num_people = len(joints_3d)
+        num_joints = joints_3d[0].shape[0]
+
+        if self.joint_indices is not None:
+            num_joints = len(self.joint_indices)
+            joint_indices = self.joint_indices
+        else:
+            joint_indices = list(range(num_joints))
+
+        space_size = cfg['space_size']
+        space_center = cfg['space_center']
+        cube_size = cfg['cube_size']
+        grids_x = np.linspace(-space_size[0] / 2, space_size[0] / 2,
+                              cube_size[0]) + space_center[0]
+        grids_y = np.linspace(-space_size[1] / 2, space_size[1] / 2,
+                              cube_size[1]) + space_center[1]
+        grids_z = np.linspace(-space_size[2] / 2, space_size[2] / 2,
+                              cube_size[2]) + space_center[2]
+
+        target = np.zeros(
+            (num_joints, cube_size[0], cube_size[1], cube_size[2]),
+            dtype=np.float32)
+
+        for n in range(num_people):
+            for idx, joint_id in enumerate(joint_indices):
+                mu_x = joints_3d[n][joint_id][0]
+                mu_y = joints_3d[n][joint_id][1]
+                mu_z = joints_3d[n][joint_id][2]
+                vis = joints_3d_visible[n][joint_id][0]
+                if vis < 1:
+                    continue
+                i_x = [
+                    np.searchsorted(grids_x, mu_x - 3 * self.sigma),
+                    np.searchsorted(grids_x, mu_x + 3 * self.sigma, 'right')
+                ]
+                i_y = [
+                    np.searchsorted(grids_y, mu_y - 3 * self.sigma),
+                    np.searchsorted(grids_y, mu_y + 3 * self.sigma, 'right')
+                ]
+                i_z = [
+                    np.searchsorted(grids_z, mu_z - 3 * self.sigma),
+                    np.searchsorted(grids_z, mu_z + 3 * self.sigma, 'right')
+                ]
+                if i_x[0] >= i_x[1] or i_y[0] >= i_y[1] or i_z[0] >= i_z[1]:
+                    continue
+                kernel_xs, kernel_ys, kernel_zs = np.meshgrid(
+                    grids_x[i_x[0]:i_x[1]],
+                    grids_y[i_y[0]:i_y[1]],
+                    grids_z[i_z[0]:i_z[1]],
+                    indexing='ij')
+                g = np.exp(-((kernel_xs - mu_x)**2 + (kernel_ys - mu_y)**2 +
+                             (kernel_zs - mu_z)**2) / (2 * self.sigma**2))
+                target[idx, i_x[0]:i_x[1], i_y[0]:i_y[1], i_z[0]:i_z[1]] \
+                    = np.maximum(target[idx, i_x[0]:i_x[1],
+                                 i_y[0]:i_y[1], i_z[0]:i_z[1]], g)
+
+        target = np.clip(target, 0, 1)
+        if target.shape[0] == 1:
+            target = target[0]
+
+        results['targets_3d'] = target
+
+        return results
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/shared_transform.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/shared_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4fea806ce84b0484cabb7b44ba09c34cc109be0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/shared_transform.py
@@ -0,0 +1,527 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections.abc import Sequence
+
+import mmcv
+import numpy as np
+from mmcv.parallel import DataContainer as DC
+from mmcv.utils import build_from_cfg
+from numpy import random
+from torchvision.transforms import functional as F
+
+from ..builder import PIPELINES
+
+try:
+    import albumentations
+except ImportError:
+    albumentations = None
+
+
+@PIPELINES.register_module()
+class ToTensor:
+    """Transform image to Tensor.
+
+    Required key: 'img'. Modifies key: 'img'.
+
+    Args:
+        results (dict): contain all information about training.
+    """
+
+    def __call__(self, results):
+        if isinstance(results['img'], (list, tuple)):
+            results['img'] = [F.to_tensor(img) for img in results['img']]
+        else:
+            results['img'] = F.to_tensor(results['img'])
+
+        return results
+
+
+@PIPELINES.register_module()
+class NormalizeTensor:
+    """Normalize the Tensor image (CxHxW), with mean and std.
+
+    Required key: 'img'. Modifies key: 'img'.
+
+    Args:
+        mean (list[float]): Mean values of 3 channels.
+        std (list[float]): Std values of 3 channels.
+    """
+
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, results):
+        if isinstance(results['img'], (list, tuple)):
+            results['img'] = [
+                F.normalize(img, mean=self.mean, std=self.std)
+                for img in results['img']
+            ]
+        else:
+            results['img'] = F.normalize(
+                results['img'], mean=self.mean, std=self.std)
+
+        return results
+
+
+@PIPELINES.register_module()
+class Compose:
+    """Compose a data pipeline with a sequence of transforms.
+
+    Args:
+        transforms (list[dict | callable]): Either config
+          dicts of transforms or transform objects.
+    """
+
+    def __init__(self, transforms):
+        assert isinstance(transforms, Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict, but got'
+                                f' {type(transform)}')
+
+    def __call__(self, data):
+        """Call function to apply transforms sequentially.
+
+        Args:
+            data (dict): A result dict contains the data to transform.
+
+        Returns:
+            dict: Transformed data.
+        """
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        """Compute the string representation."""
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += f'\n    {t}'
+        format_string += '\n)'
+        return format_string
+
+
+@PIPELINES.register_module()
+class Collect:
+    """Collect data from the loader relevant to the specific task.
+
+    This keeps the items in `keys` as it is, and collect items in `meta_keys`
+    into a meta item called `meta_name`.This is usually the last stage of the
+    data loader pipeline.
+    For example, when keys='imgs', meta_keys=('filename', 'label',
+    'original_shape'), meta_name='img_metas', the results will be a dict with
+    keys 'imgs' and 'img_metas', where 'img_metas' is a DataContainer of
+    another dict with keys 'filename', 'label', 'original_shape'.
+
+    Args:
+        keys (Sequence[str|tuple]): Required keys to be collected. If a tuple
+          (key, key_new) is given as an element, the item retrieved by key will
+          be renamed as key_new in collected data.
+        meta_name (str): The name of the key that contains meta information.
+          This key is always populated. Default: "img_metas".
+        meta_keys (Sequence[str|tuple]): Keys that are collected under
+          meta_name. The contents of the `meta_name` dictionary depends
+          on `meta_keys`.
+    """
+
+    def __init__(self, keys, meta_keys, meta_name='img_metas'):
+        self.keys = keys
+        self.meta_keys = meta_keys
+        self.meta_name = meta_name
+
+    def __call__(self, results):
+        """Performs the Collect formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+              to the next transform in pipeline.
+        """
+        if 'ann_info' in results:
+            results.update(results['ann_info'])
+
+        data = {}
+        for key in self.keys:
+            if isinstance(key, tuple):
+                assert len(key) == 2
+                key_src, key_tgt = key[:2]
+            else:
+                key_src = key_tgt = key
+            data[key_tgt] = results[key_src]
+
+        meta = {}
+        if len(self.meta_keys) != 0:
+            for key in self.meta_keys:
+                if isinstance(key, tuple):
+                    assert len(key) == 2
+                    key_src, key_tgt = key[:2]
+                else:
+                    key_src = key_tgt = key
+                meta[key_tgt] = results[key_src]
+        if 'bbox_id' in results:
+            meta['bbox_id'] = results['bbox_id']
+        data[self.meta_name] = DC(meta, cpu_only=True)
+
+        return data
+
+    def __repr__(self):
+        """Compute the string representation."""
+        return (f'{self.__class__.__name__}('
+                f'keys={self.keys}, meta_keys={self.meta_keys})')
+
+
+@PIPELINES.register_module()
+class Albumentation:
+    """Albumentation augmentation (pixel-level transforms only). Adds custom
+    pixel-level transformations from Albumentations library. Please visit
+    `https://albumentations.readthedocs.io` to get more information.
+
+    Note: we only support pixel-level transforms.
+    Please visit `https://github.com/albumentations-team/`
+    `albumentations#pixel-level-transforms`
+    to get more information about pixel-level transforms.
+
+    An example of ``transforms`` is as followed:
+
+    .. code-block:: python
+
+        [
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+
+    Args:
+        transforms (list[dict]): A list of Albumentation transformations
+        keymap (dict): Contains {'input key':'albumentation-style key'},
+            e.g., {'img': 'image'}.
+    """
+
+    def __init__(self, transforms, keymap=None):
+        if albumentations is None:
+            raise RuntimeError('albumentations is not installed')
+
+        self.transforms = transforms
+        self.filter_lost_elements = False
+
+        self.aug = albumentations.Compose(
+            [self.albu_builder(t) for t in self.transforms])
+
+        if not keymap:
+            self.keymap_to_albu = {
+                'img': 'image',
+            }
+        else:
+            self.keymap_to_albu = keymap
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg):
+        """Import a module from albumentations.
+
+        It resembles some of :func:`build_from_cfg` logic.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            obj: The constructed object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+
+        obj_type = args.pop('type')
+        if mmcv.is_str(obj_type):
+            if albumentations is None:
+                raise RuntimeError('albumentations is not installed')
+            if not hasattr(albumentations.augmentations.transforms, obj_type):
+                warnings.warn('{obj_type} is not pixel-level transformations. '
+                              'Please use with caution.')
+            obj_cls = getattr(albumentations, obj_type)
+        else:
+            raise TypeError(f'type must be a str, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(transform)
+                for transform in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d, keymap):
+        """Dictionary mapper.
+
+        Renames keys according to keymap provided.
+
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+
+        Returns:
+            dict: new dict.
+        """
+
+        updated_dict = {keymap.get(k, k): v for k, v in d.items()}
+        return updated_dict
+
+    def __call__(self, results):
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+
+        results = self.aug(**results)
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PhotometricDistortion:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def convert(self, img, alpha=1, beta=0):
+        """Multiple with alpha and add beta with clip."""
+        img = img.astype(np.float32) * alpha + beta
+        img = np.clip(img, 0, 255)
+        return img.astype(np.uint8)
+
+    def brightness(self, img):
+        """Brightness distortion."""
+        if random.randint(2):
+            return self.convert(
+                img,
+                beta=random.uniform(-self.brightness_delta,
+                                    self.brightness_delta))
+        return img
+
+    def contrast(self, img):
+        """Contrast distortion."""
+        if random.randint(2):
+            return self.convert(
+                img,
+                alpha=random.uniform(self.contrast_lower, self.contrast_upper))
+        return img
+
+    def saturation(self, img):
+        # Apply saturation distortion to hsv-formatted img
+        img[:, :, 1] = self.convert(
+            img[:, :, 1],
+            alpha=random.uniform(self.saturation_lower, self.saturation_upper))
+        return img
+
+    def hue(self, img):
+        # Apply hue distortion to hsv-formatted img
+        img[:, :, 0] = (img[:, :, 0].astype(int) +
+                        random.randint(-self.hue_delta, self.hue_delta)) % 180
+        return img
+
+    def swap_channels(self, img):
+        # Apply channel swap
+        if random.randint(2):
+            img = img[..., random.permutation(3)]
+        return img
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        img = results['img']
+        # random brightness
+        img = self.brightness(img)
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            img = self.contrast(img)
+
+        hsv_mode = random.randint(4)
+        if hsv_mode:
+            # random saturation/hue distortion
+            img = mmcv.bgr2hsv(img)
+            if hsv_mode == 1 or hsv_mode == 3:
+                img = self.saturation(img)
+            if hsv_mode == 2 or hsv_mode == 3:
+                img = self.hue(img)
+            img = mmcv.hsv2bgr(img)
+
+        # random contrast
+        if mode == 0:
+            img = self.contrast(img)
+
+        # randomly swap channels
+        self.swap_channels(img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(brightness_delta={self.brightness_delta}, '
+                     f'contrast_range=({self.contrast_lower}, '
+                     f'{self.contrast_upper}), '
+                     f'saturation_range=({self.saturation_lower}, '
+                     f'{self.saturation_upper}), '
+                     f'hue_delta={self.hue_delta})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MultiItemProcess:
+    """Process each item and merge multi-item results to lists.
+
+    Args:
+        pipeline (dict): Dictionary to construct pipeline for a single item.
+    """
+
+    def __init__(self, pipeline):
+        self.pipeline = Compose(pipeline)
+
+    def __call__(self, results):
+        results_ = {}
+        for idx, result in results.items():
+            single_result = self.pipeline(result)
+            for k, v in single_result.items():
+                if k in results_:
+                    results_[k].append(v)
+                else:
+                    results_[k] = [v]
+
+        return results_
+
+
+@PIPELINES.register_module()
+class DiscardDuplicatedItems:
+
+    def __init__(self, keys_list):
+        """Discard duplicated single-item results.
+
+        Args:
+            keys_list (list): List of keys that need to be deduplicate.
+        """
+        self.keys_list = keys_list
+
+    def __call__(self, results):
+        for k, v in results.items():
+            if k in self.keys_list:
+                assert isinstance(v, Sequence)
+                results[k] = v[0]
+
+        return results
+
+
+@PIPELINES.register_module()
+class MultitaskGatherTarget:
+    """Gather the targets for multitask heads.
+
+    Args:
+        pipeline_list (list[list]): List of pipelines for all heads.
+        pipeline_indices (list[int]): Pipeline index of each head.
+    """
+
+    def __init__(self,
+                 pipeline_list,
+                 pipeline_indices=None,
+                 keys=('target', 'target_weight')):
+        self.keys = keys
+        self.pipelines = []
+        for pipeline in pipeline_list:
+            self.pipelines.append(Compose(pipeline))
+        if pipeline_indices is None:
+            self.pipeline_indices = list(range(len(pipeline_list)))
+        else:
+            self.pipeline_indices = pipeline_indices
+
+    def __call__(self, results):
+        # generate target and target weights using all pipelines
+        pipeline_outputs = []
+        for pipeline in self.pipelines:
+            pipeline_output = pipeline(results)
+            pipeline_outputs.append(pipeline_output.copy())
+
+        for key in self.keys:
+            result_key = []
+            for ind in self.pipeline_indices:
+                result_key.append(pipeline_outputs[ind].get(key, None))
+            results[key] = result_key
+        return results
+
+
+@PIPELINES.register_module()
+class RenameKeys:
+    """Rename the keys.
+
+    Args:
+        key_pairs (Sequence[tuple]): Required keys to be renamed.
+            If a tuple (key_src, key_tgt) is given as an element,
+            the item retrieved by key_src will be renamed as key_tgt.
+    """
+
+    def __init__(self, key_pairs):
+        self.key_pairs = key_pairs
+
+    def __call__(self, results):
+        """Rename keys."""
+        for key_pair in self.key_pairs:
+            assert len(key_pair) == 2
+            key_src, key_tgt = key_pair
+            results[key_tgt] = results.pop(key_src)
+        return results
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/top_down_transform.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/top_down_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..1af1ea92d0cc5f973356ab72f300661e30b5d439
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/pipelines/top_down_transform.py
@@ -0,0 +1,736 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+from mmpose.core.post_processing import (affine_transform, fliplr_joints,
+                                         get_affine_transform, get_warp_matrix,
+                                         warp_affine_joints)
+from mmpose.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class TopDownRandomFlip:
+    """Data augmentation with random image flip.
+
+    Required keys: 'img', 'joints_3d', 'joints_3d_visible', 'center' and
+    'ann_info'.
+
+    Modifies key: 'img', 'joints_3d', 'joints_3d_visible', 'center' and
+    'flipped'.
+
+    Args:
+        flip (bool): Option to perform random flip.
+        flip_prob (float): Probability of flip.
+    """
+
+    def __init__(self, flip_prob=0.5):
+        self.flip_prob = flip_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+        img = results['img']
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        center = results['center']
+
+        # A flag indicating whether the image is flipped,
+        # which can be used by child class.
+        flipped = False
+        if np.random.rand() <= self.flip_prob:
+            flipped = True
+            if not isinstance(img, list):
+                img = img[:, ::-1, :]
+            else:
+                img = [i[:, ::-1, :] for i in img]
+            if not isinstance(img, list):
+                joints_3d, joints_3d_visible = fliplr_joints(
+                    joints_3d, joints_3d_visible, img.shape[1],
+                    results['ann_info']['flip_pairs'])
+                center[0] = img.shape[1] - center[0] - 1
+            else:
+                joints_3d, joints_3d_visible = fliplr_joints(
+                    joints_3d, joints_3d_visible, img[0].shape[1],
+                    results['ann_info']['flip_pairs'])
+                center[0] = img[0].shape[1] - center[0] - 1
+
+        results['img'] = img
+        results['joints_3d'] = joints_3d
+        results['joints_3d_visible'] = joints_3d_visible
+        results['center'] = center
+        results['flipped'] = flipped
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownHalfBodyTransform:
+    """Data augmentation with half-body transform. Keep only the upper body or
+    the lower body at random.
+
+    Required keys: 'joints_3d', 'joints_3d_visible', and 'ann_info'.
+
+    Modifies key: 'scale' and 'center'.
+
+    Args:
+        num_joints_half_body (int): Threshold of performing
+            half-body transform. If the body has fewer number
+            of joints (< num_joints_half_body), ignore this step.
+        prob_half_body (float): Probability of half-body transform.
+    """
+
+    def __init__(self, num_joints_half_body=8, prob_half_body=0.3):
+        self.num_joints_half_body = num_joints_half_body
+        self.prob_half_body = prob_half_body
+
+    @staticmethod
+    def half_body_transform(cfg, joints_3d, joints_3d_visible):
+        """Get center&scale for half-body transform."""
+        upper_joints = []
+        lower_joints = []
+        for joint_id in range(cfg['num_joints']):
+            if joints_3d_visible[joint_id][0] > 0:
+                if joint_id in cfg['upper_body_ids']:
+                    upper_joints.append(joints_3d[joint_id])
+                else:
+                    lower_joints.append(joints_3d[joint_id])
+
+        if np.random.randn() < 0.5 and len(upper_joints) > 2:
+            selected_joints = upper_joints
+        elif len(lower_joints) > 2:
+            selected_joints = lower_joints
+        else:
+            selected_joints = upper_joints
+
+        if len(selected_joints) < 2:
+            return None, None
+
+        selected_joints = np.array(selected_joints, dtype=np.float32)
+        center = selected_joints.mean(axis=0)[:2]
+
+        left_top = np.amin(selected_joints, axis=0)
+
+        right_bottom = np.amax(selected_joints, axis=0)
+
+        w = right_bottom[0] - left_top[0]
+        h = right_bottom[1] - left_top[1]
+
+        aspect_ratio = cfg['image_size'][0] / cfg['image_size'][1]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        scale = scale * 1.5
+        return center, scale
+
+    def __call__(self, results):
+        """Perform data augmentation with half-body transform."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+
+        if (np.sum(joints_3d_visible[:, 0]) > self.num_joints_half_body
+                and np.random.rand() < self.prob_half_body):
+
+            c_half_body, s_half_body = self.half_body_transform(
+                results['ann_info'], joints_3d, joints_3d_visible)
+
+            if c_half_body is not None and s_half_body is not None:
+                results['center'] = c_half_body
+                results['scale'] = s_half_body
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownGetRandomScaleRotation:
+    """Data augmentation with random scaling & rotating.
+
+    Required key: 'scale'.
+
+    Modifies key: 'scale' and 'rotation'.
+
+    Args:
+        rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``.
+        scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``.
+        rot_prob (float): Probability of random rotation.
+    """
+
+    def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6):
+        self.rot_factor = rot_factor
+        self.scale_factor = scale_factor
+        self.rot_prob = rot_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random scaling & rotating."""
+        s = results['scale']
+
+        sf = self.scale_factor
+        rf = self.rot_factor
+
+        s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+        s = s * s_factor
+
+        r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)
+        r = r_factor if np.random.rand() <= self.rot_prob else 0
+
+        results['scale'] = s
+        results['rotation'] = r
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownAffine:
+    """Affine transform the image to make input.
+
+    Required keys:'img', 'joints_3d', 'joints_3d_visible', 'ann_info','scale',
+    'rotation' and 'center'.
+
+    Modified keys:'img', 'joints_3d', and 'joints_3d_visible'.
+
+    Args:
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, use_udp=False):
+        self.use_udp = use_udp
+
+    def __call__(self, results):
+        image_size = results['ann_info']['image_size']
+
+        img = results['img']
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        c = results['center']
+        s = results['scale']
+        r = results['rotation']
+
+        if self.use_udp:
+            trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0)
+            if not isinstance(img, list):
+                img = cv2.warpAffine(
+                    img,
+                    trans, (int(image_size[0]), int(image_size[1])),
+                    flags=cv2.INTER_LINEAR)
+            else:
+                img = [
+                    cv2.warpAffine(
+                        i,
+                        trans, (int(image_size[0]), int(image_size[1])),
+                        flags=cv2.INTER_LINEAR) for i in img
+                ]
+
+            joints_3d[:, 0:2] = \
+                warp_affine_joints(joints_3d[:, 0:2].copy(), trans)
+
+        else:
+            trans = get_affine_transform(c, s, r, image_size)
+            if not isinstance(img, list):
+                img = cv2.warpAffine(
+                    img,
+                    trans, (int(image_size[0]), int(image_size[1])),
+                    flags=cv2.INTER_LINEAR)
+            else:
+                img = [
+                    cv2.warpAffine(
+                        i,
+                        trans, (int(image_size[0]), int(image_size[1])),
+                        flags=cv2.INTER_LINEAR) for i in img
+                ]
+            for i in range(results['ann_info']['num_joints']):
+                if joints_3d_visible[i, 0] > 0.0:
+                    joints_3d[i,
+                              0:2] = affine_transform(joints_3d[i, 0:2], trans)
+
+        results['img'] = img
+        results['joints_3d'] = joints_3d
+        results['joints_3d_visible'] = joints_3d_visible
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownGenerateTarget:
+    """Generate the target heatmap.
+
+    Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'.
+
+    Modified keys: 'target', and 'target_weight'.
+
+    Args:
+        sigma: Sigma of heatmap gaussian for 'MSRA' approach.
+        kernel: Kernel of heatmap gaussian for 'Megvii' approach.
+        encoding (str): Approach to generate target heatmaps.
+            Currently supported approaches: 'MSRA', 'Megvii', 'UDP'.
+            Default:'MSRA'
+        unbiased_encoding (bool): Option to use unbiased
+            encoding methods.
+            Paper ref: Zhang et al. Distribution-Aware Coordinate
+            Representation for Human Pose Estimation (CVPR 2020).
+        keypoint_pose_distance: Keypoint pose distance for UDP.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+        target_type (str): supported targets: 'GaussianHeatmap',
+            'CombinedTarget'. Default:'GaussianHeatmap'
+            CombinedTarget: The combination of classification target
+            (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self,
+                 sigma=2,
+                 kernel=(11, 11),
+                 valid_radius_factor=0.0546875,
+                 target_type='GaussianHeatmap',
+                 encoding='MSRA',
+                 unbiased_encoding=False):
+        self.sigma = sigma
+        self.unbiased_encoding = unbiased_encoding
+        self.kernel = kernel
+        self.valid_radius_factor = valid_radius_factor
+        self.target_type = target_type
+        self.encoding = encoding
+
+    def _msra_generate_target(self, cfg, joints_3d, joints_3d_visible, sigma):
+        """Generate the target heatmap via "MSRA" approach.
+
+        Args:
+            cfg (dict): data config
+            joints_3d: np.ndarray ([num_joints, 3])
+            joints_3d_visible: np.ndarray ([num_joints, 3])
+            sigma: Sigma of heatmap gaussian
+        Returns:
+            tuple: A tuple containing targets.
+
+            - target: Target heatmaps.
+            - target_weight: (1: visible, 0: invisible)
+        """
+        num_joints = cfg['num_joints']
+        image_size = cfg['image_size']
+        W, H = cfg['heatmap_size']
+        joint_weights = cfg['joint_weights']
+        use_different_joint_weights = cfg['use_different_joint_weights']
+
+        target_weight = np.zeros((num_joints, 1), dtype=np.float32)
+        target = np.zeros((num_joints, H, W), dtype=np.float32)
+
+        # 3-sigma rule
+        tmp_size = sigma * 3
+
+        if self.unbiased_encoding:
+            for joint_id in range(num_joints):
+                target_weight[joint_id] = joints_3d_visible[joint_id, 0]
+
+                feat_stride = image_size / [W, H]
+                mu_x = joints_3d[joint_id][0] / feat_stride[0]
+                mu_y = joints_3d[joint_id][1] / feat_stride[1]
+                # Check that any part of the gaussian is in-bounds
+                ul = [mu_x - tmp_size, mu_y - tmp_size]
+                br = [mu_x + tmp_size + 1, mu_y + tmp_size + 1]
+                if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0:
+                    target_weight[joint_id] = 0
+
+                if target_weight[joint_id] == 0:
+                    continue
+
+                x = np.arange(0, W, 1, np.float32)
+                y = np.arange(0, H, 1, np.float32)
+                y = y[:, None]
+
+                if target_weight[joint_id] > 0.5:
+                    target[joint_id] = np.exp(-((x - mu_x)**2 +
+                                                (y - mu_y)**2) /
+                                              (2 * sigma**2))
+        else:
+            for joint_id in range(num_joints):
+                target_weight[joint_id] = joints_3d_visible[joint_id, 0]
+
+                feat_stride = image_size / [W, H]
+                mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5)
+                mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5)
+                # Check that any part of the gaussian is in-bounds
+                ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+                br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+                if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0:
+                    target_weight[joint_id] = 0
+
+                if target_weight[joint_id] > 0.5:
+                    size = 2 * tmp_size + 1
+                    x = np.arange(0, size, 1, np.float32)
+                    y = x[:, None]
+                    x0 = y0 = size // 2
+                    # The gaussian is not normalized,
+                    # we want the center value to equal 1
+                    g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
+
+                    # Usable gaussian range
+                    g_x = max(0, -ul[0]), min(br[0], W) - ul[0]
+                    g_y = max(0, -ul[1]), min(br[1], H) - ul[1]
+                    # Image range
+                    img_x = max(0, ul[0]), min(br[0], W)
+                    img_y = max(0, ul[1]), min(br[1], H)
+
+                    target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
+                        g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+
+        if use_different_joint_weights:
+            target_weight = np.multiply(target_weight, joint_weights)
+
+        return target, target_weight
+
+    def _megvii_generate_target(self, cfg, joints_3d, joints_3d_visible,
+                                kernel):
+        """Generate the target heatmap via "Megvii" approach.
+
+        Args:
+            cfg (dict): data config
+            joints_3d: np.ndarray ([num_joints, 3])
+            joints_3d_visible: np.ndarray ([num_joints, 3])
+            kernel: Kernel of heatmap gaussian
+
+        Returns:
+            tuple: A tuple containing targets.
+
+            - target: Target heatmaps.
+            - target_weight: (1: visible, 0: invisible)
+        """
+
+        num_joints = cfg['num_joints']
+        image_size = cfg['image_size']
+        W, H = cfg['heatmap_size']
+        heatmaps = np.zeros((num_joints, H, W), dtype='float32')
+        target_weight = np.zeros((num_joints, 1), dtype=np.float32)
+
+        for i in range(num_joints):
+            target_weight[i] = joints_3d_visible[i, 0]
+
+            if target_weight[i] < 1:
+                continue
+
+            target_y = int(joints_3d[i, 1] * H / image_size[1])
+            target_x = int(joints_3d[i, 0] * W / image_size[0])
+
+            if (target_x >= W or target_x < 0) \
+                    or (target_y >= H or target_y < 0):
+                target_weight[i] = 0
+                continue
+
+            heatmaps[i, target_y, target_x] = 1
+            heatmaps[i] = cv2.GaussianBlur(heatmaps[i], kernel, 0)
+            maxi = heatmaps[i, target_y, target_x]
+
+            heatmaps[i] /= maxi / 255
+
+        return heatmaps, target_weight
+
+    def _udp_generate_target(self, cfg, joints_3d, joints_3d_visible, factor,
+                             target_type):
+        """Generate the target heatmap via 'UDP' approach. Paper ref: Huang et
+        al. The Devil is in the Details: Delving into Unbiased Data Processing
+        for Human Pose Estimation (CVPR 2020).
+
+        Note:
+            - num keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+            - num target channels: C
+            - C = K if target_type=='GaussianHeatmap'
+            - C = 3*K if target_type=='CombinedTarget'
+
+        Args:
+            cfg (dict): data config
+            joints_3d (np.ndarray[K, 3]): Annotated keypoints.
+            joints_3d_visible (np.ndarray[K, 3]): Visibility of keypoints.
+            factor (float): kernel factor for GaussianHeatmap target or
+                valid radius factor for CombinedTarget.
+            target_type (str): 'GaussianHeatmap' or 'CombinedTarget'.
+                GaussianHeatmap: Heatmap target with gaussian distribution.
+                CombinedTarget: The combination of classification target
+                (response map) and regression target (offset map).
+
+        Returns:
+            tuple: A tuple containing targets.
+
+            - target (np.ndarray[C, H, W]): Target heatmaps.
+            - target_weight (np.ndarray[K, 1]): (1: visible, 0: invisible)
+        """
+        num_joints = cfg['num_joints']
+        image_size = cfg['image_size']
+        heatmap_size = cfg['heatmap_size']
+        joint_weights = cfg['joint_weights']
+        use_different_joint_weights = cfg['use_different_joint_weights']
+
+        target_weight = np.ones((num_joints, 1), dtype=np.float32)
+        target_weight[:, 0] = joints_3d_visible[:, 0]
+
+        if target_type.lower() == 'GaussianHeatmap'.lower():
+            target = np.zeros((num_joints, heatmap_size[1], heatmap_size[0]),
+                              dtype=np.float32)
+
+            tmp_size = factor * 3
+
+            # prepare for gaussian
+            size = 2 * tmp_size + 1
+            x = np.arange(0, size, 1, np.float32)
+            y = x[:, None]
+
+            for joint_id in range(num_joints):
+                feat_stride = (image_size - 1.0) / (heatmap_size - 1.0)
+                mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5)
+                mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5)
+                # Check that any part of the gaussian is in-bounds
+                ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+                br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+                if ul[0] >= heatmap_size[0] or ul[1] >= heatmap_size[1] \
+                        or br[0] < 0 or br[1] < 0:
+                    # If not, just return the image as is
+                    target_weight[joint_id] = 0
+                    continue
+
+                # # Generate gaussian
+                mu_x_ac = joints_3d[joint_id][0] / feat_stride[0]
+                mu_y_ac = joints_3d[joint_id][1] / feat_stride[1]
+                x0 = y0 = size // 2
+                x0 += mu_x_ac - mu_x
+                y0 += mu_y_ac - mu_y
+                g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * factor**2))
+
+                # Usable gaussian range
+                g_x = max(0, -ul[0]), min(br[0], heatmap_size[0]) - ul[0]
+                g_y = max(0, -ul[1]), min(br[1], heatmap_size[1]) - ul[1]
+                # Image range
+                img_x = max(0, ul[0]), min(br[0], heatmap_size[0])
+                img_y = max(0, ul[1]), min(br[1], heatmap_size[1])
+
+                v = target_weight[joint_id]
+                if v > 0.5:
+                    target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
+                        g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+
+        elif target_type.lower() == 'CombinedTarget'.lower():
+            target = np.zeros(
+                (num_joints, 3, heatmap_size[1] * heatmap_size[0]),
+                dtype=np.float32)
+            feat_width = heatmap_size[0]
+            feat_height = heatmap_size[1]
+            feat_x_int = np.arange(0, feat_width)
+            feat_y_int = np.arange(0, feat_height)
+            feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int)
+            feat_x_int = feat_x_int.flatten()
+            feat_y_int = feat_y_int.flatten()
+            # Calculate the radius of the positive area in classification
+            #   heatmap.
+            valid_radius = factor * heatmap_size[1]
+            feat_stride = (image_size - 1.0) / (heatmap_size - 1.0)
+            for joint_id in range(num_joints):
+                mu_x = joints_3d[joint_id][0] / feat_stride[0]
+                mu_y = joints_3d[joint_id][1] / feat_stride[1]
+                x_offset = (mu_x - feat_x_int) / valid_radius
+                y_offset = (mu_y - feat_y_int) / valid_radius
+                dis = x_offset**2 + y_offset**2
+                keep_pos = np.where(dis <= 1)[0]
+                v = target_weight[joint_id]
+                if v > 0.5:
+                    target[joint_id, 0, keep_pos] = 1
+                    target[joint_id, 1, keep_pos] = x_offset[keep_pos]
+                    target[joint_id, 2, keep_pos] = y_offset[keep_pos]
+            target = target.reshape(num_joints * 3, heatmap_size[1],
+                                    heatmap_size[0])
+        else:
+            raise ValueError('target_type should be either '
+                             "'GaussianHeatmap' or 'CombinedTarget'")
+
+        if use_different_joint_weights:
+            target_weight = np.multiply(target_weight, joint_weights)
+
+        return target, target_weight
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+
+        assert self.encoding in ['MSRA', 'Megvii', 'UDP']
+
+        if self.encoding == 'MSRA':
+            if isinstance(self.sigma, list):
+                num_sigmas = len(self.sigma)
+                cfg = results['ann_info']
+                num_joints = cfg['num_joints']
+                heatmap_size = cfg['heatmap_size']
+
+                target = np.empty(
+                    (0, num_joints, heatmap_size[1], heatmap_size[0]),
+                    dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_sigmas):
+                    target_i, target_weight_i = self._msra_generate_target(
+                        cfg, joints_3d, joints_3d_visible, self.sigma[i])
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._msra_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible,
+                    self.sigma)
+
+        elif self.encoding == 'Megvii':
+            if isinstance(self.kernel, list):
+                num_kernels = len(self.kernel)
+                cfg = results['ann_info']
+                num_joints = cfg['num_joints']
+                W, H = cfg['heatmap_size']
+
+                target = np.empty((0, num_joints, H, W), dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_kernels):
+                    target_i, target_weight_i = self._megvii_generate_target(
+                        cfg, joints_3d, joints_3d_visible, self.kernel[i])
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._megvii_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible,
+                    self.kernel)
+
+        elif self.encoding == 'UDP':
+            if self.target_type.lower() == 'CombinedTarget'.lower():
+                factors = self.valid_radius_factor
+                channel_factor = 3
+            elif self.target_type.lower() == 'GaussianHeatmap'.lower():
+                factors = self.sigma
+                channel_factor = 1
+            else:
+                raise ValueError('target_type should be either '
+                                 "'GaussianHeatmap' or 'CombinedTarget'")
+            if isinstance(factors, list):
+                num_factors = len(factors)
+                cfg = results['ann_info']
+                num_joints = cfg['num_joints']
+                W, H = cfg['heatmap_size']
+
+                target = np.empty((0, channel_factor * num_joints, H, W),
+                                  dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_factors):
+                    target_i, target_weight_i = self._udp_generate_target(
+                        cfg, joints_3d, joints_3d_visible, factors[i],
+                        self.target_type)
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._udp_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible, factors,
+                    self.target_type)
+        else:
+            raise ValueError(
+                f'Encoding approach {self.encoding} is not supported!')
+
+        if results['ann_info'].get('max_num_joints', None) is not None:
+            W, H = results['ann_info']['heatmap_size']
+            padded_length = int(results['ann_info'].get('max_num_joints') - results['ann_info'].get('num_joints'))
+            target_weight = np.concatenate([target_weight, np.zeros((padded_length, 1), dtype=np.float32)], 0)
+            target = np.concatenate([target, np.zeros((padded_length, H, W), dtype=np.float32)], 0)
+
+        results['target'] = target
+        results['target_weight'] = target_weight
+
+        results['dataset_idx'] = results['ann_info'].get('dataset_idx', 0)
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownGenerateTargetRegression:
+    """Generate the target regression vector (coordinates).
+
+    Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'. Modified keys:
+    'target', and 'target_weight'.
+    """
+
+    def __init__(self):
+        pass
+
+    def _generate_target(self, cfg, joints_3d, joints_3d_visible):
+        """Generate the target regression vector.
+
+        Args:
+            cfg (dict): data config
+            joints_3d: np.ndarray([num_joints, 3])
+            joints_3d_visible: np.ndarray([num_joints, 3])
+
+        Returns:
+             target, target_weight(1: visible, 0: invisible)
+        """
+        image_size = cfg['image_size']
+        joint_weights = cfg['joint_weights']
+        use_different_joint_weights = cfg['use_different_joint_weights']
+
+        mask = (joints_3d[:, 0] >= 0) * (
+            joints_3d[:, 0] <= image_size[0] - 1) * (joints_3d[:, 1] >= 0) * (
+                joints_3d[:, 1] <= image_size[1] - 1)
+
+        target = joints_3d[:, :2] / image_size
+
+        target = target.astype(np.float32)
+        target_weight = joints_3d_visible[:, :2] * mask[:, None]
+
+        if use_different_joint_weights:
+            target_weight = np.multiply(target_weight, joint_weights)
+
+        return target, target_weight
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+
+        target, target_weight = self._generate_target(results['ann_info'],
+                                                      joints_3d,
+                                                      joints_3d_visible)
+
+        results['target'] = target
+        results['target_weight'] = target_weight
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownRandomTranslation:
+    """Data augmentation with random translation.
+
+    Required key: 'scale' and 'center'.
+
+    Modifies key: 'center'.
+
+    Note:
+        - bbox height: H
+        - bbox width: W
+
+    Args:
+        trans_factor (float): Translating center to
+            ``[-trans_factor, trans_factor] * [W, H] + center``.
+        trans_prob (float): Probability of random translation.
+    """
+
+    def __init__(self, trans_factor=0.15, trans_prob=1.0):
+        self.trans_factor = trans_factor
+        self.trans_prob = trans_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random translation."""
+        center = results['center']
+        scale = results['scale']
+        if np.random.rand() <= self.trans_prob:
+            # reference bbox size is [200, 200] pixels
+            center += self.trans_factor * np.random.uniform(
+                -1, 1, size=2) * scale * 200
+        results['center'] = center
+        return results
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/registry.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba3cc49e452eb4bceefa3bbb1b994d7f2ab7fff9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/registry.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from .builder import DATASETS, PIPELINES
+
+__all__ = ['DATASETS', 'PIPELINES']
+
+warnings.simplefilter('once', DeprecationWarning)
+warnings.warn(
+    'Registries (DATASETS, PIPELINES) have been moved to '
+    'mmpose.datasets.builder. Importing from '
+    'mmpose.models.registry will be deprecated in the future.',
+    DeprecationWarning)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/samplers/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..da09effaf20fefe1a102277672b98db7d884f002
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/samplers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .distributed_sampler import DistributedSampler
+
+__all__ = ['DistributedSampler']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/samplers/distributed_sampler.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/samplers/distributed_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcb5f522a2252678250385f9b37463ce3a0e24f5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/datasets/samplers/distributed_sampler.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+
+
+class DistributedSampler(_DistributedSampler):
+    """DistributedSampler inheriting from
+    `torch.utils.data.DistributedSampler`.
+
+    In pytorch of lower versions, there is no `shuffle` argument. This child
+    class will port one to DistributedSampler.
+    """
+
+    def __init__(self,
+                 dataset,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 seed=0):
+        super().__init__(
+            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        # for the compatibility from PyTorch 1.3+
+        self.seed = seed if seed is not None else 0
+
+    def __iter__(self):
+        """Deterministically shuffle based on epoch."""
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch + self.seed)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+        return iter(indices)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/deprecated.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..b930901722ab8fe57455f8eaf9e7c1c728b4b4f8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/deprecated.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from .datasets.builder import DATASETS
+from .datasets.datasets.base import Kpt2dSviewRgbImgTopDownDataset
+from .models.builder import HEADS, POSENETS
+from .models.detectors import AssociativeEmbedding
+from .models.heads import (AEHigherResolutionHead, AESimpleHead,
+                           DeepposeRegressionHead, HMRMeshHead,
+                           TopdownHeatmapMSMUHead,
+                           TopdownHeatmapMultiStageHead,
+                           TopdownHeatmapSimpleHead)
+
+
+@DATASETS.register_module()
+class TopDownFreiHandDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownFreiHandDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownFreiHandDataset has been renamed into FreiHandDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@DATASETS.register_module()
+class TopDownOneHand10KDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownOneHand10KDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownOneHand10KDataset has been renamed into OneHand10KDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@DATASETS.register_module()
+class TopDownPanopticDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownPanopticDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownPanopticDataset has been renamed into PanopticDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@HEADS.register_module()
+class BottomUpHigherResolutionHead(AEHigherResolutionHead):
+    """Bottom-up head for Higher Resolution.
+
+    BottomUpHigherResolutionHead has been renamed into AEHigherResolutionHead,
+    check https://github.com/open- mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUpHigherResolutionHead has been renamed into '
+            'AEHigherResolutionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class BottomUpSimpleHead(AESimpleHead):
+    """Bottom-up simple head.
+
+    BottomUpSimpleHead has been renamed into AESimpleHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUpHigherResolutionHead has been renamed into '
+            'AEHigherResolutionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownSimpleHead(TopdownHeatmapSimpleHead):
+    """Top-down heatmap simple head.
+
+    TopDownSimpleHead has been renamed into TopdownHeatmapSimpleHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownSimpleHead has been renamed into '
+            'TopdownHeatmapSimpleHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownMultiStageHead(TopdownHeatmapMultiStageHead):
+    """Top-down heatmap multi-stage head.
+
+    TopDownMultiStageHead has been renamed into TopdownHeatmapMultiStageHead,
+    check https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownMultiStageHead has been renamed into '
+            'TopdownHeatmapMultiStageHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownMSMUHead(TopdownHeatmapMSMUHead):
+    """Heads for multi-stage multi-unit heads.
+
+    TopDownMSMUHead has been renamed into TopdownHeatmapMSMUHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownMSMUHead has been renamed into '
+            'TopdownHeatmapMSMUHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class MeshHMRHead(HMRMeshHead):
+    """SMPL parameters regressor head.
+
+    MeshHMRHead has been renamed into HMRMeshHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'MeshHMRHead has been renamed into '
+            'HMRMeshHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class FcHead(DeepposeRegressionHead):
+    """FcHead (deprecated).
+
+    FcHead has been renamed into DeepposeRegressionHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'FcHead has been renamed into '
+            'DeepposeRegressionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@POSENETS.register_module()
+class BottomUp(AssociativeEmbedding):
+    """Associative Embedding.
+
+    BottomUp has been renamed into AssociativeEmbedding, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUp has been renamed into '
+            'AssociativeEmbedding, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbec55e439201119145ebb7423f9281b63f0ec07
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbones import *  # noqa
+from .builder import (BACKBONES, HEADS, LOSSES, MESH_MODELS, NECKS, POSENETS,
+                      build_backbone, build_head, build_loss, build_mesh_model,
+                      build_neck, build_posenet)
+from .detectors import *  # noqa
+from .heads import *  # noqa
+from .losses import *  # noqa
+from .necks import *  # noqa
+from .utils import *  # noqa
+
+__all__ = [
+    'BACKBONES', 'HEADS', 'NECKS', 'LOSSES', 'POSENETS', 'MESH_MODELS',
+    'build_backbone', 'build_head', 'build_loss', 'build_posenet',
+    'build_neck', 'build_mesh_model'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b8efcfbb5ac55e0f3b2de78e96bb799f54eab39
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .alexnet import AlexNet
+from .cpm import CPM
+from .hourglass import HourglassNet
+from .hourglass_ae import HourglassAENet
+from .hrformer import HRFormer
+from .hrnet import HRNet
+from .litehrnet import LiteHRNet
+from .mobilenet_v2 import MobileNetV2
+from .mobilenet_v3 import MobileNetV3
+from .mspn import MSPN
+from .regnet import RegNet
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1d
+from .resnext import ResNeXt
+from .rsn import RSN
+from .scnet import SCNet
+from .seresnet import SEResNet
+from .seresnext import SEResNeXt
+from .shufflenet_v1 import ShuffleNetV1
+from .shufflenet_v2 import ShuffleNetV2
+from .tcn import TCN
+from .v2v_net import V2VNet
+from .vgg import VGG
+from .vipnas_mbv3 import ViPNAS_MobileNetV3
+from .vipnas_resnet import ViPNAS_ResNet
+from .vit import ViT
+from .vit_moe import ViTMoE
+
+__all__ = [
+    'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
+    'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
+    'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
+    'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
+    'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT', 'ViTMoE'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/alexnet.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8efd74d118f5abe4d9c880ebe80ce7cbd58c6b2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/alexnet.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+@BACKBONES.register_module()
+class AlexNet(BaseBackbone):
+    """`AlexNet <https://en.wikipedia.org/wiki/AlexNet>`__ backbone.
+
+    The input for AlexNet is a 224x224 RGB image.
+
+    Args:
+        num_classes (int): number of classes for classification.
+            The default value is -1, which uses the backbone as
+            a feature extractor without the top classifier.
+    """
+
+    def __init__(self, num_classes=-1):
+        super().__init__()
+        self.num_classes = num_classes
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(),
+                nn.Linear(256 * 6 * 6, 4096),
+                nn.ReLU(inplace=True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(inplace=True),
+                nn.Linear(4096, num_classes),
+            )
+
+    def forward(self, x):
+
+        x = self.features(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), 256 * 6 * 6)
+            x = self.classifier(x)
+
+        return x
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/base_backbone.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/base_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..d64dca1da1380aca4521bc1066c76e8a6f56c18c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/base_backbone.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from abc import ABCMeta, abstractmethod
+
+import torch.nn as nn
+
+# from .utils import load_checkpoint
+from mmcv_custom.checkpoint import load_checkpoint
+
+class BaseBackbone(nn.Module, metaclass=ABCMeta):
+    """Base backbone.
+
+    This class defines the basic functions of a backbone. Any backbone that
+    inherits this class should at least define its own `forward` function.
+    """
+
+    def init_weights(self, pretrained=None, patch_padding='pad', part_features=None):
+        """Init backbone weights.
+
+        Args:
+            pretrained (str | None): If pretrained is a string, then it
+                initializes backbone weights by loading the pretrained
+                checkpoint. If pretrained is None, then it follows default
+                initializer or customized initializer in subclasses.
+        """
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding, part_features=part_features)
+        elif pretrained is None:
+            # use default initializer or customized initializer in subclasses
+            pass
+        else:
+            raise TypeError('pretrained must be a str or None.'
+                            f' But received {type(pretrained)}.')
+
+    @abstractmethod
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (Tensor | tuple[Tensor]): x could be a torch.Tensor or a tuple of
+                torch.Tensor, containing input data for forward computation.
+        """
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/cpm.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/cpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..458245d755f930f4ff625a754aadbab5c13494a6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/cpm.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import load_checkpoint
+
+
+class CpmBlock(nn.Module):
+    """CpmBlock for Convolutional Pose Machine.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        channels (list): Output channels of each conv module.
+        kernels (list): Kernel sizes of each conv module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels=(128, 128, 128),
+                 kernels=(11, 11, 11),
+                 norm_cfg=None):
+        super().__init__()
+
+        assert len(channels) == len(kernels)
+        layers = []
+        for i in range(len(channels)):
+            if i == 0:
+                input_channels = in_channels
+            else:
+                input_channels = channels[i - 1]
+            layers.append(
+                ConvModule(
+                    input_channels,
+                    channels[i],
+                    kernels[i],
+                    padding=(kernels[i] - 1) // 2,
+                    norm_cfg=norm_cfg))
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Model forward function."""
+        out = self.model(x)
+        return out
+
+
+@BACKBONES.register_module()
+class CPM(BaseBackbone):
+    """CPM backbone.
+
+    Convolutional Pose Machines.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1602.00134>`__ .
+
+    Args:
+        in_channels (int): The input channels of the CPM.
+        out_channels (int): The output channels of the CPM.
+        feat_channels (int): Feature channel of each CPM stage.
+        middle_channels (int): Feature channel of conv after the middle stage.
+        num_stages (int): Number of stages.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+
+    Example:
+        >>> from mmpose.models import CPM
+        >>> import torch
+        >>> self = CPM(3, 17)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 368, 368)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 feat_channels=128,
+                 middle_channels=32,
+                 num_stages=6,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        assert in_channels == 3
+
+        self.num_stages = num_stages
+        assert self.num_stages >= 1
+
+        self.stem = nn.Sequential(
+            ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 32, 5, padding=2, norm_cfg=norm_cfg),
+            ConvModule(32, 512, 9, padding=4, norm_cfg=norm_cfg),
+            ConvModule(512, 512, 1, padding=0, norm_cfg=norm_cfg),
+            ConvModule(512, out_channels, 1, padding=0, act_cfg=None))
+
+        self.middle = nn.Sequential(
+            ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+        self.cpm_stages = nn.ModuleList([
+            CpmBlock(
+                middle_channels + out_channels,
+                channels=[feat_channels, feat_channels, feat_channels],
+                kernels=[11, 11, 11],
+                norm_cfg=norm_cfg) for _ in range(num_stages - 1)
+        ])
+
+        self.middle_conv = nn.ModuleList([
+            nn.Sequential(
+                ConvModule(
+                    128, middle_channels, 5, padding=2, norm_cfg=norm_cfg))
+            for _ in range(num_stages - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            nn.Sequential(
+                ConvModule(
+                    feat_channels,
+                    feat_channels,
+                    1,
+                    padding=0,
+                    norm_cfg=norm_cfg),
+                ConvModule(feat_channels, out_channels, 1, act_cfg=None))
+            for _ in range(num_stages - 1)
+        ])
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Model forward function."""
+        stage1_out = self.stem(x)
+        middle_out = self.middle(x)
+        out_feats = []
+
+        out_feats.append(stage1_out)
+
+        for ind in range(self.num_stages - 1):
+            single_stage = self.cpm_stages[ind]
+            out_conv = self.out_convs[ind]
+
+            inp_feat = torch.cat(
+                [out_feats[-1], self.middle_conv[ind](middle_out)], 1)
+            cpm_feat = single_stage(inp_feat)
+            out_feat = out_conv(cpm_feat)
+            out_feats.append(out_feat)
+
+        return out_feats
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hourglass.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hourglass.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf75fad9895ebfd3f3c2a6bffedb3d7e4cc77cba
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hourglass.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .resnet import BasicBlock, ResLayer
+from .utils import load_checkpoint
+
+
+class HourglassModule(nn.Module):
+    """Hourglass Module for HourglassNet backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in current and
+            follow-up HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 depth,
+                 stage_channels,
+                 stage_blocks,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.depth = depth
+
+        cur_block = stage_blocks[0]
+        next_block = stage_blocks[1]
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ResLayer(
+            BasicBlock, cur_block, cur_channel, cur_channel, norm_cfg=norm_cfg)
+
+        self.low1 = ResLayer(
+            BasicBlock,
+            cur_block,
+            cur_channel,
+            next_channel,
+            stride=2,
+            norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassModule(depth - 1, stage_channels[1:],
+                                        stage_blocks[1:])
+        else:
+            self.low2 = ResLayer(
+                BasicBlock,
+                next_block,
+                next_channel,
+                next_channel,
+                norm_cfg=norm_cfg)
+
+        self.low3 = ResLayer(
+            BasicBlock,
+            cur_block,
+            next_channel,
+            cur_channel,
+            norm_cfg=norm_cfg,
+            downsample_first=False)
+
+        self.up2 = nn.Upsample(scale_factor=2)
+
+    def forward(self, x):
+        """Model forward function."""
+        up1 = self.up1(x)
+        low1 = self.low1(x)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2 = self.up2(low3)
+        return up1 + up2
+
+
+@BACKBONES.register_module()
+class HourglassNet(BaseBackbone):
+    """HourglassNet backbone.
+
+    Stacked Hourglass Networks for Human Pose Estimation.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1603.06937>`__ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (list[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channel (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+
+    Example:
+        >>> from mmpose.models import HourglassNet
+        >>> import torch
+        >>> self = HourglassNet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 256, 128, 128)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times=5,
+                 num_stacks=2,
+                 stage_channels=(256, 256, 384, 384, 384, 512),
+                 stage_blocks=(2, 2, 2, 2, 2, 4),
+                 feat_channel=256,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) == len(stage_blocks)
+        assert len(stage_channels) > downsample_times
+
+        cur_channel = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(3, 128, 7, padding=3, stride=2, norm_cfg=norm_cfg),
+            ResLayer(BasicBlock, 1, 128, 256, stride=2, norm_cfg=norm_cfg))
+
+        self.hourglass_modules = nn.ModuleList([
+            HourglassModule(downsample_times, stage_channels, stage_blocks)
+            for _ in range(num_stacks)
+        ])
+
+        self.inters = ResLayer(
+            BasicBlock,
+            num_stacks - 1,
+            cur_channel,
+            cur_channel,
+            norm_cfg=norm_cfg)
+
+        self.conv1x1s = nn.ModuleList([
+            ConvModule(
+                cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg)
+            for _ in range(num_stacks)
+        ])
+
+        self.remap_convs = nn.ModuleList([
+            ConvModule(
+                feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Model forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = self.conv1x1s[ind](
+                    inter_feat) + self.remap_convs[ind](
+                        out_feat)
+                inter_feat = self.inters[ind](self.relu(inter_feat))
+
+        return out_feats
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hourglass_ae.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hourglass_ae.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a700e5cb2157fd1dc16771145f065e991b270ea
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hourglass_ae.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, MaxPool2d, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import load_checkpoint
+
+
+class HourglassAEModule(nn.Module):
+    """Modified Hourglass Module for HourglassNet_AE backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 depth,
+                 stage_channels,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.depth = depth
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ConvModule(
+            cur_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.pool1 = MaxPool2d(2, 2)
+
+        self.low1 = ConvModule(
+            cur_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassAEModule(depth - 1, stage_channels[1:])
+        else:
+            self.low2 = ConvModule(
+                next_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.low3 = ConvModule(
+            next_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.up2 = nn.UpsamplingNearest2d(scale_factor=2)
+
+    def forward(self, x):
+        """Model forward function."""
+        up1 = self.up1(x)
+        pool1 = self.pool1(x)
+        low1 = self.low1(pool1)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2 = self.up2(low3)
+        return up1 + up2
+
+
+@BACKBONES.register_module()
+class HourglassAENet(BaseBackbone):
+    """Hourglass-AE Network proposed by Newell et al.
+
+    Associative Embedding: End-to-End Learning for Joint
+    Detection and Grouping.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1611.05424>`__ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (list[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channels (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+
+    Example:
+        >>> from mmpose.models import HourglassAENet
+        >>> import torch
+        >>> self = HourglassAENet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 512, 512)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 34, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times=4,
+                 num_stacks=1,
+                 out_channels=34,
+                 stage_channels=(256, 384, 512, 640, 768),
+                 feat_channels=256,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) > downsample_times
+
+        cur_channels = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(3, 64, 7, padding=3, stride=2, norm_cfg=norm_cfg),
+            ConvModule(64, 128, 3, padding=1, norm_cfg=norm_cfg),
+            MaxPool2d(2, 2),
+            ConvModule(128, 128, 3, padding=1, norm_cfg=norm_cfg),
+            ConvModule(128, feat_channels, 3, padding=1, norm_cfg=norm_cfg),
+        )
+
+        self.hourglass_modules = nn.ModuleList([
+            nn.Sequential(
+                HourglassAEModule(
+                    downsample_times, stage_channels, norm_cfg=norm_cfg),
+                ConvModule(
+                    feat_channels,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg),
+                ConvModule(
+                    feat_channels,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg)) for _ in range(num_stacks)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channels,
+                out_channels,
+                1,
+                padding=0,
+                norm_cfg=None,
+                act_cfg=None) for _ in range(num_stacks)
+        ])
+
+        self.remap_out_convs = nn.ModuleList([
+            ConvModule(
+                out_channels,
+                feat_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None) for _ in range(num_stacks - 1)
+        ])
+
+        self.remap_feature_convs = nn.ModuleList([
+            ConvModule(
+                feat_channels,
+                feat_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None) for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Model forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = inter_feat + self.remap_out_convs[ind](
+                    out_feat) + self.remap_feature_convs[ind](
+                        hourglass_feat)
+
+        return out_feats
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hrformer.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hrformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b843300a9fdb85908678c5a3fd45ce19e97ce2fe
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hrformer.py
@@ -0,0 +1,746 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import math
+
+import torch
+import torch.nn as nn
+# from timm.models.layers import to_2tuple, trunc_normal_
+from mmcv.cnn import (build_activation_layer, build_conv_layer,
+                      build_norm_layer, trunc_normal_init)
+from mmcv.cnn.bricks.transformer import build_dropout
+from mmcv.runner import BaseModule
+from torch.nn.functional import pad
+
+from ..builder import BACKBONES
+from .hrnet import Bottleneck, HRModule, HRNet
+
+
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W)
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def build_drop_path(drop_path_rate):
+    """Build drop path layer."""
+    return build_dropout(dict(type='DropPath', drop_prob=drop_path_rate))
+
+
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        with_rpe (bool, optional): If True, use relative position bias.
+            Default: True.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 with_rpe=True,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        self.with_rpe = with_rpe
+        if self.with_rpe:
+            # define a parameter table of relative position bias
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(
+                    (2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                    num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+            Wh, Ww = self.window_size
+            rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+            rel_position_index = rel_index_coords + rel_index_coords.T
+            rel_position_index = rel_position_index.flip(1).contiguous()
+            self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_init(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (B*num_windows, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.with_rpe:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1],
+                    self.window_size[0] * self.window_size[1],
+                    -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class LocalWindowSelfAttention(BaseModule):
+    r""" Local-window Self Attention (LSA) module with relative position bias.
+
+    This module is the short-range self-attention module in the
+    Interlaced Sparse Self-Attention <https://arxiv.org/abs/1907.12273>`_.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int] | int): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        with_rpe (bool, optional): If True, use relative position bias.
+            Default: True.
+        with_pad_mask (bool, optional): If True, mask out the padded tokens in
+            the attention process. Default: False.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 with_rpe=True,
+                 with_pad_mask=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(window_size, int):
+            window_size = (window_size, window_size)
+        self.window_size = window_size
+        self.with_pad_mask = with_pad_mask
+        self.attn = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            with_rpe=with_rpe,
+            init_cfg=init_cfg)
+
+    def forward(self, x, H, W, **kwargs):
+        """Forward function."""
+        B, N, C = x.shape
+        x = x.view(B, H, W, C)
+        Wh, Ww = self.window_size
+
+        # center-pad the feature on H and W axes
+        pad_h = math.ceil(H / Wh) * Wh - H
+        pad_w = math.ceil(W / Ww) * Ww - W
+        x = pad(x, (0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2))
+
+        # permute
+        x = x.view(B, math.ceil(H / Wh), Wh, math.ceil(W / Ww), Ww, C)
+        x = x.permute(0, 1, 3, 2, 4, 5)
+        x = x.reshape(-1, Wh * Ww, C)  # (B*num_window, Wh*Ww, C)
+
+        # attention
+        if self.with_pad_mask and pad_h > 0 and pad_w > 0:
+            pad_mask = x.new_zeros(1, H, W, 1)
+            pad_mask = pad(
+                pad_mask, [
+                    0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ],
+                value=-float('inf'))
+            pad_mask = pad_mask.view(1, math.ceil(H / Wh), Wh,
+                                     math.ceil(W / Ww), Ww, 1)
+            pad_mask = pad_mask.permute(1, 3, 0, 2, 4, 5)
+            pad_mask = pad_mask.reshape(-1, Wh * Ww)
+            pad_mask = pad_mask[:, None, :].expand([-1, Wh * Ww, -1])
+            out = self.attn(x, pad_mask, **kwargs)
+        else:
+            out = self.attn(x, **kwargs)
+
+        # reverse permutation
+        out = out.reshape(B, math.ceil(H / Wh), math.ceil(W / Ww), Wh, Ww, C)
+        out = out.permute(0, 1, 3, 2, 4, 5)
+        out = out.reshape(B, H + pad_h, W + pad_w, C)
+
+        # de-pad
+        out = out[:, pad_h // 2:H + pad_h // 2, pad_w // 2:W + pad_w // 2]
+        return out.reshape(B, N, C)
+
+
+class CrossFFN(BaseModule):
+    r"""FFN with Depthwise Conv of HRFormer.
+
+    Args:
+        in_features (int): The feature dimension.
+        hidden_features (int, optional): The hidden dimension of FFNs.
+            Defaults: The same as in_features.
+        act_cfg (dict, optional): Config of activation layer.
+            Default: dict(type='GELU').
+        dw_act_cfg (dict, optional): Config of activation layer appended
+            right after DW Conv. Default: dict(type='GELU').
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='SyncBN').
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_cfg=dict(type='GELU'),
+                 dw_act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = build_activation_layer(act_cfg)
+        self.norm1 = build_norm_layer(norm_cfg, hidden_features)[1]
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1)
+        self.act2 = build_activation_layer(dw_act_cfg)
+        self.norm2 = build_norm_layer(norm_cfg, hidden_features)[1]
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.act3 = build_activation_layer(act_cfg)
+        self.norm3 = build_norm_layer(norm_cfg, out_features)[1]
+
+        # put the modules togather
+        self.layers = [
+            self.fc1, self.norm1, self.act1, self.dw3x3, self.norm2, self.act2,
+            self.fc2, self.norm3, self.act3
+        ]
+
+    def forward(self, x, H, W):
+        """Forward function."""
+        x = nlc_to_nchw(x, (H, W))
+        for layer in self.layers:
+            x = layer(x)
+        x = nchw_to_nlc(x)
+        return x
+
+
+class HRFormerBlock(BaseModule):
+    """High-Resolution Block for HRFormer.
+
+    Args:
+        in_features (int): The input dimension.
+        out_features (int): The output dimension.
+        num_heads (int): The number of head within each LSA.
+        window_size (int, optional): The window size for the LSA.
+            Default: 7
+        mlp_ratio (int, optional): The expansion ration of FFN.
+            Default: 4
+        act_cfg (dict, optional): Config of activation layer.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='SyncBN').
+        transformer_norm_cfg (dict, optional): Config of transformer norm
+            layer. Default: dict(type='LN', eps=1e-6).
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.0,
+                 drop_path=0.0,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN'),
+                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
+                 init_cfg=None,
+                 **kwargs):
+        super(HRFormerBlock, self).__init__(init_cfg=init_cfg)
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+
+        self.norm1 = build_norm_layer(transformer_norm_cfg, in_features)[1]
+        self.attn = LocalWindowSelfAttention(
+            in_features,
+            num_heads=num_heads,
+            window_size=window_size,
+            init_cfg=None,
+            **kwargs)
+
+        self.norm2 = build_norm_layer(transformer_norm_cfg, out_features)[1]
+        self.ffn = CrossFFN(
+            in_features=in_features,
+            hidden_features=int(in_features * mlp_ratio),
+            out_features=out_features,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            dw_act_cfg=act_cfg,
+            init_cfg=None)
+
+        self.drop_path = build_drop_path(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, x):
+        """Forward function."""
+        B, C, H, W = x.size()
+        # Attention
+        x = x.view(B, C, -1).permute(0, 2, 1)
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        # FFN
+        x = x + self.drop_path(self.ffn(self.norm2(x), H, W))
+        x = x.permute(0, 2, 1).view(B, C, H, W)
+        return x
+
+    def extra_repr(self):
+        """(Optional) Set the extra information about this module."""
+        return 'num_heads={}, window_size={}, mlp_ratio={}'.format(
+            self.num_heads, self.window_size, self.mlp_ratio)
+
+
+class HRFomerModule(HRModule):
+    """High-Resolution Module for HRFormer.
+
+    Args:
+        num_branches (int): The number of branches in the HRFormerModule.
+        block (nn.Module): The building block of HRFormer.
+            The block should be the HRFormerBlock.
+        num_blocks (tuple): The number of blocks in each branch.
+            The length must be equal to num_branches.
+        num_inchannels (tuple): The number of input channels in each branch.
+            The length must be equal to num_branches.
+        num_channels (tuple): The number of channels in each branch.
+            The length must be equal to num_branches.
+        num_heads (tuple): The number of heads within the LSAs.
+        num_window_sizes (tuple): The window size for the LSAs.
+        num_mlp_ratios (tuple): The expansion ratio for the FFNs.
+        drop_path (int, optional): The drop path rate of HRFomer.
+            Default: 0.0
+        multiscale_output (bool, optional): Whether to output multi-level
+            features produced by multiple branches. If False, only the first
+            level feature will be output. Default: True.
+        conv_cfg (dict, optional): Config of the conv layers.
+            Default: None.
+        norm_cfg (dict, optional): Config of the norm layers appended
+            right after conv. Default: dict(type='SyncBN', requires_grad=True)
+        transformer_norm_cfg (dict, optional): Config of the norm layers.
+            Default: dict(type='LN', eps=1e-6)
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False
+        upsample_cfg(dict, optional): The config of upsample layers in fuse
+            layers. Default: dict(mode='bilinear', align_corners=False)
+    """
+
+    def __init__(self,
+                 num_branches,
+                 block,
+                 num_blocks,
+                 num_inchannels,
+                 num_channels,
+                 num_heads,
+                 num_window_sizes,
+                 num_mlp_ratios,
+                 multiscale_output=True,
+                 drop_paths=0.0,
+                 with_rpe=True,
+                 with_pad_mask=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='SyncBN', requires_grad=True),
+                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
+                 with_cp=False,
+                 upsample_cfg=dict(mode='bilinear', align_corners=False)):
+
+        self.transformer_norm_cfg = transformer_norm_cfg
+        self.drop_paths = drop_paths
+        self.num_heads = num_heads
+        self.num_window_sizes = num_window_sizes
+        self.num_mlp_ratios = num_mlp_ratios
+        self.with_rpe = with_rpe
+        self.with_pad_mask = with_pad_mask
+
+        super().__init__(num_branches, block, num_blocks, num_inchannels,
+                         num_channels, multiscale_output, with_cp, conv_cfg,
+                         norm_cfg, upsample_cfg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        """Build one branch."""
+        # HRFormerBlock does not support down sample layer yet.
+        assert stride == 1 and self.in_channels[branch_index] == num_channels[
+            branch_index]
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                num_heads=self.num_heads[branch_index],
+                window_size=self.num_window_sizes[branch_index],
+                mlp_ratio=self.num_mlp_ratios[branch_index],
+                drop_path=self.drop_paths[0],
+                norm_cfg=self.norm_cfg,
+                transformer_norm_cfg=self.transformer_norm_cfg,
+                init_cfg=None,
+                with_rpe=self.with_rpe,
+                with_pad_mask=self.with_pad_mask))
+
+        self.in_channels[
+            branch_index] = self.in_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    num_heads=self.num_heads[branch_index],
+                    window_size=self.num_window_sizes[branch_index],
+                    mlp_ratio=self.num_mlp_ratios[branch_index],
+                    drop_path=self.drop_paths[i],
+                    norm_cfg=self.norm_cfg,
+                    transformer_norm_cfg=self.transformer_norm_cfg,
+                    init_cfg=None,
+                    with_rpe=self.with_rpe,
+                    with_pad_mask=self.with_pad_mask))
+        return nn.Sequential(*layers)
+
+    def _make_fuse_layers(self):
+        """Build fuse layers."""
+        if self.num_branches == 1:
+            return None
+        num_branches = self.num_branches
+        num_inchannels = self.in_channels
+        fuse_layers = []
+        for i in range(num_branches if self.multiscale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_inchannels[j],
+                                num_inchannels[i],
+                                kernel_size=1,
+                                stride=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_inchannels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i),
+                                mode=self.upsample_cfg['mode'],
+                                align_corners=self.
+                                upsample_cfg['align_corners'])))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            with_out_act = False
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            with_out_act = True
+                        sub_modules = [
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_inchannels[j],
+                                num_inchannels[j],
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                groups=num_inchannels[j],
+                                bias=False,
+                            ),
+                            build_norm_layer(self.norm_cfg,
+                                             num_inchannels[j])[1],
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_inchannels[j],
+                                num_outchannels_conv3x3,
+                                kernel_size=1,
+                                stride=1,
+                                bias=False,
+                            ),
+                            build_norm_layer(self.norm_cfg,
+                                             num_outchannels_conv3x3)[1]
+                        ]
+                        if with_out_act:
+                            sub_modules.append(nn.ReLU(False))
+                        conv3x3s.append(nn.Sequential(*sub_modules))
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def get_num_inchannels(self):
+        """Return the number of input channels."""
+        return self.in_channels
+
+
+@BACKBONES.register_module()
+class HRFormer(HRNet):
+    """HRFormer backbone.
+
+    This backbone is the implementation of `HRFormer: High-Resolution
+    Transformer for Dense Prediction <https://arxiv.org/abs/2110.09408>`_.
+
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+
+                - num_modules (int): The number of HRModule in this stage.
+                - num_branches (int): The number of branches in the HRModule.
+                - block (str): The type of block.
+                - num_blocks (tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels (tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Normally 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Config of norm layer.
+            Use `SyncBN` by default.
+        transformer_norm_cfg (dict): Config of transformer norm layer.
+            Use `LN` by default.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+    Example:
+        >>> from mmpose.models import HRFormer
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(2, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='HRFORMER',
+        >>>         window_sizes=(7, 7),
+        >>>         num_heads=(1, 2),
+        >>>         mlp_ratios=(4, 4),
+        >>>         num_blocks=(2, 2),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='HRFORMER',
+        >>>         window_sizes=(7, 7, 7),
+        >>>         num_heads=(1, 2, 4),
+        >>>         mlp_ratios=(4, 4, 4),
+        >>>         num_blocks=(2, 2, 2),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=2,
+        >>>         num_branches=4,
+        >>>         block='HRFORMER',
+        >>>         window_sizes=(7, 7, 7, 7),
+        >>>         num_heads=(1, 2, 4, 8),
+        >>>         mlp_ratios=(4, 4, 4, 4),
+        >>>         num_blocks=(2, 2, 2, 2),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRFormer(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BOTTLENECK': Bottleneck, 'HRFORMERBLOCK': HRFormerBlock}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=False,
+                 frozen_stages=-1):
+
+        # stochastic depth
+        depths = [
+            extra[stage]['num_blocks'][0] * extra[stage]['num_modules']
+            for stage in ['stage2', 'stage3', 'stage4']
+        ]
+        depth_s2, depth_s3, _ = depths
+        drop_path_rate = extra['drop_path_rate']
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        extra['stage2']['drop_path_rates'] = dpr[0:depth_s2]
+        extra['stage3']['drop_path_rates'] = dpr[depth_s2:depth_s2 + depth_s3]
+        extra['stage4']['drop_path_rates'] = dpr[depth_s2 + depth_s3:]
+
+        # HRFormer use bilinear upsample as default
+        upsample_cfg = extra.get('upsample', {
+            'mode': 'bilinear',
+            'align_corners': False
+        })
+        extra['upsample'] = upsample_cfg
+        self.transformer_norm_cfg = transformer_norm_cfg
+        self.with_rpe = extra.get('with_rpe', True)
+        self.with_pad_mask = extra.get('with_pad_mask', False)
+
+        super().__init__(extra, in_channels, conv_cfg, norm_cfg, norm_eval,
+                         with_cp, zero_init_residual, frozen_stages)
+
+    def _make_stage(self,
+                    layer_config,
+                    num_inchannels,
+                    multiscale_output=True):
+        """Make each stage."""
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+        num_heads = layer_config['num_heads']
+        num_window_sizes = layer_config['window_sizes']
+        num_mlp_ratios = layer_config['mlp_ratios']
+        drop_path_rates = layer_config['drop_path_rates']
+
+        modules = []
+        for i in range(num_modules):
+            # multiscale_output is only used at the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            modules.append(
+                HRFomerModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    num_inchannels,
+                    num_channels,
+                    num_heads,
+                    num_window_sizes,
+                    num_mlp_ratios,
+                    reset_multiscale_output,
+                    drop_paths=drop_path_rates[num_blocks[0] *
+                                               i:num_blocks[0] * (i + 1)],
+                    with_rpe=self.with_rpe,
+                    with_pad_mask=self.with_pad_mask,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    transformer_norm_cfg=self.transformer_norm_cfg,
+                    with_cp=self.with_cp,
+                    upsample_cfg=self.upsample_cfg))
+            num_inchannels = modules[-1].get_num_inchannels()
+
+        return nn.Sequential(*modules), num_inchannels
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hrnet.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..87dc8cef555b5e8d78fcc69293047b0cbe2ea8a6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/hrnet.py
@@ -0,0 +1,604 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
+                      normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .resnet import BasicBlock, Bottleneck, get_expansion
+from .utils import load_checkpoint
+
+
+class HRModule(nn.Module):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=False,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 upsample_cfg=dict(mode='nearest', align_corners=None)):
+
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.upsample_cfg = upsample_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=True)
+
+    @staticmethod
+    def _check_branches(num_branches, num_blocks, in_channels, num_channels):
+        """Check input to avoid ValueError."""
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_BLOCKS({len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_CHANNELS({len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        """Make one branch."""
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * get_expansion(block):
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * get_expansion(block),
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(
+                    self.norm_cfg,
+                    num_channels[branch_index] * get_expansion(block))[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index] * get_expansion(block),
+                stride=stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * get_expansion(block)
+        for _ in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * get_expansion(block),
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg))
+
+        return nn.Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        """Make branches."""
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        """Make fuse layer."""
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i),
+                                mode=self.upsample_cfg['mode'],
+                                align_corners=self.
+                                upsample_cfg['align_corners'])))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=True)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@BACKBONES.register_module()
+class HRNet(nn.Module):
+    """HRNet backbone.
+
+    `High-Resolution Representations for Labeling Pixels and Regions
+    <https://arxiv.org/abs/1904.04514>`__
+
+    Args:
+        extra (dict): detailed configuration for each stage of HRNet.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+
+    Example:
+        >>> from mmpose.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=False,
+                 frozen_stages=-1):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+        self.frozen_stages = frozen_stages
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.upsample_cfg = self.extra.get('upsample', {
+            'mode': 'nearest',
+            'align_corners': None
+        })
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * get_expansion(block)
+        self.layer1 = self._make_layer(block, 64, stage1_out_channels,
+                                       num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [
+            channel * get_expansion(block) for channel in num_channels
+        ]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [
+            channel * get_expansion(block) for channel in num_channels
+        ]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [
+            channel * get_expansion(block) for channel in num_channels
+        ]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg,
+            num_channels,
+            multiscale_output=self.stage4_cfg.get('multiscale_output', False))
+
+        self._freeze_stages()
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        """Make transition layer."""
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, in_channels, out_channels, blocks, stride=1):
+        """Make layer."""
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, out_channels)[1])
+
+        layers = []
+        layers.append(
+            block(
+                in_channels,
+                out_channels,
+                stride=stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg))
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    out_channels,
+                    out_channels,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg))
+
+        return nn.Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        """Make stage."""
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    upsample_cfg=self.upsample_cfg))
+
+            in_channels = hr_modules[-1].in_channels
+
+        return nn.Sequential(*hr_modules), in_channels
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.norm1.eval()
+            self.norm2.eval()
+
+            for m in [self.conv1, self.norm1, self.conv2, self.norm2]:
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            if i == 1:
+                m = getattr(self, 'layer1')
+            else:
+                m = getattr(self, f'stage{i}')
+
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+            if i < 4:
+                m = getattr(self, f'transition{i}')
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/litehrnet.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/litehrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..954368841eb631e3dc6c77e9810f6980f3739bf3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/litehrnet.py
@@ -0,0 +1,984 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HRNet/Lite-HRNet
+# Original licence: Apache License 2.0.
+# ------------------------------------------------------------------------------
+
+import mmcv
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule,
+                      build_conv_layer, build_norm_layer, constant_init,
+                      normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .utils import channel_shuffle, load_checkpoint
+
+
+class SpatialWeighting(nn.Module):
+    """Spatial weighting module.
+
+    Args:
+        channels (int): The channels of the module.
+        ratio (int): channel reduction ratio.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid')).
+            The last ConvModule uses Sigmoid by default.
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
+
+
+class CrossResolutionWeighting(nn.Module):
+    """Cross-resolution channel weighting module.
+
+    Args:
+        channels (int): The channels of the module.
+        ratio (int): channel reduction ratio.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid')).
+            The last ConvModule uses Sigmoid by default.
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.channels = channels
+        total_channel = sum(channels)
+        self.conv1 = ConvModule(
+            in_channels=total_channel,
+            out_channels=int(total_channel / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(total_channel / ratio),
+            out_channels=total_channel,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        mini_size = x[-1].size()[-2:]
+        out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]]
+        out = torch.cat(out, dim=1)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        out = torch.split(out, self.channels, dim=1)
+        out = [
+            s * F.interpolate(a, size=s.size()[-2:], mode='nearest')
+            for s, a in zip(x, out)
+        ]
+        return out
+
+
+class ConditionalChannelWeighting(nn.Module):
+    """Conditional channel weighting block.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        stride (int): Stride of the 3x3 convolution layer.
+        reduce_ratio (int): channel reduction ratio.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 stride,
+                 reduce_ratio,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.stride = stride
+        assert stride in [1, 2]
+
+        branch_channels = [channel // 2 for channel in in_channels]
+
+        self.cross_resolution_weighting = CrossResolutionWeighting(
+            branch_channels,
+            ratio=reduce_ratio,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+        self.depthwise_convs = nn.ModuleList([
+            ConvModule(
+                channel,
+                channel,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                groups=channel,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None) for channel in branch_channels
+        ])
+
+        self.spatial_weighting = nn.ModuleList([
+            SpatialWeighting(channels=channel, ratio=4)
+            for channel in branch_channels
+        ])
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            x = [s.chunk(2, dim=1) for s in x]
+            x1 = [s[0] for s in x]
+            x2 = [s[1] for s in x]
+
+            x2 = self.cross_resolution_weighting(x2)
+            x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)]
+            x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)]
+
+            out = [torch.cat([s1, s2], dim=1) for s1, s2 in zip(x1, x2)]
+            out = [channel_shuffle(s, 2) for s in out]
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class Stem(nn.Module):
+    """Stem network block.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        stem_channels (int): Output channels of the stem layer.
+        out_channels (int): The output channels of the block.
+        expand_ratio (int): adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 stem_channels,
+                 out_channels,
+                 expand_ratio,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 with_cp=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=stem_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=dict(type='ReLU'))
+
+        mid_channels = int(round(stem_channels * expand_ratio))
+        branch_channels = stem_channels // 2
+        if stem_channels == self.out_channels:
+            inc_channels = self.out_channels - branch_channels
+        else:
+            inc_channels = self.out_channels - stem_channels
+
+        self.branch1 = nn.Sequential(
+            ConvModule(
+                branch_channels,
+                branch_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                groups=branch_channels,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            ConvModule(
+                branch_channels,
+                inc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU')),
+        )
+
+        self.expand_conv = ConvModule(
+            branch_channels,
+            mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'))
+        self.depthwise_conv = ConvModule(
+            mid_channels,
+            mid_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=mid_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.linear_conv = ConvModule(
+            mid_channels,
+            branch_channels
+            if stem_channels == self.out_channels else stem_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            x = self.conv1(x)
+            x1, x2 = x.chunk(2, dim=1)
+
+            x2 = self.expand_conv(x2)
+            x2 = self.depthwise_conv(x2)
+            x2 = self.linear_conv(x2)
+
+            out = torch.cat((self.branch1(x1), x2), dim=1)
+
+            out = channel_shuffle(out, 2)
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class IterativeHead(nn.Module):
+    """Extra iterative head for feature learning.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+    """
+
+    def __init__(self, in_channels, norm_cfg=dict(type='BN')):
+        super().__init__()
+        projects = []
+        num_branchs = len(in_channels)
+        self.in_channels = in_channels[::-1]
+
+        for i in range(num_branchs):
+            if i != num_branchs - 1:
+                projects.append(
+                    DepthwiseSeparableConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=self.in_channels[i + 1],
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=dict(type='ReLU'),
+                        dw_act_cfg=None,
+                        pw_act_cfg=dict(type='ReLU')))
+            else:
+                projects.append(
+                    DepthwiseSeparableConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=self.in_channels[i],
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=dict(type='ReLU'),
+                        dw_act_cfg=None,
+                        pw_act_cfg=dict(type='ReLU')))
+        self.projects = nn.ModuleList(projects)
+
+    def forward(self, x):
+        x = x[::-1]
+
+        y = []
+        last_x = None
+        for i, s in enumerate(x):
+            if last_x is not None:
+                last_x = F.interpolate(
+                    last_x,
+                    size=s.size()[-2:],
+                    mode='bilinear',
+                    align_corners=True)
+                s = s + last_x
+            s = self.projects[i](s)
+            y.append(s)
+            last_x = s
+
+        return y[::-1]
+
+
+class ShuffleUnit(nn.Module):
+    """InvertedResidual block for ShuffleNetV2 backbone.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        out_channels (int): The output channels of the block.
+        stride (int): Stride of the 3x3 convolution layer. Default: 1
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        super().__init__()
+        self.stride = stride
+        self.with_cp = with_cp
+
+        branch_features = out_channels // 2
+        if self.stride == 1:
+            assert in_channels == branch_features * 2, (
+                f'in_channels ({in_channels}) should equal to '
+                f'branch_features * 2 ({branch_features * 2}) '
+                'when stride is 1')
+
+        if in_channels != branch_features * 2:
+            assert self.stride != 1, (
+                f'stride ({self.stride}) should not equal 1 when '
+                f'in_channels != branch_features * 2')
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=self.stride,
+                    padding=1,
+                    groups=in_channels,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=None),
+                ConvModule(
+                    in_channels,
+                    branch_features,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+            )
+
+        self.branch2 = nn.Sequential(
+            ConvModule(
+                in_channels if (self.stride > 1) else branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                groups=branch_features,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.stride > 1:
+                out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+            else:
+                x1, x2 = x.chunk(2, dim=1)
+                out = torch.cat((x1, self.branch2(x2)), dim=1)
+
+            out = channel_shuffle(out, 2)
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class LiteHRModule(nn.Module):
+    """High-Resolution Module for LiteHRNet.
+
+    It contains conditional channel weighting blocks and
+    shuffle blocks.
+
+
+    Args:
+        num_branches (int): Number of branches in the module.
+        num_blocks (int): Number of blocks in the module.
+        in_channels (list(int)): Number of input image channels.
+        reduce_ratio (int): Channel reduction ratio.
+        module_type (str): 'LITE' or 'NAIVE'
+        multiscale_output (bool): Whether to output multi-scale features.
+        with_fuse (bool): Whether to use fuse layers.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(
+            self,
+            num_branches,
+            num_blocks,
+            in_channels,
+            reduce_ratio,
+            module_type,
+            multiscale_output=False,
+            with_fuse=True,
+            conv_cfg=None,
+            norm_cfg=dict(type='BN'),
+            with_cp=False,
+    ):
+        super().__init__()
+        self._check_branches(num_branches, in_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.module_type = module_type
+        self.multiscale_output = multiscale_output
+        self.with_fuse = with_fuse
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+
+        if self.module_type.upper() == 'LITE':
+            self.layers = self._make_weighting_blocks(num_blocks, reduce_ratio)
+        elif self.module_type.upper() == 'NAIVE':
+            self.layers = self._make_naive_branches(num_branches, num_blocks)
+        else:
+            raise ValueError("module_type should be either 'LITE' or 'NAIVE'.")
+        if self.with_fuse:
+            self.fuse_layers = self._make_fuse_layers()
+            self.relu = nn.ReLU()
+
+    def _check_branches(self, num_branches, in_channels):
+        """Check input to avoid ValueError."""
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_weighting_blocks(self, num_blocks, reduce_ratio, stride=1):
+        """Make channel weighting blocks."""
+        layers = []
+        for i in range(num_blocks):
+            layers.append(
+                ConditionalChannelWeighting(
+                    self.in_channels,
+                    stride=stride,
+                    reduce_ratio=reduce_ratio,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    with_cp=self.with_cp))
+
+        return nn.Sequential(*layers)
+
+    def _make_one_branch(self, branch_index, num_blocks, stride=1):
+        """Make one branch."""
+        layers = []
+        layers.append(
+            ShuffleUnit(
+                self.in_channels[branch_index],
+                self.in_channels[branch_index],
+                stride=stride,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=dict(type='ReLU'),
+                with_cp=self.with_cp))
+        for i in range(1, num_blocks):
+            layers.append(
+                ShuffleUnit(
+                    self.in_channels[branch_index],
+                    self.in_channels[branch_index],
+                    stride=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=dict(type='ReLU'),
+                    with_cp=self.with_cp))
+
+        return nn.Sequential(*layers)
+
+    def _make_naive_branches(self, num_branches, num_blocks):
+        """Make branches."""
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(self._make_one_branch(i, num_blocks))
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        """Make fuse layer."""
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=in_channels[j],
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=in_channels[j],
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=True)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.layers[0](x[0])]
+
+        if self.module_type.upper() == 'LITE':
+            out = self.layers(x)
+        elif self.module_type.upper() == 'NAIVE':
+            for i in range(self.num_branches):
+                x[i] = self.layers[i](x[i])
+            out = x
+
+        if self.with_fuse:
+            out_fuse = []
+            for i in range(len(self.fuse_layers)):
+                # `y = 0` will lead to decreased accuracy (0.5~1 mAP)
+                y = out[0] if i == 0 else self.fuse_layers[i][0](out[0])
+                for j in range(self.num_branches):
+                    if i == j:
+                        y += out[j]
+                    else:
+                        y += self.fuse_layers[i][j](out[j])
+                out_fuse.append(self.relu(y))
+            out = out_fuse
+        if not self.multiscale_output:
+            out = [out[0]]
+        return out
+
+
+@BACKBONES.register_module()
+class LiteHRNet(nn.Module):
+    """Lite-HRNet backbone.
+
+    `Lite-HRNet: A Lightweight High-Resolution Network
+    <https://arxiv.org/abs/2104.06403>`_.
+
+    Code adapted from 'https://github.com/HRNet/Lite-HRNet'.
+
+    Args:
+        extra (dict): detailed configuration for each stage of HRNet.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+
+    Example:
+        >>> from mmpose.models import LiteHRNet
+        >>> import torch
+        >>> extra=dict(
+        >>>    stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+        >>>    num_stages=3,
+        >>>    stages_spec=dict(
+        >>>        num_modules=(2, 4, 2),
+        >>>        num_branches=(2, 3, 4),
+        >>>        num_blocks=(2, 2, 2),
+        >>>        module_type=('LITE', 'LITE', 'LITE'),
+        >>>        with_fuse=(True, True, True),
+        >>>        reduce_ratios=(8, 8, 8),
+        >>>        num_channels=(
+        >>>            (40, 80),
+        >>>            (40, 80, 160),
+        >>>            (40, 80, 160, 320),
+        >>>        )),
+        >>>    with_head=False)
+        >>> self = LiteHRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 40, 8, 8)
+    """
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=False,
+                 with_cp=False):
+        super().__init__()
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.stem = Stem(
+            in_channels,
+            stem_channels=self.extra['stem']['stem_channels'],
+            out_channels=self.extra['stem']['out_channels'],
+            expand_ratio=self.extra['stem']['expand_ratio'],
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+
+        self.num_stages = self.extra['num_stages']
+        self.stages_spec = self.extra['stages_spec']
+
+        num_channels_last = [
+            self.stem.out_channels,
+        ]
+        for i in range(self.num_stages):
+            num_channels = self.stages_spec['num_channels'][i]
+            num_channels = [num_channels[i] for i in range(len(num_channels))]
+            setattr(
+                self, f'transition{i}',
+                self._make_transition_layer(num_channels_last, num_channels))
+
+            stage, num_channels_last = self._make_stage(
+                self.stages_spec, i, num_channels, multiscale_output=True)
+            setattr(self, f'stage{i}', stage)
+
+        self.with_head = self.extra['with_head']
+        if self.with_head:
+            self.head_layer = IterativeHead(
+                in_channels=num_channels_last,
+                norm_cfg=self.norm_cfg,
+            )
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        """Make transition layer."""
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_pre_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                groups=num_channels_pre_layer[i],
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_pre_layer[i])[1],
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU()))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                in_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                groups=in_channels,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels)[1],
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU()))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_stage(self,
+                    stages_spec,
+                    stage_index,
+                    in_channels,
+                    multiscale_output=True):
+        num_modules = stages_spec['num_modules'][stage_index]
+        num_branches = stages_spec['num_branches'][stage_index]
+        num_blocks = stages_spec['num_blocks'][stage_index]
+        reduce_ratio = stages_spec['reduce_ratios'][stage_index]
+        with_fuse = stages_spec['with_fuse'][stage_index]
+        module_type = stages_spec['module_type'][stage_index]
+
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            modules.append(
+                LiteHRModule(
+                    num_branches,
+                    num_blocks,
+                    in_channels,
+                    reduce_ratio,
+                    module_type,
+                    multiscale_output=reset_multiscale_output,
+                    with_fuse=with_fuse,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    with_cp=self.with_cp))
+            in_channels = modules[-1].in_channels
+
+        return nn.Sequential(*modules), in_channels
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.stem(x)
+
+        y_list = [x]
+        for i in range(self.num_stages):
+            x_list = []
+            transition = getattr(self, f'transition{i}')
+            for j in range(self.stages_spec['num_branches'][i]):
+                if transition[j]:
+                    if j >= len(y_list):
+                        x_list.append(transition[j](y_list[-1]))
+                    else:
+                        x_list.append(transition[j](y_list[j]))
+                else:
+                    x_list.append(y_list[j])
+            y_list = getattr(self, f'stage{i}')(x_list)
+
+        x = y_list
+        if self.with_head:
+            x = self.head_layer(x)
+
+        return [x[0]]
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/mobilenet_v2.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc0cd1b7dfdec2aa751861e39fc1c1a45ec488e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/mobilenet_v2.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, constant_init, kaiming_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import load_checkpoint, make_divisible
+
+
+class InvertedResidual(nn.Module):
+    """InvertedResidual block for MobileNetV2.
+
+    Args:
+        in_channels (int): The input channels of the InvertedResidual block.
+        out_channels (int): The output channels of the InvertedResidual block.
+        stride (int): Stride of the middle (first) 3x3 convolution.
+        expand_ratio (int): adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 expand_ratio,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stride = stride
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels
+        hidden_dim = int(round(in_channels * expand_ratio))
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=hidden_dim,
+                    kernel_size=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        layers.extend([
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=hidden_dim,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=out_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.use_res_connect:
+                return x + self.conv(x)
+            return self.conv(x)
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+@BACKBONES.register_module()
+class MobileNetV2(BaseBackbone):
+    """MobileNetV2 backbone.
+
+    Args:
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (None or Sequence[int]): Output from which stages.
+            Default: (7, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    # Parameters to build layers. 4 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks, stride.
+    arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2],
+                     [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2],
+                     [6, 320, 1, 1]]
+
+    def __init__(self,
+                 widen_factor=1.,
+                 out_indices=(7, ),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.widen_factor = widen_factor
+        self.out_indices = out_indices
+        for index in out_indices:
+            if index not in range(0, 8):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 8). But received {index}')
+
+        if frozen_stages not in range(-1, 8):
+            raise ValueError('frozen_stages must be in range(-1, 8). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks, stride = layer_cfg
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+        if widen_factor > 1.0:
+            self.out_channel = int(1280 * widen_factor)
+        else:
+            self.out_channel = 1280
+
+        layer = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.add_module('conv2', layer)
+        self.layers.append('conv2')
+
+    def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): number of blocks.
+            stride (int): stride of the first block. Default: 1
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio. Default: 6.
+        """
+        layers = []
+        for i in range(num_blocks):
+            if i >= 1:
+                stride = 1
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    stride,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/mobilenet_v3.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/mobilenet_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..d640abec79f06d689f2d4bc1e92999946bc07261
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/mobilenet_v3.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, kaiming_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import InvertedResidual, load_checkpoint
+
+
+@BACKBONES.register_module()
+class MobileNetV3(BaseBackbone):
+    """MobileNetV3 backbone.
+
+    Args:
+        arch (str): Architecture of mobilnetv3, from {small, big}.
+            Default: small.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        out_indices (None or Sequence[int]): Output from which stages.
+            Default: (-1, ), which means output tensors from final stage.
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed.
+            Default: False.
+    """
+    # Parameters to build each block:
+    #     [kernel size, mid channels, out channels, with_se, act type, stride]
+    arch_settings = {
+        'small': [[3, 16, 16, True, 'ReLU', 2],
+                  [3, 72, 24, False, 'ReLU', 2],
+                  [3, 88, 24, False, 'ReLU', 1],
+                  [5, 96, 40, True, 'HSwish', 2],
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 120, 48, True, 'HSwish', 1],
+                  [5, 144, 48, True, 'HSwish', 1],
+                  [5, 288, 96, True, 'HSwish', 2],
+                  [5, 576, 96, True, 'HSwish', 1],
+                  [5, 576, 96, True, 'HSwish', 1]],
+        'big': [[3, 16, 16, False, 'ReLU', 1],
+                [3, 64, 24, False, 'ReLU', 2],
+                [3, 72, 24, False, 'ReLU', 1],
+                [5, 72, 40, True, 'ReLU', 2],
+                [5, 120, 40, True, 'ReLU', 1],
+                [5, 120, 40, True, 'ReLU', 1],
+                [3, 240, 80, False, 'HSwish', 2],
+                [3, 200, 80, False, 'HSwish', 1],
+                [3, 184, 80, False, 'HSwish', 1],
+                [3, 184, 80, False, 'HSwish', 1],
+                [3, 480, 112, True, 'HSwish', 1],
+                [3, 672, 112, True, 'HSwish', 1],
+                [5, 672, 160, True, 'HSwish', 1],
+                [5, 672, 160, True, 'HSwish', 2],
+                [5, 960, 160, True, 'HSwish', 1]]
+    }  # yapf: disable
+
+    def __init__(self,
+                 arch='small',
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 out_indices=(-1, ),
+                 frozen_stages=-1,
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        assert arch in self.arch_settings
+        for index in out_indices:
+            if index not in range(-len(self.arch_settings[arch]),
+                                  len(self.arch_settings[arch])):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, {len(self.arch_settings[arch])}). '
+                                 f'But received {index}')
+
+        if frozen_stages not in range(-1, len(self.arch_settings[arch])):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             f'{len(self.arch_settings[arch])}). '
+                             f'But received {frozen_stages}')
+        self.arch = arch
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = 16
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='HSwish'))
+
+        self.layers = self._make_layer()
+        self.feat_dim = self.arch_settings[arch][-1][2]
+
+    def _make_layer(self):
+        layers = []
+        layer_setting = self.arch_settings[self.arch]
+        for i, params in enumerate(layer_setting):
+            (kernel_size, mid_channels, out_channels, with_se, act,
+             stride) = params
+            if with_se:
+                se_cfg = dict(
+                    channels=mid_channels,
+                    ratio=4,
+                    act_cfg=(dict(type='ReLU'), dict(type='HSigmoid')))
+            else:
+                se_cfg = None
+
+            layer = InvertedResidual(
+                in_channels=self.in_channels,
+                out_channels=out_channels,
+                mid_channels=mid_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                se_cfg=se_cfg,
+                with_expand_conv=True,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=dict(type=act),
+                with_cp=self.with_cp)
+            self.in_channels = out_channels
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, layer)
+            layers.append(layer_name)
+        return layers
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices or \
+                    i - len(self.layers) in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/mspn.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/mspn.py
new file mode 100644
index 0000000000000000000000000000000000000000..71cee34e399780e8b67eac43d862b65a3ce05412
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/mspn.py
@@ -0,0 +1,513 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+from collections import OrderedDict
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init,
+                      normal_init)
+from mmcv.runner.checkpoint import load_state_dict
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .resnet import Bottleneck as _Bottleneck
+from .utils.utils import get_state_dict
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+    """Bottleneck block for MSPN.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        stride (int): stride of the block. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super().__init__(in_channels, out_channels * 4, **kwargs)
+
+
+class DownsampleModule(nn.Module):
+    """Downsample module for MSPN.
+
+    Args:
+        block (nn.Module): Downsample block.
+        num_blocks (list): Number of blocks in each downsample unit.
+        num_units (int): Numbers of downsample units. Default: 4
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the input feature to
+            downsample module. Default: 64
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 num_units=4,
+                 has_skip=False,
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.has_skip = has_skip
+        self.in_channels = in_channels
+        assert len(num_blocks) == num_units
+        self.num_blocks = num_blocks
+        self.num_units = num_units
+        self.norm_cfg = norm_cfg
+        self.layer1 = self._make_layer(block, in_channels, num_blocks[0])
+        for i in range(1, num_units):
+            module_name = f'layer{i + 1}'
+            self.add_module(
+                module_name,
+                self._make_layer(
+                    block, in_channels * pow(2, i), num_blocks[i], stride=2))
+
+    def _make_layer(self, block, out_channels, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.in_channels != out_channels * block.expansion:
+            downsample = ConvModule(
+                self.in_channels,
+                out_channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        units = list()
+        units.append(
+            block(
+                self.in_channels,
+                out_channels,
+                stride=stride,
+                downsample=downsample,
+                norm_cfg=self.norm_cfg))
+        self.in_channels = out_channels * block.expansion
+        for _ in range(1, blocks):
+            units.append(block(self.in_channels, out_channels))
+
+        return nn.Sequential(*units)
+
+    def forward(self, x, skip1, skip2):
+        out = list()
+        for i in range(self.num_units):
+            module_name = f'layer{i + 1}'
+            module_i = getattr(self, module_name)
+            x = module_i(x)
+            if self.has_skip:
+                x = x + skip1[i] + skip2[i]
+            out.append(x)
+        out.reverse()
+
+        return tuple(out)
+
+
+class UpsampleUnit(nn.Module):
+    """Upsample unit for upsample module.
+
+    Args:
+        ind (int): Indicates whether to interpolate (>0) and whether to
+           generate feature map for the next hourglass-like module.
+        num_units (int): Number of units that form a upsample module. Along
+            with ind and gen_cross_conv, nm_units is used to decide whether
+            to generate feature map for the next hourglass-like module.
+        in_channels (int): Channel number of the skip-in feature maps from
+            the corresponding downsample unit.
+        unit_channels (int): Channel number in this unit. Default:256.
+        gen_skip: (bool): Whether or not to generate skips for the posterior
+            downsample module. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (int): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 ind,
+                 num_units,
+                 in_channels,
+                 unit_channels=256,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.num_units = num_units
+        self.norm_cfg = norm_cfg
+        self.in_skip = ConvModule(
+            in_channels,
+            unit_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None,
+            inplace=True)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.ind = ind
+        if self.ind > 0:
+            self.up_conv = ConvModule(
+                unit_channels,
+                unit_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        self.gen_skip = gen_skip
+        if self.gen_skip:
+            self.out_skip1 = ConvModule(
+                in_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+            self.out_skip2 = ConvModule(
+                unit_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+        self.gen_cross_conv = gen_cross_conv
+        if self.ind == num_units - 1 and self.gen_cross_conv:
+            self.cross_conv = ConvModule(
+                unit_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+    def forward(self, x, up_x):
+        out = self.in_skip(x)
+
+        if self.ind > 0:
+            up_x = F.interpolate(
+                up_x,
+                size=(x.size(2), x.size(3)),
+                mode='bilinear',
+                align_corners=True)
+            up_x = self.up_conv(up_x)
+            out = out + up_x
+        out = self.relu(out)
+
+        skip1 = None
+        skip2 = None
+        if self.gen_skip:
+            skip1 = self.out_skip1(x)
+            skip2 = self.out_skip2(out)
+
+        cross_conv = None
+        if self.ind == self.num_units - 1 and self.gen_cross_conv:
+            cross_conv = self.cross_conv(out)
+
+        return out, skip1, skip2, cross_conv
+
+
+class UpsampleModule(nn.Module):
+    """Upsample module for MSPN.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units.
+            Default:256.
+        num_units (int): Numbers of upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (int): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_units=4,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = list()
+        for i in range(num_units):
+            self.in_channels.append(Bottleneck.expansion * out_channels *
+                                    pow(2, i))
+        self.in_channels.reverse()
+        self.num_units = num_units
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.norm_cfg = norm_cfg
+        for i in range(num_units):
+            module_name = f'up{i + 1}'
+            self.add_module(
+                module_name,
+                UpsampleUnit(
+                    i,
+                    self.num_units,
+                    self.in_channels[i],
+                    unit_channels,
+                    self.gen_skip,
+                    self.gen_cross_conv,
+                    norm_cfg=self.norm_cfg,
+                    out_channels=64))
+
+    def forward(self, x):
+        out = list()
+        skip1 = list()
+        skip2 = list()
+        cross_conv = None
+        for i in range(self.num_units):
+            module_i = getattr(self, f'up{i + 1}')
+            if i == 0:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], None)
+            elif i == self.num_units - 1:
+                outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1])
+            else:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1])
+            out.append(outi)
+            skip1.append(skip1_i)
+            skip2.append(skip2_i)
+        skip1.reverse()
+        skip2.reverse()
+
+        return out, skip1, skip2, cross_conv
+
+
+class SingleStageNetwork(nn.Module):
+    """Single_stage Network.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units. Default:256.
+        num_units (int): Numbers of downsample/upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        num_blocks (list): Number of blocks in each downsample unit.
+            Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks)
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the feature from ResNetTop.
+            Default: 64.
+    """
+
+    def __init__(self,
+                 has_skip=False,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 unit_channels=256,
+                 num_units=4,
+                 num_blocks=[2, 2, 2, 2],
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        assert len(num_blocks) == num_units
+        self.has_skip = has_skip
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.num_units = num_units
+        self.unit_channels = unit_channels
+        self.num_blocks = num_blocks
+        self.norm_cfg = norm_cfg
+
+        self.downsample = DownsampleModule(Bottleneck, num_blocks, num_units,
+                                           has_skip, norm_cfg, in_channels)
+        self.upsample = UpsampleModule(unit_channels, num_units, gen_skip,
+                                       gen_cross_conv, norm_cfg, in_channels)
+
+    def forward(self, x, skip1, skip2):
+        mid = self.downsample(x, skip1, skip2)
+        out, skip1, skip2, cross_conv = self.upsample(mid)
+
+        return out, skip1, skip2, cross_conv
+
+
+class ResNetTop(nn.Module):
+    """ResNet top for MSPN.
+
+    Args:
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        channels (int): Number of channels of the feature output by ResNetTop.
+    """
+
+    def __init__(self, norm_cfg=dict(type='BN'), channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.top = nn.Sequential(
+            ConvModule(
+                3,
+                channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                norm_cfg=norm_cfg,
+                inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+    def forward(self, img):
+        return self.top(img)
+
+
+@BACKBONES.register_module()
+class MSPN(BaseBackbone):
+    """MSPN backbone. Paper ref: Li et al. "Rethinking on Multi-Stage Networks
+    for Human Pose Estimation" (CVPR 2020).
+
+    Args:
+        unit_channels (int): Number of Channels in an upsample unit.
+            Default: 256
+        num_stages (int): Number of stages in a multi-stage MSPN. Default: 4
+        num_units (int): Number of downsample/upsample units in a single-stage
+            network. Default: 4
+            Note: Make sure num_units == len(self.num_blocks)
+        num_blocks (list): Number of bottlenecks in each
+            downsample unit. Default: [2, 2, 2, 2]
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        res_top_channels (int): Number of channels of feature from ResNetTop.
+            Default: 64.
+
+    Example:
+        >>> from mmpose.models import MSPN
+        >>> import torch
+        >>> self = MSPN(num_stages=2,num_units=2,num_blocks=[2,2])
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     for feature in level_output:
+        ...         print(tuple(feature.shape))
+        ...
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_stages=4,
+                 num_units=4,
+                 num_blocks=[2, 2, 2, 2],
+                 norm_cfg=dict(type='BN'),
+                 res_top_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        self.unit_channels = unit_channels
+        self.num_stages = num_stages
+        self.num_units = num_units
+        self.num_blocks = num_blocks
+        self.norm_cfg = norm_cfg
+
+        assert self.num_stages > 0
+        assert self.num_units > 1
+        assert self.num_units == len(self.num_blocks)
+        self.top = ResNetTop(norm_cfg=norm_cfg)
+        self.multi_stage_mspn = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if i == 0:
+                has_skip = False
+            else:
+                has_skip = True
+            if i != self.num_stages - 1:
+                gen_skip = True
+                gen_cross_conv = True
+            else:
+                gen_skip = False
+                gen_cross_conv = False
+            self.multi_stage_mspn.append(
+                SingleStageNetwork(has_skip, gen_skip, gen_cross_conv,
+                                   unit_channels, num_units, num_blocks,
+                                   norm_cfg, res_top_channels))
+
+    def forward(self, x):
+        """Model forward function."""
+        out_feats = []
+        skip1 = None
+        skip2 = None
+        x = self.top(x)
+        for i in range(self.num_stages):
+            out, skip1, skip2, x = self.multi_stage_mspn[i](x, skip1, skip2)
+            out_feats.append(out)
+
+        return out_feats
+
+    def init_weights(self, pretrained=None):
+        """Initialize model weights."""
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            state_dict_tmp = get_state_dict(pretrained)
+            state_dict = OrderedDict()
+            state_dict['top'] = OrderedDict()
+            state_dict['bottlenecks'] = OrderedDict()
+            for k, v in state_dict_tmp.items():
+                if k.startswith('layer'):
+                    if 'downsample.0' in k:
+                        state_dict['bottlenecks'][k.replace(
+                            'downsample.0', 'downsample.conv')] = v
+                    elif 'downsample.1' in k:
+                        state_dict['bottlenecks'][k.replace(
+                            'downsample.1', 'downsample.bn')] = v
+                    else:
+                        state_dict['bottlenecks'][k] = v
+                elif k.startswith('conv1'):
+                    state_dict['top'][k.replace('conv1', 'top.0.conv')] = v
+                elif k.startswith('bn1'):
+                    state_dict['top'][k.replace('bn1', 'top.0.bn')] = v
+
+            load_state_dict(
+                self.top, state_dict['top'], strict=False, logger=logger)
+            for i in range(self.num_stages):
+                load_state_dict(
+                    self.multi_stage_mspn[i].downsample,
+                    state_dict['bottlenecks'],
+                    strict=False,
+                    logger=logger)
+        else:
+            for m in self.multi_stage_mspn.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+
+            for m in self.top.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/regnet.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..693417c2d61066e4e9a90989ad61700448028e58
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/regnet.py
@@ -0,0 +1,317 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import ResNet
+from .resnext import Bottleneck
+
+
+@BACKBONES.register_module()
+class RegNet(ResNet):
+    """RegNet backbone.
+
+    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`__ .
+
+    Args:
+        arch (dict): The parameter of RegNets.
+            - w0 (int): initial width
+            - wa (float): slope of width
+            - wm (float): quantization parameter to quantize the width
+            - depth (int): depth of the backbone
+            - group_w (int): width of group
+            - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        base_channels (int): Base channels after stem layer.
+        in_channels (int): Number of input image channels. Default: 3.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer. Default: "pytorch".
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters. Default: -1.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import RegNet
+        >>> import torch
+        >>> self = RegNet(
+                arch=dict(
+                    w0=88,
+                    wa=26.31,
+                    wm=2.25,
+                    group_w=48,
+                    depth=25,
+                    bot_mul=1.0),
+                 out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 96, 8, 8)
+        (1, 192, 4, 4)
+        (1, 432, 2, 2)
+        (1, 1008, 1, 1)
+    """
+    arch_settings = {
+        'regnetx_400mf':
+        dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        'regnetx_800mf':
+        dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0),
+        'regnetx_1.6gf':
+        dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0),
+        'regnetx_3.2gf':
+        dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0),
+        'regnetx_4.0gf':
+        dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0),
+        'regnetx_6.4gf':
+        dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0),
+        'regnetx_8.0gf':
+        dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0),
+        'regnetx_12gf':
+        dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0),
+    }
+
+    def __init__(self,
+                 arch,
+                 in_channels=3,
+                 stem_channels=32,
+                 base_channels=32,
+                 strides=(2, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(3, ),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super(ResNet, self).__init__()
+
+        # Generate RegNet parameters first
+        if isinstance(arch, str):
+            assert arch in self.arch_settings, \
+                f'"arch": "{arch}" is not one of the' \
+                ' arch_settings'
+            arch = self.arch_settings[arch]
+        elif not isinstance(arch, dict):
+            raise TypeError('Expect "arch" to be either a string '
+                            f'or a dict, got {type(arch)}')
+
+        widths, num_stages = self.generate_regnet(
+            arch['w0'],
+            arch['wa'],
+            arch['wm'],
+            arch['depth'],
+        )
+        # Convert to per stage format
+        stage_widths, stage_blocks = self.get_stages_from_blocks(widths)
+        # Generate group widths and bot muls
+        group_widths = [arch['group_w'] for _ in range(num_stages)]
+        self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)]
+        # Adjust the compatibility of stage_widths and group_widths
+        stage_widths, group_widths = self.adjust_width_group(
+            stage_widths, self.bottleneck_ratio, group_widths)
+
+        # Group params by stage
+        self.stage_widths = stage_widths
+        self.group_widths = group_widths
+        self.depth = sum(stage_blocks)
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        if self.deep_stem:
+            raise NotImplementedError(
+                'deep_stem has not been implemented for RegNet')
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.stage_blocks = stage_blocks[:num_stages]
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        _in_channels = stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            group_width = self.group_widths[i]
+            width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i]))
+            stage_groups = width // group_width
+
+            res_layer = self.make_res_layer(
+                block=Bottleneck,
+                num_blocks=num_blocks,
+                in_channels=_in_channels,
+                out_channels=self.stage_widths[i],
+                expansion=1,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                base_channels=self.stage_widths[i],
+                groups=stage_groups,
+                width_per_group=group_width)
+            _in_channels = self.stage_widths[i]
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = stage_widths[-1]
+
+    def _make_stem_layer(self, in_channels, base_channels):
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            base_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, base_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+
+    @staticmethod
+    def generate_regnet(initial_width,
+                        width_slope,
+                        width_parameter,
+                        depth,
+                        divisor=8):
+        """Generates per block width from RegNet parameters.
+
+        Args:
+            initial_width ([int]): Initial width of the backbone
+            width_slope ([float]): Slope of the quantized linear function
+            width_parameter ([int]): Parameter used to quantize the width.
+            depth ([int]): Depth of the backbone.
+            divisor (int, optional): The divisor of channels. Defaults to 8.
+
+        Returns:
+            list, int: return a list of widths of each stage and the number of
+                stages
+        """
+        assert width_slope >= 0
+        assert initial_width > 0
+        assert width_parameter > 1
+        assert initial_width % divisor == 0
+        widths_cont = np.arange(depth) * width_slope + initial_width
+        ks = np.round(
+            np.log(widths_cont / initial_width) / np.log(width_parameter))
+        widths = initial_width * np.power(width_parameter, ks)
+        widths = np.round(np.divide(widths, divisor)) * divisor
+        num_stages = len(np.unique(widths))
+        widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
+        return widths, num_stages
+
+    @staticmethod
+    def quantize_float(number, divisor):
+        """Converts a float to closest non-zero int divisible by divior.
+
+        Args:
+            number (int): Original number to be quantized.
+            divisor (int): Divisor used to quantize the number.
+
+        Returns:
+            int: quantized number that is divisible by devisor.
+        """
+        return int(round(number / divisor) * divisor)
+
+    def adjust_width_group(self, widths, bottleneck_ratio, groups):
+        """Adjusts the compatibility of widths and groups.
+
+        Args:
+            widths (list[int]): Width of each stage.
+            bottleneck_ratio (float): Bottleneck ratio.
+            groups (int): number of groups in each stage
+
+        Returns:
+            tuple(list): The adjusted widths and groups of each stage.
+        """
+        bottleneck_width = [
+            int(w * b) for w, b in zip(widths, bottleneck_ratio)
+        ]
+        groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)]
+        bottleneck_width = [
+            self.quantize_float(w_bot, g)
+            for w_bot, g in zip(bottleneck_width, groups)
+        ]
+        widths = [
+            int(w_bot / b)
+            for w_bot, b in zip(bottleneck_width, bottleneck_ratio)
+        ]
+        return widths, groups
+
+    def get_stages_from_blocks(self, widths):
+        """Gets widths/stage_blocks of network at each stage.
+
+        Args:
+            widths (list[int]): Width in each stage.
+
+        Returns:
+            tuple(list): width and depth of each stage
+        """
+        width_diff = [
+            width != width_prev
+            for width, width_prev in zip(widths + [0], [0] + widths)
+        ]
+        stage_widths = [
+            width for width, diff in zip(widths, width_diff[:-1]) if diff
+        ]
+        stage_blocks = np.diff([
+            depth for depth, diff in zip(range(len(width_diff)), width_diff)
+            if diff
+        ]).tolist()
+        return stage_widths, stage_blocks
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/resnest.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/resnest.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a2d4081df1417155f0626646f5fe3d0dbfc2864
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/resnest.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResLayer, ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(nn.Module):
+    """Split-Attention Conv2d.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int | tuple[int]): Same as nn.Conv2d.
+        stride (int | tuple[int]): Same as nn.Conv2d.
+        padding (int | tuple[int]): Same as nn.Conv2d.
+        dilation (int | tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
+            Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        super().__init__()
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
+            Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 groups=1,
+                 width_per_group=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+
+        self.groups = groups
+        self.width_per_group = width_per_group
+
+        # For ResNet bottleneck, middle channels are determined by expansion
+        # and out_channels, but for ResNeXt bottleneck, it is determined by
+        # groups and width_per_group and the stage it is located in.
+        if groups != 1:
+            assert self.mid_channels % base_channels == 0
+            self.mid_channels = (
+                groups * width_per_group * self.mid_channels // base_channels)
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = SplitAttentionConv2d(
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    Please refer to the `paper <https://arxiv.org/pdf/2004.08955.pdf>`__
+    for details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152, 200}.
+        groups (int): Groups of conv2 in Bottleneck. Default: 32.
+        width_per_group (int): Width per group of conv2 in Bottleneck.
+            Default: 4.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
+            Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3)),
+        269: (Bottleneck, (3, 30, 48, 8))
+    }
+
+    def __init__(self,
+                 depth,
+                 groups=1,
+                 width_per_group=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.width_per_group = width_per_group
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super().__init__(depth=depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(
+            groups=self.groups,
+            width_per_group=self.width_per_group,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/resnet.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..649496a755020140d94eb32fbe79d1ff135c86ca
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/resnet.py
@@ -0,0 +1,701 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer,
+                      constant_init, kaiming_init)
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class BasicBlock(nn.Module):
+    """BasicBlock for ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the output channels of conv1. This is a
+            reserved argument in BasicBlock and should always be 1. Default: 1.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): `pytorch` or `caffe`. It is unused and reserved for
+            unified API with Bottleneck.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=1,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert self.expansion == 1
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, out_channels, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            3,
+            padding=1,
+            bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck block for ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the input/output channels of conv2. Default: 4.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Default: "pytorch".
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=4,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: the normalization layer named "norm3" """
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def get_expansion(block, expansion=None):
+    """Get the expansion of a residual block.
+
+    The block expansion will be obtained by the following order:
+
+    1. If ``expansion`` is given, just return it.
+    2. If ``block`` has the attribute ``expansion``, then return
+       ``block.expansion``.
+    3. Return the default value according the the block type:
+       1 for ``BasicBlock`` and 4 for ``Bottleneck``.
+
+    Args:
+        block (class): The block class.
+        expansion (int | None): The given expansion ratio.
+
+    Returns:
+        int: The expansion of the block.
+    """
+    if isinstance(expansion, int):
+        assert expansion > 0
+    elif expansion is None:
+        if hasattr(block, 'expansion'):
+            expansion = block.expansion
+        elif issubclass(block, BasicBlock):
+            expansion = 1
+        elif issubclass(block, Bottleneck):
+            expansion = 4
+        else:
+            raise TypeError(f'expansion is not specified for {block.__name__}')
+    else:
+        raise TypeError('expansion must be an integer or None')
+
+    return expansion
+
+
+class ResLayer(nn.Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): Residual block used to build ResLayer.
+        num_blocks (int): Number of blocks.
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int, optional): The expansion for BasicBlock/Bottleneck.
+            If not specified, it will firstly be obtained via
+            ``block.expansion``. If the block has no attribute "expansion",
+            the following default values will be used: 1 for BasicBlock and
+            4 for Bottleneck. Default: None.
+        stride (int): stride of the first block. Default: 1.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 in_channels,
+                 out_channels,
+                 expansion=None,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 **kwargs):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        self.block = block
+        self.expansion = get_expansion(block, expansion)
+
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, out_channels)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            in_channels = out_channels
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+        else:  # downsample_first=False is for HourglassModule
+            for i in range(0, num_blocks - 1):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+
+        super().__init__(*layers)
+
+
+@BACKBONES.register_module()
+class ResNet(BaseBackbone):
+    """ResNet backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1512.03385>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        base_channels (int): Middle channels of the first stage. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=64,
+                 base_channels=64,
+                 expansion=None,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(3, ),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.expansion = get_expansion(self.block, expansion)
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        _in_channels = stem_channels
+        _out_channels = base_channels * self.expansion
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            res_layer = self.make_res_layer(
+                block=self.block,
+                num_blocks=num_blocks,
+                in_channels=_in_channels,
+                out_channels=_out_channels,
+                expansion=self.expansion,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg)
+            _in_channels = _out_channels
+            _out_channels *= 2
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = res_layer[-1].out_channels
+
+    def make_res_layer(self, **kwargs):
+        """Make a ResLayer."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        """Make stem layer."""
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@BACKBONES.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`__.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(deep_stem=True, avg_down=True, **kwargs)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/resnext.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..c10dc33f98ac3229c77bf306acf19950c295f904
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/resnext.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResLayer, ResNet
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeXt.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 base_channels=64,
+                 groups=32,
+                 width_per_group=4,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.groups = groups
+        self.width_per_group = width_per_group
+
+        # For ResNet bottleneck, middle channels are determined by expansion
+        # and out_channels, but for ResNeXt bottleneck, it is determined by
+        # groups and width_per_group and the stage it is located in.
+        if groups != 1:
+            assert self.mid_channels % base_channels == 0
+            self.mid_channels = (
+                groups * width_per_group * self.mid_channels // base_channels)
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@BACKBONES.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1611.05431>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152}.
+        groups (int): Groups of conv2 in Bottleneck. Default: 32.
+        width_per_group (int): Width per group of conv2 in Bottleneck.
+            Default: 4.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+     Example:
+        >>> from mmpose.models import ResNeXt
+        >>> import torch
+        >>> self = ResNeXt(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
+        self.groups = groups
+        self.width_per_group = width_per_group
+        super().__init__(depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(
+            groups=self.groups,
+            width_per_group=self.width_per_group,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/rsn.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/rsn.py
new file mode 100644
index 0000000000000000000000000000000000000000..29038afe2a77dcb3d3b027b1549d478916a50727
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/rsn.py
@@ -0,0 +1,616 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init,
+                      normal_init)
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class RSB(nn.Module):
+    """Residual Steps block for RSN. Paper ref: Cai et al. "Learning Delicate
+    Local Representations for Multi-Person Pose Estimation" (ECCV 2020).
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        num_steps (int): Numbers of steps in RSB
+        stride (int): stride of the block. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        expand_times (int): Times by which the in_channels are expanded.
+            Default:26.
+        res_top_channels (int): Number of channels of feature output by
+            ResNet_top. Default:64.
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_steps=4,
+                 stride=1,
+                 downsample=None,
+                 with_cp=False,
+                 norm_cfg=dict(type='BN'),
+                 expand_times=26,
+                 res_top_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        assert num_steps > 1
+        self.in_channels = in_channels
+        self.branch_channels = self.in_channels * expand_times
+        self.branch_channels //= res_top_channels
+        self.out_channels = out_channels
+        self.stride = stride
+        self.downsample = downsample
+        self.with_cp = with_cp
+        self.norm_cfg = norm_cfg
+        self.num_steps = num_steps
+        self.conv_bn_relu1 = ConvModule(
+            self.in_channels,
+            self.num_steps * self.branch_channels,
+            kernel_size=1,
+            stride=self.stride,
+            padding=0,
+            norm_cfg=self.norm_cfg,
+            inplace=False)
+        for i in range(self.num_steps):
+            for j in range(i + 1):
+                module_name = f'conv_bn_relu2_{i + 1}_{j + 1}'
+                self.add_module(
+                    module_name,
+                    ConvModule(
+                        self.branch_channels,
+                        self.branch_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+        self.conv_bn3 = ConvModule(
+            self.num_steps * self.branch_channels,
+            self.out_channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act_cfg=None,
+            norm_cfg=self.norm_cfg,
+            inplace=False)
+        self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        """Forward function."""
+
+        identity = x
+        x = self.conv_bn_relu1(x)
+        spx = torch.split(x, self.branch_channels, 1)
+        outputs = list()
+        outs = list()
+        for i in range(self.num_steps):
+            outputs_i = list()
+            outputs.append(outputs_i)
+            for j in range(i + 1):
+                if j == 0:
+                    inputs = spx[i]
+                else:
+                    inputs = outputs[i][j - 1]
+                if i > j:
+                    inputs = inputs + outputs[i - 1][j]
+                module_name = f'conv_bn_relu2_{i + 1}_{j + 1}'
+                module_i_j = getattr(self, module_name)
+                outputs[i].append(module_i_j(inputs))
+
+            outs.append(outputs[i][i])
+        out = torch.cat(tuple(outs), 1)
+        out = self.conv_bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(identity)
+        out = out + identity
+
+        out = self.relu(out)
+
+        return out
+
+
+class Downsample_module(nn.Module):
+    """Downsample module for RSN.
+
+    Args:
+        block (nn.Module): Downsample block.
+        num_blocks (list): Number of blocks in each downsample unit.
+        num_units (int): Numbers of downsample units. Default: 4
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        num_steps (int): Number of steps in a block. Default:4
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the input feature to
+            downsample module. Default: 64
+        expand_times (int): Times by which the in_channels are expanded.
+            Default:26.
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 num_steps=4,
+                 num_units=4,
+                 has_skip=False,
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64,
+                 expand_times=26):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.has_skip = has_skip
+        self.in_channels = in_channels
+        assert len(num_blocks) == num_units
+        self.num_blocks = num_blocks
+        self.num_units = num_units
+        self.num_steps = num_steps
+        self.norm_cfg = norm_cfg
+        self.layer1 = self._make_layer(
+            block,
+            in_channels,
+            num_blocks[0],
+            expand_times=expand_times,
+            res_top_channels=in_channels)
+        for i in range(1, num_units):
+            module_name = f'layer{i + 1}'
+            self.add_module(
+                module_name,
+                self._make_layer(
+                    block,
+                    in_channels * pow(2, i),
+                    num_blocks[i],
+                    stride=2,
+                    expand_times=expand_times,
+                    res_top_channels=in_channels))
+
+    def _make_layer(self,
+                    block,
+                    out_channels,
+                    blocks,
+                    stride=1,
+                    expand_times=26,
+                    res_top_channels=64):
+        downsample = None
+        if stride != 1 or self.in_channels != out_channels * block.expansion:
+            downsample = ConvModule(
+                self.in_channels,
+                out_channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        units = list()
+        units.append(
+            block(
+                self.in_channels,
+                out_channels,
+                num_steps=self.num_steps,
+                stride=stride,
+                downsample=downsample,
+                norm_cfg=self.norm_cfg,
+                expand_times=expand_times,
+                res_top_channels=res_top_channels))
+        self.in_channels = out_channels * block.expansion
+        for _ in range(1, blocks):
+            units.append(
+                block(
+                    self.in_channels,
+                    out_channels,
+                    num_steps=self.num_steps,
+                    expand_times=expand_times,
+                    res_top_channels=res_top_channels))
+
+        return nn.Sequential(*units)
+
+    def forward(self, x, skip1, skip2):
+        out = list()
+        for i in range(self.num_units):
+            module_name = f'layer{i + 1}'
+            module_i = getattr(self, module_name)
+            x = module_i(x)
+            if self.has_skip:
+                x = x + skip1[i] + skip2[i]
+            out.append(x)
+        out.reverse()
+
+        return tuple(out)
+
+
+class Upsample_unit(nn.Module):
+    """Upsample unit for upsample module.
+
+    Args:
+        ind (int): Indicates whether to interpolate (>0) and whether to
+           generate feature map for the next hourglass-like module.
+        num_units (int): Number of units that form a upsample module. Along
+            with ind and gen_cross_conv, nm_units is used to decide whether
+            to generate feature map for the next hourglass-like module.
+        in_channels (int): Channel number of the skip-in feature maps from
+            the corresponding downsample unit.
+        unit_channels (int): Channel number in this unit. Default:256.
+        gen_skip: (bool): Whether or not to generate skips for the posterior
+            downsample module. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (in): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 ind,
+                 num_units,
+                 in_channels,
+                 unit_channels=256,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.num_units = num_units
+        self.norm_cfg = norm_cfg
+        self.in_skip = ConvModule(
+            in_channels,
+            unit_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None,
+            inplace=True)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.ind = ind
+        if self.ind > 0:
+            self.up_conv = ConvModule(
+                unit_channels,
+                unit_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        self.gen_skip = gen_skip
+        if self.gen_skip:
+            self.out_skip1 = ConvModule(
+                in_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+            self.out_skip2 = ConvModule(
+                unit_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+        self.gen_cross_conv = gen_cross_conv
+        if self.ind == num_units - 1 and self.gen_cross_conv:
+            self.cross_conv = ConvModule(
+                unit_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+    def forward(self, x, up_x):
+        out = self.in_skip(x)
+
+        if self.ind > 0:
+            up_x = F.interpolate(
+                up_x,
+                size=(x.size(2), x.size(3)),
+                mode='bilinear',
+                align_corners=True)
+            up_x = self.up_conv(up_x)
+            out = out + up_x
+        out = self.relu(out)
+
+        skip1 = None
+        skip2 = None
+        if self.gen_skip:
+            skip1 = self.out_skip1(x)
+            skip2 = self.out_skip2(out)
+
+        cross_conv = None
+        if self.ind == self.num_units - 1 and self.gen_cross_conv:
+            cross_conv = self.cross_conv(out)
+
+        return out, skip1, skip2, cross_conv
+
+
+class Upsample_module(nn.Module):
+    """Upsample module for RSN.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units.
+            Default:256.
+        num_units (int): Numbers of upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (int): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_units=4,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = list()
+        for i in range(num_units):
+            self.in_channels.append(RSB.expansion * out_channels * pow(2, i))
+        self.in_channels.reverse()
+        self.num_units = num_units
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.norm_cfg = norm_cfg
+        for i in range(num_units):
+            module_name = f'up{i + 1}'
+            self.add_module(
+                module_name,
+                Upsample_unit(
+                    i,
+                    self.num_units,
+                    self.in_channels[i],
+                    unit_channels,
+                    self.gen_skip,
+                    self.gen_cross_conv,
+                    norm_cfg=self.norm_cfg,
+                    out_channels=64))
+
+    def forward(self, x):
+        out = list()
+        skip1 = list()
+        skip2 = list()
+        cross_conv = None
+        for i in range(self.num_units):
+            module_i = getattr(self, f'up{i + 1}')
+            if i == 0:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], None)
+            elif i == self.num_units - 1:
+                outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1])
+            else:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1])
+            out.append(outi)
+            skip1.append(skip1_i)
+            skip2.append(skip2_i)
+        skip1.reverse()
+        skip2.reverse()
+
+        return out, skip1, skip2, cross_conv
+
+
+class Single_stage_RSN(nn.Module):
+    """Single_stage Residual Steps Network.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units. Default:256.
+        num_units (int): Numbers of downsample/upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        num_steps (int): Number of steps in RSB. Default: 4
+        num_blocks (list): Number of blocks in each downsample unit.
+            Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks)
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the feature from ResNet_Top.
+            Default: 64.
+        expand_times (int): Times by which the in_channels are expanded in RSB.
+            Default:26.
+    """
+
+    def __init__(self,
+                 has_skip=False,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 unit_channels=256,
+                 num_units=4,
+                 num_steps=4,
+                 num_blocks=[2, 2, 2, 2],
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64,
+                 expand_times=26):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        assert len(num_blocks) == num_units
+        self.has_skip = has_skip
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.num_units = num_units
+        self.num_steps = num_steps
+        self.unit_channels = unit_channels
+        self.num_blocks = num_blocks
+        self.norm_cfg = norm_cfg
+
+        self.downsample = Downsample_module(RSB, num_blocks, num_steps,
+                                            num_units, has_skip, norm_cfg,
+                                            in_channels, expand_times)
+        self.upsample = Upsample_module(unit_channels, num_units, gen_skip,
+                                        gen_cross_conv, norm_cfg, in_channels)
+
+    def forward(self, x, skip1, skip2):
+        mid = self.downsample(x, skip1, skip2)
+        out, skip1, skip2, cross_conv = self.upsample(mid)
+
+        return out, skip1, skip2, cross_conv
+
+
+class ResNet_top(nn.Module):
+    """ResNet top for RSN.
+
+    Args:
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        channels (int): Number of channels of the feature output by ResNet_top.
+    """
+
+    def __init__(self, norm_cfg=dict(type='BN'), channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.top = nn.Sequential(
+            ConvModule(
+                3,
+                channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                norm_cfg=norm_cfg,
+                inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+    def forward(self, img):
+        return self.top(img)
+
+
+@BACKBONES.register_module()
+class RSN(BaseBackbone):
+    """Residual Steps Network backbone. Paper ref: Cai et al. "Learning
+    Delicate Local Representations for Multi-Person Pose Estimation" (ECCV
+    2020).
+
+    Args:
+        unit_channels (int): Number of Channels in an upsample unit.
+            Default: 256
+        num_stages (int): Number of stages in a multi-stage RSN. Default: 4
+        num_units (int): NUmber of downsample/upsample units in a single-stage
+            RSN. Default: 4 Note: Make sure num_units == len(self.num_blocks)
+        num_blocks (list): Number of RSBs (Residual Steps Block) in each
+            downsample unit. Default: [2, 2, 2, 2]
+        num_steps (int): Number of steps in a RSB. Default:4
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        res_top_channels (int): Number of channels of feature from ResNet_top.
+            Default: 64.
+        expand_times (int): Times by which the in_channels are expanded in RSB.
+            Default:26.
+    Example:
+        >>> from mmpose.models import RSN
+        >>> import torch
+        >>> self = RSN(num_stages=2,num_units=2,num_blocks=[2,2])
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     for feature in level_output:
+        ...         print(tuple(feature.shape))
+        ...
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_stages=4,
+                 num_units=4,
+                 num_blocks=[2, 2, 2, 2],
+                 num_steps=4,
+                 norm_cfg=dict(type='BN'),
+                 res_top_channels=64,
+                 expand_times=26):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        self.unit_channels = unit_channels
+        self.num_stages = num_stages
+        self.num_units = num_units
+        self.num_blocks = num_blocks
+        self.num_steps = num_steps
+        self.norm_cfg = norm_cfg
+
+        assert self.num_stages > 0
+        assert self.num_steps > 1
+        assert self.num_units > 1
+        assert self.num_units == len(self.num_blocks)
+        self.top = ResNet_top(norm_cfg=norm_cfg)
+        self.multi_stage_rsn = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if i == 0:
+                has_skip = False
+            else:
+                has_skip = True
+            if i != self.num_stages - 1:
+                gen_skip = True
+                gen_cross_conv = True
+            else:
+                gen_skip = False
+                gen_cross_conv = False
+            self.multi_stage_rsn.append(
+                Single_stage_RSN(has_skip, gen_skip, gen_cross_conv,
+                                 unit_channels, num_units, num_steps,
+                                 num_blocks, norm_cfg, res_top_channels,
+                                 expand_times))
+
+    def forward(self, x):
+        """Model forward function."""
+        out_feats = []
+        skip1 = None
+        skip2 = None
+        x = self.top(x)
+        for i in range(self.num_stages):
+            out, skip1, skip2, x = self.multi_stage_rsn[i](x, skip1, skip2)
+            out_feats.append(out)
+
+        return out_feats
+
+    def init_weights(self, pretrained=None):
+        """Initialize model weights."""
+        for m in self.multi_stage_rsn.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+            elif isinstance(m, nn.Linear):
+                normal_init(m, std=0.01)
+
+        for m in self.top.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/scnet.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/scnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3786c5731d685638cfa64a83e5d4a5e2eee545de
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/scnet.py
@@ -0,0 +1,248 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck, ResNet
+
+
+class SCConv(nn.Module):
+    """SCConv (Self-calibrated Convolution)
+
+    Args:
+        in_channels (int): The input channels of the SCConv.
+        out_channels (int): The output channel of the SCConv.
+        stride (int): stride of SCConv.
+        pooling_r (int): size of pooling for scconv.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 pooling_r,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.1)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        assert in_channels == out_channels
+
+        self.k2 = nn.Sequential(
+            nn.AvgPool2d(kernel_size=pooling_r, stride=pooling_r),
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            build_norm_layer(norm_cfg, in_channels)[1],
+        )
+        self.k3 = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            build_norm_layer(norm_cfg, in_channels)[1],
+        )
+        self.k4 = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                bias=False),
+            build_norm_layer(norm_cfg, out_channels)[1],
+            nn.ReLU(inplace=True),
+        )
+
+    def forward(self, x):
+        """Forward function."""
+        identity = x
+
+        out = torch.sigmoid(
+            torch.add(identity, F.interpolate(self.k2(x),
+                                              identity.size()[2:])))
+        out = torch.mul(self.k3(x), out)
+        out = self.k4(out)
+
+        return out
+
+
+class SCBottleneck(Bottleneck):
+    """SC(Self-calibrated) Bottleneck.
+
+    Args:
+        in_channels (int): The input channels of the SCBottleneck block.
+        out_channels (int): The output channel of the SCBottleneck block.
+    """
+
+    pooling_r = 4
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.mid_channels = out_channels // self.expansion // 2
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=1,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+
+        self.k1 = nn.Sequential(
+            build_conv_layer(
+                self.conv_cfg,
+                self.mid_channels,
+                self.mid_channels,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                bias=False),
+            build_norm_layer(self.norm_cfg, self.mid_channels)[1],
+            nn.ReLU(inplace=True))
+
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=1,
+            bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.scconv = SCConv(self.mid_channels, self.mid_channels, self.stride,
+                             self.pooling_r, self.conv_cfg, self.norm_cfg)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels * 2,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out_a = self.conv1(x)
+            out_a = self.norm1(out_a)
+            out_a = self.relu(out_a)
+
+            out_a = self.k1(out_a)
+
+            out_b = self.conv2(x)
+            out_b = self.norm2(out_b)
+            out_b = self.relu(out_b)
+
+            out_b = self.scconv(out_b)
+
+            out = self.conv3(torch.cat([out_a, out_b], dim=1))
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class SCNet(ResNet):
+    """SCNet backbone.
+
+    Improving Convolutional Networks with Self-Calibrated Convolutions,
+    Jiang-Jiang Liu, Qibin Hou, Ming-Ming Cheng, Changhu Wang, Jiashi Feng,
+    IEEE CVPR, 2020.
+    http://mftp.mmcheng.net/Papers/20cvprSCNet.pdf
+
+    Args:
+        depth (int): Depth of scnet, from {50, 101}.
+        in_channels (int): Number of input image channels. Normally 3.
+        base_channels (int): Number of base channels of hidden layer.
+        num_stages (int): SCNet stages, normally 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmpose.models import SCNet
+        >>> import torch
+        >>> self = SCNet(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 56, 56)
+        (1, 512, 28, 28)
+        (1, 1024, 14, 14)
+        (1, 2048, 7, 7)
+    """
+
+    arch_settings = {
+        50: (SCBottleneck, [3, 4, 6, 3]),
+        101: (SCBottleneck, [3, 4, 23, 3])
+    }
+
+    def __init__(self, depth, **kwargs):
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for SCNet')
+        super().__init__(depth, **kwargs)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/seresnet.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/seresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac2d53b40a4593bce96d5c7c3bb4e06d38353d0b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/seresnet.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.utils.checkpoint as cp
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck, ResLayer, ResNet
+from .utils.se_layer import SELayer
+
+
+class SEBottleneck(Bottleneck):
+    """SEBottleneck block for SEResNet.
+
+    Args:
+        in_channels (int): The input channels of the SEBottleneck block.
+        out_channels (int): The output channel of the SEBottleneck block.
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16
+    """
+
+    def __init__(self, in_channels, out_channels, se_ratio=16, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.se_layer = SELayer(out_channels, ratio=se_ratio)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            out = self.se_layer(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class SEResNet(ResNet):
+    """SEResNet backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1709.01507>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152}.
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import SEResNet
+        >>> import torch
+        >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 56, 56)
+        (1, 512, 28, 28)
+        (1, 1024, 14, 14)
+        (1, 2048, 7, 7)
+    """
+
+    arch_settings = {
+        50: (SEBottleneck, (3, 4, 6, 3)),
+        101: (SEBottleneck, (3, 4, 23, 3)),
+        152: (SEBottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, depth, se_ratio=16, **kwargs):
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for SEResNet')
+        self.se_ratio = se_ratio
+        super().__init__(depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(se_ratio=self.se_ratio, **kwargs)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/seresnext.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/seresnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5c4e4ce03684f8a9bd0c6166969c01bace54bd2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/seresnext.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import ResLayer
+from .seresnet import SEBottleneck as _SEBottleneck
+from .seresnet import SEResNet
+
+
+class SEBottleneck(_SEBottleneck):
+    """SEBottleneck block for SEResNeXt.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        base_channels (int): Middle channels of the first stage. Default: 64.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 base_channels=64,
+                 groups=32,
+                 width_per_group=4,
+                 se_ratio=16,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, se_ratio, **kwargs)
+        self.groups = groups
+        self.width_per_group = width_per_group
+
+        # We follow the same rational of ResNext to compute mid_channels.
+        # For SEResNet bottleneck, middle channels are determined by expansion
+        # and out_channels, but for SEResNeXt bottleneck, it is determined by
+        # groups and width_per_group and the stage it is located in.
+        if groups != 1:
+            assert self.mid_channels % base_channels == 0
+            self.mid_channels = (
+                groups * width_per_group * self.mid_channels // base_channels)
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@BACKBONES.register_module()
+class SEResNeXt(SEResNet):
+    """SEResNeXt backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1709.01507>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152}.
+        groups (int): Groups of conv2 in Bottleneck. Default: 32.
+        width_per_group (int): Width per group of conv2 in Bottleneck.
+            Default: 4.
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import SEResNeXt
+        >>> import torch
+        >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 56, 56)
+        (1, 512, 28, 28)
+        (1, 1024, 14, 14)
+        (1, 2048, 7, 7)
+    """
+
+    arch_settings = {
+        50: (SEBottleneck, (3, 4, 6, 3)),
+        101: (SEBottleneck, (3, 4, 23, 3)),
+        152: (SEBottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
+        self.groups = groups
+        self.width_per_group = width_per_group
+        super().__init__(depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(
+            groups=self.groups,
+            width_per_group=self.width_per_group,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/shufflenet_v1.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/shufflenet_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f98cbd2132250ec13adcce6e642c966b0dbd7cc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/shufflenet_v1.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (ConvModule, build_activation_layer, constant_init,
+                      normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import channel_shuffle, load_checkpoint, make_divisible
+
+
+class ShuffleUnit(nn.Module):
+    """ShuffleUnit block.
+
+    ShuffleNet unit with pointwise group convolution (GConv) and channel
+    shuffle.
+
+    Args:
+        in_channels (int): The input channels of the ShuffleUnit.
+        out_channels (int): The output channels of the ShuffleUnit.
+        groups (int, optional): The number of groups to be used in grouped 1x1
+            convolutions in each ShuffleUnit. Default: 3
+        first_block (bool, optional): Whether it is the first ShuffleUnit of a
+            sequential ShuffleUnits. Default: True, which means not using the
+            grouped 1x1 convolution.
+        combine (str, optional): The ways to combine the input and output
+            branches. Default: 'add'.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 groups=3,
+                 first_block=True,
+                 combine='add',
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.first_block = first_block
+        self.combine = combine
+        self.groups = groups
+        self.bottleneck_channels = self.out_channels // 4
+        self.with_cp = with_cp
+
+        if self.combine == 'add':
+            self.depthwise_stride = 1
+            self._combine_func = self._add
+            assert in_channels == out_channels, (
+                'in_channels must be equal to out_channels when combine '
+                'is add')
+        elif self.combine == 'concat':
+            self.depthwise_stride = 2
+            self._combine_func = self._concat
+            self.out_channels -= self.in_channels
+            self.avgpool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+        else:
+            raise ValueError(f'Cannot combine tensors with {self.combine}. '
+                             'Only "add" and "concat" are supported')
+
+        self.first_1x1_groups = 1 if first_block else self.groups
+        self.g_conv_1x1_compress = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.bottleneck_channels,
+            kernel_size=1,
+            groups=self.first_1x1_groups,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.depthwise_conv3x3_bn = ConvModule(
+            in_channels=self.bottleneck_channels,
+            out_channels=self.bottleneck_channels,
+            kernel_size=3,
+            stride=self.depthwise_stride,
+            padding=1,
+            groups=self.bottleneck_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.g_conv_1x1_expand = ConvModule(
+            in_channels=self.bottleneck_channels,
+            out_channels=self.out_channels,
+            kernel_size=1,
+            groups=self.groups,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.act = build_activation_layer(act_cfg)
+
+    @staticmethod
+    def _add(x, out):
+        # residual connection
+        return x + out
+
+    @staticmethod
+    def _concat(x, out):
+        # concatenate along channel axis
+        return torch.cat((x, out), 1)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            residual = x
+
+            out = self.g_conv_1x1_compress(x)
+            out = self.depthwise_conv3x3_bn(out)
+
+            if self.groups > 1:
+                out = channel_shuffle(out, self.groups)
+
+            out = self.g_conv_1x1_expand(out)
+
+            if self.combine == 'concat':
+                residual = self.avgpool(residual)
+                out = self.act(out)
+                out = self._combine_func(residual, out)
+            else:
+                out = self._combine_func(residual, out)
+                out = self.act(out)
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ShuffleNetV1(BaseBackbone):
+    """ShuffleNetV1 backbone.
+
+    Args:
+        groups (int, optional): The number of groups to be used in grouped 1x1
+            convolutions in each ShuffleUnit. Default: 3.
+        widen_factor (float, optional): Width multiplier - adjusts the number
+            of channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (2, )
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 groups=3,
+                 widen_factor=1.0,
+                 out_indices=(2, ),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stage_blocks = [4, 8, 4]
+        self.groups = groups
+
+        for index in out_indices:
+            if index not in range(0, 3):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 3). But received {index}')
+
+        if frozen_stages not in range(-1, 3):
+            raise ValueError('frozen_stages must be in range(-1, 3). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        if groups == 1:
+            channels = (144, 288, 576)
+        elif groups == 2:
+            channels = (200, 400, 800)
+        elif groups == 3:
+            channels = (240, 480, 960)
+        elif groups == 4:
+            channels = (272, 544, 1088)
+        elif groups == 8:
+            channels = (384, 768, 1536)
+        else:
+            raise ValueError(f'{groups} groups is not supported for 1x1 '
+                             'Grouped Convolutions')
+
+        channels = [make_divisible(ch * widen_factor, 8) for ch in channels]
+
+        self.in_channels = int(24 * widen_factor)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layers = nn.ModuleList()
+        for i, num_blocks in enumerate(self.stage_blocks):
+            first_block = (i == 0)
+            layer = self.make_layer(channels[i], num_blocks, first_block)
+            self.layers.append(layer)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(self.frozen_stages):
+            layer = self.layers[i]
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for name, m in self.named_modules():
+                if isinstance(m, nn.Conv2d):
+                    if 'conv1' in name:
+                        normal_init(m, mean=0, std=0.01)
+                    else:
+                        normal_init(m, mean=0, std=1.0 / m.weight.shape[1])
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, val=1, bias=0.0001)
+                    if isinstance(m, _BatchNorm):
+                        if m.running_mean is not None:
+                            nn.init.constant_(m.running_mean, 0)
+        else:
+            raise TypeError('pretrained must be a str or None. But received '
+                            f'{type(pretrained)}')
+
+    def make_layer(self, out_channels, num_blocks, first_block=False):
+        """Stack ShuffleUnit blocks to make a layer.
+
+        Args:
+            out_channels (int): out_channels of the block.
+            num_blocks (int): Number of blocks.
+            first_block (bool, optional): Whether is the first ShuffleUnit of a
+                sequential ShuffleUnits. Default: False, which means using
+                the grouped 1x1 convolution.
+        """
+        layers = []
+        for i in range(num_blocks):
+            first_block = first_block if i == 0 else False
+            combine_mode = 'concat' if i == 0 else 'add'
+            layers.append(
+                ShuffleUnit(
+                    self.in_channels,
+                    out_channels,
+                    groups=self.groups,
+                    first_block=first_block,
+                    combine=combine_mode,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/shufflenet_v2.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/shufflenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e93533367afe4efa01fa67d14cafcca006c990e8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/shufflenet_v2.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import channel_shuffle, load_checkpoint
+
+
+class InvertedResidual(nn.Module):
+    """InvertedResidual block for ShuffleNetV2 backbone.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        out_channels (int): The output channels of the block.
+        stride (int): Stride of the 3x3 convolution layer. Default: 1
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stride = stride
+        self.with_cp = with_cp
+
+        branch_features = out_channels // 2
+        if self.stride == 1:
+            assert in_channels == branch_features * 2, (
+                f'in_channels ({in_channels}) should equal to '
+                f'branch_features * 2 ({branch_features * 2}) '
+                'when stride is 1')
+
+        if in_channels != branch_features * 2:
+            assert self.stride != 1, (
+                f'stride ({self.stride}) should not equal 1 when '
+                f'in_channels != branch_features * 2')
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=self.stride,
+                    padding=1,
+                    groups=in_channels,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=None),
+                ConvModule(
+                    in_channels,
+                    branch_features,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+            )
+
+        self.branch2 = nn.Sequential(
+            ConvModule(
+                in_channels if (self.stride > 1) else branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                groups=branch_features,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.stride > 1:
+                out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+            else:
+                x1, x2 = x.chunk(2, dim=1)
+                out = torch.cat((x1, self.branch2(x2)), dim=1)
+
+            out = channel_shuffle(out, 2)
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ShuffleNetV2(BaseBackbone):
+    """ShuffleNetV2 backbone.
+
+    Args:
+        widen_factor (float): Width multiplier - adjusts the number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 widen_factor=1.0,
+                 out_indices=(3, ),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stage_blocks = [4, 8, 4]
+        for index in out_indices:
+            if index not in range(0, 4):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 4). But received {index}')
+
+        if frozen_stages not in range(-1, 4):
+            raise ValueError('frozen_stages must be in range(-1, 4). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        if widen_factor == 0.5:
+            channels = [48, 96, 192, 1024]
+        elif widen_factor == 1.0:
+            channels = [116, 232, 464, 1024]
+        elif widen_factor == 1.5:
+            channels = [176, 352, 704, 1024]
+        elif widen_factor == 2.0:
+            channels = [244, 488, 976, 2048]
+        else:
+            raise ValueError('widen_factor must be in [0.5, 1.0, 1.5, 2.0]. '
+                             f'But received {widen_factor}')
+
+        self.in_channels = 24
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layers = nn.ModuleList()
+        for i, num_blocks in enumerate(self.stage_blocks):
+            layer = self._make_layer(channels[i], num_blocks)
+            self.layers.append(layer)
+
+        output_channels = channels[-1]
+        self.layers.append(
+            ConvModule(
+                in_channels=self.in_channels,
+                out_channels=output_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def _make_layer(self, out_channels, num_blocks):
+        """Stack blocks to make a layer.
+
+        Args:
+            out_channels (int): out_channels of the block.
+            num_blocks (int): number of blocks.
+        """
+        layers = []
+        for i in range(num_blocks):
+            stride = 2 if i == 0 else 1
+            layers.append(
+                InvertedResidual(
+                    in_channels=self.in_channels,
+                    out_channels=out_channels,
+                    stride=stride,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+
+        for i in range(self.frozen_stages):
+            m = self.layers[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for name, m in self.named_modules():
+                if isinstance(m, nn.Conv2d):
+                    if 'conv1' in name:
+                        normal_init(m, mean=0, std=0.01)
+                    else:
+                        normal_init(m, mean=0, std=1.0 / m.weight.shape[1])
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m.weight, val=1, bias=0.0001)
+                    if isinstance(m, _BatchNorm):
+                        if m.running_mean is not None:
+                            nn.init.constant_(m.running_mean, 0)
+        else:
+            raise TypeError('pretrained must be a str or None. But received '
+                            f'{type(pretrained)}')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/tcn.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/tcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..deca2290aeb1830bc3e241b819157369371aaf27
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/tcn.py
@@ -0,0 +1,267 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_conv_layer, constant_init, kaiming_init
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from mmpose.core import WeightNormClipHook
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class BasicTemporalBlock(nn.Module):
+    """Basic block for VideoPose3D.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        mid_channels (int): The output channels of conv1. Default: 1024.
+        kernel_size (int): Size of the convolving kernel. Default: 3.
+        dilation (int): Spacing between kernel elements. Default: 3.
+        dropout (float): Dropout rate. Default: 0.25.
+        causal (bool): Use causal convolutions instead of symmetric
+            convolutions (for real-time applications). Default: False.
+        residual (bool): Use residual connection. Default: True.
+        use_stride_conv (bool): Use optimized TCN that designed
+            specifically for single-frame batching, i.e. where batches have
+            input length = receptive field, and output length = 1. This
+            implementation replaces dilated convolutions with strided
+            convolutions to avoid generating unused intermediate results.
+            Default: False.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN1d').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels=1024,
+                 kernel_size=3,
+                 dilation=3,
+                 dropout=0.25,
+                 causal=False,
+                 residual=True,
+                 use_stride_conv=False,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d')):
+        # Protect mutable default arguments
+        conv_cfg = copy.deepcopy(conv_cfg)
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.mid_channels = mid_channels
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.dropout = dropout
+        self.causal = causal
+        self.residual = residual
+        self.use_stride_conv = use_stride_conv
+
+        self.pad = (kernel_size - 1) * dilation // 2
+        if use_stride_conv:
+            self.stride = kernel_size
+            self.causal_shift = kernel_size // 2 if causal else 0
+            self.dilation = 1
+        else:
+            self.stride = 1
+            self.causal_shift = kernel_size // 2 * dilation if causal else 0
+
+        self.conv1 = nn.Sequential(
+            ConvModule(
+                in_channels,
+                mid_channels,
+                kernel_size=kernel_size,
+                stride=self.stride,
+                dilation=self.dilation,
+                bias='auto',
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+        self.conv2 = nn.Sequential(
+            ConvModule(
+                mid_channels,
+                out_channels,
+                kernel_size=1,
+                bias='auto',
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+
+        if residual and in_channels != out_channels:
+            self.short_cut = build_conv_layer(conv_cfg, in_channels,
+                                              out_channels, 1)
+        else:
+            self.short_cut = None
+
+        self.dropout = nn.Dropout(dropout) if dropout > 0 else None
+
+    def forward(self, x):
+        """Forward function."""
+        if self.use_stride_conv:
+            assert self.causal_shift + self.kernel_size // 2 < x.shape[2]
+        else:
+            assert 0 <= self.pad + self.causal_shift < x.shape[2] - \
+                self.pad + self.causal_shift <= x.shape[2]
+
+        out = self.conv1(x)
+        if self.dropout is not None:
+            out = self.dropout(out)
+
+        out = self.conv2(out)
+        if self.dropout is not None:
+            out = self.dropout(out)
+
+        if self.residual:
+            if self.use_stride_conv:
+                res = x[:, :, self.causal_shift +
+                        self.kernel_size // 2::self.kernel_size]
+            else:
+                res = x[:, :,
+                        (self.pad + self.causal_shift):(x.shape[2] - self.pad +
+                                                        self.causal_shift)]
+
+            if self.short_cut is not None:
+                res = self.short_cut(res)
+            out = out + res
+
+        return out
+
+
+@BACKBONES.register_module()
+class TCN(BaseBackbone):
+    """TCN backbone.
+
+    Temporal Convolutional Networks.
+    More details can be found in the
+    `paper <https://arxiv.org/abs/1811.11742>`__ .
+
+    Args:
+        in_channels (int): Number of input channels, which equals to
+            num_keypoints * num_features.
+        stem_channels (int): Number of feature channels. Default: 1024.
+        num_blocks (int): NUmber of basic temporal convolutional blocks.
+            Default: 2.
+        kernel_sizes (Sequence[int]): Sizes of the convolving kernel of
+            each basic block. Default: ``(3, 3, 3)``.
+        dropout (float): Dropout rate. Default: 0.25.
+        causal (bool): Use causal convolutions instead of symmetric
+            convolutions (for real-time applications).
+            Default: False.
+        residual (bool): Use residual connection. Default: True.
+        use_stride_conv (bool): Use TCN backbone optimized for
+            single-frame batching, i.e. where batches have input length =
+            receptive field, and output length = 1. This implementation
+            replaces dilated convolutions with strided convolutions to avoid
+            generating unused intermediate results. The weights are
+            interchangeable with the reference implementation. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN1d').
+        max_norm (float|None): if not None, the weight of convolution layers
+            will be clipped to have a maximum norm of max_norm.
+
+    Example:
+        >>> from mmpose.models import TCN
+        >>> import torch
+        >>> self = TCN(in_channels=34)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 34, 243)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 1024, 235)
+        (1, 1024, 217)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 stem_channels=1024,
+                 num_blocks=2,
+                 kernel_sizes=(3, 3, 3),
+                 dropout=0.25,
+                 causal=False,
+                 residual=True,
+                 use_stride_conv=False,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 max_norm=None):
+        # Protect mutable default arguments
+        conv_cfg = copy.deepcopy(conv_cfg)
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.stem_channels = stem_channels
+        self.num_blocks = num_blocks
+        self.kernel_sizes = kernel_sizes
+        self.dropout = dropout
+        self.causal = causal
+        self.residual = residual
+        self.use_stride_conv = use_stride_conv
+        self.max_norm = max_norm
+
+        assert num_blocks == len(kernel_sizes) - 1
+        for ks in kernel_sizes:
+            assert ks % 2 == 1, 'Only odd filter widths are supported.'
+
+        self.expand_conv = ConvModule(
+            in_channels,
+            stem_channels,
+            kernel_size=kernel_sizes[0],
+            stride=kernel_sizes[0] if use_stride_conv else 1,
+            bias='auto',
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+        dilation = kernel_sizes[0]
+        self.tcn_blocks = nn.ModuleList()
+        for i in range(1, num_blocks + 1):
+            self.tcn_blocks.append(
+                BasicTemporalBlock(
+                    in_channels=stem_channels,
+                    out_channels=stem_channels,
+                    mid_channels=stem_channels,
+                    kernel_size=kernel_sizes[i],
+                    dilation=dilation,
+                    dropout=dropout,
+                    causal=causal,
+                    residual=residual,
+                    use_stride_conv=use_stride_conv,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+            dilation *= kernel_sizes[i]
+
+        if self.max_norm is not None:
+            # Apply weight norm clip to conv layers
+            weight_clip = WeightNormClipHook(self.max_norm)
+            for module in self.modules():
+                if isinstance(module, nn.modules.conv._ConvNd):
+                    weight_clip.register(module)
+
+        self.dropout = nn.Dropout(dropout) if dropout > 0 else None
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.expand_conv(x)
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        outs = []
+        for i in range(self.num_blocks):
+            x = self.tcn_blocks[i](x)
+            outs.append(x)
+
+        return tuple(outs)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights."""
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.modules.conv._ConvNd):
+                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52a30ca9f7c8e90b6c6fa2fd8a9705ca0403b259
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .channel_shuffle import channel_shuffle
+from .inverted_residual import InvertedResidual
+from .make_divisible import make_divisible
+from .se_layer import SELayer
+from .utils import load_checkpoint
+
+__all__ = [
+    'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer',
+    'load_checkpoint'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/channel_shuffle.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/channel_shuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..27006a8065db35a14c4207ce6613104374b064ad
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/channel_shuffle.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def channel_shuffle(x, groups):
+    """Channel Shuffle operation.
+
+    This function enables cross-group information flow for multiple groups
+    convolution layers.
+
+    Args:
+        x (Tensor): The input tensor.
+        groups (int): The number of groups to divide the input tensor
+            in the channel dimension.
+
+    Returns:
+        Tensor: The output tensor after channel shuffle operation.
+    """
+
+    batch_size, num_channels, height, width = x.size()
+    assert (num_channels % groups == 0), ('num_channels should be '
+                                          'divisible by groups')
+    channels_per_group = num_channels // groups
+
+    x = x.view(batch_size, groups, channels_per_group, height, width)
+    x = torch.transpose(x, 1, 2).contiguous()
+    x = x.view(batch_size, -1, height, width)
+
+    return x
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/inverted_residual.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..dff762c570550e4a738ae1833a4c82c18777115d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/inverted_residual.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(nn.Module):
+    """Inverted Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        groups (None or int): The group number of the depthwise convolution.
+            Default: None, which means group number = mid_channels.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels.
+            Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 groups=None,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if groups is None:
+            groups = mid_channels
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + out
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/make_divisible.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7666be65939d5c76057e73927c230029cb1871d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/make_divisible.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number down to the nearest value that can
+    be divisible by the divisor.
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int, optional): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float, optional): The minimum ratio of the rounded channel
+            number to the original channel number. Default: 0.9.
+    Returns:
+        int: The modified output channel number
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/se_layer.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f70802eb1b98b1f22516ba62b1533557f428ed
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/se_layer.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+
+class SELayer(nn.Module):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Default: 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid'))
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/utils.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9ac948653adeb849e0f510bc1014664741fe6f9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/utils/utils.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmcv.runner.checkpoint import _load_checkpoint, load_state_dict
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location='cpu',
+                    strict=False,
+                    logger=None):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict_tmp = checkpoint['state_dict']
+    else:
+        state_dict_tmp = checkpoint
+
+    state_dict = OrderedDict()
+    # strip prefix of state_dict
+    for k, v in state_dict_tmp.items():
+        if k.startswith('module.backbone.'):
+            state_dict[k[16:]] = v
+        elif k.startswith('module.'):
+            state_dict[k[7:]] = v
+        elif k.startswith('backbone.'):
+            state_dict[k[9:]] = v
+        else:
+            state_dict[k] = v
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def get_state_dict(filename, map_location='cpu'):
+    """Get state_dict from a file or URI.
+
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``.
+        map_location (str): Same as :func:`torch.load`.
+
+    Returns:
+        OrderedDict: The state_dict.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict_tmp = checkpoint['state_dict']
+    else:
+        state_dict_tmp = checkpoint
+
+    state_dict = OrderedDict()
+    # strip prefix of state_dict
+    for k, v in state_dict_tmp.items():
+        if k.startswith('module.backbone.'):
+            state_dict[k[16:]] = v
+        elif k.startswith('module.'):
+            state_dict[k[7:]] = v
+        elif k.startswith('backbone.'):
+            state_dict[k[9:]] = v
+        else:
+            state_dict[k] = v
+
+    return state_dict
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/v2v_net.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/v2v_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..99462af711069a34c13628364e2c466163507861
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/v2v_net.py
@@ -0,0 +1,257 @@
+# ------------------------------------------------------------------------------
+# Copyright and License Information
+# Adapted from
+# https://github.com/microsoft/voxelpose-pytorch/blob/main/lib/models/v2v_net.py
+# Original Licence: MIT License
+# ------------------------------------------------------------------------------
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class Basic3DBlock(nn.Module):
+    """A basic 3D convolutional block.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        kernel_size (int): Kernel size of the convolution operation
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv3d')
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN3d')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d')):
+        super(Basic3DBlock, self).__init__()
+        self.block = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=((kernel_size - 1) // 2),
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True)
+
+    def forward(self, x):
+        """Forward function."""
+        return self.block(x)
+
+
+class Res3DBlock(nn.Module):
+    """A residual 3D convolutional block.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        kernel_size (int): Kernel size of the convolution operation
+            Default: 3
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv3d')
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN3d')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d')):
+        super(Res3DBlock, self).__init__()
+        self.res_branch = nn.Sequential(
+            ConvModule(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=1,
+                padding=((kernel_size - 1) // 2),
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                bias=True),
+            ConvModule(
+                out_channels,
+                out_channels,
+                kernel_size,
+                stride=1,
+                padding=((kernel_size - 1) // 2),
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True))
+
+        if in_channels == out_channels:
+            self.skip_con = nn.Sequential()
+        else:
+            self.skip_con = ConvModule(
+                in_channels,
+                out_channels,
+                1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True)
+
+    def forward(self, x):
+        """Forward function."""
+        res = self.res_branch(x)
+        skip = self.skip_con(x)
+        return F.relu(res + skip, True)
+
+
+class Pool3DBlock(nn.Module):
+    """A 3D max-pool block.
+
+    Args:
+        pool_size (int): Pool size of the 3D max-pool layer
+    """
+
+    def __init__(self, pool_size):
+        super(Pool3DBlock, self).__init__()
+        self.pool_size = pool_size
+
+    def forward(self, x):
+        """Forward function."""
+        return F.max_pool3d(
+            x, kernel_size=self.pool_size, stride=self.pool_size)
+
+
+class Upsample3DBlock(nn.Module):
+    """A 3D upsample block.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        kernel_size (int): Kernel size of the transposed convolution operation.
+            Default: 2
+        stride (int):  Kernel size of the transposed convolution operation.
+            Default: 2
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size=2, stride=2):
+        super(Upsample3DBlock, self).__init__()
+        assert kernel_size == 2
+        assert stride == 2
+        self.block = nn.Sequential(
+            nn.ConvTranspose3d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=0,
+                output_padding=0), nn.BatchNorm3d(out_channels), nn.ReLU(True))
+
+    def forward(self, x):
+        """Forward function."""
+        return self.block(x)
+
+
+class EncoderDecorder(nn.Module):
+    """An encoder-decoder block.
+
+    Args:
+        in_channels (int): Input channels of this block
+    """
+
+    def __init__(self, in_channels=32):
+        super(EncoderDecorder, self).__init__()
+
+        self.encoder_pool1 = Pool3DBlock(2)
+        self.encoder_res1 = Res3DBlock(in_channels, in_channels * 2)
+        self.encoder_pool2 = Pool3DBlock(2)
+        self.encoder_res2 = Res3DBlock(in_channels * 2, in_channels * 4)
+
+        self.mid_res = Res3DBlock(in_channels * 4, in_channels * 4)
+
+        self.decoder_res2 = Res3DBlock(in_channels * 4, in_channels * 4)
+        self.decoder_upsample2 = Upsample3DBlock(in_channels * 4,
+                                                 in_channels * 2, 2, 2)
+        self.decoder_res1 = Res3DBlock(in_channels * 2, in_channels * 2)
+        self.decoder_upsample1 = Upsample3DBlock(in_channels * 2, in_channels,
+                                                 2, 2)
+
+        self.skip_res1 = Res3DBlock(in_channels, in_channels)
+        self.skip_res2 = Res3DBlock(in_channels * 2, in_channels * 2)
+
+    def forward(self, x):
+        """Forward function."""
+        skip_x1 = self.skip_res1(x)
+        x = self.encoder_pool1(x)
+        x = self.encoder_res1(x)
+
+        skip_x2 = self.skip_res2(x)
+        x = self.encoder_pool2(x)
+        x = self.encoder_res2(x)
+
+        x = self.mid_res(x)
+
+        x = self.decoder_res2(x)
+        x = self.decoder_upsample2(x)
+        x = x + skip_x2
+
+        x = self.decoder_res1(x)
+        x = self.decoder_upsample1(x)
+        x = x + skip_x1
+
+        return x
+
+
+@BACKBONES.register_module()
+class V2VNet(BaseBackbone):
+    """V2VNet.
+
+    Please refer to the `paper <https://arxiv.org/abs/1711.07399>`
+        for details.
+
+    Args:
+        input_channels (int):
+            Number of channels of the input feature volume.
+        output_channels (int):
+            Number of channels of the output volume.
+        mid_channels (int):
+            Input and output channels of the encoder-decoder block.
+    """
+
+    def __init__(self, input_channels, output_channels, mid_channels=32):
+        super(V2VNet, self).__init__()
+
+        self.front_layers = nn.Sequential(
+            Basic3DBlock(input_channels, mid_channels // 2, 7),
+            Res3DBlock(mid_channels // 2, mid_channels),
+        )
+
+        self.encoder_decoder = EncoderDecorder(in_channels=mid_channels)
+
+        self.output_layer = nn.Conv3d(
+            mid_channels, output_channels, kernel_size=1, stride=1, padding=0)
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.front_layers(x)
+        x = self.encoder_decoder(x)
+        x = self.output_layer(x)
+
+        return x
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                nn.init.normal_(m.weight, 0, 0.001)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose3d):
+                nn.init.normal_(m.weight, 0, 0.001)
+                nn.init.constant_(m.bias, 0)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vgg.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d467017a5520f399c84b1235ec64c99b805b42
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vgg.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, kaiming_init, normal_init
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+def make_vgg_layer(in_channels,
+                   out_channels,
+                   num_blocks,
+                   conv_cfg=None,
+                   norm_cfg=None,
+                   act_cfg=dict(type='ReLU'),
+                   dilation=1,
+                   with_norm=False,
+                   ceil_mode=False):
+    layers = []
+    for _ in range(num_blocks):
+        layer = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            dilation=dilation,
+            padding=dilation,
+            bias=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        layers.append(layer)
+        in_channels = out_channels
+    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
+
+    return layers
+
+
+@BACKBONES.register_module()
+class VGG(BaseBackbone):
+    """VGG backbone.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_norm (bool): Use BatchNorm or not.
+        num_classes (int): number of classes for classification.
+        num_stages (int): VGG stages, normally 5.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. When it is None, the default behavior depends on
+            whether num_classes is specified. If num_classes <= 0, the default
+            value is (4, ), outputting the last feature map before classifier.
+            If num_classes > 0, the default value is (5, ), outputting the
+            classification score. Default: None.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        ceil_mode (bool): Whether to use ceil_mode of MaxPool. Default: False.
+        with_last_pool (bool): Whether to keep the last pooling before
+            classifier. Default: True.
+    """
+
+    # Parameters to build layers. Each element specifies the number of conv in
+    # each stage. For example, VGG11 contains 11 layers with learnable
+    # parameters. 11 is computed as 11 = (1 + 1 + 2 + 2 + 2) + 3,
+    # where 3 indicates the last three fully-connected layers.
+    arch_settings = {
+        11: (1, 1, 2, 2, 2),
+        13: (2, 2, 2, 2, 2),
+        16: (2, 2, 3, 3, 3),
+        19: (2, 2, 4, 4, 4)
+    }
+
+    def __init__(self,
+                 depth,
+                 num_classes=-1,
+                 num_stages=5,
+                 dilations=(1, 1, 1, 1, 1),
+                 out_indices=None,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 norm_eval=False,
+                 ceil_mode=False,
+                 with_last_pool=True):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for vgg')
+        assert num_stages >= 1 and num_stages <= 5
+        stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        assert len(dilations) == num_stages
+
+        self.num_classes = num_classes
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        with_norm = norm_cfg is not None
+
+        if out_indices is None:
+            out_indices = (5, ) if num_classes > 0 else (4, )
+        assert max(out_indices) <= num_stages
+        self.out_indices = out_indices
+
+        self.in_channels = 3
+        start_idx = 0
+        vgg_layers = []
+        self.range_sub_modules = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            num_modules = num_blocks + 1
+            end_idx = start_idx + num_modules
+            dilation = dilations[i]
+            out_channels = 64 * 2**i if i < 4 else 512
+            vgg_layer = make_vgg_layer(
+                self.in_channels,
+                out_channels,
+                num_blocks,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                dilation=dilation,
+                with_norm=with_norm,
+                ceil_mode=ceil_mode)
+            vgg_layers.extend(vgg_layer)
+            self.in_channels = out_channels
+            self.range_sub_modules.append([start_idx, end_idx])
+            start_idx = end_idx
+        if not with_last_pool:
+            vgg_layers.pop(-1)
+            self.range_sub_modules[-1][1] -= 1
+        self.module_name = 'features'
+        self.add_module(self.module_name, nn.Sequential(*vgg_layers))
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(512 * 7 * 7, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained=None):
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+
+    def forward(self, x):
+        outs = []
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(len(self.stage_blocks)):
+            for j in range(*self.range_sub_modules[i]):
+                vgg_layer = vgg_layers[j]
+                x = vgg_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), -1)
+            x = self.classifier(x)
+            outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def _freeze_stages(self):
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(self.frozen_stages):
+            for j in range(*self.range_sub_modules[i]):
+                m = vgg_layers[j]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vipnas_mbv3.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vipnas_mbv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed990e3966b27301dbaf081e3ec0e908704dfc8b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vipnas_mbv3.py
@@ -0,0 +1,179 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import InvertedResidual, load_checkpoint
+
+
+@BACKBONES.register_module()
+class ViPNAS_MobileNetV3(BaseBackbone):
+    """ViPNAS_MobileNetV3 backbone.
+
+    "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search"
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2105.10154>`__ .
+
+    Args:
+        wid (list(int)): Searched width config for each stage.
+        expan (list(int)): Searched expansion ratio config for each stage.
+        dep (list(int)): Searched depth config for each stage.
+        ks (list(int)): Searched kernel size config for each stage.
+        group (list(int)): Searched group number config for each stage.
+        att (list(bool)): Searched attention config for each stage.
+        stride (list(int)): Stride config for each stage.
+        act (list(dict)): Activation config for each stage.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed.
+            Default: False.
+    """
+
+    def __init__(self,
+                 wid=[16, 16, 24, 40, 80, 112, 160],
+                 expan=[None, 1, 5, 4, 5, 5, 6],
+                 dep=[None, 1, 4, 4, 4, 4, 4],
+                 ks=[3, 3, 7, 7, 5, 7, 5],
+                 group=[None, 8, 120, 20, 100, 280, 240],
+                 att=[None, True, True, False, True, True, True],
+                 stride=[2, 1, 2, 2, 2, 1, 2],
+                 act=[
+                     'HSwish', 'ReLU', 'ReLU', 'ReLU', 'HSwish', 'HSwish',
+                     'HSwish'
+                 ],
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 frozen_stages=-1,
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.wid = wid
+        self.expan = expan
+        self.dep = dep
+        self.ks = ks
+        self.group = group
+        self.att = att
+        self.stride = stride
+        self.act = act
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.wid[0],
+            kernel_size=self.ks[0],
+            stride=self.stride[0],
+            padding=self.ks[0] // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type=self.act[0]))
+
+        self.layers = self._make_layer()
+
+    def _make_layer(self):
+        layers = []
+        layer_index = 0
+        for i, dep in enumerate(self.dep[1:]):
+            mid_channels = self.wid[i + 1] * self.expan[i + 1]
+
+            if self.att[i + 1]:
+                se_cfg = dict(
+                    channels=mid_channels,
+                    ratio=4,
+                    act_cfg=(dict(type='ReLU'), dict(type='HSigmoid')))
+            else:
+                se_cfg = None
+
+            if self.expan[i + 1] == 1:
+                with_expand_conv = False
+            else:
+                with_expand_conv = True
+
+            for j in range(dep):
+                if j == 0:
+                    stride = self.stride[i + 1]
+                    in_channels = self.wid[i]
+                else:
+                    stride = 1
+                    in_channels = self.wid[i + 1]
+
+                layer = InvertedResidual(
+                    in_channels=in_channels,
+                    out_channels=self.wid[i + 1],
+                    mid_channels=mid_channels,
+                    kernel_size=self.ks[i + 1],
+                    groups=self.group[i + 1],
+                    stride=stride,
+                    se_cfg=se_cfg,
+                    with_expand_conv=with_expand_conv,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=dict(type=self.act[i + 1]),
+                    with_cp=self.with_cp)
+                layer_index += 1
+                layer_name = f'layer{layer_index}'
+                self.add_module(layer_name, layer)
+                layers.append(layer_name)
+        return layers
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.normal_(m.weight, std=0.001)
+                    for name, _ in m.named_parameters():
+                        if name in ['bias']:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+
+        return x
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vipnas_resnet.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vipnas_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..81b028ed5f5caad5f59c68b7f82c1a4661cf4d6f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vipnas_resnet.py
@@ -0,0 +1,589 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, build_conv_layer, build_norm_layer
+from mmcv.cnn.bricks import ContextBlock
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class ViPNAS_Bottleneck(nn.Module):
+    """Bottleneck block for ViPNAS_ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the input/output channels of conv2. Default: 4.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Default: "pytorch".
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        kernel_size (int): kernel size of conv2 searched in ViPANS.
+        groups (int): group number of conv2 searched in ViPNAS.
+        attention (bool): whether to use attention module in the end of
+            the block.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=4,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 kernel_size=3,
+                 groups=1,
+                 attention=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=kernel_size,
+            stride=self.conv2_stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            dilation=dilation,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        if attention:
+            self.attention = ContextBlock(out_channels,
+                                          max(1.0 / 16, 16.0 / out_channels))
+        else:
+            self.attention = None
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: the normalization layer named "norm3" """
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.attention is not None:
+                out = self.attention(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def get_expansion(block, expansion=None):
+    """Get the expansion of a residual block.
+
+    The block expansion will be obtained by the following order:
+
+    1. If ``expansion`` is given, just return it.
+    2. If ``block`` has the attribute ``expansion``, then return
+       ``block.expansion``.
+    3. Return the default value according the the block type:
+       4 for ``ViPNAS_Bottleneck``.
+
+    Args:
+        block (class): The block class.
+        expansion (int | None): The given expansion ratio.
+
+    Returns:
+        int: The expansion of the block.
+    """
+    if isinstance(expansion, int):
+        assert expansion > 0
+    elif expansion is None:
+        if hasattr(block, 'expansion'):
+            expansion = block.expansion
+        elif issubclass(block, ViPNAS_Bottleneck):
+            expansion = 1
+        else:
+            raise TypeError(f'expansion is not specified for {block.__name__}')
+    else:
+        raise TypeError('expansion must be an integer or None')
+
+    return expansion
+
+
+class ViPNAS_ResLayer(nn.Sequential):
+    """ViPNAS_ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): Residual block used to build ViPNAS ResLayer.
+        num_blocks (int): Number of blocks.
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int, optional): The expansion for BasicBlock/Bottleneck.
+            If not specified, it will firstly be obtained via
+            ``block.expansion``. If the block has no attribute "expansion",
+            the following default values will be used: 1 for BasicBlock and
+            4 for Bottleneck. Default: None.
+        stride (int): stride of the first block. Default: 1.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+        kernel_size (int): Kernel Size of the corresponding convolution layer
+            searched in the block.
+        groups (int): Group number of the corresponding convolution layer
+            searched in the block.
+        attention (bool): Whether to use attention module in the end of the
+            block.
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 in_channels,
+                 out_channels,
+                 expansion=None,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 kernel_size=3,
+                 groups=1,
+                 attention=False,
+                 **kwargs):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        self.block = block
+        self.expansion = get_expansion(block, expansion)
+
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, out_channels)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    kernel_size=kernel_size,
+                    groups=groups,
+                    attention=attention,
+                    **kwargs))
+            in_channels = out_channels
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        kernel_size=kernel_size,
+                        groups=groups,
+                        attention=attention,
+                        **kwargs))
+        else:  # downsample_first=False is for HourglassModule
+            for i in range(0, num_blocks - 1):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        kernel_size=kernel_size,
+                        groups=groups,
+                        attention=attention,
+                        **kwargs))
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    kernel_size=kernel_size,
+                    groups=groups,
+                    attention=attention,
+                    **kwargs))
+
+        super().__init__(*layers)
+
+
+@BACKBONES.register_module()
+class ViPNAS_ResNet(BaseBackbone):
+    """ViPNAS_ResNet backbone.
+
+    "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search"
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2105.10154>`__ .
+
+    Args:
+        depth (int): Network depth, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+        wid (list(int)): Searched width config for each stage.
+        expan (list(int)): Searched expansion ratio config for each stage.
+        dep (list(int)): Searched depth config for each stage.
+        ks (list(int)): Searched kernel size config for each stage.
+        group (list(int)): Searched group number config for each stage.
+        att (list(bool)): Searched attention config for each stage.
+    """
+
+    arch_settings = {
+        50: ViPNAS_Bottleneck,
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(3, ),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 wid=[48, 80, 160, 304, 608],
+                 expan=[None, 1, 1, 1, 1],
+                 dep=[None, 4, 6, 7, 3],
+                 ks=[7, 3, 5, 5, 5],
+                 group=[None, 16, 16, 16, 16],
+                 att=[None, True, False, True, True]):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.stem_channels = dep[0]
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.block = self.arch_settings[depth]
+        self.stage_blocks = dep[1:1 + num_stages]
+
+        self._make_stem_layer(in_channels, wid[0], ks[0])
+
+        self.res_layers = []
+        _in_channels = wid[0]
+        for i, num_blocks in enumerate(self.stage_blocks):
+            expansion = get_expansion(self.block, expan[i + 1])
+            _out_channels = wid[i + 1] * expansion
+            stride = strides[i]
+            dilation = dilations[i]
+            res_layer = self.make_res_layer(
+                block=self.block,
+                num_blocks=num_blocks,
+                in_channels=_in_channels,
+                out_channels=_out_channels,
+                expansion=expansion,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                kernel_size=ks[i + 1],
+                groups=group[i + 1],
+                attention=att[i + 1])
+            _in_channels = _out_channels
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = res_layer[-1].out_channels
+
+    def make_res_layer(self, **kwargs):
+        """Make a ViPNAS ResLayer."""
+        return ViPNAS_ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels, kernel_size):
+        """Make stem layer."""
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=kernel_size,
+                stride=2,
+                padding=kernel_size // 2,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize model weights."""
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.normal_(m.weight, std=0.001)
+                    for name, _ in m.named_parameters():
+                        if name in ['bias']:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vit.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..2719d1a6991b67e1b0832247c2f1259bbacda3f6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vit.py
@@ -0,0 +1,341 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    cls_token = None
+    B, L, C = abs_pos.shape
+    if has_cls_token:
+        cls_token = abs_pos[:, 0:1]
+        abs_pos = abs_pos[:, 1:]
+
+    if ori_h != h or ori_w != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).reshape(B, -1, C)
+
+    else:
+        new_abs_pos = abs_pos
+    
+    if cls_token is not None:
+        new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
+    return new_abs_pos
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    
+    def extra_repr(self):
+        return 'p={}'.format(self.drop_prob)
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None,):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, 
+                 drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, 
+                 norm_layer=nn.LayerNorm, attn_head_dim=None
+                 ):
+        super().__init__()
+        
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
+            )
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
+        self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
+        self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+@BACKBONES.register_module()
+class ViT(BaseBackbone):
+
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False, 
+                 frozen_stages=-1, ratio=1, last_norm=True,
+                 patch_padding='pad', freeze_attn=False, freeze_ffn=False,
+                 ):
+        # Protect mutable default arguments
+        super(ViT, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
+        num_patches = self.patch_embed.num_patches
+
+        # since the pretraining model has class token
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                )
+            for i in range(depth)])
+
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained, patch_padding=self.patch_padding)
+
+        if pretrained is None:
+            def _init_weights(m):
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if isinstance(m, nn.Linear) and m.bias is not None:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.LayerNorm):
+                    nn.init.constant_(m.bias, 0)
+                    nn.init.constant_(m.weight, 1.0)
+
+            self.apply(_init_weights)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward_features(self, x):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+        x = self.last_norm(x)
+
+        xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
+
+        return xp
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vit_moe.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vit_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..880a58fbb2ac2892ef6e1e349f4ef98e38c1d274
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/backbones/vit_moe.py
@@ -0,0 +1,385 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    cls_token = None
+    B, L, C = abs_pos.shape
+    if has_cls_token:
+        cls_token = abs_pos[:, 0:1]
+        abs_pos = abs_pos[:, 1:]
+
+    if ori_h != h or ori_w != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).reshape(B, -1, C)
+
+    else:
+        new_abs_pos = abs_pos
+    
+    if cls_token is not None:
+        new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
+    return new_abs_pos
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    
+    def extra_repr(self):
+        return 'p={}'.format(self.drop_prob)
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class MoEMlp(nn.Module):
+    def __init__(self, num_expert=1, in_features=1024, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., part_features=256):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.part_features = part_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features - part_features)
+        self.drop = nn.Dropout(drop)
+        
+        self.num_expert = num_expert
+        experts = []
+
+        for i in range(num_expert):
+            experts.append(
+                        nn.Linear(hidden_features, part_features)
+                        )
+        self.experts = nn.ModuleList(experts)
+
+    def forward(self, x, indices):
+
+        expert_x = torch.zeros_like(x[:, :, -self.part_features:], device=x.device, dtype=x.dtype)
+
+        x = self.fc1(x)
+        x = self.act(x)
+        shared_x = self.fc2(x)
+        indices = indices.view(-1, 1, 1)
+
+        # to support ddp training
+        for i in range(self.num_expert):
+            selectedIndex = (indices == i)
+            current_x = self.experts[i](x) * selectedIndex
+            expert_x = expert_x + current_x
+
+        x = torch.cat([shared_x, expert_x], dim=-1)
+
+        return x
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None,):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, 
+                 drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, 
+                 norm_layer=nn.LayerNorm, attn_head_dim=None, num_expert=1, part_features=None
+                 ):
+        super().__init__()
+
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
+            )
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MoEMlp(num_expert=num_expert, in_features=dim, hidden_features=mlp_hidden_dim, 
+                            act_layer=act_layer, drop=drop, part_features=part_features)
+
+    def forward(self, x, indices=None):
+
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x), indices))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
+        self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
+        self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+@BACKBONES.register_module()
+class ViTMoE(BaseBackbone):
+
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False, 
+                 frozen_stages=-1, ratio=1, last_norm=True,
+                 patch_padding='pad', freeze_attn=False, freeze_ffn=False,
+                 num_expert=1, part_features=None
+                 ):
+        # Protect mutable default arguments
+        super(ViTMoE, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
+        num_patches = self.patch_embed.num_patches
+
+        self.part_features = part_features
+
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                num_expert=num_expert, part_features=part_features
+                )
+            for i in range(depth)])
+
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained, patch_padding=self.patch_padding, part_features=self.part_features)
+
+        if pretrained is None:
+            def _init_weights(m):
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if isinstance(m, nn.Linear) and m.bias is not None:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.LayerNorm):
+                    nn.init.constant_(m.bias, 0)
+                    nn.init.constant_(m.weight, 1.0)
+
+            self.apply(_init_weights)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward_features(self, x, dataset_source=None):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, dataset_source)
+            else:
+                x = blk(x, dataset_source)
+
+        x = self.last_norm(x)
+
+        xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
+
+        return xp
+
+    def forward(self, x, dataset_source=None):
+        x = self.forward_features(x, dataset_source)
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/builder.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..220839d47d6b1e66a06eb143b1f1ef8145c6a3be
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/builder.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.cnn import build_model_from_cfg
+from mmcv.utils import Registry
+
+MODELS = Registry(
+    'models', build_func=build_model_from_cfg, parent=MMCV_MODELS)
+
+BACKBONES = MODELS
+NECKS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+POSENETS = MODELS
+MESH_MODELS = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+
+
+def build_posenet(cfg):
+    """Build posenet."""
+    return POSENETS.build(cfg)
+
+
+def build_mesh_model(cfg):
+    """Build mesh model."""
+    return MESH_MODELS.build(cfg)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0982094c96295f3f8a0e63e1e0a15964c2c286a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .associative_embedding import AssociativeEmbedding
+from .interhand_3d import Interhand3D
+from .mesh import ParametricMesh
+from .multi_task import MultiTask
+from .multiview_pose import (DetectAndRegress, VoxelCenterDetector,
+                             VoxelSinglePose)
+from .pose_lifter import PoseLifter
+from .posewarper import PoseWarper
+from .top_down import TopDown
+from .top_down_moe import TopDownMoE
+
+__all__ = [
+    'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask',
+    'PoseLifter', 'Interhand3D', 'PoseWarper', 'DetectAndRegress',
+    'VoxelCenterDetector', 'VoxelSinglePose', 'TopDownMoE'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/associative_embedding.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/associative_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..100c7806d361d323abb720eb8ad5649ddc3c1a03
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/associative_embedding.py
@@ -0,0 +1,420 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import torch
+from mmcv.image import imwrite
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.image import imshow
+
+from mmpose.core.evaluation import (aggregate_scale, aggregate_stage_flip,
+                                    flip_feature_maps, get_group_preds,
+                                    split_ae_outputs)
+from mmpose.core.post_processing.group import HeatmapParser
+from mmpose.core.visualization import imshow_keypoints
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class AssociativeEmbedding(BasePose):
+    """Associative embedding pose detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            ``loss_keypoint`` for heads instead.
+    """
+
+    def __init__(self,
+                 backbone,
+                 keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None):
+        super().__init__()
+        self.fp16_enabled = False
+
+        self.backbone = builder.build_backbone(backbone)
+
+        if keypoint_head is not None:
+            if 'loss_keypoint' not in keypoint_head and loss_pose is not None:
+                warnings.warn(
+                    '`loss_pose` for BottomUp is deprecated, '
+                    'use `loss_keypoint` for heads instead. See '
+                    'https://github.com/open-mmlab/mmpose/pull/382'
+                    ' for more information.', DeprecationWarning)
+                keypoint_head['loss_keypoint'] = loss_pose
+
+            self.keypoint_head = builder.build_head(keypoint_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.use_udp = test_cfg.get('use_udp', False)
+        self.parser = HeatmapParser(self.test_cfg)
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img=None,
+                targets=None,
+                masks=None,
+                joints=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss is True.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C
+            - img_width: imgW
+            - img_height: imgH
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+
+        Args:
+            img (torch.Tensor[N,C,imgH,imgW]): Input image.
+            targets (list(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps.
+            masks (list(torch.Tensor[N,H,W])): Masks of multi-scale target
+                heatmaps
+            joints (list(torch.Tensor[N,M,K,2])): Joints of multi-scale target
+                heatmaps for ae loss
+            img_metas (dict): Information about val & test.
+                By default it includes:
+
+                - "image_file": image path
+                - "aug_data": input
+                - "test_scale_factor": test scale factor
+                - "base_size": base size of input
+                - "center": center of image
+                - "scale": scale of image
+                - "flip_index": flip index of keypoints
+            return loss (bool): ``return_loss=True`` for training,
+                ``return_loss=False`` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if 'return_loss' is true, then return losses. \
+                Otherwise, return predicted poses, scores, image \
+                paths and heatmaps.
+        """
+
+        if return_loss:
+            return self.forward_train(img, targets, masks, joints, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, img, targets, masks, joints, img_metas, **kwargs):
+        """Forward the bottom-up model and calculate the loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            heatmaps weight: W
+            heatmaps height: H
+            max_num_people: M
+
+        Args:
+            img (torch.Tensor[N,C,imgH,imgW]): Input image.
+            targets (List(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps.
+            masks (List(torch.Tensor[N,H,W])): Masks of multi-scale target
+                                              heatmaps
+            joints (List(torch.Tensor[N,M,K,2])): Joints of multi-scale target
+                                                 heatmaps for ae loss
+            img_metas (dict):Information about val&test
+                By default this includes:
+                - "image_file": image path
+                - "aug_data": input
+                - "test_scale_factor": test scale factor
+                - "base_size": base size of input
+                - "center": center of image
+                - "scale": scale of image
+                - "flip_index": flip index of keypoints
+
+        Returns:
+            dict: The total loss for bottom-up
+        """
+
+        output = self.backbone(img)
+
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+
+        # if return loss
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output, targets, masks, joints)
+            losses.update(keypoint_losses)
+
+        return losses
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Outputs.
+        """
+        output = self.backbone(img)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+        return output
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Inference the bottom-up model.
+
+        Note:
+            - Batchsize: N (currently support batchsize = 1)
+            - num_img_channel: C
+            - img_width: imgW
+            - img_height: imgH
+
+        Args:
+            flip_index (List(int)):
+            aug_data (List(Tensor[NxCximgHximgW])): Multi-scale image
+            test_scale_factor (List(float)): Multi-scale factor
+            base_size (Tuple(int)): Base size of image when scale is 1
+            center (np.ndarray): center of image
+            scale (np.ndarray): the scale of image
+        """
+        assert img.size(0) == 1
+        assert len(img_metas) == 1
+
+        img_metas = img_metas[0]
+
+        aug_data = img_metas['aug_data']
+
+        test_scale_factor = img_metas['test_scale_factor']
+        base_size = img_metas['base_size']
+        center = img_metas['center']
+        scale = img_metas['scale']
+
+        result = {}
+
+        scale_heatmaps_list = []
+        scale_tags_list = []
+
+        for idx, s in enumerate(sorted(test_scale_factor, reverse=True)):
+            image_resized = aug_data[idx].to(img.device)
+
+            features = self.backbone(image_resized)
+            if self.with_keypoint:
+                outputs = self.keypoint_head(features)
+
+            heatmaps, tags = split_ae_outputs(
+                outputs, self.test_cfg['num_joints'],
+                self.test_cfg['with_heatmaps'], self.test_cfg['with_ae'],
+                self.test_cfg.get('select_output_index', range(len(outputs))))
+
+            if self.test_cfg.get('flip_test', True):
+                # use flip test
+                features_flipped = self.backbone(
+                    torch.flip(image_resized, [3]))
+                if self.with_keypoint:
+                    outputs_flipped = self.keypoint_head(features_flipped)
+
+                heatmaps_flipped, tags_flipped = split_ae_outputs(
+                    outputs_flipped, self.test_cfg['num_joints'],
+                    self.test_cfg['with_heatmaps'], self.test_cfg['with_ae'],
+                    self.test_cfg.get('select_output_index',
+                                      range(len(outputs))))
+
+                heatmaps_flipped = flip_feature_maps(
+                    heatmaps_flipped, flip_index=img_metas['flip_index'])
+                if self.test_cfg['tag_per_joint']:
+                    tags_flipped = flip_feature_maps(
+                        tags_flipped, flip_index=img_metas['flip_index'])
+                else:
+                    tags_flipped = flip_feature_maps(
+                        tags_flipped, flip_index=None, flip_output=True)
+
+            else:
+                heatmaps_flipped = None
+                tags_flipped = None
+
+            aggregated_heatmaps = aggregate_stage_flip(
+                heatmaps,
+                heatmaps_flipped,
+                index=-1,
+                project2image=self.test_cfg['project2image'],
+                size_projected=base_size,
+                align_corners=self.test_cfg.get('align_corners', True),
+                aggregate_stage='average',
+                aggregate_flip='average')
+
+            aggregated_tags = aggregate_stage_flip(
+                tags,
+                tags_flipped,
+                index=-1,
+                project2image=self.test_cfg['project2image'],
+                size_projected=base_size,
+                align_corners=self.test_cfg.get('align_corners', True),
+                aggregate_stage='concat',
+                aggregate_flip='concat')
+
+            if s == 1 or len(test_scale_factor) == 1:
+                if isinstance(aggregated_tags, list):
+                    scale_tags_list.extend(aggregated_tags)
+                else:
+                    scale_tags_list.append(aggregated_tags)
+
+            if isinstance(aggregated_heatmaps, list):
+                scale_heatmaps_list.extend(aggregated_heatmaps)
+            else:
+                scale_heatmaps_list.append(aggregated_heatmaps)
+
+        aggregated_heatmaps = aggregate_scale(
+            scale_heatmaps_list,
+            align_corners=self.test_cfg.get('align_corners', True),
+            aggregate_scale='average')
+
+        aggregated_tags = aggregate_scale(
+            scale_tags_list,
+            align_corners=self.test_cfg.get('align_corners', True),
+            aggregate_scale='unsqueeze_concat')
+
+        heatmap_size = aggregated_heatmaps.shape[2:4]
+        tag_size = aggregated_tags.shape[2:4]
+        if heatmap_size != tag_size:
+            tmp = []
+            for idx in range(aggregated_tags.shape[-1]):
+                tmp.append(
+                    torch.nn.functional.interpolate(
+                        aggregated_tags[..., idx],
+                        size=heatmap_size,
+                        mode='bilinear',
+                        align_corners=self.test_cfg.get('align_corners',
+                                                        True)).unsqueeze(-1))
+            aggregated_tags = torch.cat(tmp, dim=-1)
+
+        # perform grouping
+        grouped, scores = self.parser.parse(aggregated_heatmaps,
+                                            aggregated_tags,
+                                            self.test_cfg['adjust'],
+                                            self.test_cfg['refine'])
+
+        preds = get_group_preds(
+            grouped,
+            center,
+            scale, [aggregated_heatmaps.size(3),
+                    aggregated_heatmaps.size(2)],
+            use_udp=self.use_udp)
+
+        image_paths = []
+        image_paths.append(img_metas['image_file'])
+
+        if return_heatmap:
+            output_heatmap = aggregated_heatmaps.detach().cpu().numpy()
+        else:
+            output_heatmap = None
+
+        result['preds'] = preds
+        result['scores'] = scores
+        result['image_paths'] = image_paths
+        result['output_heatmap'] = output_heatmap
+
+        return result
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='AssociativeEmbedding')
+    def show_result(self,
+                    img,
+                    result,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    bbox_color=None,
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    radius=4,
+                    thickness=1,
+                    font_scale=0.5,
+                    win_name='',
+                    show=False,
+                    show_keypoint_weight=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+            skeleton (list[list]): The connection of keypoints.
+                skeleton is 0-based indexing.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M links.
+                If None, do not draw links.
+            radius (int): Radius of circles.
+            thickness (int): Thickness of lines.
+            font_scale (float): Font scales of texts.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            show_keypoint_weight (bool): Whether to change the transparency
+                using the predicted confidence scores of keypoints.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized image only if not `show` or `out_file`
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        img_h, img_w, _ = img.shape
+
+        pose_result = []
+        for res in result:
+            pose_result.append(res['keypoints'])
+
+        imshow_keypoints(img, pose_result, skeleton, kpt_score_thr,
+                         pose_kpt_color, pose_link_color, radius, thickness)
+
+        if show:
+            imshow(img, win_name, wait_time)
+
+        if out_file is not None:
+            imwrite(img, out_file)
+
+        return img
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/base.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d459b42de66012c88ff37d7d845265d06efebc7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/base.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+
+class BasePose(nn.Module, metaclass=ABCMeta):
+    """Base class for pose detectors.
+
+    All recognizers should subclass it.
+    All subclass should overwrite:
+        Methods:`forward_train`, supporting to forward when training.
+        Methods:`forward_test`, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Head modules to give output.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+    """
+
+    @abstractmethod
+    def forward_train(self, img, img_metas, **kwargs):
+        """Defines the computation performed at training."""
+
+    @abstractmethod
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at testing."""
+
+    @abstractmethod
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Forward function."""
+
+    @staticmethod
+    def _parse_losses(losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
+                which may be a weighted sum of all losses, log_vars \
+                contains all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, float):
+                log_vars[loss_name] = loss_value
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors or float')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if not isinstance(loss_value, float):
+                if dist.is_available() and dist.is_initialized():
+                    loss_value = loss_value.data.clone()
+                    dist.all_reduce(loss_value.div_(dist.get_world_size()))
+                log_vars[loss_name] = loss_value.item()
+            else:
+                log_vars[loss_name] = loss_value
+
+        return loss, log_vars
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data_batch (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self.forward(**data_batch)
+
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(next(iter(data_batch.values()))))
+
+        return outputs
+
+    def val_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        results = self.forward(return_loss=False, **data_batch)
+
+        outputs = dict(results=results)
+
+        return outputs
+
+    @abstractmethod
+    def show_result(self, **kwargs):
+        """Visualize the results."""
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/interhand_3d.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/interhand_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a4d6bde1b097d1649a65de8075744ac1978ad15
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/interhand_3d.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from mmcv.utils.misc import deprecated_api_warning
+
+from mmpose.core import imshow_keypoints, imshow_keypoints_3d
+from ..builder import POSENETS
+from .top_down import TopDown
+
+
+@POSENETS.register_module()
+class Interhand3D(TopDown):
+    """Top-down interhand 3D pose detector of paper ref: Gyeongsik Moon.
+
+    "InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose
+    Estimation from a Single RGB Image". A child class of TopDown detector.
+    """
+
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  list[Tensor], list[list[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (list[torch.Tensor]): Target heatmaps, relative hand
+            root depth and hand type.
+            target_weight (list[torch.Tensor]): Weights for target
+            heatmaps, relative hand root depth and hand type.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+                - "heatmap3d_depth_bound": depth bound of hand keypoint 3D
+                    heatmap
+                - "root_depth_bound": depth bound of relative root depth 1D
+                    heatmap
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths, \
+                heatmaps, relative hand root depth and hand type.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(img, img_metas, **kwargs)
+
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        features = self.backbone(img)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            img_flipped = img.flip(3)
+            features_flipped = self.backbone(img_flipped)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_flipped = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output = [(out + out_flipped) * 0.5
+                          for out, out_flipped in zip(output, output_flipped)]
+
+        if self.with_keypoint:
+            result = self.keypoint_head.decode(
+                img_metas, output, img_size=[img_width, img_height])
+        else:
+            result = {}
+        return result
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='Interhand3D')
+    def show_result(self,
+                    result,
+                    img=None,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    radius=8,
+                    bbox_color='green',
+                    thickness=2,
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    vis_height=400,
+                    num_instances=-1,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Visualize 3D pose estimation results.
+
+        Args:
+            result (list[dict]): The pose estimation results containing:
+
+                - "keypoints_3d" ([K,4]): 3D keypoints
+                - "keypoints" ([K,3] or [T,K,3]): Optional for visualizing
+                    2D inputs. If a sequence is given, only the last frame
+                    will be used for visualization
+                - "bbox" ([4,] or [T,4]): Optional for visualizing 2D inputs
+                - "title" (str): title for the subplot
+            img (str or Tensor): Optional. The image to visualize 2D inputs on.
+            skeleton (list of [idx_i,idx_j]): Skeleton described by a list of
+                links, each is a pair of joint indices.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            radius (int): Radius of circles.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            thickness (int): Thickness of lines.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M limbs.
+                If None, do not draw limbs.
+            vis_height (int): The image height of the visualization. The width
+                will be N*vis_height depending on the number of visualized
+                items.
+            num_instances (int): Number of instances to be shown in 3D. If
+                smaller than 0, all the instances in the pose_result will be
+                shown. Otherwise, pad or truncate the pose_result to a length
+                of num_instances.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+        if num_instances < 0:
+            assert len(result) > 0
+        result = sorted(result, key=lambda x: x.get('track_id', 0))
+
+        # draw image and 2d poses
+        if img is not None:
+            img = mmcv.imread(img)
+
+            bbox_result = []
+            pose_2d = []
+            for res in result:
+                if 'bbox' in res:
+                    bbox = np.array(res['bbox'])
+                    if bbox.ndim != 1:
+                        assert bbox.ndim == 2
+                        bbox = bbox[-1]  # Get bbox from the last frame
+                    bbox_result.append(bbox)
+                if 'keypoints' in res:
+                    kpts = np.array(res['keypoints'])
+                    if kpts.ndim != 2:
+                        assert kpts.ndim == 3
+                        kpts = kpts[-1]  # Get 2D keypoints from the last frame
+                    pose_2d.append(kpts)
+
+            if len(bbox_result) > 0:
+                bboxes = np.vstack(bbox_result)
+                mmcv.imshow_bboxes(
+                    img,
+                    bboxes,
+                    colors=bbox_color,
+                    top_k=-1,
+                    thickness=2,
+                    show=False)
+            if len(pose_2d) > 0:
+                imshow_keypoints(
+                    img,
+                    pose_2d,
+                    skeleton,
+                    kpt_score_thr=kpt_score_thr,
+                    pose_kpt_color=pose_kpt_color,
+                    pose_link_color=pose_link_color,
+                    radius=radius,
+                    thickness=thickness)
+            img = mmcv.imrescale(img, scale=vis_height / img.shape[0])
+
+        img_vis = imshow_keypoints_3d(
+            result,
+            img,
+            skeleton,
+            pose_kpt_color,
+            pose_link_color,
+            vis_height,
+            axis_limit=300,
+            axis_azimuth=-115,
+            axis_elev=15,
+            kpt_score_thr=kpt_score_thr,
+            num_instances=num_instances)
+
+        if show:
+            mmcv.visualization.imshow(img_vis, win_name, wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(img_vis, out_file)
+
+        return img_vis
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/mesh.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/mesh.py
new file mode 100644
index 0000000000000000000000000000000000000000..0af18e3844659c7d2a3755ab891819bbf7ef4c22
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/mesh.py
@@ -0,0 +1,438 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import mmcv
+import numpy as np
+import torch
+
+from mmpose.core.visualization.image import imshow_mesh_3d
+from mmpose.models.misc.discriminator import SMPLDiscriminator
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+
+def set_requires_grad(nets, requires_grad=False):
+    """Set requies_grad for all the networks.
+
+    Args:
+        nets (nn.Module | list[nn.Module]): A list of networks or a single
+            network.
+        requires_grad (bool): Whether the networks require gradients or not
+    """
+    if not isinstance(nets, list):
+        nets = [nets]
+    for net in nets:
+        if net is not None:
+            for param in net.parameters():
+                param.requires_grad = requires_grad
+
+
+@POSENETS.register_module()
+class ParametricMesh(BasePose):
+    """Model-based 3D human mesh detector. Take a single color image as input
+    and output 3D joints, SMPL parameters and camera parameters.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        mesh_head (dict): Mesh head to process feature.
+        smpl (dict): Config for SMPL model.
+        disc (dict): Discriminator for SMPL parameters. Default: None.
+        loss_gan (dict): Config for adversarial loss. Default: None.
+        loss_mesh (dict): Config for mesh loss. Default: None.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+    """
+
+    def __init__(self,
+                 backbone,
+                 mesh_head,
+                 smpl,
+                 disc=None,
+                 loss_gan=None,
+                 loss_mesh=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = builder.build_backbone(backbone)
+        self.mesh_head = builder.build_head(mesh_head)
+        self.generator = torch.nn.Sequential(self.backbone, self.mesh_head)
+
+        self.smpl = builder.build_mesh_model(smpl)
+
+        self.with_gan = disc is not None and loss_gan is not None
+        if self.with_gan:
+            self.discriminator = SMPLDiscriminator(**disc)
+            self.loss_gan = builder.build_loss(loss_gan)
+        self.disc_step_count = 0
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self.loss_mesh = builder.build_loss(loss_mesh)
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        self.mesh_head.init_weights()
+        if self.with_gan:
+            self.discriminator.init_weights()
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """Train step function.
+
+        In this function, the detector will finish the train step following
+        the pipeline:
+
+            1. get fake and real SMPL parameters
+            2. optimize discriminator (if have)
+            3. optimize generator
+
+        If `self.train_cfg.disc_step > 1`, the train step will contain multiple
+        iterations for optimizing discriminator with different input data and
+        only one iteration for optimizing generator after `disc_step`
+        iterations for discriminator.
+
+        Args:
+            data_batch (torch.Tensor): Batch of data as input.
+            optimizer (dict[torch.optim.Optimizer]): Dict with optimizers for
+                generator and discriminator (if have).
+
+        Returns:
+            outputs (dict): Dict with loss, information for logger,
+            the number of samples.
+        """
+
+        img = data_batch['img']
+        pred_smpl = self.generator(img)
+        pred_pose, pred_beta, pred_camera = pred_smpl
+
+        # optimize discriminator (if have)
+        if self.train_cfg['disc_step'] > 0 and self.with_gan:
+            set_requires_grad(self.discriminator, True)
+            fake_data = (pred_camera.detach(), pred_pose.detach(),
+                         pred_beta.detach())
+            mosh_theta = data_batch['mosh_theta']
+            real_data = (mosh_theta[:, :3], mosh_theta[:,
+                                                       3:75], mosh_theta[:,
+                                                                         75:])
+            fake_score = self.discriminator(fake_data)
+            real_score = self.discriminator(real_data)
+
+            disc_losses = {}
+            disc_losses['real_loss'] = self.loss_gan(
+                real_score, target_is_real=True, is_disc=True)
+            disc_losses['fake_loss'] = self.loss_gan(
+                fake_score, target_is_real=False, is_disc=True)
+            loss_disc, log_vars_d = self._parse_losses(disc_losses)
+
+            optimizer['discriminator'].zero_grad()
+            loss_disc.backward()
+            optimizer['discriminator'].step()
+            self.disc_step_count = \
+                (self.disc_step_count + 1) % self.train_cfg['disc_step']
+
+            if self.disc_step_count != 0:
+                outputs = dict(
+                    loss=loss_disc,
+                    log_vars=log_vars_d,
+                    num_samples=len(next(iter(data_batch.values()))))
+                return outputs
+
+        # optimize generator
+        pred_out = self.smpl(
+            betas=pred_beta,
+            body_pose=pred_pose[:, 1:],
+            global_orient=pred_pose[:, :1])
+        pred_vertices, pred_joints_3d = pred_out['vertices'], pred_out[
+            'joints']
+
+        gt_beta = data_batch['beta']
+        gt_pose = data_batch['pose']
+        gt_vertices = self.smpl(
+            betas=gt_beta,
+            body_pose=gt_pose[:, 3:],
+            global_orient=gt_pose[:, :3])['vertices']
+
+        pred = dict(
+            pose=pred_pose,
+            beta=pred_beta,
+            camera=pred_camera,
+            vertices=pred_vertices,
+            joints_3d=pred_joints_3d)
+
+        target = {
+            key: data_batch[key]
+            for key in [
+                'pose', 'beta', 'has_smpl', 'joints_3d', 'joints_2d',
+                'joints_3d_visible', 'joints_2d_visible'
+            ]
+        }
+        target['vertices'] = gt_vertices
+
+        losses = self.loss_mesh(pred, target)
+
+        if self.with_gan:
+            set_requires_grad(self.discriminator, False)
+            pred_theta = (pred_camera, pred_pose, pred_beta)
+            pred_score = self.discriminator(pred_theta)
+            loss_adv = self.loss_gan(
+                pred_score, target_is_real=True, is_disc=False)
+            losses['adv_loss'] = loss_adv
+
+        loss, log_vars = self._parse_losses(losses)
+        optimizer['generator'].zero_grad()
+        loss.backward()
+        optimizer['generator'].step()
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(next(iter(data_batch.values()))))
+
+        return outputs
+
+    def forward_train(self, *args, **kwargs):
+        """Forward function for training.
+
+        For ParametricMesh, we do not use this interface.
+        """
+        raise NotImplementedError('This interface should not be used in '
+                                  'current training schedule. Please use '
+                                  '`train_step` for training.')
+
+    def val_step(self, data_batch, **kwargs):
+        """Forward function for evaluation.
+
+        Args:
+            data_batch (dict): Contain data for forward.
+
+        Returns:
+            dict: Contain the results from model.
+        """
+        output = self.forward_test(**data_batch, **kwargs)
+        return output
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Outputs.
+        """
+        output = self.generator(img)
+        return output
+
+    def forward_test(self,
+                     img,
+                     img_metas,
+                     return_vertices=False,
+                     return_faces=False,
+                     **kwargs):
+        """Defines the computation performed at every call when testing."""
+
+        pred_smpl = self.generator(img)
+        pred_pose, pred_beta, pred_camera = pred_smpl
+        pred_out = self.smpl(
+            betas=pred_beta,
+            body_pose=pred_pose[:, 1:],
+            global_orient=pred_pose[:, :1])
+        pred_vertices, pred_joints_3d = pred_out['vertices'], pred_out[
+            'joints']
+
+        all_preds = {}
+        all_preds['keypoints_3d'] = pred_joints_3d.detach().cpu().numpy()
+        all_preds['smpl_pose'] = pred_pose.detach().cpu().numpy()
+        all_preds['smpl_beta'] = pred_beta.detach().cpu().numpy()
+        all_preds['camera'] = pred_camera.detach().cpu().numpy()
+
+        if return_vertices:
+            all_preds['vertices'] = pred_vertices.detach().cpu().numpy()
+        if return_faces:
+            all_preds['faces'] = self.smpl.get_faces()
+
+        all_boxes = []
+        image_path = []
+        for img_meta in img_metas:
+            box = np.zeros(6, dtype=np.float32)
+            c = img_meta['center']
+            s = img_meta['scale']
+            if 'bbox_score' in img_metas:
+                score = np.array(img_metas['bbox_score']).reshape(-1)
+            else:
+                score = 1.0
+            box[0:2] = c
+            box[2:4] = s
+            box[4] = np.prod(s * 200.0, axis=0)
+            box[5] = score
+            all_boxes.append(box)
+            image_path.append(img_meta['image_file'])
+
+        all_preds['bboxes'] = np.stack(all_boxes, axis=0)
+        all_preds['image_path'] = image_path
+        return all_preds
+
+    def get_3d_joints_from_mesh(self, vertices):
+        """Get 3D joints from 3D mesh using predefined joints regressor."""
+        return torch.matmul(
+            self.joints_regressor.to(vertices.device), vertices)
+
+    def forward(self, img, img_metas=None, return_loss=False, **kwargs):
+        """Forward function.
+
+        Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+
+        Note:
+            - batch_size: N
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+
+        Args:
+            img (torch.Tensor[N x C x imgH x imgW]): Input images.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+
+        Returns:
+            Return predicted 3D joints, SMPL parameters, boxes and image paths.
+        """
+
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        return self.forward_test(img, img_metas, **kwargs)
+
+    def show_result(self,
+                    result,
+                    img,
+                    show=False,
+                    out_file=None,
+                    win_name='',
+                    wait_time=0,
+                    bbox_color='green',
+                    mesh_color=(76, 76, 204),
+                    **kwargs):
+        """Visualize 3D mesh estimation results.
+
+        Args:
+            result (list[dict]): The mesh estimation results containing:
+
+               - "bbox" (ndarray[4]): instance bounding bbox
+               - "center" (ndarray[2]): bbox center
+               - "scale" (ndarray[2]): bbox scale
+               - "keypoints_3d" (ndarray[K,3]): predicted 3D keypoints
+               - "camera" (ndarray[3]): camera parameters
+               - "vertices" (ndarray[V, 3]): predicted 3D vertices
+               - "faces" (ndarray[F, 3]): mesh faces
+            img (str or Tensor): Optional. The image to visualize 2D inputs on.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            wait_time (int): Value of waitKey param. Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            mesh_color (str or tuple or :obj:`Color`): Color of mesh surface.
+
+        Returns:
+            ndarray: Visualized img, only if not `show` or `out_file`.
+        """
+
+        if img is not None:
+            img = mmcv.imread(img)
+
+        focal_length = self.loss_mesh.focal_length
+        H, W, C = img.shape
+        img_center = np.array([[0.5 * W], [0.5 * H]])
+
+        # show bounding boxes
+        bboxes = [res['bbox'] for res in result]
+        bboxes = np.vstack(bboxes)
+        mmcv.imshow_bboxes(
+            img, bboxes, colors=bbox_color, top_k=-1, thickness=2, show=False)
+
+        vertex_list = []
+        face_list = []
+        for res in result:
+            vertices = res['vertices']
+            faces = res['faces']
+            camera = res['camera']
+            camera_center = res['center']
+            scale = res['scale']
+
+            # predicted vertices are in root-relative space,
+            # we need to translate them to camera space.
+            translation = np.array([
+                camera[1], camera[2],
+                2 * focal_length / (scale[0] * 200.0 * camera[0] + 1e-9)
+            ])
+            mean_depth = vertices[:, -1].mean() + translation[-1]
+            translation[:2] += (camera_center -
+                                img_center[:, 0]) / focal_length * mean_depth
+            vertices += translation[None, :]
+
+            vertex_list.append(vertices)
+            face_list.append(faces)
+
+        # render from front view
+        img_vis = imshow_mesh_3d(
+            img,
+            vertex_list,
+            face_list,
+            img_center, [focal_length, focal_length],
+            colors=mesh_color)
+
+        # render from side view
+        # rotate mesh vertices
+        R = cv2.Rodrigues(np.array([0, np.radians(90.), 0]))[0]
+        rot_vertex_list = [np.dot(vert, R) for vert in vertex_list]
+
+        # get the 3D bbox containing all meshes
+        rot_vertices = np.concatenate(rot_vertex_list, axis=0)
+        min_corner = rot_vertices.min(0)
+        max_corner = rot_vertices.max(0)
+
+        center_3d = 0.5 * (min_corner + max_corner)
+        ratio = 0.8
+        bbox3d_size = max_corner - min_corner
+
+        # set appropriate translation to make all meshes appear in the image
+        z_x = bbox3d_size[0] * focal_length / (ratio * W) - min_corner[2]
+        z_y = bbox3d_size[1] * focal_length / (ratio * H) - min_corner[2]
+        z = max(z_x, z_y)
+        translation = -center_3d
+        translation[2] = z
+        translation = translation[None, :]
+        rot_vertex_list = [
+            rot_vert + translation for rot_vert in rot_vertex_list
+        ]
+
+        # render from side view
+        img_side = imshow_mesh_3d(
+            np.ones_like(img) * 255, rot_vertex_list, face_list, img_center,
+            [focal_length, focal_length])
+
+        # merger images from front view and side view
+        img_vis = np.concatenate([img_vis, img_side], axis=1)
+
+        if show:
+            mmcv.visualization.imshow(img_vis, win_name, wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(img_vis, out_file)
+
+        return img_vis
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/multi_task.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/multi_task.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b6f3178a4b0413f5118eee27b535f46a1baaf84
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/multi_task.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+from .. import builder
+from ..builder import POSENETS
+
+
+@POSENETS.register_module()
+class MultiTask(nn.Module):
+    """Multi-task detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        heads (list[dict]): heads to output predictions.
+        necks (list[dict] | None): necks to process feature.
+        head2neck (dict{int:int}): head index to neck index.
+        pretrained (str): Path to the pretrained models.
+    """
+
+    def __init__(self,
+                 backbone,
+                 heads,
+                 necks=None,
+                 head2neck=None,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = builder.build_backbone(backbone)
+
+        if head2neck is None:
+            assert necks is None
+            head2neck = {}
+
+        self.head2neck = {}
+        for i in range(len(heads)):
+            self.head2neck[i] = head2neck[i] if i in head2neck else -1
+
+        self.necks = nn.ModuleList([])
+        if necks is not None:
+            for neck in necks:
+                self.necks.append(builder.build_neck(neck))
+        self.necks.append(nn.Identity())
+
+        self.heads = nn.ModuleList([])
+        assert heads is not None
+        for head in heads:
+            assert head is not None
+            self.heads.append(builder.build_head(head))
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_necks(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'necks')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_necks:
+            for neck in self.necks:
+                if hasattr(neck, 'init_weights'):
+                    neck.init_weights()
+
+        for head in self.heads:
+            if hasattr(head, 'init_weights'):
+                head.init_weights()
+
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img weight: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[N,C,imgH,imgW]): Input images.
+            target (list[torch.Tensor]): Targets.
+            target_weight (List[torch.Tensor]): Weights.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(img, img_metas, **kwargs)
+
+    def forward_train(self, img, target, target_weight, img_metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        features = self.backbone(img)
+        outputs = []
+
+        for head_id, head in enumerate(self.heads):
+            neck_id = self.head2neck[head_id]
+            outputs.append(head(self.necks[neck_id](features)))
+
+        # if return loss
+        losses = dict()
+
+        for head, output, gt, gt_weight in zip(self.heads, outputs, target,
+                                               target_weight):
+            loss = head.get_loss(output, gt, gt_weight)
+            assert len(set(losses.keys()).intersection(set(loss.keys()))) == 0
+            losses.update(loss)
+
+            if hasattr(head, 'get_accuracy'):
+                acc = head.get_accuracy(output, gt, gt_weight)
+                assert len(set(losses.keys()).intersection(set(
+                    acc.keys()))) == 0
+                losses.update(acc)
+
+        return losses
+
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        results = {}
+
+        features = self.backbone(img)
+        outputs = []
+
+        for head_id, head in enumerate(self.heads):
+            neck_id = self.head2neck[head_id]
+            if hasattr(head, 'inference_model'):
+                head_output = head.inference_model(
+                    self.necks[neck_id](features), flip_pairs=None)
+            else:
+                head_output = head(
+                    self.necks[neck_id](features)).detach().cpu().numpy()
+            outputs.append(head_output)
+
+        for head, output in zip(self.heads, outputs):
+            result = head.decode(
+                img_metas, output, img_size=[img_width, img_height])
+            results.update(result)
+        return results
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            list[Tensor]: Outputs.
+        """
+        features = self.backbone(img)
+        outputs = []
+        for head_id, head in enumerate(self.heads):
+            neck_id = self.head2neck[head_id]
+            outputs.append(head(self.necks[neck_id](features)))
+        return outputs
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/multiview_pose.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/multiview_pose.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3d2221eee4198d0cbaad7c8e7031f85dc35cf33
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/multiview_pose.py
@@ -0,0 +1,889 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.runner import load_checkpoint
+
+from mmpose.core.camera import SimpleCameraTorch
+from mmpose.core.post_processing.post_transforms import (
+    affine_transform_torch, get_affine_transform)
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+
+class ProjectLayer(nn.Module):
+
+    def __init__(self, image_size, heatmap_size):
+        """Project layer to get voxel feature. Adapted from
+        https://github.com/microsoft/voxelpose-
+        pytorch/blob/main/lib/models/project_layer.py.
+
+        Args:
+            image_size (int or list): input size of the 2D model
+            heatmap_size (int or list): output size of the 2D model
+        """
+        super(ProjectLayer, self).__init__()
+        self.image_size = image_size
+        self.heatmap_size = heatmap_size
+        if isinstance(self.image_size, int):
+            self.image_size = [self.image_size, self.image_size]
+        if isinstance(self.heatmap_size, int):
+            self.heatmap_size = [self.heatmap_size, self.heatmap_size]
+
+    def compute_grid(self, box_size, box_center, num_bins, device=None):
+        if isinstance(box_size, int) or isinstance(box_size, float):
+            box_size = [box_size, box_size, box_size]
+        if isinstance(num_bins, int):
+            num_bins = [num_bins, num_bins, num_bins]
+
+        grid_1D_x = torch.linspace(
+            -box_size[0] / 2, box_size[0] / 2, num_bins[0], device=device)
+        grid_1D_y = torch.linspace(
+            -box_size[1] / 2, box_size[1] / 2, num_bins[1], device=device)
+        grid_1D_z = torch.linspace(
+            -box_size[2] / 2, box_size[2] / 2, num_bins[2], device=device)
+        grid_x, grid_y, grid_z = torch.meshgrid(
+            grid_1D_x + box_center[0],
+            grid_1D_y + box_center[1],
+            grid_1D_z + box_center[2],
+        )
+        grid_x = grid_x.contiguous().view(-1, 1)
+        grid_y = grid_y.contiguous().view(-1, 1)
+        grid_z = grid_z.contiguous().view(-1, 1)
+        grid = torch.cat([grid_x, grid_y, grid_z], dim=1)
+
+        return grid
+
+    def get_voxel(self, feature_maps, meta, grid_size, grid_center, cube_size):
+        device = feature_maps[0].device
+        batch_size = feature_maps[0].shape[0]
+        num_channels = feature_maps[0].shape[1]
+        num_bins = cube_size[0] * cube_size[1] * cube_size[2]
+        n = len(feature_maps)
+        cubes = torch.zeros(
+            batch_size, num_channels, 1, num_bins, n, device=device)
+        w, h = self.heatmap_size
+        grids = torch.zeros(batch_size, num_bins, 3, device=device)
+        bounding = torch.zeros(batch_size, 1, 1, num_bins, n, device=device)
+        for i in range(batch_size):
+            if len(grid_center[0]) == 3 or grid_center[i][3] >= 0:
+                if len(grid_center) == 1:
+                    grid = self.compute_grid(
+                        grid_size, grid_center[0], cube_size, device=device)
+                else:
+                    grid = self.compute_grid(
+                        grid_size, grid_center[i], cube_size, device=device)
+                grids[i:i + 1] = grid
+                for c in range(n):
+                    center = meta[i]['center'][c]
+                    scale = meta[i]['scale'][c]
+
+                    width, height = center * 2
+                    trans = torch.as_tensor(
+                        get_affine_transform(center, scale / 200.0, 0,
+                                             self.image_size),
+                        dtype=torch.float,
+                        device=device)
+
+                    cam_param = meta[i]['camera'][c].copy()
+
+                    single_view_camera = SimpleCameraTorch(
+                        param=cam_param, device=device)
+                    xy = single_view_camera.world_to_pixel(grid)
+
+                    bounding[i, 0, 0, :, c] = (xy[:, 0] >= 0) & (
+                        xy[:, 1] >= 0) & (xy[:, 0] < width) & (
+                            xy[:, 1] < height)
+                    xy = torch.clamp(xy, -1.0, max(width, height))
+                    xy = affine_transform_torch(xy, trans)
+                    xy = xy * torch.tensor(
+                        [w, h], dtype=torch.float,
+                        device=device) / torch.tensor(
+                            self.image_size, dtype=torch.float, device=device)
+                    sample_grid = xy / torch.tensor([w - 1, h - 1],
+                                                    dtype=torch.float,
+                                                    device=device) * 2.0 - 1.0
+                    sample_grid = torch.clamp(
+                        sample_grid.view(1, 1, num_bins, 2), -1.1, 1.1)
+
+                    cubes[i:i + 1, :, :, :, c] += F.grid_sample(
+                        feature_maps[c][i:i + 1, :, :, :],
+                        sample_grid,
+                        align_corners=True)
+
+        cubes = torch.sum(
+            torch.mul(cubes, bounding), dim=-1) / (
+                torch.sum(bounding, dim=-1) + 1e-6)
+        cubes[cubes != cubes] = 0.0
+        cubes = cubes.clamp(0.0, 1.0)
+
+        cubes = cubes.view(batch_size, num_channels, cube_size[0],
+                           cube_size[1], cube_size[2])
+        return cubes, grids
+
+    def forward(self, feature_maps, meta, grid_size, grid_center, cube_size):
+        cubes, grids = self.get_voxel(feature_maps, meta, grid_size,
+                                      grid_center, cube_size)
+        return cubes, grids
+
+
+@POSENETS.register_module()
+class DetectAndRegress(BasePose):
+    """DetectAndRegress approach for multiview human pose detection.
+
+    Args:
+        backbone (ConfigDict): Dictionary to construct the 2D pose detector
+        human_detector (ConfigDict): dictionary to construct human detector
+        pose_regressor (ConfigDict): dictionary to construct pose regressor
+        train_cfg (ConfigDict): Config for training. Default: None.
+        test_cfg (ConfigDict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained 2D model. Default: None.
+        freeze_2d (bool): Whether to freeze the 2D model in training.
+            Default: True.
+    """
+
+    def __init__(self,
+                 backbone,
+                 human_detector,
+                 pose_regressor,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 freeze_2d=True):
+        super(DetectAndRegress, self).__init__()
+        if backbone is not None:
+            self.backbone = builder.build_posenet(backbone)
+            if self.training and pretrained is not None:
+                load_checkpoint(self.backbone, pretrained)
+        else:
+            self.backbone = None
+
+        self.freeze_2d = freeze_2d
+        self.human_detector = builder.MODELS.build(human_detector)
+        self.pose_regressor = builder.MODELS.build(pose_regressor)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    @staticmethod
+    def _freeze(model):
+        """Freeze parameters."""
+        model.eval()
+        for param in model.parameters():
+            param.requires_grad = False
+
+    def train(self, mode=True):
+        """Sets the module in training mode.
+        Args:
+            mode (bool): whether to set training mode (``True``)
+                or evaluation mode (``False``). Default: ``True``.
+
+        Returns:
+            Module: self
+        """
+        super().train(mode)
+        if mode and self.freeze_2d and self.backbone is not None:
+            self._freeze(self.backbone)
+
+        return self
+
+    def forward(self,
+                img=None,
+                img_metas=None,
+                return_loss=True,
+                targets=None,
+                masks=None,
+                targets_3d=None,
+                input_heatmaps=None,
+                **kwargs):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            return_loss: Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            targets (list(torch.Tensor[NxKxHxW])):
+                Multi-camera target feature_maps of the 2D model.
+            masks (list(torch.Tensor[NxHxW])):
+                Multi-camera masks of the input to the 2D model.
+            targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]):
+                Ground-truth 3D heatmap of human centers.
+            input_heatmaps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps when the 2D model is not available.
+                 Default: None.
+            **kwargs:
+
+        Returns:
+            dict: if 'return_loss' is true, then return losses.
+              Otherwise, return predicted poses, human centers and sample_id
+
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, targets, masks,
+                                      targets_3d, input_heatmaps)
+        else:
+            return self.forward_test(img, img_metas, input_heatmaps)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data_batch (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self.forward(**data_batch)
+
+        loss, log_vars = self._parse_losses(losses)
+        if 'img' in data_batch:
+            batch_size = data_batch['img'][0].shape[0]
+        else:
+            assert 'input_heatmaps' in data_batch
+            batch_size = data_batch['input_heatmaps'][0][0].shape[0]
+
+        outputs = dict(loss=loss, log_vars=log_vars, num_samples=batch_size)
+
+        return outputs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      targets=None,
+                      masks=None,
+                      targets_3d=None,
+                      input_heatmaps=None):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            targets (list(torch.Tensor[NxKxHxW])):
+                Multi-camera target feature_maps of the 2D model.
+            masks (list(torch.Tensor[NxHxW])):
+                Multi-camera masks of the input to the 2D model.
+            targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]):
+                Ground-truth 3D heatmap of human centers.
+            input_heatmaps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps when the 2D model is not available.
+                 Default: None.
+
+        Returns:
+            dict: losses.
+
+        """
+        if self.backbone is None:
+            assert input_heatmaps is not None
+            feature_maps = []
+            for input_heatmap in input_heatmaps:
+                feature_maps.append(input_heatmap[0])
+        else:
+            feature_maps = []
+            assert isinstance(img, list)
+            for img_ in img:
+                feature_maps.append(self.backbone.forward_dummy(img_)[0])
+
+        losses = dict()
+        human_candidates, human_loss = self.human_detector.forward_train(
+            None, img_metas, feature_maps, targets_3d, return_preds=True)
+        losses.update(human_loss)
+
+        pose_loss = self.pose_regressor(
+            None,
+            img_metas,
+            return_loss=True,
+            feature_maps=feature_maps,
+            human_candidates=human_candidates)
+        losses.update(pose_loss)
+
+        if not self.freeze_2d:
+            losses_2d = {}
+            heatmaps_tensor = torch.cat(feature_maps, dim=0)
+            targets_tensor = torch.cat(targets, dim=0)
+            masks_tensor = torch.cat(masks, dim=0)
+            losses_2d_ = self.backbone.get_loss(heatmaps_tensor,
+                                                targets_tensor, masks_tensor)
+            for k, v in losses_2d_.items():
+                losses_2d[k + '_2d'] = v
+            losses.update(losses_2d)
+
+        return losses
+
+    def forward_test(
+        self,
+        img,
+        img_metas,
+        input_heatmaps=None,
+    ):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            input_heatmaps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps when the 2D model is not available.
+                 Default: None.
+
+        Returns:
+            dict: predicted poses, human centers and sample_id
+
+        """
+        if self.backbone is None:
+            assert input_heatmaps is not None
+            feature_maps = []
+            for input_heatmap in input_heatmaps:
+                feature_maps.append(input_heatmap[0])
+        else:
+            feature_maps = []
+            assert isinstance(img, list)
+            for img_ in img:
+                feature_maps.append(self.backbone.forward_dummy(img_)[0])
+
+        human_candidates = self.human_detector.forward_test(
+            None, img_metas, feature_maps)
+
+        human_poses = self.pose_regressor(
+            None,
+            img_metas,
+            return_loss=False,
+            feature_maps=feature_maps,
+            human_candidates=human_candidates)
+
+        result = {}
+        result['pose_3d'] = human_poses.cpu().numpy()
+        result['human_detection_3d'] = human_candidates.cpu().numpy()
+        result['sample_id'] = [img_meta['sample_id'] for img_meta in img_metas]
+
+        return result
+
+    def show_result(self, **kwargs):
+        """Visualize the results."""
+        raise NotImplementedError
+
+    def forward_dummy(self, img, input_heatmaps=None, num_candidates=5):
+        """Used for computing network FLOPs."""
+        if self.backbone is None:
+            assert input_heatmaps is not None
+            feature_maps = []
+            for input_heatmap in input_heatmaps:
+                feature_maps.append(input_heatmap[0])
+        else:
+            feature_maps = []
+            assert isinstance(img, list)
+            for img_ in img:
+                feature_maps.append(self.backbone.forward_dummy(img_)[0])
+
+        _ = self.human_detector.forward_dummy(feature_maps)
+
+        _ = self.pose_regressor.forward_dummy(feature_maps, num_candidates)
+
+
+@POSENETS.register_module()
+class VoxelSinglePose(BasePose):
+    """VoxelPose Please refer to the `paper <https://arxiv.org/abs/2004.06239>`
+    for details.
+
+    Args:
+        image_size (list): input size of the 2D model.
+        heatmap_size (list): output size of the 2D model.
+        sub_space_size (list): Size of the cuboid human proposal.
+        sub_cube_size (list): Size of the input volume to the pose net.
+        pose_net (ConfigDict): Dictionary to construct the pose net.
+        pose_head (ConfigDict): Dictionary to construct the pose head.
+        train_cfg (ConfigDict): Config for training. Default: None.
+        test_cfg (ConfigDict): Config for testing. Default: None.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        heatmap_size,
+        sub_space_size,
+        sub_cube_size,
+        num_joints,
+        pose_net,
+        pose_head,
+        train_cfg=None,
+        test_cfg=None,
+    ):
+        super(VoxelSinglePose, self).__init__()
+        self.project_layer = ProjectLayer(image_size, heatmap_size)
+        self.pose_net = builder.build_backbone(pose_net)
+        self.pose_head = builder.build_head(pose_head)
+
+        self.sub_space_size = sub_space_size
+        self.sub_cube_size = sub_cube_size
+
+        self.num_joints = num_joints
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def forward(self,
+                img,
+                img_metas,
+                return_loss=True,
+                feature_maps=None,
+                human_candidates=None,
+                **kwargs):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            feature_maps (list(torch.Tensor[NxCxHxW])):
+                Multi-camera input feature_maps.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            human_candidates (torch.Tensor[NxPx5]):
+                Human candidates.
+            return_loss: Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, feature_maps,
+                                      human_candidates)
+        else:
+            return self.forward_test(img, img_metas, feature_maps,
+                                     human_candidates)
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      feature_maps=None,
+                      human_candidates=None,
+                      return_preds=False,
+                      **kwargs):
+        """Defines the computation performed at training.
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            feature_maps (list(torch.Tensor[NxCxHxW])):
+                Multi-camera input feature_maps.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            human_candidates (torch.Tensor[NxPx5]):
+                Human candidates.
+            return_preds (bool): Whether to return prediction results
+
+        Returns:
+            dict: losses.
+
+        """
+        batch_size, num_candidates, _ = human_candidates.shape
+        pred = human_candidates.new_zeros(batch_size, num_candidates,
+                                          self.num_joints, 5)
+        pred[:, :, :, 3:] = human_candidates[:, :, None, 3:]
+
+        device = feature_maps[0].device
+        gt_3d = torch.stack([
+            torch.tensor(img_meta['joints_3d'], device=device)
+            for img_meta in img_metas
+        ])
+        gt_3d_vis = torch.stack([
+            torch.tensor(img_meta['joints_3d_visible'], device=device)
+            for img_meta in img_metas
+        ])
+        valid_preds = []
+        valid_targets = []
+        valid_weights = []
+
+        for n in range(num_candidates):
+            index = pred[:, n, 0, 3] >= 0
+            num_valid = index.sum()
+            if num_valid > 0:
+                pose_input_cube, coordinates \
+                    = self.project_layer(feature_maps,
+                                         img_metas,
+                                         self.sub_space_size,
+                                         human_candidates[:, n, :3],
+                                         self.sub_cube_size)
+                pose_heatmaps_3d = self.pose_net(pose_input_cube)
+                pose_3d = self.pose_head(pose_heatmaps_3d[index],
+                                         coordinates[index])
+
+                pred[index, n, :, 0:3] = pose_3d.detach()
+                valid_targets.append(gt_3d[index, pred[index, n, 0, 3].long()])
+                valid_weights.append(gt_3d_vis[index, pred[index, n, 0,
+                                                           3].long(), :,
+                                               0:1].float())
+                valid_preds.append(pose_3d)
+
+        losses = dict()
+        if len(valid_preds) > 0:
+            valid_targets = torch.cat(valid_targets, dim=0)
+            valid_weights = torch.cat(valid_weights, dim=0)
+            valid_preds = torch.cat(valid_preds, dim=0)
+            losses.update(
+                self.pose_head.get_loss(valid_preds, valid_targets,
+                                        valid_weights))
+        else:
+            pose_input_cube = feature_maps[0].new_zeros(
+                batch_size, self.num_joints, *self.sub_cube_size)
+            coordinates = feature_maps[0].new_zeros(batch_size,
+                                                    *self.sub_cube_size,
+                                                    3).view(batch_size, -1, 3)
+            pseudo_targets = feature_maps[0].new_zeros(batch_size,
+                                                       self.num_joints, 3)
+            pseudo_weights = feature_maps[0].new_zeros(batch_size,
+                                                       self.num_joints, 1)
+            pose_heatmaps_3d = self.pose_net(pose_input_cube)
+            pose_3d = self.pose_head(pose_heatmaps_3d, coordinates)
+            losses.update(
+                self.pose_head.get_loss(pose_3d, pseudo_targets,
+                                        pseudo_weights))
+        if return_preds:
+            return pred, losses
+        else:
+            return losses
+
+    def forward_test(self,
+                     img,
+                     img_metas,
+                     feature_maps=None,
+                     human_candidates=None,
+                     **kwargs):
+        """Defines the computation performed at training.
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            feature_maps (list(torch.Tensor[NxCxHxW])):
+                Multi-camera input feature_maps.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            human_candidates (torch.Tensor[NxPx5]):
+                Human candidates.
+
+        Returns:
+            dict: predicted poses, human centers and sample_id
+
+        """
+        batch_size, num_candidates, _ = human_candidates.shape
+        pred = human_candidates.new_zeros(batch_size, num_candidates,
+                                          self.num_joints, 5)
+        pred[:, :, :, 3:] = human_candidates[:, :, None, 3:]
+
+        for n in range(num_candidates):
+            index = pred[:, n, 0, 3] >= 0
+            num_valid = index.sum()
+            if num_valid > 0:
+                pose_input_cube, coordinates \
+                    = self.project_layer(feature_maps,
+                                         img_metas,
+                                         self.sub_space_size,
+                                         human_candidates[:, n, :3],
+                                         self.sub_cube_size)
+                pose_heatmaps_3d = self.pose_net(pose_input_cube)
+                pose_3d = self.pose_head(pose_heatmaps_3d[index],
+                                         coordinates[index])
+
+                pred[index, n, :, 0:3] = pose_3d.detach()
+
+        return pred
+
+    def show_result(self, **kwargs):
+        """Visualize the results."""
+        raise NotImplementedError
+
+    def forward_dummy(self, feature_maps, num_candidates=5):
+        """Used for computing network FLOPs."""
+        batch_size, num_channels = feature_maps[0].shape
+        pose_input_cube = feature_maps[0].new_zeros(batch_size, num_channels,
+                                                    *self.sub_cube_size)
+        for n in range(num_candidates):
+            _ = self.pose_net(pose_input_cube)
+
+
+@POSENETS.register_module()
+class VoxelCenterDetector(BasePose):
+    """Detect human center by 3D CNN on voxels.
+
+    Please refer to the
+    `paper <https://arxiv.org/abs/2004.06239>` for details.
+    Args:
+        image_size (list): input size of the 2D model.
+        heatmap_size (list): output size of the 2D model.
+        space_size (list): Size of the 3D space.
+        cube_size (list): Size of the input volume to the 3D CNN.
+        space_center (list): Coordinate of the center of the 3D space.
+        center_net (ConfigDict): Dictionary to construct the center net.
+        center_head (ConfigDict): Dictionary to construct the center head.
+        train_cfg (ConfigDict): Config for training. Default: None.
+        test_cfg (ConfigDict): Config for testing. Default: None.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        heatmap_size,
+        space_size,
+        cube_size,
+        space_center,
+        center_net,
+        center_head,
+        train_cfg=None,
+        test_cfg=None,
+    ):
+        super(VoxelCenterDetector, self).__init__()
+        self.project_layer = ProjectLayer(image_size, heatmap_size)
+        self.center_net = builder.build_backbone(center_net)
+        self.center_head = builder.build_head(center_head)
+
+        self.space_size = space_size
+        self.cube_size = cube_size
+        self.space_center = space_center
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def assign2gt(self, center_candidates, gt_centers, gt_num_persons):
+        """"Assign gt id to each valid human center candidate."""
+        det_centers = center_candidates[..., :3]
+        batch_size = center_candidates.shape[0]
+        cand_num = center_candidates.shape[1]
+        cand2gt = torch.zeros(batch_size, cand_num)
+
+        for i in range(batch_size):
+            cand = det_centers[i].view(cand_num, 1, -1)
+            gt = gt_centers[None, i, :gt_num_persons[i]]
+
+            dist = torch.sqrt(torch.sum((cand - gt)**2, dim=-1))
+            min_dist, min_gt = torch.min(dist, dim=-1)
+
+            cand2gt[i] = min_gt
+            cand2gt[i][min_dist > self.train_cfg['dist_threshold']] = -1.0
+
+        center_candidates[:, :, 3] = cand2gt
+
+        return center_candidates
+
+    def forward(self,
+                img,
+                img_metas,
+                return_loss=True,
+                feature_maps=None,
+                targets_3d=None):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            heatmaps width: W
+            heatmaps height: H
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            return_loss: Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]):
+                Ground-truth 3D heatmap of human centers.
+            feature_maps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps.
+        Returns:
+            dict: if 'return_loss' is true, then return losses.
+                Otherwise, return predicted poses
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, feature_maps, targets_3d)
+        else:
+            return self.forward_test(img, img_metas, feature_maps)
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      feature_maps=None,
+                      targets_3d=None,
+                      return_preds=False):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            heatmaps width: W
+            heatmaps height: H
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]):
+                Ground-truth 3D heatmap of human centers.
+            feature_maps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps.
+            return_preds (bool): Whether to return prediction results
+        Returns:
+            dict: if 'return_pred' is true, then return losses
+                and human centers. Otherwise, return losses only
+        """
+        initial_cubes, _ = self.project_layer(feature_maps, img_metas,
+                                              self.space_size,
+                                              [self.space_center],
+                                              self.cube_size)
+        center_heatmaps_3d = self.center_net(initial_cubes)
+        center_heatmaps_3d = center_heatmaps_3d.squeeze(1)
+        center_candidates = self.center_head(center_heatmaps_3d)
+
+        device = center_candidates.device
+
+        gt_centers = torch.stack([
+            torch.tensor(img_meta['roots_3d'], device=device)
+            for img_meta in img_metas
+        ])
+        gt_num_persons = torch.stack([
+            torch.tensor(img_meta['num_persons'], device=device)
+            for img_meta in img_metas
+        ])
+        center_candidates = self.assign2gt(center_candidates, gt_centers,
+                                           gt_num_persons)
+
+        losses = dict()
+        losses.update(
+            self.center_head.get_loss(center_heatmaps_3d, targets_3d))
+
+        if return_preds:
+            return center_candidates, losses
+        else:
+            return losses
+
+    def forward_test(self, img, img_metas, feature_maps=None):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            heatmaps width: W
+            heatmaps height: H
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            feature_maps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps.
+        Returns:
+            human centers
+        """
+        initial_cubes, _ = self.project_layer(feature_maps, img_metas,
+                                              self.space_size,
+                                              [self.space_center],
+                                              self.cube_size)
+        center_heatmaps_3d = self.center_net(initial_cubes)
+        center_heatmaps_3d = center_heatmaps_3d.squeeze(1)
+        center_candidates = self.center_head(center_heatmaps_3d)
+        center_candidates[..., 3] = \
+            (center_candidates[..., 4] >
+             self.test_cfg['center_threshold']).float() - 1.0
+
+        return center_candidates
+
+    def show_result(self, **kwargs):
+        """Visualize the results."""
+        raise NotImplementedError
+
+    def forward_dummy(self, feature_maps):
+        """Used for computing network FLOPs."""
+        batch_size, num_channels, _, _ = feature_maps[0].shape
+        initial_cubes = feature_maps[0].new_zeros(batch_size, num_channels,
+                                                  *self.cube_size)
+        _ = self.center_net(initial_cubes)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/pose_lifter.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/pose_lifter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ace6b9f3e8b0363666da5d96858b3864213aeabe
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/pose_lifter.py
@@ -0,0 +1,392 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+from mmcv.utils.misc import deprecated_api_warning
+
+from mmpose.core import imshow_bboxes, imshow_keypoints, imshow_keypoints_3d
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class PoseLifter(BasePose):
+    """Pose lifter that lifts 2D pose to 3D pose.
+
+    The basic model is a pose model that predicts root-relative pose. If
+    traj_head is not None, a trajectory model that predicts absolute root joint
+    position is also built.
+
+    Args:
+        backbone (dict): Config for the backbone of pose model.
+        neck (dict|None): Config for the neck of pose model.
+        keypoint_head (dict|None): Config for the head of pose model.
+        traj_backbone (dict|None): Config for the backbone of trajectory model.
+            If traj_backbone is None and traj_head is not None, trajectory
+            model will share backbone with pose model.
+        traj_neck (dict|None): Config for the neck of trajectory model.
+        traj_head (dict|None): Config for the head of trajectory model.
+        loss_semi (dict|None): Config for semi-supervision loss.
+        train_cfg (dict|None): Config for keypoint head during training.
+        test_cfg (dict|None): Config for keypoint head during testing.
+        pretrained (str|None): Path to pretrained weights.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 traj_backbone=None,
+                 traj_neck=None,
+                 traj_head=None,
+                 loss_semi=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super().__init__()
+        self.fp16_enabled = False
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        # pose model
+        self.backbone = builder.build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if keypoint_head is not None:
+            keypoint_head['train_cfg'] = train_cfg
+            keypoint_head['test_cfg'] = test_cfg
+            self.keypoint_head = builder.build_head(keypoint_head)
+
+        # trajectory model
+        if traj_head is not None:
+            self.traj_head = builder.build_head(traj_head)
+
+            if traj_backbone is not None:
+                self.traj_backbone = builder.build_backbone(traj_backbone)
+            else:
+                self.traj_backbone = self.backbone
+
+            if traj_neck is not None:
+                self.traj_neck = builder.build_neck(traj_neck)
+
+        # semi-supervised learning
+        self.semi = loss_semi is not None
+        if self.semi:
+            assert keypoint_head is not None and traj_head is not None
+            self.loss_semi = builder.build_loss(loss_semi)
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_neck(self):
+        """Check if has keypoint_neck."""
+        return hasattr(self, 'neck')
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    @property
+    def with_traj_backbone(self):
+        """Check if has trajectory_backbone."""
+        return hasattr(self, 'traj_backbone')
+
+    @property
+    def with_traj_neck(self):
+        """Check if has trajectory_neck."""
+        return hasattr(self, 'traj_neck')
+
+    @property
+    def with_traj(self):
+        """Check if has trajectory_head."""
+        return hasattr(self, 'traj_head')
+
+    @property
+    def causal(self):
+        if hasattr(self.backbone, 'causal'):
+            return self.backbone.causal
+        else:
+            raise AttributeError('A PoseLifter\'s backbone should have '
+                                 'the bool attribute "causal" to indicate if'
+                                 'it performs causal inference.')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+        if self.with_traj_backbone:
+            self.traj_backbone.init_weights(pretrained)
+        if self.with_traj_neck:
+            self.traj_neck.init_weights()
+        if self.with_traj:
+            self.traj_head.init_weights()
+
+    @auto_fp16(apply_to=('input', ))
+    def forward(self,
+                input,
+                target=None,
+                target_weight=None,
+                metas=None,
+                return_loss=True,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+
+        Note:
+            - batch_size: N
+            - num_input_keypoints: Ki
+            - input_keypoint_dim: Ci
+            - input_sequence_len: Ti
+            - num_output_keypoints: Ko
+            - output_keypoint_dim: Co
+            - input_sequence_len: To
+
+        Args:
+            input (torch.Tensor[NxKixCixTi]): Input keypoint coordinates.
+            target (torch.Tensor[NxKoxCoxTo]): Output keypoint coordinates.
+                Defaults to None.
+            target_weight (torch.Tensor[NxKox1]): Weights across different
+                joint types. Defaults to None.
+            metas (list(dict)): Information about data augmentation
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+
+        Returns:
+            dict|Tensor: If `reutrn_loss` is true, return losses. \
+                Otherwise return predicted poses.
+        """
+        if return_loss:
+            return self.forward_train(input, target, target_weight, metas,
+                                      **kwargs)
+        else:
+            return self.forward_test(input, metas, **kwargs)
+
+    def forward_train(self, input, target, target_weight, metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        assert input.size(0) == len(metas)
+
+        # supervised learning
+        # pose model
+        features = self.backbone(input)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output = self.keypoint_head(features)
+
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output, target, target_weight)
+            keypoint_accuracy = self.keypoint_head.get_accuracy(
+                output, target, target_weight, metas)
+            losses.update(keypoint_losses)
+            losses.update(keypoint_accuracy)
+
+        # trajectory model
+        if self.with_traj:
+            traj_features = self.traj_backbone(input)
+            if self.with_traj_neck:
+                traj_features = self.traj_neck(traj_features)
+            traj_output = self.traj_head(traj_features)
+
+            traj_losses = self.traj_head.get_loss(traj_output,
+                                                  kwargs['traj_target'], None)
+            losses.update(traj_losses)
+
+        # semi-supervised learning
+        if self.semi:
+            ul_input = kwargs['unlabeled_input']
+            ul_features = self.backbone(ul_input)
+            if self.with_neck:
+                ul_features = self.neck(ul_features)
+            ul_output = self.keypoint_head(ul_features)
+
+            ul_traj_features = self.traj_backbone(ul_input)
+            if self.with_traj_neck:
+                ul_traj_features = self.traj_neck(ul_traj_features)
+            ul_traj_output = self.traj_head(ul_traj_features)
+
+            output_semi = dict(
+                labeled_pose=output,
+                unlabeled_pose=ul_output,
+                unlabeled_traj=ul_traj_output)
+            target_semi = dict(
+                unlabeled_target_2d=kwargs['unlabeled_target_2d'],
+                intrinsics=kwargs['intrinsics'])
+
+            semi_losses = self.loss_semi(output_semi, target_semi)
+            losses.update(semi_losses)
+
+        return losses
+
+    def forward_test(self, input, metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        assert input.size(0) == len(metas)
+
+        results = {}
+
+        features = self.backbone(input)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output = self.keypoint_head.inference_model(features)
+            keypoint_result = self.keypoint_head.decode(metas, output)
+            results.update(keypoint_result)
+
+        if self.with_traj:
+            traj_features = self.traj_backbone(input)
+            if self.with_traj_neck:
+                traj_features = self.traj_neck(traj_features)
+            traj_output = self.traj_head.inference_model(traj_features)
+            results['traj_preds'] = traj_output
+
+        return results
+
+    def forward_dummy(self, input):
+        """Used for computing network FLOPs. See ``tools/get_flops.py``.
+
+        Args:
+            input (torch.Tensor): Input pose
+
+        Returns:
+            Tensor: Model output
+        """
+        output = self.backbone(input)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+
+        if self.with_traj:
+            traj_features = self.traj_backbone(input)
+            if self.with_neck:
+                traj_features = self.traj_neck(traj_features)
+            traj_output = self.traj_head(traj_features)
+            output = output + traj_output
+
+        return output
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='PoseLifter')
+    def show_result(self,
+                    result,
+                    img=None,
+                    skeleton=None,
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    radius=8,
+                    thickness=2,
+                    vis_height=400,
+                    num_instances=-1,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Visualize 3D pose estimation results.
+
+        Args:
+            result (list[dict]): The pose estimation results containing:
+
+                - "keypoints_3d" ([K,4]): 3D keypoints
+                - "keypoints" ([K,3] or [T,K,3]): Optional for visualizing
+                    2D inputs. If a sequence is given, only the last frame
+                    will be used for visualization
+                - "bbox" ([4,] or [T,4]): Optional for visualizing 2D inputs
+                - "title" (str): title for the subplot
+            img (str or Tensor): Optional. The image to visualize 2D inputs on.
+            skeleton (list of [idx_i,idx_j]): Skeleton described by a list of
+                links, each is a pair of joint indices.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M links.
+                If None, do not draw links.
+            radius (int): Radius of circles.
+            thickness (int): Thickness of lines.
+            vis_height (int): The image height of the visualization. The width
+                will be N*vis_height depending on the number of visualized
+                items.
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+        if num_instances < 0:
+            assert len(result) > 0
+        result = sorted(result, key=lambda x: x.get('track_id', 1e4))
+
+        # draw image and input 2d poses
+        if img is not None:
+            img = mmcv.imread(img)
+
+            bbox_result = []
+            pose_input_2d = []
+            for res in result:
+                if 'bbox' in res:
+                    bbox = np.array(res['bbox'])
+                    if bbox.ndim != 1:
+                        assert bbox.ndim == 2
+                        bbox = bbox[-1]  # Get bbox from the last frame
+                    bbox_result.append(bbox)
+                if 'keypoints' in res:
+                    kpts = np.array(res['keypoints'])
+                    if kpts.ndim != 2:
+                        assert kpts.ndim == 3
+                        kpts = kpts[-1]  # Get 2D keypoints from the last frame
+                    pose_input_2d.append(kpts)
+
+            if len(bbox_result) > 0:
+                bboxes = np.vstack(bbox_result)
+                imshow_bboxes(
+                    img,
+                    bboxes,
+                    colors='green',
+                    thickness=thickness,
+                    show=False)
+            if len(pose_input_2d) > 0:
+                imshow_keypoints(
+                    img,
+                    pose_input_2d,
+                    skeleton,
+                    kpt_score_thr=0.3,
+                    pose_kpt_color=pose_kpt_color,
+                    pose_link_color=pose_link_color,
+                    radius=radius,
+                    thickness=thickness)
+            img = mmcv.imrescale(img, scale=vis_height / img.shape[0])
+
+        img_vis = imshow_keypoints_3d(
+            result,
+            img,
+            skeleton,
+            pose_kpt_color,
+            pose_link_color,
+            vis_height,
+            num_instances=num_instances)
+
+        if show:
+            mmcv.visualization.imshow(img_vis, win_name, wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(img_vis, out_file)
+
+        return img_vis
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/posewarper.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/posewarper.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa1d05f2a4f73728400ebe5205703bf96110c31a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/posewarper.py
@@ -0,0 +1,244 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+
+from ..builder import POSENETS
+from .top_down import TopDown
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class PoseWarper(TopDown):
+    """Top-down pose detectors for multi-frame settings for video inputs.
+
+    `"Learning temporal pose estimation from sparsely-labeled videos"
+    <https://arxiv.org/abs/1906.04016>`_.
+
+    A child class of TopDown detector. The main difference between PoseWarper
+    and TopDown lies in that the former takes a list of tensors as input image
+    while the latter takes a single tensor as input image in forward method.
+
+    Args:
+        backbone (dict): Backbone modules to extract features.
+        neck (dict): intermediate modules to transform features.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            `loss_keypoint` for heads instead.
+        concat_tensors (bool): Whether to concat the tensors on the batch dim,
+            which can speed up, Default: True
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None,
+                 concat_tensors=True):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            keypoint_head=keypoint_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            loss_pose=loss_pose)
+        self.concat_tensors = concat_tensors
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - number of frames: F
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            imgs (list[F,torch.Tensor[N,C,imgH,imgW]]): multiple input frames
+            target (torch.Tensor[N,K,H,W]): Target heatmaps for one frame.
+            target_weight (torch.Tensor[N,K,1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: paths to multiple video frames
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, imgs, target, target_weight, img_metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames
+        assert imgs[0].size(0) == len(img_metas)
+        num_frames = len(imgs)
+        frame_weight = img_metas[0]['frame_weight']
+
+        assert num_frames == len(frame_weight), f'The number of frames ' \
+            f'({num_frames}) and the length of weights for each frame ' \
+            f'({len(frame_weight)}) must match'
+
+        if self.concat_tensors:
+            features = [self.backbone(torch.cat(imgs, 0))]
+        else:
+            features = [self.backbone(img) for img in imgs]
+
+        if self.with_neck:
+            features = self.neck(features, frame_weight=frame_weight)
+
+        if self.with_keypoint:
+            output = self.keypoint_head(features)
+
+        # if return loss
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output, target, target_weight)
+            losses.update(keypoint_losses)
+            keypoint_accuracy = self.keypoint_head.get_accuracy(
+                output, target, target_weight)
+            losses.update(keypoint_accuracy)
+
+        return losses
+
+    def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames
+        assert imgs[0].size(0) == len(img_metas)
+        num_frames = len(imgs)
+        frame_weight = img_metas[0]['frame_weight']
+
+        assert num_frames == len(frame_weight), f'The number of frames ' \
+            f'({num_frames}) and the length of weights for each frame ' \
+            f'({len(frame_weight)}) must match'
+
+        batch_size, _, img_height, img_width = imgs[0].shape
+
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+
+        if self.concat_tensors:
+            features = [self.backbone(torch.cat(imgs, 0))]
+        else:
+            features = [self.backbone(img) for img in imgs]
+
+        if self.with_neck:
+            features = self.neck(features, frame_weight=frame_weight)
+
+        if self.with_keypoint:
+            output_heatmap = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            imgs_flipped = [img.flip(3) for img in imgs]
+
+            if self.concat_tensors:
+                features_flipped = [self.backbone(torch.cat(imgs_flipped, 0))]
+            else:
+                features_flipped = [
+                    self.backbone(img_flipped) for img_flipped in imgs_flipped
+                ]
+
+            if self.with_neck:
+                features_flipped = self.neck(
+                    features_flipped, frame_weight=frame_weight)
+
+            if self.with_keypoint:
+                output_flipped_heatmap = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_heatmap = (output_heatmap +
+                                  output_flipped_heatmap) * 0.5
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode(
+                img_metas, output_heatmap, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor[N,C,imgH,imgW], or list|tuple of tensors):
+                multiple input frames, N >= 2.
+
+        Returns:
+            Tensor: Output heatmaps.
+        """
+        # concat tensors if they are in a list
+        if isinstance(img, (list, tuple)):
+            img = torch.cat(img, 0)
+
+        batch_size = img.size(0)
+        assert batch_size > 1, 'Input batch size to PoseWarper ' \
+            'should be larger than 1.'
+        if batch_size == 2:
+            warnings.warn('Current batch size: 2, for pytorch2onnx and '
+                          'getting flops both.')
+        else:
+            warnings.warn(
+                f'Current batch size: {batch_size}, for getting flops only.')
+
+        frame_weight = np.random.uniform(0, 1, batch_size)
+        output = [self.backbone(img)]
+
+        if self.with_neck:
+            output = self.neck(output, frame_weight=frame_weight)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+        return output
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/top_down.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/top_down.py
new file mode 100644
index 0000000000000000000000000000000000000000..af0ab51c5b230f4bd39d2fdd082e0fb2daf4594f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/top_down.py
@@ -0,0 +1,307 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+from mmcv.image import imwrite
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.image import imshow
+
+from mmpose.core import imshow_bboxes, imshow_keypoints
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class TopDown(BasePose):
+    """Top-down pose detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            `loss_keypoint` for heads instead.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None):
+        super().__init__()
+        self.fp16_enabled = False
+
+        self.backbone = builder.build_backbone(backbone)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if keypoint_head is not None:
+            keypoint_head['train_cfg'] = train_cfg
+            keypoint_head['test_cfg'] = test_cfg
+
+            if 'loss_keypoint' not in keypoint_head and loss_pose is not None:
+                warnings.warn(
+                    '`loss_pose` for TopDown is deprecated, '
+                    'use `loss_keypoint` for heads instead. See '
+                    'https://github.com/open-mmlab/mmpose/pull/382'
+                    ' for more information.', DeprecationWarning)
+                keypoint_head['loss_keypoint'] = loss_pose
+
+            self.keypoint_head = builder.build_head(keypoint_head)
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_neck(self):
+        """Check if has neck."""
+        return hasattr(self, 'neck')
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, img, target, target_weight, img_metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+
+        # if return loss
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output, target, target_weight)
+            losses.update(keypoint_losses)
+            keypoint_accuracy = self.keypoint_head.get_accuracy(
+                output, target, target_weight)
+            losses.update(keypoint_accuracy)
+
+        return losses
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+
+        features = self.backbone(img)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output_heatmap = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            img_flipped = img.flip(3)
+            features_flipped = self.backbone(img_flipped)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_flipped_heatmap = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_heatmap = (output_heatmap +
+                                  output_flipped_heatmap) * 0.5
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode(
+                img_metas, output_heatmap, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Output heatmaps.
+        """
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+        return output
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='TopDown')
+    def show_result(self,
+                    img,
+                    result,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    bbox_color='green',
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    text_color='white',
+                    radius=4,
+                    thickness=1,
+                    font_scale=0.5,
+                    bbox_thickness=1,
+                    win_name='',
+                    show=False,
+                    show_keypoint_weight=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+            skeleton (list[list]): The connection of keypoints.
+                skeleton is 0-based indexing.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M links.
+                If None, do not draw links.
+            text_color (str or tuple or :obj:`Color`): Color of texts.
+            radius (int): Radius of circles.
+            thickness (int): Thickness of lines.
+            font_scale (float): Font scales of texts.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            show_keypoint_weight (bool): Whether to change the transparency
+                using the predicted confidence scores of keypoints.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+
+        bbox_result = []
+        bbox_labels = []
+        pose_result = []
+        for res in result:
+            if 'bbox' in res:
+                bbox_result.append(res['bbox'])
+                bbox_labels.append(res.get('label', None))
+            pose_result.append(res['keypoints'])
+
+        if bbox_result:
+            bboxes = np.vstack(bbox_result)
+            # draw bounding boxes
+            imshow_bboxes(
+                img,
+                bboxes,
+                labels=bbox_labels,
+                colors=bbox_color,
+                text_color=text_color,
+                thickness=bbox_thickness,
+                font_scale=font_scale,
+                show=False)
+
+        if pose_result:
+            imshow_keypoints(img, pose_result, skeleton, kpt_score_thr,
+                             pose_kpt_color, pose_link_color, radius,
+                             thickness)
+
+        if show:
+            imshow(img, win_name, wait_time)
+
+        if out_file is not None:
+            imwrite(img, out_file)
+
+        return img
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/top_down_moe.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/top_down_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d499b7ff2723b96104815b3f15fcfcb79489d7d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/detectors/top_down_moe.py
@@ -0,0 +1,351 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+
+import mmcv
+import numpy as np
+from mmcv.image import imwrite
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.image import imshow
+
+from mmpose.core import imshow_bboxes, imshow_keypoints
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class TopDownMoE(BasePose):
+    """Top-down pose detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            `loss_keypoint` for heads instead.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 associate_keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None):
+        super().__init__()
+        self.fp16_enabled = False
+
+        self.backbone = builder.build_backbone(backbone)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if keypoint_head is not None:
+            keypoint_head['train_cfg'] = train_cfg
+            keypoint_head['test_cfg'] = test_cfg
+
+            if 'loss_keypoint' not in keypoint_head and loss_pose is not None:
+                warnings.warn(
+                    '`loss_pose` for TopDown is deprecated, '
+                    'use `loss_keypoint` for heads instead. See '
+                    'https://github.com/open-mmlab/mmpose/pull/382'
+                    ' for more information.', DeprecationWarning)
+                keypoint_head['loss_keypoint'] = loss_pose
+                
+            self.keypoint_head = builder.build_head(keypoint_head)
+
+
+        associate_keypoint_heads = []
+        keypoint_heads_cnt = 1
+
+        if associate_keypoint_head is not None:
+            if not isinstance(associate_keypoint_head, list):
+                associate_keypoint_head = [associate_keypoint_head]
+            for single_keypoint_head in associate_keypoint_head:
+                single_keypoint_head['train_cfg'] = train_cfg
+                single_keypoint_head['test_cfg'] = test_cfg
+                associate_keypoint_heads.append(builder.build_head(single_keypoint_head))
+                keypoint_heads_cnt += 1
+
+        self.associate_keypoint_heads = nn.ModuleList(associate_keypoint_heads)
+
+        self.keypoint_heads_cnt = keypoint_heads_cnt
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_neck(self):
+        """Check if has neck."""
+        return hasattr(self, 'neck')
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+        for item in self.associate_keypoint_heads:
+            item.init_weights()
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, img, target, target_weight, img_metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+
+        img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device)
+
+        output = self.backbone(img, img_sources)
+        if self.with_neck:
+            output = self.neck(output)
+        # if return loss
+        losses = dict()
+
+        main_stream_select = (img_sources == 0)
+        # if torch.sum(main_stream_select) > 0:
+        output_select = self.keypoint_head(output)
+
+        target_select = target * main_stream_select.view(-1, 1, 1, 1)
+        target_weight_select = target_weight * main_stream_select.view(-1, 1, 1)
+
+        keypoint_losses = self.keypoint_head.get_loss(
+            output_select, target_select, target_weight_select)
+        losses['main_stream_loss'] = keypoint_losses['heatmap_loss']
+        keypoint_accuracy = self.keypoint_head.get_accuracy(
+            output_select, target_select, target_weight_select)
+        losses['main_stream_acc'] = keypoint_accuracy['acc_pose']
+
+        for idx in range(1, self.keypoint_heads_cnt):
+            idx_select = (img_sources == idx)
+            target_select = target * idx_select.view(-1, 1, 1, 1)
+            target_weight_select = target_weight * idx_select.view(-1, 1, 1)
+            output_select = self.associate_keypoint_heads[idx - 1](output)
+            keypoint_losses = self.associate_keypoint_heads[idx - 1].get_loss(
+                output_select, target_select, target_weight_select)
+            losses[f'{idx}_loss'] = keypoint_losses['heatmap_loss']
+            keypoint_accuracy = self.associate_keypoint_heads[idx - 1].get_accuracy(
+                output_select, target_select, target_weight_select)
+            losses[f'{idx}_acc'] = keypoint_accuracy['acc_pose']
+
+        return losses
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+        img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device)
+
+        features = self.backbone(img, img_sources)
+
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output_heatmap = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            img_flipped = img.flip(3)
+            features_flipped = self.backbone(img_flipped, img_sources)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_flipped_heatmap = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_heatmap = (output_heatmap +
+                                  output_flipped_heatmap) * 0.5
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode(
+                img_metas, output_heatmap, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Output heatmaps.
+        """
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+        return output
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='TopDown')
+    def show_result(self,
+                    img,
+                    result,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    bbox_color='green',
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    text_color='white',
+                    radius=4,
+                    thickness=1,
+                    font_scale=0.5,
+                    bbox_thickness=1,
+                    win_name='',
+                    show=False,
+                    show_keypoint_weight=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+            skeleton (list[list]): The connection of keypoints.
+                skeleton is 0-based indexing.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M links.
+                If None, do not draw links.
+            text_color (str or tuple or :obj:`Color`): Color of texts.
+            radius (int): Radius of circles.
+            thickness (int): Thickness of lines.
+            font_scale (float): Font scales of texts.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            show_keypoint_weight (bool): Whether to change the transparency
+                using the predicted confidence scores of keypoints.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+
+        bbox_result = []
+        bbox_labels = []
+        pose_result = []
+        for res in result:
+            if 'bbox' in res:
+                bbox_result.append(res['bbox'])
+                bbox_labels.append(res.get('label', None))
+            pose_result.append(res['keypoints'])
+
+        if bbox_result:
+            bboxes = np.vstack(bbox_result)
+            # draw bounding boxes
+            imshow_bboxes(
+                img,
+                bboxes,
+                labels=bbox_labels,
+                colors=bbox_color,
+                text_color=text_color,
+                thickness=bbox_thickness,
+                font_scale=font_scale,
+                show=False)
+
+        if pose_result:
+            imshow_keypoints(img, pose_result, skeleton, kpt_score_thr,
+                             pose_kpt_color, pose_link_color, radius,
+                             thickness)
+
+        if show:
+            imshow(img, win_name, wait_time)
+
+        if out_file is not None:
+            imwrite(img, out_file)
+
+        return img
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a98e91140e7af574816787e9ace4ede24214c189
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ae_higher_resolution_head import AEHigherResolutionHead
+from .ae_multi_stage_head import AEMultiStageHead
+from .ae_simple_head import AESimpleHead
+from .deconv_head import DeconvHead
+from .deeppose_regression_head import DeepposeRegressionHead
+from .hmr_head import HMRMeshHead
+from .interhand_3d_head import Interhand3DHead
+from .temporal_regression_head import TemporalRegressionHead
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+from .topdown_heatmap_multi_stage_head import (TopdownHeatmapMSMUHead,
+                                               TopdownHeatmapMultiStageHead)
+from .topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
+from .vipnas_heatmap_simple_head import ViPNASHeatmapSimpleHead
+from .voxelpose_head import CuboidCenterHead, CuboidPoseHead
+
+__all__ = [
+    'TopdownHeatmapSimpleHead', 'TopdownHeatmapMultiStageHead',
+    'TopdownHeatmapMSMUHead', 'TopdownHeatmapBaseHead',
+    'AEHigherResolutionHead', 'AESimpleHead', 'AEMultiStageHead',
+    'DeepposeRegressionHead', 'TemporalRegressionHead', 'Interhand3DHead',
+    'HMRMeshHead', 'DeconvHead', 'ViPNASHeatmapSimpleHead', 'CuboidCenterHead',
+    'CuboidPoseHead'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/ae_higher_resolution_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/ae_higher_resolution_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bf3399cb6facb232931ab9a763fadaf717b138b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/ae_higher_resolution_head.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_upsample_layer, constant_init,
+                      normal_init)
+
+from mmpose.models.builder import build_loss
+from ..backbones.resnet import BasicBlock
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class AEHigherResolutionHead(nn.Module):
+    """Associative embedding with higher resolution head. paper ref: Bowen
+    Cheng et al. "HigherHRNet: Scale-Aware Representation Learning for Bottom-
+    Up Human Pose Estimation".
+
+    Args:
+        in_channels (int): Number of input channels.
+        num_joints (int): Number of joints
+        tag_per_joint (bool): If tag_per_joint is True,
+            the dimension of tags equals to num_joints,
+            else the dimension of tags is 1. Default: True
+        extra (dict): Configs for extra conv layers. Default: None
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        cat_output (list[bool]): Option to concat outputs.
+        with_ae_loss (list[bool]): Option to use ae loss.
+        loss_keypoint (dict): Config for loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_joints,
+                 tag_per_joint=True,
+                 extra=None,
+                 num_deconv_layers=1,
+                 num_deconv_filters=(32, ),
+                 num_deconv_kernels=(4, ),
+                 num_basic_blocks=4,
+                 cat_output=None,
+                 with_ae_loss=None,
+                 loss_keypoint=None):
+        super().__init__()
+
+        self.loss = build_loss(loss_keypoint)
+        dim_tag = num_joints if tag_per_joint else 1
+
+        self.num_deconvs = num_deconv_layers
+        self.cat_output = cat_output
+
+        final_layer_output_channels = []
+
+        if with_ae_loss[0]:
+            out_channels = num_joints + dim_tag
+        else:
+            out_channels = num_joints
+
+        final_layer_output_channels.append(out_channels)
+        for i in range(num_deconv_layers):
+            if with_ae_loss[i + 1]:
+                out_channels = num_joints + dim_tag
+            else:
+                out_channels = num_joints
+            final_layer_output_channels.append(out_channels)
+
+        deconv_layer_output_channels = []
+        for i in range(num_deconv_layers):
+            if with_ae_loss[i]:
+                out_channels = num_joints + dim_tag
+            else:
+                out_channels = num_joints
+            deconv_layer_output_channels.append(out_channels)
+
+        self.final_layers = self._make_final_layers(
+            in_channels, final_layer_output_channels, extra, num_deconv_layers,
+            num_deconv_filters)
+        self.deconv_layers = self._make_deconv_layers(
+            in_channels, deconv_layer_output_channels, num_deconv_layers,
+            num_deconv_filters, num_deconv_kernels, num_basic_blocks,
+            cat_output)
+
+    @staticmethod
+    def _make_final_layers(in_channels, final_layer_output_channels, extra,
+                           num_deconv_layers, num_deconv_filters):
+        """Make final layers."""
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            else:
+                padding = 0
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        final_layers = []
+        final_layers.append(
+            build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=in_channels,
+                out_channels=final_layer_output_channels[0],
+                kernel_size=kernel_size,
+                stride=1,
+                padding=padding))
+
+        for i in range(num_deconv_layers):
+            in_channels = num_deconv_filters[i]
+            final_layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=in_channels,
+                    out_channels=final_layer_output_channels[i + 1],
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+        return nn.ModuleList(final_layers)
+
+    def _make_deconv_layers(self, in_channels, deconv_layer_output_channels,
+                            num_deconv_layers, num_deconv_filters,
+                            num_deconv_kernels, num_basic_blocks, cat_output):
+        """Make deconv layers."""
+        deconv_layers = []
+        for i in range(num_deconv_layers):
+            if cat_output[i]:
+                in_channels += deconv_layer_output_channels[i]
+
+            planes = num_deconv_filters[i]
+            deconv_kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_deconv_kernels[i])
+
+            layers = []
+            layers.append(
+                nn.Sequential(
+                    build_upsample_layer(
+                        dict(type='deconv'),
+                        in_channels=in_channels,
+                        out_channels=planes,
+                        kernel_size=deconv_kernel,
+                        stride=2,
+                        padding=padding,
+                        output_padding=output_padding,
+                        bias=False), nn.BatchNorm2d(planes, momentum=0.1),
+                    nn.ReLU(inplace=True)))
+            for _ in range(num_basic_blocks):
+                layers.append(nn.Sequential(BasicBlock(planes, planes), ))
+            deconv_layers.append(nn.Sequential(*layers))
+            in_channels = planes
+
+        return nn.ModuleList(deconv_layers)
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
+
+    def get_loss(self, outputs, targets, masks, joints):
+        """Calculate bottom-up keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            outputs (list(torch.Tensor[N,K,H,W])): Multi-scale output heatmaps.
+            targets (List(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps.
+            masks (List(torch.Tensor[N,H,W])): Masks of multi-scale target
+                heatmaps
+            joints (List(torch.Tensor[N,M,K,2])): Joints of multi-scale target
+                heatmaps for ae loss
+        """
+
+        losses = dict()
+
+        heatmaps_losses, push_losses, pull_losses = self.loss(
+            outputs, targets, masks, joints)
+
+        for idx in range(len(targets)):
+            if heatmaps_losses[idx] is not None:
+                heatmaps_loss = heatmaps_losses[idx].mean(dim=0)
+                if 'heatmap_loss' not in losses:
+                    losses['heatmap_loss'] = heatmaps_loss
+                else:
+                    losses['heatmap_loss'] += heatmaps_loss
+            if push_losses[idx] is not None:
+                push_loss = push_losses[idx].mean(dim=0)
+                if 'push_loss' not in losses:
+                    losses['push_loss'] = push_loss
+                else:
+                    losses['push_loss'] += push_loss
+            if pull_losses[idx] is not None:
+                pull_loss = pull_losses[idx].mean(dim=0)
+                if 'pull_loss' not in losses:
+                    losses['pull_loss'] = pull_loss
+                else:
+                    losses['pull_loss'] += pull_loss
+
+        return losses
+
+    def forward(self, x):
+        """Forward function."""
+        if isinstance(x, list):
+            x = x[0]
+
+        final_outputs = []
+        y = self.final_layers[0](x)
+        final_outputs.append(y)
+
+        for i in range(self.num_deconvs):
+            if self.cat_output[i]:
+                x = torch.cat((x, y), 1)
+
+            x = self.deconv_layers[i](x)
+            y = self.final_layers[i + 1](x)
+            final_outputs.append(y)
+
+        return final_outputs
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for _, m in self.final_layers.named_modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/ae_multi_stage_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/ae_multi_stage_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..195666b27ed50402a073c9eff7c5579c710a36f6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/ae_multi_stage_head.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_upsample_layer, constant_init,
+                      normal_init)
+
+from mmpose.models.builder import build_loss
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class AEMultiStageHead(nn.Module):
+    """Associative embedding multi-stage head.
+    paper ref: Alejandro Newell et al. "Associative
+    Embedding: End-to-end Learning for Joint Detection
+    and Grouping"
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        loss_keypoint (dict): Config for loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_stages=1,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 loss_keypoint=None):
+        super().__init__()
+
+        self.loss = build_loss(loss_keypoint)
+
+        self.in_channels = in_channels
+        self.num_stages = num_stages
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        # build multi-stage deconv layers
+        self.multi_deconv_layers = nn.ModuleList([])
+        for _ in range(self.num_stages):
+            if num_deconv_layers > 0:
+                deconv_layers = self._make_deconv_layer(
+                    num_deconv_layers,
+                    num_deconv_filters,
+                    num_deconv_kernels,
+                )
+            elif num_deconv_layers == 0:
+                deconv_layers = nn.Identity()
+            else:
+                raise ValueError(
+                    f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+            self.multi_deconv_layers.append(deconv_layers)
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        # build multi-stage final layers
+        self.multi_final_layers = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if identity_final_layer:
+                final_layer = nn.Identity()
+            else:
+                final_layer = build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=num_deconv_filters[-1]
+                    if num_deconv_layers > 0 else in_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding)
+            self.multi_final_layers.append(final_layer)
+
+    def get_loss(self, output, targets, masks, joints):
+        """Calculate bottom-up keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (List(torch.Tensor[NxKxHxW])): Output heatmaps.
+            targets(List(List(torch.Tensor[NxKxHxW]))):
+                Multi-stage and multi-scale target heatmaps.
+            masks(List(List(torch.Tensor[NxHxW]))):
+                Masks of multi-stage and multi-scale target heatmaps
+            joints(List(List(torch.Tensor[NxMxKx2]))):
+                Joints of multi-stage multi-scale target heatmaps for ae loss
+        """
+
+        losses = dict()
+
+        # Flatten list:
+        # [stage_1_scale_1, stage_1_scale_2, ... , stage_1_scale_m,
+        # ...
+        # stage_n_scale_1, stage_n_scale_2, ... , stage_n_scale_m]
+        targets = [target for _targets in targets for target in _targets]
+        masks = [mask for _masks in masks for mask in _masks]
+        joints = [joint for _joints in joints for joint in _joints]
+
+        heatmaps_losses, push_losses, pull_losses = self.loss(
+            output, targets, masks, joints)
+
+        for idx in range(len(targets)):
+            if heatmaps_losses[idx] is not None:
+                heatmaps_loss = heatmaps_losses[idx].mean(dim=0)
+                if 'heatmap_loss' not in losses:
+                    losses['heatmap_loss'] = heatmaps_loss
+                else:
+                    losses['heatmap_loss'] += heatmaps_loss
+            if push_losses[idx] is not None:
+                push_loss = push_losses[idx].mean(dim=0)
+                if 'push_loss' not in losses:
+                    losses['push_loss'] = push_loss
+                else:
+                    losses['push_loss'] += push_loss
+            if pull_losses[idx] is not None:
+                pull_loss = pull_losses[idx].mean(dim=0)
+                if 'pull_loss' not in losses:
+                    losses['pull_loss'] = pull_loss
+                else:
+                    losses['pull_loss'] += pull_loss
+
+        return losses
+
+    def forward(self, x):
+        """Forward function.
+
+        Returns:
+            out (list[Tensor]): a list of heatmaps from multiple stages.
+        """
+        out = []
+        assert isinstance(x, list)
+        for i in range(self.num_stages):
+            y = self.multi_deconv_layers[i](x[i])
+            y = self.multi_final_layers[i](y)
+            out.append(y)
+        return out
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.multi_deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.multi_final_layers.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/ae_simple_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/ae_simple_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9297f71fd319ab26700f90d797fdd7fea508cb7a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/ae_simple_head.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import HEADS
+from .deconv_head import DeconvHead
+
+
+@HEADS.register_module()
+class AESimpleHead(DeconvHead):
+    """Associative embedding simple head.
+    paper ref: Alejandro Newell et al. "Associative
+    Embedding: End-to-end Learning for Joint Detection
+    and Grouping"
+
+    Args:
+        in_channels (int): Number of input channels.
+        num_joints (int): Number of joints.
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        tag_per_joint (bool): If tag_per_joint is True,
+            the dimension of tags equals to num_joints,
+            else the dimension of tags is 1. Default: True
+        with_ae_loss (list[bool]): Option to use ae loss or not.
+        loss_keypoint (dict): Config for loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_joints,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 tag_per_joint=True,
+                 with_ae_loss=None,
+                 extra=None,
+                 loss_keypoint=None):
+
+        dim_tag = num_joints if tag_per_joint else 1
+        if with_ae_loss[0]:
+            out_channels = num_joints + dim_tag
+        else:
+            out_channels = num_joints
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            num_deconv_layers=num_deconv_layers,
+            num_deconv_filters=num_deconv_filters,
+            num_deconv_kernels=num_deconv_kernels,
+            extra=extra,
+            loss_keypoint=loss_keypoint)
+
+    def get_loss(self, outputs, targets, masks, joints):
+        """Calculate bottom-up keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            outputs (list(torch.Tensor[N,K,H,W])): Multi-scale output heatmaps.
+            targets (List(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps.
+            masks (List(torch.Tensor[N,H,W])): Masks of multi-scale target
+                heatmaps
+            joints(List(torch.Tensor[N,M,K,2])): Joints of multi-scale target
+                heatmaps for ae loss
+        """
+
+        losses = dict()
+
+        heatmaps_losses, push_losses, pull_losses = self.loss(
+            outputs, targets, masks, joints)
+
+        for idx in range(len(targets)):
+            if heatmaps_losses[idx] is not None:
+                heatmaps_loss = heatmaps_losses[idx].mean(dim=0)
+                if 'heatmap_loss' not in losses:
+                    losses['heatmap_loss'] = heatmaps_loss
+                else:
+                    losses['heatmap_loss'] += heatmaps_loss
+            if push_losses[idx] is not None:
+                push_loss = push_losses[idx].mean(dim=0)
+                if 'push_loss' not in losses:
+                    losses['push_loss'] = push_loss
+                else:
+                    losses['push_loss'] += push_loss
+            if pull_losses[idx] is not None:
+                pull_loss = pull_losses[idx].mean(dim=0)
+                if 'pull_loss' not in losses:
+                    losses['pull_loss'] = pull_loss
+                else:
+                    losses['pull_loss'] += pull_loss
+
+        return losses
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/deconv_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/deconv_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..90846d27af46d65091f4ad7e0e6687377ebd86e1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/deconv_head.py
@@ -0,0 +1,295 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+
+from mmpose.models.builder import HEADS, build_loss
+from mmpose.models.utils.ops import resize
+
+
+@HEADS.register_module()
+class DeconvHead(nn.Module):
+    """Simple deconv head.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        in_index (int|Sequence[int]): Input feature index. Default: 0
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resized to the
+                same size as the first one and then concat together.
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_keypoint (dict): Config for loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 out_channels=17,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 in_index=0,
+                 input_transform=None,
+                 align_corners=False,
+                 loss_keypoint=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.loss = build_loss(loss_keypoint)
+
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.in_index = in_index
+        self.align_corners = align_corners
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+
+                for i in range(num_conv_layers):
+                    layers.append(
+                        build_conv_layer(
+                            dict(type='Conv2d'),
+                            in_channels=conv_channels,
+                            out_channels=conv_channels,
+                            kernel_size=num_conv_kernels[i],
+                            stride=1,
+                            padding=(num_conv_kernels[i] - 1) // 2))
+                    layers.append(
+                        build_norm_layer(dict(type='BN'), conv_channels)[1])
+                    layers.append(nn.ReLU(inplace=True))
+
+            layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=conv_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform is not None, in_channels and in_index must be
+        list or tuple, with the same length.
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+
+                - 'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                - 'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                - None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
+
+    def get_loss(self, outputs, targets, masks):
+        """Calculate bottom-up masked mse loss.
+
+        Note:
+            - batch_size: N
+            - num_channels: C
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            outputs (List(torch.Tensor[N,C,H,W])): Multi-scale outputs.
+            targets (List(torch.Tensor[N,C,H,W])): Multi-scale targets.
+            masks (List(torch.Tensor[N,H,W])): Masks of multi-scale targets.
+        """
+
+        losses = dict()
+
+        for idx in range(len(targets)):
+            if 'loss' not in losses:
+                losses['loss'] = self.loss(outputs[idx], targets[idx],
+                                           masks[idx])
+            else:
+                losses['loss'] += self.loss(outputs[idx], targets[idx],
+                                            masks[idx])
+
+        return losses
+
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+        final_outputs = []
+        x = self.deconv_layers(x)
+        y = self.final_layer(x)
+        final_outputs.append(y)
+        return final_outputs
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/deeppose_regression_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/deeppose_regression_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f326e26fa624bd99e9603ad28ff71dccb29b5638
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/deeppose_regression_head.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import normal_init
+
+from mmpose.core.evaluation import (keypoint_pck_accuracy,
+                                    keypoints_from_regression)
+from mmpose.core.post_processing import fliplr_regression
+from mmpose.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class DeepposeRegressionHead(nn.Module):
+    """Deeppose regression head with fully connected layers.
+
+    "DeepPose: Human Pose Estimation via Deep Neural Networks".
+
+    Args:
+        in_channels (int): Number of input channels
+        num_joints (int): Number of joints
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_joints,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_joints = num_joints
+
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+
+        self.fc = nn.Linear(self.in_channels, self.num_joints * 2)
+
+    def forward(self, x):
+        """Forward function."""
+        output = self.fc(x)
+        N, C = output.shape
+        return output.reshape([N, C // 2, 2])
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 3 and target_weight.dim() == 3
+        losses['reg_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        N = output.shape[0]
+
+        _, avg_acc, cnt = keypoint_pck_accuracy(
+            output.detach().cpu().numpy(),
+            target.detach().cpu().numpy(),
+            target_weight[:, :, 0].detach().cpu().numpy() > 0,
+            thr=0.05,
+            normalize=np.ones((N, 2), dtype=np.float32))
+        accuracy['acc_pose'] = avg_acc
+
+        return accuracy
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_regression (np.ndarray): Output regression.
+
+        Args:
+            x (torch.Tensor[N, K, 2]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_regression = fliplr_regression(
+                output.detach().cpu().numpy(), flip_pairs)
+        else:
+            output_regression = output.detach().cpu().numpy()
+        return output_regression
+
+    def decode(self, img_metas, output, **kwargs):
+        """Decode the keypoints from output regression.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output (np.ndarray[N, K, 2]): predicted regression vector.
+            kwargs: dict contains 'img_size'.
+                img_size (tuple(img_width, img_height)): input image size.
+        """
+        batch_size = len(img_metas)
+
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        preds, maxvals = keypoints_from_regression(output, c, s,
+                                                   kwargs['img_size'])
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
+
+    def init_weights(self):
+        normal_init(self.fc, mean=0, std=0.01, bias=0)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/hmr_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/hmr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..015a3076bcba53d1590de226fab39444708cb3f9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/hmr_head.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init
+
+from ..builder import HEADS
+from ..utils.geometry import rot6d_to_rotmat
+
+
+@HEADS.register_module()
+class HMRMeshHead(nn.Module):
+    """SMPL parameters regressor head of simple baseline. "End-to-end Recovery
+    of Human Shape and Pose", CVPR'2018.
+
+    Args:
+        in_channels (int): Number of input channels
+        smpl_mean_params (str): The file name of the mean SMPL parameters
+        n_iter (int): The iterations of estimating delta parameters
+    """
+
+    def __init__(self, in_channels, smpl_mean_params=None, n_iter=3):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.n_iter = n_iter
+
+        npose = 24 * 6
+        nbeta = 10
+        ncam = 3
+        hidden_dim = 1024
+
+        self.fc1 = nn.Linear(in_channels + npose + nbeta + ncam, hidden_dim)
+        self.drop1 = nn.Dropout()
+        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
+        self.drop2 = nn.Dropout()
+        self.decpose = nn.Linear(hidden_dim, npose)
+        self.decshape = nn.Linear(hidden_dim, nbeta)
+        self.deccam = nn.Linear(hidden_dim, ncam)
+
+        # Load mean SMPL parameters
+        if smpl_mean_params is None:
+            init_pose = torch.zeros([1, npose])
+            init_shape = torch.zeros([1, nbeta])
+            init_cam = torch.FloatTensor([[1, 0, 0]])
+        else:
+            mean_params = np.load(smpl_mean_params)
+            init_pose = torch.from_numpy(
+                mean_params['pose'][:]).unsqueeze(0).float()
+            init_shape = torch.from_numpy(
+                mean_params['shape'][:]).unsqueeze(0).float()
+            init_cam = torch.from_numpy(
+                mean_params['cam']).unsqueeze(0).float()
+        self.register_buffer('init_pose', init_pose)
+        self.register_buffer('init_shape', init_shape)
+        self.register_buffer('init_cam', init_cam)
+
+    def forward(self, x):
+        """Forward function.
+
+        x is the image feature map and is expected to be in shape (batch size x
+        channel number x height x width)
+        """
+        batch_size = x.shape[0]
+        # extract the global feature vector by average along
+        # spatial dimension.
+        x = x.mean(dim=-1).mean(dim=-1)
+
+        init_pose = self.init_pose.expand(batch_size, -1)
+        init_shape = self.init_shape.expand(batch_size, -1)
+        init_cam = self.init_cam.expand(batch_size, -1)
+
+        pred_pose = init_pose
+        pred_shape = init_shape
+        pred_cam = init_cam
+        for _ in range(self.n_iter):
+            xc = torch.cat([x, pred_pose, pred_shape, pred_cam], 1)
+            xc = self.fc1(xc)
+            xc = self.drop1(xc)
+            xc = self.fc2(xc)
+            xc = self.drop2(xc)
+            pred_pose = self.decpose(xc) + pred_pose
+            pred_shape = self.decshape(xc) + pred_shape
+            pred_cam = self.deccam(xc) + pred_cam
+
+        pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3)
+        out = (pred_rotmat, pred_shape, pred_cam)
+        return out
+
+    def init_weights(self):
+        """Initialize model weights."""
+        xavier_init(self.decpose, gain=0.01)
+        xavier_init(self.decshape, gain=0.01)
+        xavier_init(self.deccam, gain=0.01)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/interhand_3d_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/interhand_3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..aebe4a5f61e5fd1dcd5ecfb64962f88da94d5664
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/interhand_3d_head.py
@@ -0,0 +1,521 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+
+from mmpose.core.evaluation.top_down_eval import (
+    keypoints_from_heatmaps3d, multilabel_classification_accuracy)
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from mmpose.models.necks import GlobalAveragePooling
+from ..builder import HEADS
+
+
+class Heatmap3DHead(nn.Module):
+    """Heatmap3DHead is a sub-module of Interhand3DHead, and outputs 3D
+    heatmaps. Heatmap3DHead is composed of (>=0) number of deconv layers and a
+    simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        depth_size (int): Number of depth discretization size
+        num_deconv_layers (int): Number of deconv layers.
+        num_deconv_layers should >= 0. Note that 0 means no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        extra (dict): Configs for extra conv layers. Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 depth_size=64,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None):
+
+        super().__init__()
+
+        assert out_channels % depth_size == 0
+        self.depth_size = depth_size
+        self.in_channels = in_channels
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+
+                for i in range(num_conv_layers):
+                    layers.append(
+                        build_conv_layer(
+                            dict(type='Conv2d'),
+                            in_channels=conv_channels,
+                            out_channels=conv_channels,
+                            kernel_size=num_conv_kernels[i],
+                            stride=1,
+                            padding=(num_conv_kernels[i] - 1) // 2))
+                    layers.append(
+                        build_norm_layer(dict(type='BN'), conv_channels)[1])
+                    layers.append(nn.ReLU(inplace=True))
+
+            layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=conv_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.deconv_layers(x)
+        x = self.final_layer(x)
+        N, C, H, W = x.shape
+        # reshape the 2D heatmap to 3D heatmap
+        x = x.reshape(N, C // self.depth_size, self.depth_size, H, W)
+        return x
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+
+
+class Heatmap1DHead(nn.Module):
+    """Heatmap1DHead is a sub-module of Interhand3DHead, and outputs 1D
+    heatmaps.
+
+    Args:
+        in_channels (int): Number of input channels
+        heatmap_size (int): Heatmap size
+        hidden_dims (list|tuple): Number of feature dimension of FC layers.
+    """
+
+    def __init__(self, in_channels=2048, heatmap_size=64, hidden_dims=(512, )):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.heatmap_size = heatmap_size
+
+        feature_dims = [in_channels, *hidden_dims, heatmap_size]
+        self.fc = self._make_linear_layers(feature_dims, relu_final=False)
+
+    def soft_argmax_1d(self, heatmap1d):
+        heatmap1d = F.softmax(heatmap1d, 1)
+        accu = heatmap1d * torch.arange(
+            self.heatmap_size, dtype=heatmap1d.dtype,
+            device=heatmap1d.device)[None, :]
+        coord = accu.sum(dim=1)
+        return coord
+
+    def _make_linear_layers(self, feat_dims, relu_final=False):
+        """Make linear layers."""
+        layers = []
+        for i in range(len(feat_dims) - 1):
+            layers.append(nn.Linear(feat_dims[i], feat_dims[i + 1]))
+            if i < len(feat_dims) - 2 or \
+                    (i == len(feat_dims) - 2 and relu_final):
+                layers.append(nn.ReLU(inplace=True))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Forward function."""
+        heatmap1d = self.fc(x)
+        value = self.soft_argmax_1d(heatmap1d).view(-1, 1)
+        return value
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.fc.modules():
+            if isinstance(m, nn.Linear):
+                normal_init(m, mean=0, std=0.01, bias=0)
+
+
+class MultilabelClassificationHead(nn.Module):
+    """MultilabelClassificationHead is a sub-module of Interhand3DHead, and
+    outputs hand type classification.
+
+    Args:
+        in_channels (int): Number of input channels
+        num_labels (int): Number of labels
+        hidden_dims (list|tuple): Number of hidden dimension of FC layers.
+    """
+
+    def __init__(self, in_channels=2048, num_labels=2, hidden_dims=(512, )):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_labesl = num_labels
+
+        feature_dims = [in_channels, *hidden_dims, num_labels]
+        self.fc = self._make_linear_layers(feature_dims, relu_final=False)
+
+    def _make_linear_layers(self, feat_dims, relu_final=False):
+        """Make linear layers."""
+        layers = []
+        for i in range(len(feat_dims) - 1):
+            layers.append(nn.Linear(feat_dims[i], feat_dims[i + 1]))
+            if i < len(feat_dims) - 2 or \
+                    (i == len(feat_dims) - 2 and relu_final):
+                layers.append(nn.ReLU(inplace=True))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Forward function."""
+        labels = torch.sigmoid(self.fc(x))
+        return labels
+
+    def init_weights(self):
+        for m in self.fc.modules():
+            if isinstance(m, nn.Linear):
+                normal_init(m, mean=0, std=0.01, bias=0)
+
+
+@HEADS.register_module()
+class Interhand3DHead(nn.Module):
+    """Interhand 3D head of paper ref: Gyeongsik Moon. "InterHand2.6M: A
+    Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single
+    RGB Image".
+
+    Args:
+        keypoint_head_cfg (dict): Configs of Heatmap3DHead for hand
+            keypoint estimation.
+        root_head_cfg (dict): Configs of Heatmap1DHead for relative
+            hand root depth estimation.
+        hand_type_head_cfg (dict): Configs of MultilabelClassificationHead
+            for hand type classification.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+        loss_root_depth (dict): Config for relative root depth loss.
+            Default: None.
+        loss_hand_type (dict): Config for hand type classification
+            loss. Default: None.
+    """
+
+    def __init__(self,
+                 keypoint_head_cfg,
+                 root_head_cfg,
+                 hand_type_head_cfg,
+                 loss_keypoint=None,
+                 loss_root_depth=None,
+                 loss_hand_type=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        # build sub-module heads
+        self.right_hand_head = Heatmap3DHead(**keypoint_head_cfg)
+        self.left_hand_head = Heatmap3DHead(**keypoint_head_cfg)
+        self.root_head = Heatmap1DHead(**root_head_cfg)
+        self.hand_type_head = MultilabelClassificationHead(
+            **hand_type_head_cfg)
+        self.neck = GlobalAveragePooling()
+
+        # build losses
+        self.keypoint_loss = build_loss(loss_keypoint)
+        self.root_depth_loss = build_loss(loss_root_depth)
+        self.hand_type_loss = build_loss(loss_hand_type)
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+    def init_weights(self):
+        self.left_hand_head.init_weights()
+        self.right_hand_head.init_weights()
+        self.root_head.init_weights()
+        self.hand_type_head.init_weights()
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate loss for hand keypoint heatmaps, relative root depth and
+        hand type.
+
+        Args:
+            output (list[Tensor]): a list of outputs from multiple heads.
+            target (list[Tensor]): a list of targets for multiple heads.
+            target_weight (list[Tensor]): a list of targets weight for
+                multiple heads.
+        """
+        losses = dict()
+
+        # hand keypoint loss
+        assert not isinstance(self.keypoint_loss, nn.Sequential)
+        out, tar, tar_weight = output[0], target[0], target_weight[0]
+        assert tar.dim() == 5 and tar_weight.dim() == 3
+        losses['hand_loss'] = self.keypoint_loss(out, tar, tar_weight)
+
+        # relative root depth loss
+        assert not isinstance(self.root_depth_loss, nn.Sequential)
+        out, tar, tar_weight = output[1], target[1], target_weight[1]
+        assert tar.dim() == 2 and tar_weight.dim() == 2
+        losses['rel_root_loss'] = self.root_depth_loss(out, tar, tar_weight)
+
+        # hand type loss
+        assert not isinstance(self.hand_type_loss, nn.Sequential)
+        out, tar, tar_weight = output[2], target[2], target_weight[2]
+        assert tar.dim() == 2 and tar_weight.dim() in [1, 2]
+        losses['hand_type_loss'] = self.hand_type_loss(out, tar, tar_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for hand type.
+
+        Args:
+            output (list[Tensor]): a list of outputs from multiple heads.
+            target (list[Tensor]): a list of targets for multiple heads.
+            target_weight (list[Tensor]): a list of targets weight for
+                multiple heads.
+        """
+        accuracy = dict()
+        avg_acc = multilabel_classification_accuracy(
+            output[2].detach().cpu().numpy(),
+            target[2].detach().cpu().numpy(),
+            target_weight[2].detach().cpu().numpy(),
+        )
+        accuracy['acc_classification'] = float(avg_acc)
+        return accuracy
+
+    def forward(self, x):
+        """Forward function."""
+        outputs = []
+        outputs.append(
+            torch.cat([self.right_hand_head(x),
+                       self.left_hand_head(x)], dim=1))
+        x = self.neck(x)
+        outputs.append(self.root_head(x))
+        outputs.append(self.hand_type_head(x))
+        return outputs
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output (list[np.ndarray]): list of output hand keypoint
+            heatmaps, relative root depth and hand type.
+
+        Args:
+            x (torch.Tensor[N,K,H,W]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            # flip 3D heatmap
+            heatmap_3d = output[0]
+            N, K, D, H, W = heatmap_3d.shape
+            # reshape 3D heatmap to 2D heatmap
+            heatmap_3d = heatmap_3d.reshape(N, K * D, H, W)
+            # 2D heatmap flip
+            heatmap_3d_flipped_back = flip_back(
+                heatmap_3d.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # reshape back to 3D heatmap
+            heatmap_3d_flipped_back = heatmap_3d_flipped_back.reshape(
+                N, K, D, H, W)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                heatmap_3d_flipped_back[...,
+                                        1:] = heatmap_3d_flipped_back[..., :-1]
+            output[0] = heatmap_3d_flipped_back
+
+            # flip relative hand root depth
+            output[1] = -output[1].detach().cpu().numpy()
+
+            # flip hand type
+            hand_type = output[2].detach().cpu().numpy()
+            hand_type_flipped_back = hand_type.copy()
+            hand_type_flipped_back[:, 0] = hand_type[:, 1]
+            hand_type_flipped_back[:, 1] = hand_type[:, 0]
+            output[2] = hand_type_flipped_back
+        else:
+            output = [out.detach().cpu().numpy() for out in output]
+
+        return output
+
+    def decode(self, img_metas, output, **kwargs):
+        """Decode hand keypoint, relative root depth and hand type.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+                - "heatmap3d_depth_bound": depth bound of hand keypoint
+                    3D heatmap
+                - "root_depth_bound": depth bound of relative root depth
+                    1D heatmap
+            output (list[np.ndarray]): model predicted 3D heatmaps, relative
+                root depth and hand type.
+        """
+
+        batch_size = len(img_metas)
+        result = {}
+
+        heatmap3d_depth_bound = np.ones(batch_size, dtype=np.float32)
+        root_depth_bound = np.ones(batch_size, dtype=np.float32)
+        center = np.zeros((batch_size, 2), dtype=np.float32)
+        scale = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size, dtype=np.float32)
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        for i in range(batch_size):
+            heatmap3d_depth_bound[i] = img_metas[i]['heatmap3d_depth_bound']
+            root_depth_bound[i] = img_metas[i]['root_depth_bound']
+            center[i, :] = img_metas[i]['center']
+            scale[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_boxes[:, 0:2] = center[:, 0:2]
+        all_boxes[:, 2:4] = scale[:, 0:2]
+        # scale is defined as: bbox_size / 200.0, so we
+        # need multiply 200.0 to get bbox size
+        all_boxes[:, 4] = np.prod(scale * 200.0, axis=1)
+        all_boxes[:, 5] = score
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        # decode 3D heatmaps of hand keypoints
+        heatmap3d = output[0]
+        preds, maxvals = keypoints_from_heatmaps3d(heatmap3d, center, scale)
+        keypoints_3d = np.zeros((batch_size, preds.shape[1], 4),
+                                dtype=np.float32)
+        keypoints_3d[:, :, 0:3] = preds[:, :, 0:3]
+        keypoints_3d[:, :, 3:4] = maxvals
+        # transform keypoint depth to camera space
+        keypoints_3d[:, :, 2] = \
+            (keypoints_3d[:, :, 2] / self.right_hand_head.depth_size - 0.5) \
+            * heatmap3d_depth_bound[:, np.newaxis]
+
+        result['preds'] = keypoints_3d
+
+        # decode relative hand root depth
+        # transform relative root depth to camera space
+        result['rel_root_depth'] = (output[1] / self.root_head.heatmap_size -
+                                    0.5) * root_depth_bound
+
+        # decode hand type
+        result['hand_type'] = output[2] > 0.5
+        return result
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/temporal_regression_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/temporal_regression_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..97a07f9cf2c9ef0497380ca5c602142b206f3b52
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/temporal_regression_head.py
@@ -0,0 +1,319 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, constant_init, kaiming_init
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from mmpose.core import (WeightNormClipHook, compute_similarity_transform,
+                         fliplr_regression)
+from mmpose.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class TemporalRegressionHead(nn.Module):
+    """Regression head of VideoPose3D.
+
+    "3D human pose estimation in video with temporal convolutions and
+    semi-supervised training", CVPR'2019.
+
+    Args:
+        in_channels (int): Number of input channels
+        num_joints (int): Number of joints
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+        max_norm (float|None): if not None, the weight of convolution layers
+            will be clipped to have a maximum norm of max_norm.
+        is_trajectory (bool): If the model only predicts root joint
+            position, then this arg should be set to True. In this case,
+            traj_loss will be calculated. Otherwise, it should be set to
+            False. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_joints,
+                 max_norm=None,
+                 loss_keypoint=None,
+                 is_trajectory=False,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_joints = num_joints
+        self.max_norm = max_norm
+        self.loss = build_loss(loss_keypoint)
+        self.is_trajectory = is_trajectory
+        if self.is_trajectory:
+            assert self.num_joints == 1
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+
+        self.conv = build_conv_layer(
+            dict(type='Conv1d'), in_channels, num_joints * 3, 1)
+
+        if self.max_norm is not None:
+            # Apply weight norm clip to conv layers
+            weight_clip = WeightNormClipHook(self.max_norm)
+            for module in self.modules():
+                if isinstance(module, nn.modules.conv._ConvNd):
+                    weight_clip.register(module)
+
+    @staticmethod
+    def _transform_inputs(x):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (tuple or list of Tensor | Tensor): multi-level features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(x, (list, tuple)):
+            return x
+
+        assert len(x) > 0
+
+        # return the top-level feature of the 1D feature pyramid
+        return x[-1]
+
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+
+        assert x.ndim == 3 and x.shape[2] == 1, f'Invalid shape {x.shape}'
+        output = self.conv(x)
+        N = output.shape[0]
+        return output.reshape(N, self.num_joints, 3)
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 3]): Output keypoints.
+            target (torch.Tensor[N, K, 3]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 3]):
+                Weights across different joint types.
+                If self.is_trajectory is True and target_weight is None,
+                target_weight will be set inversely proportional to joint
+                depth.
+        """
+        losses = dict()
+        assert not isinstance(self.loss, nn.Sequential)
+
+        # trajectory model
+        if self.is_trajectory:
+            if target.dim() == 2:
+                target.unsqueeze_(1)
+
+            if target_weight is None:
+                target_weight = (1 / target[:, :, 2:]).expand(target.shape)
+            assert target.dim() == 3 and target_weight.dim() == 3
+
+            losses['traj_loss'] = self.loss(output, target, target_weight)
+
+        # pose model
+        else:
+            if target_weight is None:
+                target_weight = target.new_ones(target.shape)
+            assert target.dim() == 3 and target_weight.dim() == 3
+            losses['reg_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight, metas):
+        """Calculate accuracy for keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 3]): Output keypoints.
+            target (torch.Tensor[N, K, 3]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 3]):
+                Weights across different joint types.
+            metas (list(dict)): Information about data augmentation including:
+
+                - target_image_path (str): Optional, path to the image file
+                - target_mean (float): Optional, normalization parameter of
+                    the target pose.
+                - target_std (float): Optional, normalization parameter of the
+                    target pose.
+                - root_position (np.ndarray[3,1]): Optional, global
+                    position of the root joint.
+                - root_index (torch.ndarray[1,]): Optional, original index of
+                    the root joint before root-centering.
+        """
+
+        accuracy = dict()
+
+        N = output.shape[0]
+        output_ = output.detach().cpu().numpy()
+        target_ = target.detach().cpu().numpy()
+        # Denormalize the predicted pose
+        if 'target_mean' in metas[0] and 'target_std' in metas[0]:
+            target_mean = np.stack([m['target_mean'] for m in metas])
+            target_std = np.stack([m['target_std'] for m in metas])
+            output_ = self._denormalize_joints(output_, target_mean,
+                                               target_std)
+            target_ = self._denormalize_joints(target_, target_mean,
+                                               target_std)
+
+        # Restore global position
+        if self.test_cfg.get('restore_global_position', False):
+            root_pos = np.stack([m['root_position'] for m in metas])
+            root_idx = metas[0].get('root_position_index', None)
+            output_ = self._restore_global_position(output_, root_pos,
+                                                    root_idx)
+            target_ = self._restore_global_position(target_, root_pos,
+                                                    root_idx)
+        # Get target weight
+        if target_weight is None:
+            target_weight_ = np.ones_like(target_)
+        else:
+            target_weight_ = target_weight.detach().cpu().numpy()
+            if self.test_cfg.get('restore_global_position', False):
+                root_idx = metas[0].get('root_position_index', None)
+                root_weight = metas[0].get('root_joint_weight', 1.0)
+                target_weight_ = self._restore_root_target_weight(
+                    target_weight_, root_weight, root_idx)
+
+        mpjpe = np.mean(
+            np.linalg.norm((output_ - target_) * target_weight_, axis=-1))
+
+        transformed_output = np.zeros_like(output_)
+        for i in range(N):
+            transformed_output[i, :, :] = compute_similarity_transform(
+                output_[i, :, :], target_[i, :, :])
+        p_mpjpe = np.mean(
+            np.linalg.norm(
+                (transformed_output - target_) * target_weight_, axis=-1))
+
+        accuracy['mpjpe'] = output.new_tensor(mpjpe)
+        accuracy['p_mpjpe'] = output.new_tensor(p_mpjpe)
+
+        return accuracy
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_regression (np.ndarray): Output regression.
+
+        Args:
+            x (torch.Tensor[N, K, 2]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_regression = fliplr_regression(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                center_mode='static',
+                center_x=0)
+        else:
+            output_regression = output.detach().cpu().numpy()
+        return output_regression
+
+    def decode(self, metas, output):
+        """Decode the keypoints from output regression.
+
+        Args:
+            metas (list(dict)): Information about data augmentation.
+                By default this includes:
+
+                - "target_image_path": path to the image file
+            output (np.ndarray[N, K, 3]): predicted regression vector.
+            metas (list(dict)): Information about data augmentation including:
+
+                - target_image_path (str): Optional, path to the image file
+                - target_mean (float): Optional, normalization parameter of
+                    the target pose.
+                - target_std (float): Optional, normalization parameter of the
+                    target pose.
+                - root_position (np.ndarray[3,1]): Optional, global
+                    position of the root joint.
+                - root_index (torch.ndarray[1,]): Optional, original index of
+                    the root joint before root-centering.
+        """
+
+        # Denormalize the predicted pose
+        if 'target_mean' in metas[0] and 'target_std' in metas[0]:
+            target_mean = np.stack([m['target_mean'] for m in metas])
+            target_std = np.stack([m['target_std'] for m in metas])
+            output = self._denormalize_joints(output, target_mean, target_std)
+
+        # Restore global position
+        if self.test_cfg.get('restore_global_position', False):
+            root_pos = np.stack([m['root_position'] for m in metas])
+            root_idx = metas[0].get('root_position_index', None)
+            output = self._restore_global_position(output, root_pos, root_idx)
+
+        target_image_paths = [m.get('target_image_path', None) for m in metas]
+        result = {'preds': output, 'target_image_paths': target_image_paths}
+
+        return result
+
+    @staticmethod
+    def _denormalize_joints(x, mean, std):
+        """Denormalize joint coordinates with given statistics mean and std.
+
+        Args:
+            x (np.ndarray[N, K, 3]): Normalized joint coordinates.
+            mean (np.ndarray[K, 3]): Mean value.
+            std (np.ndarray[K, 3]): Std value.
+        """
+        assert x.ndim == 3
+        assert x.shape == mean.shape == std.shape
+
+        return x * std + mean
+
+    @staticmethod
+    def _restore_global_position(x, root_pos, root_idx=None):
+        """Restore global position of the root-centered joints.
+
+        Args:
+            x (np.ndarray[N, K, 3]): root-centered joint coordinates
+            root_pos (np.ndarray[N,1,3]): The global position of the
+                root joint.
+            root_idx (int|None): If not none, the root joint will be inserted
+                back to the pose at the given index.
+        """
+        x = x + root_pos
+        if root_idx is not None:
+            x = np.insert(x, root_idx, root_pos.squeeze(1), axis=1)
+        return x
+
+    @staticmethod
+    def _restore_root_target_weight(target_weight, root_weight, root_idx=None):
+        """Restore the target weight of the root joint after the restoration of
+        the global position.
+
+        Args:
+            target_weight (np.ndarray[N, K, 1]): Target weight of relativized
+                joints.
+            root_weight (float): The target weight value of the root joint.
+            root_idx (int|None): If not none, the root joint weight will be
+                inserted back to the target weight at the given index.
+        """
+        if root_idx is not None:
+            root_weight = np.full(
+                target_weight.shape[0], root_weight, dtype=target_weight.dtype)
+            target_weight = np.insert(
+                target_weight, root_idx, root_weight[:, None], axis=1)
+        return target_weight
+
+    def init_weights(self):
+        """Initialize the weights."""
+        for m in self.modules():
+            if isinstance(m, nn.modules.conv._ConvNd):
+                kaiming_init(m, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m, _BatchNorm):
+                constant_init(m, 1)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/topdown_heatmap_base_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/topdown_heatmap_base_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..09646ead353fb054f066b9fc6816748a43287e2c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/topdown_heatmap_base_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import torch.nn as nn
+
+from mmpose.core.evaluation.top_down_eval import keypoints_from_heatmaps
+
+
+class TopdownHeatmapBaseHead(nn.Module):
+    """Base class for top-down heatmap heads.
+
+    All top-down heatmap heads should subclass it.
+    All subclass should overwrite:
+
+    Methods:`get_loss`, supporting to calculate loss.
+    Methods:`get_accuracy`, supporting to calculate accuracy.
+    Methods:`forward`, supporting to forward model.
+    Methods:`inference_model`, supporting to inference model.
+    """
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def get_loss(self, **kwargs):
+        """Gets the loss."""
+
+    @abstractmethod
+    def get_accuracy(self, **kwargs):
+        """Gets the accuracy."""
+
+    @abstractmethod
+    def forward(self, **kwargs):
+        """Forward function."""
+
+    @abstractmethod
+    def inference_model(self, **kwargs):
+        """Inference function."""
+
+    def decode(self, img_metas, output, **kwargs):
+        """Decode keypoints from heatmaps.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        """
+        batch_size = len(img_metas)
+
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        preds, maxvals = keypoints_from_heatmaps(
+            output,
+            c,
+            s,
+            unbiased=self.test_cfg.get('unbiased_decoding', False),
+            post_process=self.test_cfg.get('post_process', 'default'),
+            kernel=self.test_cfg.get('modulate_kernel', 11),
+            valid_radius_factor=self.test_cfg.get('valid_radius_factor',
+                                                  0.0546875),
+            use_udp=self.test_cfg.get('use_udp', False),
+            target_type=self.test_cfg.get('target_type', 'GaussianHeatmap'))
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/topdown_heatmap_multi_stage_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c439f5b6332d72a66db75bf599035411c4e1e0d1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
@@ -0,0 +1,572 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+
+import torch.nn as nn
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, Linear,
+                      build_activation_layer, build_conv_layer,
+                      build_norm_layer, build_upsample_layer, constant_init,
+                      kaiming_init, normal_init)
+
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from ..builder import HEADS
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+
+
+@HEADS.register_module()
+class TopdownHeatmapMultiStageHead(TopdownHeatmapBaseHead):
+    """Top-down heatmap multi-stage head.
+
+    TopdownHeatmapMultiStageHead is consisted of multiple branches,
+    each of which has num_deconv_layers(>=0) number of deconv layers
+    and a simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_stages (int): Number of stages.
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=512,
+                 out_channels=17,
+                 num_stages=1,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_stages = num_stages
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        # build multi-stage deconv layers
+        self.multi_deconv_layers = nn.ModuleList([])
+        for _ in range(self.num_stages):
+            if num_deconv_layers > 0:
+                deconv_layers = self._make_deconv_layer(
+                    num_deconv_layers,
+                    num_deconv_filters,
+                    num_deconv_kernels,
+                )
+            elif num_deconv_layers == 0:
+                deconv_layers = nn.Identity()
+            else:
+                raise ValueError(
+                    f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+            self.multi_deconv_layers.append(deconv_layers)
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        # build multi-stage final layers
+        self.multi_final_layers = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if identity_final_layer:
+                final_layer = nn.Identity()
+            else:
+                final_layer = build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=num_deconv_filters[-1]
+                    if num_deconv_layers > 0 else in_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding)
+            self.multi_final_layers.append(final_layer)
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]):
+                Output heatmaps.
+            target (torch.Tensor[N,K,H,W]):
+                Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert isinstance(output, list)
+        assert target.dim() == 4 and target_weight.dim() == 3
+
+        if isinstance(self.loss, nn.Sequential):
+            assert len(self.loss) == len(output)
+        for i in range(len(output)):
+            target_i = target
+            target_weight_i = target_weight
+            if isinstance(self.loss, nn.Sequential):
+                loss_func = self.loss[i]
+            else:
+                loss_func = self.loss
+            loss_i = loss_func(output[i], target_i, target_weight_i)
+            if 'heatmap_loss' not in losses:
+                losses['heatmap_loss'] = loss_i
+            else:
+                losses['heatmap_loss'] += loss_i
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            _, avg_acc, _ = pose_pck_accuracy(
+                output[-1].detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function.
+
+        Returns:
+            out (list[Tensor]): a list of heatmaps from multiple stages.
+        """
+        out = []
+        assert isinstance(x, list)
+        for i in range(self.num_stages):
+            y = self.multi_deconv_layers[i](x[i])
+            y = self.multi_final_layers[i](y)
+            out.append(y)
+        return out
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (List[torch.Tensor[NxKxHxW]]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+        assert isinstance(output, list)
+        output = output[-1]
+
+        if flip_pairs is not None:
+            # perform flip
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+
+        return output_heatmap
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.multi_deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.multi_final_layers.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+
+
+class PredictHeatmap(nn.Module):
+    """Predict the heat map for an input feature.
+
+    Args:
+        unit_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        out_shape (tuple): Shape of the output heatmap.
+        use_prm (bool): Whether to use pose refine machine. Default: False.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 unit_channels,
+                 out_channels,
+                 out_shape,
+                 use_prm=False,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.unit_channels = unit_channels
+        self.out_channels = out_channels
+        self.out_shape = out_shape
+        self.use_prm = use_prm
+        if use_prm:
+            self.prm = PRM(out_channels, norm_cfg=norm_cfg)
+        self.conv_layers = nn.Sequential(
+            ConvModule(
+                unit_channels,
+                unit_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=norm_cfg,
+                inplace=False),
+            ConvModule(
+                unit_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                inplace=False))
+
+    def forward(self, feature):
+        feature = self.conv_layers(feature)
+        output = nn.functional.interpolate(
+            feature, size=self.out_shape, mode='bilinear', align_corners=True)
+        if self.use_prm:
+            output = self.prm(output)
+        return output
+
+
+class PRM(nn.Module):
+    """Pose Refine Machine.
+
+    Please refer to "Learning Delicate Local Representations
+    for Multi-Person Pose Estimation" (ECCV 2020).
+
+    Args:
+        out_channels (int): Channel number of the output. Equals to
+            the number of key points.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self, out_channels, norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.out_channels = out_channels
+        self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
+        self.middle_path = nn.Sequential(
+            Linear(self.out_channels, self.out_channels),
+            build_norm_layer(dict(type='BN1d'), out_channels)[1],
+            build_activation_layer(dict(type='ReLU')),
+            Linear(self.out_channels, self.out_channels),
+            build_norm_layer(dict(type='BN1d'), out_channels)[1],
+            build_activation_layer(dict(type='ReLU')),
+            build_activation_layer(dict(type='Sigmoid')))
+
+        self.bottom_path = nn.Sequential(
+            ConvModule(
+                self.out_channels,
+                self.out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=norm_cfg,
+                inplace=False),
+            DepthwiseSeparableConvModule(
+                self.out_channels,
+                1,
+                kernel_size=9,
+                stride=1,
+                padding=4,
+                norm_cfg=norm_cfg,
+                inplace=False), build_activation_layer(dict(type='Sigmoid')))
+        self.conv_bn_relu_prm_1 = ConvModule(
+            self.out_channels,
+            self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False)
+
+    def forward(self, x):
+        out = self.conv_bn_relu_prm_1(x)
+        out_1 = out
+
+        out_2 = self.global_pooling(out_1)
+        out_2 = out_2.view(out_2.size(0), -1)
+        out_2 = self.middle_path(out_2)
+        out_2 = out_2.unsqueeze(2)
+        out_2 = out_2.unsqueeze(3)
+
+        out_3 = self.bottom_path(out_1)
+        out = out_1 * (1 + out_2 * out_3)
+
+        return out
+
+
+@HEADS.register_module()
+class TopdownHeatmapMSMUHead(TopdownHeatmapBaseHead):
+    """Heads for multi-stage multi-unit heads used in Multi-Stage Pose
+    estimation Network (MSPN), and Residual Steps Networks (RSN).
+
+    Args:
+        unit_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        out_shape (tuple): Shape of the output heatmap.
+        num_stages (int): Number of stages.
+        num_units (int): Number of units in each stage.
+        use_prm (bool): Whether to use pose refine machine (PRM).
+            Default: False.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 out_shape,
+                 unit_channels=256,
+                 out_channels=17,
+                 num_stages=4,
+                 num_units=4,
+                 use_prm=False,
+                 norm_cfg=dict(type='BN'),
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        self.out_shape = out_shape
+        self.unit_channels = unit_channels
+        self.out_channels = out_channels
+        self.num_stages = num_stages
+        self.num_units = num_units
+
+        self.loss = build_loss(loss_keypoint)
+
+        self.predict_layers = nn.ModuleList([])
+        for i in range(self.num_stages):
+            for j in range(self.num_units):
+                self.predict_layers.append(
+                    PredictHeatmap(
+                        unit_channels,
+                        out_channels,
+                        out_shape,
+                        use_prm,
+                        norm_cfg=norm_cfg))
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,O,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,O,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,O,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert isinstance(output, list)
+        assert target.dim() == 5 and target_weight.dim() == 4
+        assert target.size(1) == len(output)
+
+        if isinstance(self.loss, nn.Sequential):
+            assert len(self.loss) == len(output)
+        for i in range(len(output)):
+            target_i = target[:, i, :, :, :]
+            target_weight_i = target_weight[:, i, :, :]
+
+            if isinstance(self.loss, nn.Sequential):
+                loss_func = self.loss[i]
+            else:
+                loss_func = self.loss
+
+            loss_i = loss_func(output[i], target_i, target_weight_i)
+            if 'heatmap_loss' not in losses:
+                losses['heatmap_loss'] = loss_i
+            else:
+                losses['heatmap_loss'] += loss_i
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            assert isinstance(output, list)
+            assert target.dim() == 5 and target_weight.dim() == 4
+            _, avg_acc, _ = pose_pck_accuracy(
+                output[-1].detach().cpu().numpy(),
+                target[:, -1, ...].detach().cpu().numpy(),
+                target_weight[:, -1,
+                              ...].detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function.
+
+        Returns:
+            out (list[Tensor]): a list of heatmaps from multiple stages
+                                and units.
+        """
+        out = []
+        assert isinstance(x, list)
+        assert len(x) == self.num_stages
+        assert isinstance(x[0], list)
+        assert len(x[0]) == self.num_units
+        assert x[0][0].shape[1] == self.unit_channels
+        for i in range(self.num_stages):
+            for j in range(self.num_units):
+                y = self.predict_layers[i * self.num_units + j](x[i][j])
+                out.append(y)
+
+        return out
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (list[torch.Tensor[N,K,H,W]]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+        assert isinstance(output, list)
+        output = output[-1]
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.predict_layers.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+            elif isinstance(m, nn.Linear):
+                normal_init(m, std=0.01)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/topdown_heatmap_simple_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/topdown_heatmap_simple_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..72f3348b2ba06d43e6489e0235c4a883d567e5cd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/topdown_heatmap_simple_head.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from mmpose.models.utils.ops import resize
+from ..builder import HEADS
+import torch.nn.functional as F
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+
+
+@HEADS.register_module()
+class TopdownHeatmapSimpleHead(TopdownHeatmapBaseHead):
+    """Top-down heatmap simple head. paper ref: Bin Xiao et al. ``Simple
+    Baselines for Human Pose Estimation and Tracking``.
+
+    TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers
+    and a simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        in_index (int|Sequence[int]): Input feature index. Default: 0
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resized to the
+                same size as the first one and then concat together.
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 in_index=0,
+                 input_transform=None,
+                 align_corners=False,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 upsample=0,):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.loss = build_loss(loss_keypoint)
+        self.upsample = upsample
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.in_index = in_index
+        self.align_corners = align_corners
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+
+                for i in range(num_conv_layers):
+                    layers.append(
+                        build_conv_layer(
+                            dict(type='Conv2d'),
+                            in_channels=conv_channels,
+                            out_channels=conv_channels,
+                            kernel_size=num_conv_kernels[i],
+                            stride=1,
+                            padding=(num_conv_kernels[i] - 1) // 2))
+                    layers.append(
+                        build_norm_layer(dict(type='BN'), conv_channels)[1])
+                    layers.append(nn.ReLU(inplace=True))
+
+            layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=conv_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 4 and target_weight.dim() == 3
+        losses['heatmap_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            _, avg_acc, _ = pose_pck_accuracy(
+                output.detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+        x = self.deconv_layers(x)
+        x = self.final_layer(x)
+        return x
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (torch.Tensor[N,K,H,W]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform is not None, in_channels and in_index must be
+        list or tuple, with the same length.
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+
+                - 'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                - 'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                - None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            if not isinstance(inputs, list):
+                if self.upsample > 0:
+                    inputs = resize(
+                        input=F.relu(inputs),
+                        scale_factor=self.upsample,
+                        mode='bilinear',
+                        align_corners=self.align_corners
+                        )
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/vipnas_heatmap_simple_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/vipnas_heatmap_simple_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..41703128c45909733159a0869e091f61e9805756
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/vipnas_heatmap_simple_head.py
@@ -0,0 +1,349 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from mmpose.models.utils.ops import resize
+from ..builder import HEADS
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+
+
+@HEADS.register_module()
+class ViPNASHeatmapSimpleHead(TopdownHeatmapBaseHead):
+    """ViPNAS heatmap simple head.
+
+    ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2105.10154>`__ .
+
+    TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers
+    and a simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        num_deconv_groups (list|tuple): Group number.
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(144, 144, 144),
+                 num_deconv_kernels=(4, 4, 4),
+                 num_deconv_groups=(16, 16, 16),
+                 extra=None,
+                 in_index=0,
+                 input_transform=None,
+                 align_corners=False,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.in_index = in_index
+        self.align_corners = align_corners
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers, num_deconv_filters, num_deconv_kernels,
+                num_deconv_groups)
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+
+                for i in range(num_conv_layers):
+                    layers.append(
+                        build_conv_layer(
+                            dict(type='Conv2d'),
+                            in_channels=conv_channels,
+                            out_channels=conv_channels,
+                            kernel_size=num_conv_kernels[i],
+                            stride=1,
+                            padding=(num_conv_kernels[i] - 1) // 2))
+                    layers.append(
+                        build_norm_layer(dict(type='BN'), conv_channels)[1])
+                    layers.append(nn.ReLU(inplace=True))
+
+            layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=conv_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 4 and target_weight.dim() == 3
+        losses['heatmap_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type.lower() == 'GaussianHeatmap'.lower():
+            _, avg_acc, _ = pose_pck_accuracy(
+                output.detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+        x = self.deconv_layers(x)
+        x = self.final_layer(x)
+        return x
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (torch.Tensor[N,K,H,W]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform is not None, in_channels and in_index must be
+        list or tuple, with the same length.
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+
+                - 'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                - 'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                - None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels,
+                           num_groups):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_groups):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_groups({len(num_groups)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            groups = num_groups[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    groups=groups,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/voxelpose_head.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/voxelpose_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8799bdc2c0a888973f6cf98f3da00c60a891e699
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/heads/voxelpose_head.py
@@ -0,0 +1,167 @@
+# ------------------------------------------------------------------------------
+# Copyright and License Information
+# https://github.com/microsoft/voxelpose-pytorch/blob/main/lib/models
+# Original Licence: MIT License
+# ------------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class CuboidCenterHead(nn.Module):
+    """Get results from the 3D human center heatmap. In this module, human 3D
+    centers are local maximums obtained from the 3D heatmap via NMS (max-
+    pooling).
+
+    Args:
+        space_size (list[3]): The size of the 3D space.
+        cube_size (list[3]): The size of the heatmap volume.
+        space_center (list[3]): The coordinate of space center.
+        max_num (int): Maximum of human center detections.
+        max_pool_kernel (int): Kernel size of the max-pool kernel in nms.
+    """
+
+    def __init__(self,
+                 space_size,
+                 space_center,
+                 cube_size,
+                 max_num=10,
+                 max_pool_kernel=3):
+        super(CuboidCenterHead, self).__init__()
+        # use register_buffer
+        self.register_buffer('grid_size', torch.tensor(space_size))
+        self.register_buffer('cube_size', torch.tensor(cube_size))
+        self.register_buffer('grid_center', torch.tensor(space_center))
+
+        self.num_candidates = max_num
+        self.max_pool_kernel = max_pool_kernel
+        self.loss = nn.MSELoss()
+
+    def _get_real_locations(self, indices):
+        """
+        Args:
+            indices (torch.Tensor(NXP)): Indices of points in the 3D tensor
+
+        Returns:
+            real_locations (torch.Tensor(NXPx3)): Locations of points
+                in the world coordinate system
+        """
+        real_locations = indices.float() / (
+                self.cube_size - 1) * self.grid_size + \
+            self.grid_center - self.grid_size / 2.0
+        return real_locations
+
+    def _nms_by_max_pool(self, heatmap_volumes):
+        max_num = self.num_candidates
+        batch_size = heatmap_volumes.shape[0]
+        root_cubes_nms = self._max_pool(heatmap_volumes)
+        root_cubes_nms_reshape = root_cubes_nms.reshape(batch_size, -1)
+        topk_values, topk_index = root_cubes_nms_reshape.topk(max_num)
+        topk_unravel_index = self._get_3d_indices(topk_index,
+                                                  heatmap_volumes[0].shape)
+
+        return topk_values, topk_unravel_index
+
+    def _max_pool(self, inputs):
+        kernel = self.max_pool_kernel
+        padding = (kernel - 1) // 2
+        max = F.max_pool3d(
+            inputs, kernel_size=kernel, stride=1, padding=padding)
+        keep = (inputs == max).float()
+        return keep * inputs
+
+    @staticmethod
+    def _get_3d_indices(indices, shape):
+        """Get indices in the 3-D tensor.
+
+        Args:
+            indices (torch.Tensor(NXp)): Indices of points in the 1D tensor
+            shape (torch.Size(3)): The shape of the original 3D tensor
+
+        Returns:
+            indices: Indices of points in the original 3D tensor
+        """
+        batch_size = indices.shape[0]
+        num_people = indices.shape[1]
+        indices_x = (indices //
+                     (shape[1] * shape[2])).reshape(batch_size, num_people, -1)
+        indices_y = ((indices % (shape[1] * shape[2])) //
+                     shape[2]).reshape(batch_size, num_people, -1)
+        indices_z = (indices % shape[2]).reshape(batch_size, num_people, -1)
+        indices = torch.cat([indices_x, indices_y, indices_z], dim=2)
+        return indices
+
+    def forward(self, heatmap_volumes):
+        """
+
+        Args:
+            heatmap_volumes (torch.Tensor(NXLXWXH)):
+                3D human center heatmaps predicted by the network.
+        Returns:
+            human_centers (torch.Tensor(NXPX5)):
+                Coordinates of human centers.
+        """
+        batch_size = heatmap_volumes.shape[0]
+
+        topk_values, topk_unravel_index = self._nms_by_max_pool(
+            heatmap_volumes.detach())
+
+        topk_unravel_index = self._get_real_locations(topk_unravel_index)
+
+        human_centers = torch.zeros(
+            batch_size, self.num_candidates, 5, device=heatmap_volumes.device)
+        human_centers[:, :, 0:3] = topk_unravel_index
+        human_centers[:, :, 4] = topk_values
+
+        return human_centers
+
+    def get_loss(self, pred_cubes, gt):
+
+        return dict(loss_center=self.loss(pred_cubes, gt))
+
+
+@HEADS.register_module()
+class CuboidPoseHead(nn.Module):
+
+    def __init__(self, beta):
+        """Get results from the 3D human pose heatmap. Instead of obtaining
+        maximums on the heatmap, this module regresses the coordinates of
+        keypoints via integral pose regression. Refer to `paper.
+
+        <https://arxiv.org/abs/2004.06239>` for more details.
+
+        Args:
+            beta: Constant to adjust the magnification of soft-maxed heatmap.
+        """
+        super(CuboidPoseHead, self).__init__()
+        self.beta = beta
+        self.loss = nn.L1Loss()
+
+    def forward(self, heatmap_volumes, grid_coordinates):
+        """
+
+        Args:
+            heatmap_volumes (torch.Tensor(NxKxLxWxH)):
+                3D human pose heatmaps predicted by the network.
+            grid_coordinates (torch.Tensor(Nx(LxWxH)x3)):
+                Coordinates of the grids in the heatmap volumes.
+        Returns:
+            human_poses (torch.Tensor(NxKx3)): Coordinates of human poses.
+        """
+        batch_size = heatmap_volumes.size(0)
+        channel = heatmap_volumes.size(1)
+        x = heatmap_volumes.reshape(batch_size, channel, -1, 1)
+        x = F.softmax(self.beta * x, dim=2)
+        grid_coordinates = grid_coordinates.unsqueeze(1)
+        x = torch.mul(x, grid_coordinates)
+        human_poses = torch.sum(x, dim=2)
+
+        return human_poses
+
+    def get_loss(self, preds, targets, weights):
+
+        return dict(loss_pose=self.loss(preds * weights, targets * weights))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d67973fc5cb53e85faa918719944d8c02f2190cd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .classfication_loss import BCELoss
+from .heatmap_loss import AdaptiveWingLoss
+from .mesh_loss import GANLoss, MeshLoss
+from .mse_loss import JointsMSELoss, JointsOHKMMSELoss
+from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory
+from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss,
+                              SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss,
+                              WingLoss)
+
+__all__ = [
+    'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss',
+    'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss',
+    'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss',
+    'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/classfication_loss.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/classfication_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b79b69d035611f75f10e8722aaea4362659509e2
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/classfication_loss.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class BCELoss(nn.Module):
+    """Binary Cross Entropy loss."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.binary_cross_entropy
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_labels: K
+
+        Args:
+            output (torch.Tensor[N, K]): Output classification.
+            target (torch.Tensor[N, K]): Target classification.
+            target_weight (torch.Tensor[N, K] or torch.Tensor[N]):
+                Weights across different labels.
+        """
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output, target, reduction='none')
+            if target_weight.dim() == 1:
+                target_weight = target_weight[:, None]
+            loss = (loss * target_weight).mean()
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/heatmap_loss.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/heatmap_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9471457ca0da2d43441da1d394bc45b3e8ca3ee7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/heatmap_loss.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class AdaptiveWingLoss(nn.Module):
+    """Adaptive wing loss. paper ref: 'Adaptive Wing Loss for Robust Face
+    Alignment via Heatmap Regression' Wang et al. ICCV'2019.
+
+    Args:
+        alpha (float), omega (float), epsilon (float), theta (float)
+            are hyper-parameters.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 alpha=2.1,
+                 omega=14,
+                 epsilon=1,
+                 theta=0.5,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.alpha = float(alpha)
+        self.omega = float(omega)
+        self.epsilon = float(epsilon)
+        self.theta = float(theta)
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            pred (torch.Tensor[NxKxHxW]): Predicted heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+        """
+        H, W = pred.shape[2:4]
+        delta = (target - pred).abs()
+
+        A = self.omega * (
+            1 / (1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
+        ) * (self.alpha - target) * (torch.pow(
+            self.theta / self.epsilon,
+            self.alpha - target - 1)) * (1 / self.epsilon)
+        C = self.theta * A - self.omega * torch.log(
+            1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
+
+        losses = torch.where(
+            delta < self.theta,
+            self.omega *
+            torch.log(1 +
+                      torch.pow(delta / self.epsilon, self.alpha - target)),
+            A * delta - C)
+
+        return torch.mean(losses)
+
+    def forward(self, output, target, target_weight):
+        """Forward function.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[NxKxHxW]): Output heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            loss = self.criterion(output * target_weight.unsqueeze(-1),
+                                  target * target_weight.unsqueeze(-1))
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/mesh_loss.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/mesh_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d18bd7296a189ec2f24c422cc05a19035d3224
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/mesh_loss.py
@@ -0,0 +1,340 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from ..utils.geometry import batch_rodrigues
+
+
+def perspective_projection(points, rotation, translation, focal_length,
+                           camera_center):
+    """This function computes the perspective projection of a set of 3D points.
+
+    Note:
+        - batch size: B
+        - point number: N
+
+    Args:
+        points (Tensor([B, N, 3])): A set of 3D points
+        rotation (Tensor([B, 3, 3])): Camera rotation matrix
+        translation (Tensor([B, 3])): Camera translation
+        focal_length (Tensor([B,])): Focal length
+        camera_center (Tensor([B, 2])): Camera center
+
+    Returns:
+        projected_points (Tensor([B, N, 2])): Projected 2D
+            points in image space.
+    """
+
+    batch_size = points.shape[0]
+    K = torch.zeros([batch_size, 3, 3], device=points.device)
+    K[:, 0, 0] = focal_length
+    K[:, 1, 1] = focal_length
+    K[:, 2, 2] = 1.
+    K[:, :-1, -1] = camera_center
+
+    # Transform points
+    points = torch.einsum('bij,bkj->bki', rotation, points)
+    points = points + translation.unsqueeze(1)
+
+    # Apply perspective distortion
+    projected_points = points / points[:, :, -1].unsqueeze(-1)
+
+    # Apply camera intrinsics
+    projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
+    projected_points = projected_points[:, :, :-1]
+    return projected_points
+
+
+@LOSSES.register_module()
+class MeshLoss(nn.Module):
+    """Mix loss for 3D human mesh. It is composed of loss on 2D joints, 3D
+    joints, mesh vertices and smpl parameters (if any).
+
+    Args:
+        joints_2d_loss_weight (float): Weight for loss on 2D joints.
+        joints_3d_loss_weight (float): Weight for loss on 3D joints.
+        vertex_loss_weight (float): Weight for loss on 3D verteices.
+        smpl_pose_loss_weight (float): Weight for loss on SMPL
+            pose parameters.
+        smpl_beta_loss_weight (float): Weight for loss on SMPL
+            shape parameters.
+        img_res (int): Input image resolution.
+        focal_length (float): Focal length of camera model. Default=5000.
+    """
+
+    def __init__(self,
+                 joints_2d_loss_weight,
+                 joints_3d_loss_weight,
+                 vertex_loss_weight,
+                 smpl_pose_loss_weight,
+                 smpl_beta_loss_weight,
+                 img_res,
+                 focal_length=5000):
+
+        super().__init__()
+        # Per-vertex loss on the mesh
+        self.criterion_vertex = nn.L1Loss(reduction='none')
+
+        # Joints (2D and 3D) loss
+        self.criterion_joints_2d = nn.SmoothL1Loss(reduction='none')
+        self.criterion_joints_3d = nn.SmoothL1Loss(reduction='none')
+
+        # Loss for SMPL parameter regression
+        self.criterion_regr = nn.MSELoss(reduction='none')
+
+        self.joints_2d_loss_weight = joints_2d_loss_weight
+        self.joints_3d_loss_weight = joints_3d_loss_weight
+        self.vertex_loss_weight = vertex_loss_weight
+        self.smpl_pose_loss_weight = smpl_pose_loss_weight
+        self.smpl_beta_loss_weight = smpl_beta_loss_weight
+        self.focal_length = focal_length
+        self.img_res = img_res
+
+    def joints_2d_loss(self, pred_joints_2d, gt_joints_2d, joints_2d_visible):
+        """Compute 2D reprojection loss on the joints.
+
+        The loss is weighted by joints_2d_visible.
+        """
+        conf = joints_2d_visible.float()
+        loss = (conf *
+                self.criterion_joints_2d(pred_joints_2d, gt_joints_2d)).mean()
+        return loss
+
+    def joints_3d_loss(self, pred_joints_3d, gt_joints_3d, joints_3d_visible):
+        """Compute 3D joints loss for the examples that 3D joint annotations
+        are available.
+
+        The loss is weighted by joints_3d_visible.
+        """
+        conf = joints_3d_visible.float()
+        if len(gt_joints_3d) > 0:
+            gt_pelvis = (gt_joints_3d[:, 2, :] + gt_joints_3d[:, 3, :]) / 2
+            gt_joints_3d = gt_joints_3d - gt_pelvis[:, None, :]
+            pred_pelvis = (pred_joints_3d[:, 2, :] +
+                           pred_joints_3d[:, 3, :]) / 2
+            pred_joints_3d = pred_joints_3d - pred_pelvis[:, None, :]
+            return (
+                conf *
+                self.criterion_joints_3d(pred_joints_3d, gt_joints_3d)).mean()
+        return pred_joints_3d.sum() * 0
+
+    def vertex_loss(self, pred_vertices, gt_vertices, has_smpl):
+        """Compute 3D vertex loss for the examples that 3D human mesh
+        annotations are available.
+
+        The loss is weighted by the has_smpl.
+        """
+        conf = has_smpl.float()
+        loss_vertex = self.criterion_vertex(pred_vertices, gt_vertices)
+        loss_vertex = (conf[:, None, None] * loss_vertex).mean()
+        return loss_vertex
+
+    def smpl_losses(self, pred_rotmat, pred_betas, gt_pose, gt_betas,
+                    has_smpl):
+        """Compute SMPL parameters loss for the examples that SMPL parameter
+        annotations are available.
+
+        The loss is weighted by has_smpl.
+        """
+        conf = has_smpl.float()
+        gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3)
+        loss_regr_pose = self.criterion_regr(pred_rotmat, gt_rotmat)
+        loss_regr_betas = self.criterion_regr(pred_betas, gt_betas)
+        loss_regr_pose = (conf[:, None, None, None] * loss_regr_pose).mean()
+        loss_regr_betas = (conf[:, None] * loss_regr_betas).mean()
+        return loss_regr_pose, loss_regr_betas
+
+    def project_points(self, points_3d, camera):
+        """Perform orthographic projection of 3D points using the camera
+        parameters, return projected 2D points in image plane.
+
+        Note:
+            - batch size: B
+            - point number: N
+
+        Args:
+            points_3d (Tensor([B, N, 3])): 3D points.
+            camera (Tensor([B, 3])): camera parameters with the
+                3 channel as (scale, translation_x, translation_y)
+
+        Returns:
+            Tensor([B, N, 2]): projected 2D points \
+                in image space.
+        """
+        batch_size = points_3d.shape[0]
+        device = points_3d.device
+        cam_t = torch.stack([
+            camera[:, 1], camera[:, 2], 2 * self.focal_length /
+            (self.img_res * camera[:, 0] + 1e-9)
+        ],
+                            dim=-1)
+        camera_center = camera.new_zeros([batch_size, 2])
+        rot_t = torch.eye(
+            3, device=device,
+            dtype=points_3d.dtype).unsqueeze(0).expand(batch_size, -1, -1)
+        joints_2d = perspective_projection(
+            points_3d,
+            rotation=rot_t,
+            translation=cam_t,
+            focal_length=self.focal_length,
+            camera_center=camera_center)
+        return joints_2d
+
+    def forward(self, output, target):
+        """Forward function.
+
+        Args:
+            output (dict): dict of network predicted results.
+                Keys: 'vertices', 'joints_3d', 'camera',
+                'pose'(optional), 'beta'(optional)
+            target (dict): dict of ground-truth labels.
+                Keys: 'vertices', 'joints_3d', 'joints_3d_visible',
+                'joints_2d', 'joints_2d_visible', 'pose', 'beta',
+                'has_smpl'
+
+        Returns:
+            dict: dict of losses.
+        """
+        losses = {}
+
+        # Per-vertex loss for the shape
+        pred_vertices = output['vertices']
+
+        gt_vertices = target['vertices']
+        has_smpl = target['has_smpl']
+        loss_vertex = self.vertex_loss(pred_vertices, gt_vertices, has_smpl)
+        losses['vertex_loss'] = loss_vertex * self.vertex_loss_weight
+
+        # Compute loss on SMPL parameters, if available
+        if 'pose' in output.keys() and 'beta' in output.keys():
+            pred_rotmat = output['pose']
+            pred_betas = output['beta']
+            gt_pose = target['pose']
+            gt_betas = target['beta']
+            loss_regr_pose, loss_regr_betas = self.smpl_losses(
+                pred_rotmat, pred_betas, gt_pose, gt_betas, has_smpl)
+            losses['smpl_pose_loss'] = \
+                loss_regr_pose * self.smpl_pose_loss_weight
+            losses['smpl_beta_loss'] = \
+                loss_regr_betas * self.smpl_beta_loss_weight
+
+        # Compute 3D joints loss
+        pred_joints_3d = output['joints_3d']
+        gt_joints_3d = target['joints_3d']
+        joints_3d_visible = target['joints_3d_visible']
+        loss_joints_3d = self.joints_3d_loss(pred_joints_3d, gt_joints_3d,
+                                             joints_3d_visible)
+        losses['joints_3d_loss'] = loss_joints_3d * self.joints_3d_loss_weight
+
+        # Compute 2D reprojection loss for the 2D joints
+        pred_camera = output['camera']
+        gt_joints_2d = target['joints_2d']
+        joints_2d_visible = target['joints_2d_visible']
+        pred_joints_2d = self.project_points(pred_joints_3d, pred_camera)
+
+        # Normalize keypoints to [-1,1]
+        # The coordinate origin of pred_joints_2d is
+        #  the center of the input image.
+        pred_joints_2d = 2 * pred_joints_2d / (self.img_res - 1)
+        # The coordinate origin of gt_joints_2d is
+        # the top left corner of the input image.
+        gt_joints_2d = 2 * gt_joints_2d / (self.img_res - 1) - 1
+        loss_joints_2d = self.joints_2d_loss(pred_joints_2d, gt_joints_2d,
+                                             joints_2d_visible)
+        losses['joints_2d_loss'] = loss_joints_2d * self.joints_2d_loss_weight
+
+        return losses
+
+
+@LOSSES.register_module()
+class GANLoss(nn.Module):
+    """Define GAN loss.
+
+    Args:
+        gan_type (str): Support 'vanilla', 'lsgan', 'wgan', 'hinge'.
+        real_label_val (float): The value for real label. Default: 1.0.
+        fake_label_val (float): The value for fake label. Default: 0.0.
+        loss_weight (float): Loss weight. Default: 1.0.
+            Note that loss_weight is only for generators; and it is always 1.0
+            for discriminators.
+    """
+
+    def __init__(self,
+                 gan_type,
+                 real_label_val=1.0,
+                 fake_label_val=0.0,
+                 loss_weight=1.0):
+        super().__init__()
+        self.gan_type = gan_type
+        self.loss_weight = loss_weight
+        self.real_label_val = real_label_val
+        self.fake_label_val = fake_label_val
+
+        if self.gan_type == 'vanilla':
+            self.loss = nn.BCEWithLogitsLoss()
+        elif self.gan_type == 'lsgan':
+            self.loss = nn.MSELoss()
+        elif self.gan_type == 'wgan':
+            self.loss = self._wgan_loss
+        elif self.gan_type == 'hinge':
+            self.loss = nn.ReLU()
+        else:
+            raise NotImplementedError(
+                f'GAN type {self.gan_type} is not implemented.')
+
+    @staticmethod
+    def _wgan_loss(input, target):
+        """wgan loss.
+
+        Args:
+            input (Tensor): Input tensor.
+            target (bool): Target label.
+
+        Returns:
+            Tensor: wgan loss.
+        """
+        return -input.mean() if target else input.mean()
+
+    def get_target_label(self, input, target_is_real):
+        """Get target label.
+
+        Args:
+            input (Tensor): Input tensor.
+            target_is_real (bool): Whether the target is real or fake.
+
+        Returns:
+            (bool | Tensor): Target tensor. Return bool for wgan, \
+                otherwise, return Tensor.
+        """
+
+        if self.gan_type == 'wgan':
+            return target_is_real
+        target_val = (
+            self.real_label_val if target_is_real else self.fake_label_val)
+        return input.new_ones(input.size()) * target_val
+
+    def forward(self, input, target_is_real, is_disc=False):
+        """
+        Args:
+            input (Tensor): The input for the loss module, i.e., the network
+                prediction.
+            target_is_real (bool): Whether the targe is real or fake.
+            is_disc (bool): Whether the loss for discriminators or not.
+                Default: False.
+
+        Returns:
+            Tensor: GAN loss value.
+        """
+        target_label = self.get_target_label(input, target_is_real)
+        if self.gan_type == 'hinge':
+            if is_disc:  # for discriminators in hinge-gan
+                input = -input if target_is_real else input
+                loss = self.loss(1 + input).mean()
+            else:  # for generators in hinge-gan
+                loss = -input.mean()
+        else:  # other gan types
+            loss = self.loss(input, target_label)
+
+        # loss_weight is always 1.0 for discriminators
+        return loss if is_disc else loss * self.loss_weight
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/mse_loss.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/mse_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..f972efadfdfe0093c9ae1b308c6f82a9ccd72f73
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/mse_loss.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class JointsMSELoss(nn.Module):
+    """MSE loss for heatmaps.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = nn.MSELoss()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight):
+        """Forward function."""
+        batch_size = output.size(0)
+        num_joints = output.size(1)
+
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(1, 1)
+        heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
+
+        loss = 0.
+
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze(1)
+            heatmap_gt = heatmaps_gt[idx].squeeze(1)
+            if self.use_target_weight:
+                loss += self.criterion(heatmap_pred * target_weight[:, idx],
+                                       heatmap_gt * target_weight[:, idx])
+            else:
+                loss += self.criterion(heatmap_pred, heatmap_gt)
+
+        return loss / num_joints * self.loss_weight
+
+
+@LOSSES.register_module()
+class CombinedTargetMSELoss(nn.Module):
+    """MSE loss for combined target.
+        CombinedTarget: The combination of classification target
+        (response map) and regression target (offset map).
+        Paper ref: Huang et al. The Devil is in the Details: Delving into
+        Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight, loss_weight=1.):
+        super().__init__()
+        self.criterion = nn.MSELoss(reduction='mean')
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight):
+        batch_size = output.size(0)
+        num_channels = output.size(1)
+        heatmaps_pred = output.reshape(
+            (batch_size, num_channels, -1)).split(1, 1)
+        heatmaps_gt = target.reshape(
+            (batch_size, num_channels, -1)).split(1, 1)
+        loss = 0.
+        num_joints = num_channels // 3
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx * 3].squeeze()
+            heatmap_gt = heatmaps_gt[idx * 3].squeeze()
+            offset_x_pred = heatmaps_pred[idx * 3 + 1].squeeze()
+            offset_x_gt = heatmaps_gt[idx * 3 + 1].squeeze()
+            offset_y_pred = heatmaps_pred[idx * 3 + 2].squeeze()
+            offset_y_gt = heatmaps_gt[idx * 3 + 2].squeeze()
+            if self.use_target_weight:
+                heatmap_pred = heatmap_pred * target_weight[:, idx]
+                heatmap_gt = heatmap_gt * target_weight[:, idx]
+            # classification loss
+            loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt)
+            # regression loss
+            loss += 0.5 * self.criterion(heatmap_gt * offset_x_pred,
+                                         heatmap_gt * offset_x_gt)
+            loss += 0.5 * self.criterion(heatmap_gt * offset_y_pred,
+                                         heatmap_gt * offset_y_gt)
+        return loss / num_joints * self.loss_weight
+
+
+@LOSSES.register_module()
+class JointsOHKMMSELoss(nn.Module):
+    """MSE loss with online hard keypoint mining.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        topk (int): Only top k joint losses are kept.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, topk=8, loss_weight=1.):
+        super().__init__()
+        assert topk > 0
+        self.criterion = nn.MSELoss(reduction='none')
+        self.use_target_weight = use_target_weight
+        self.topk = topk
+        self.loss_weight = loss_weight
+
+    def _ohkm(self, loss):
+        """Online hard keypoint mining."""
+        ohkm_loss = 0.
+        N = len(loss)
+        for i in range(N):
+            sub_loss = loss[i]
+            _, topk_idx = torch.topk(
+                sub_loss, k=self.topk, dim=0, sorted=False)
+            tmp_loss = torch.gather(sub_loss, 0, topk_idx)
+            ohkm_loss += torch.sum(tmp_loss) / self.topk
+        ohkm_loss /= N
+        return ohkm_loss
+
+    def forward(self, output, target, target_weight):
+        """Forward function."""
+        batch_size = output.size(0)
+        num_joints = output.size(1)
+        if num_joints < self.topk:
+            raise ValueError(f'topk ({self.topk}) should not '
+                             f'larger than num_joints ({num_joints}).')
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(1, 1)
+        heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
+
+        losses = []
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze(1)
+            heatmap_gt = heatmaps_gt[idx].squeeze(1)
+            if self.use_target_weight:
+                losses.append(
+                    self.criterion(heatmap_pred * target_weight[:, idx],
+                                   heatmap_gt * target_weight[:, idx]))
+            else:
+                losses.append(self.criterion(heatmap_pred, heatmap_gt))
+
+        losses = [loss.mean(dim=1).unsqueeze(dim=1) for loss in losses]
+        losses = torch.cat(losses, dim=1)
+
+        return self._ohkm(losses) * self.loss_weight
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/multi_loss_factory.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/multi_loss_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..65f90a761d0e5f94309023288f0d3ec848ec82dd
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/multi_loss_factory.py
@@ -0,0 +1,281 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+def _make_input(t, requires_grad=False, device=torch.device('cpu')):
+    """Make zero inputs for AE loss.
+
+    Args:
+        t (torch.Tensor): input
+        requires_grad (bool): Option to use requires_grad.
+        device: torch device
+
+    Returns:
+        torch.Tensor: zero input.
+    """
+    inp = torch.autograd.Variable(t, requires_grad=requires_grad)
+    inp = inp.sum()
+    inp = inp.to(device)
+    return inp
+
+
+@LOSSES.register_module()
+class HeatmapLoss(nn.Module):
+    """Accumulate the heatmap loss for each image in the batch.
+
+    Args:
+        supervise_empty (bool): Whether to supervise empty channels.
+    """
+
+    def __init__(self, supervise_empty=True):
+        super().__init__()
+        self.supervise_empty = supervise_empty
+
+    def forward(self, pred, gt, mask):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            pred (torch.Tensor[N,K,H,W]):heatmap of output.
+            gt (torch.Tensor[N,K,H,W]): target heatmap.
+            mask (torch.Tensor[N,H,W]): mask of target.
+        """
+        assert pred.size() == gt.size(
+        ), f'pred.size() is {pred.size()}, gt.size() is {gt.size()}'
+
+        if not self.supervise_empty:
+            empty_mask = (gt.sum(dim=[2, 3], keepdim=True) > 0).float()
+            loss = ((pred - gt)**2) * empty_mask.expand_as(
+                pred) * mask[:, None, :, :].expand_as(pred)
+        else:
+            loss = ((pred - gt)**2) * mask[:, None, :, :].expand_as(pred)
+        loss = loss.mean(dim=3).mean(dim=2).mean(dim=1)
+        return loss
+
+
+@LOSSES.register_module()
+class AELoss(nn.Module):
+    """Associative Embedding loss.
+
+    `Associative Embedding: End-to-End Learning for Joint Detection and
+    Grouping <https://arxiv.org/abs/1611.05424v2>`_.
+    """
+
+    def __init__(self, loss_type):
+        super().__init__()
+        self.loss_type = loss_type
+
+    def singleTagLoss(self, pred_tag, joints):
+        """Associative embedding loss for one image.
+
+        Note:
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            pred_tag (torch.Tensor[KxHxW,1]): tag of output for one image.
+            joints (torch.Tensor[M,K,2]): joints information for one image.
+        """
+        tags = []
+        pull = 0
+        for joints_per_person in joints:
+            tmp = []
+            for joint in joints_per_person:
+                if joint[1] > 0:
+                    tmp.append(pred_tag[joint[0]])
+            if len(tmp) == 0:
+                continue
+            tmp = torch.stack(tmp)
+            tags.append(torch.mean(tmp, dim=0))
+            pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2)
+
+        num_tags = len(tags)
+        if num_tags == 0:
+            return (
+                _make_input(torch.zeros(1).float(), device=pred_tag.device),
+                _make_input(torch.zeros(1).float(), device=pred_tag.device))
+        elif num_tags == 1:
+            return (_make_input(
+                torch.zeros(1).float(), device=pred_tag.device), pull)
+
+        tags = torch.stack(tags)
+
+        size = (num_tags, num_tags)
+        A = tags.expand(*size)
+        B = A.permute(1, 0)
+
+        diff = A - B
+
+        if self.loss_type == 'exp':
+            diff = torch.pow(diff, 2)
+            push = torch.exp(-diff)
+            push = torch.sum(push) - num_tags
+        elif self.loss_type == 'max':
+            diff = 1 - torch.abs(diff)
+            push = torch.clamp(diff, min=0).sum() - num_tags
+        else:
+            raise ValueError('Unknown ae loss type')
+
+        push_loss = push / ((num_tags - 1) * num_tags) * 0.5
+        pull_loss = pull / (num_tags)
+
+        return push_loss, pull_loss
+
+    def forward(self, tags, joints):
+        """Accumulate the tag loss for each image in the batch.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            tags (torch.Tensor[N,KxHxW,1]): tag channels of output.
+            joints (torch.Tensor[N,M,K,2]): joints information.
+        """
+        pushes, pulls = [], []
+        joints = joints.cpu().data.numpy()
+        batch_size = tags.size(0)
+        for i in range(batch_size):
+            push, pull = self.singleTagLoss(tags[i], joints[i])
+            pushes.append(push)
+            pulls.append(pull)
+        return torch.stack(pushes), torch.stack(pulls)
+
+
+@LOSSES.register_module()
+class MultiLossFactory(nn.Module):
+    """Loss for bottom-up models.
+
+    Args:
+        num_joints (int): Number of keypoints.
+        num_stages (int): Number of stages.
+        ae_loss_type (str): Type of ae loss.
+        with_ae_loss (list[bool]): Use ae loss or not in multi-heatmap.
+        push_loss_factor (list[float]):
+            Parameter of push loss in multi-heatmap.
+        pull_loss_factor (list[float]):
+            Parameter of pull loss in multi-heatmap.
+        with_heatmap_loss (list[bool]):
+            Use heatmap loss or not in multi-heatmap.
+        heatmaps_loss_factor (list[float]):
+            Parameter of heatmap loss in multi-heatmap.
+        supervise_empty (bool): Whether to supervise empty channels.
+    """
+
+    def __init__(self,
+                 num_joints,
+                 num_stages,
+                 ae_loss_type,
+                 with_ae_loss,
+                 push_loss_factor,
+                 pull_loss_factor,
+                 with_heatmaps_loss,
+                 heatmaps_loss_factor,
+                 supervise_empty=True):
+        super().__init__()
+
+        assert isinstance(with_heatmaps_loss, (list, tuple)), \
+            'with_heatmaps_loss should be a list or tuple'
+        assert isinstance(heatmaps_loss_factor, (list, tuple)), \
+            'heatmaps_loss_factor should be a list or tuple'
+        assert isinstance(with_ae_loss, (list, tuple)), \
+            'with_ae_loss should be a list or tuple'
+        assert isinstance(push_loss_factor, (list, tuple)), \
+            'push_loss_factor should be a list or tuple'
+        assert isinstance(pull_loss_factor, (list, tuple)), \
+            'pull_loss_factor should be a list or tuple'
+
+        self.num_joints = num_joints
+        self.num_stages = num_stages
+        self.ae_loss_type = ae_loss_type
+        self.with_ae_loss = with_ae_loss
+        self.push_loss_factor = push_loss_factor
+        self.pull_loss_factor = pull_loss_factor
+        self.with_heatmaps_loss = with_heatmaps_loss
+        self.heatmaps_loss_factor = heatmaps_loss_factor
+
+        self.heatmaps_loss = \
+            nn.ModuleList(
+                [
+                    HeatmapLoss(supervise_empty)
+                    if with_heatmaps_loss else None
+                    for with_heatmaps_loss in self.with_heatmaps_loss
+                ]
+            )
+
+        self.ae_loss = \
+            nn.ModuleList(
+                [
+                    AELoss(self.ae_loss_type) if with_ae_loss else None
+                    for with_ae_loss in self.with_ae_loss
+                ]
+            )
+
+    def forward(self, outputs, heatmaps, masks, joints):
+        """Forward function to calculate losses.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+            - output_channel: C C=2K if use ae loss else K
+
+        Args:
+            outputs (list(torch.Tensor[N,C,H,W])): outputs of stages.
+            heatmaps (list(torch.Tensor[N,K,H,W])): target of heatmaps.
+            masks (list(torch.Tensor[N,H,W])): masks of heatmaps.
+            joints (list(torch.Tensor[N,M,K,2])): joints of ae loss.
+        """
+        heatmaps_losses = []
+        push_losses = []
+        pull_losses = []
+        for idx in range(len(outputs)):
+            offset_feat = 0
+            if self.heatmaps_loss[idx]:
+                heatmaps_pred = outputs[idx][:, :self.num_joints]
+                offset_feat = self.num_joints
+                heatmaps_loss = self.heatmaps_loss[idx](heatmaps_pred,
+                                                        heatmaps[idx],
+                                                        masks[idx])
+                heatmaps_loss = heatmaps_loss * self.heatmaps_loss_factor[idx]
+                heatmaps_losses.append(heatmaps_loss)
+            else:
+                heatmaps_losses.append(None)
+
+            if self.ae_loss[idx]:
+                tags_pred = outputs[idx][:, offset_feat:]
+                batch_size = tags_pred.size()[0]
+                tags_pred = tags_pred.contiguous().view(batch_size, -1, 1)
+
+                push_loss, pull_loss = self.ae_loss[idx](tags_pred,
+                                                         joints[idx])
+                push_loss = push_loss * self.push_loss_factor[idx]
+                pull_loss = pull_loss * self.pull_loss_factor[idx]
+
+                push_losses.append(push_loss)
+                pull_losses.append(pull_loss)
+            else:
+                push_losses.append(None)
+                pull_losses.append(None)
+
+        return heatmaps_losses, push_losses, pull_losses
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/regression_loss.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/regression_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..db4178355ed4d16978d487ed92120a4cf427bf83
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/losses/regression_loss.py
@@ -0,0 +1,448 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class SmoothL1Loss(nn.Module):
+    """SmoothL1Loss loss.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.smooth_l1_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class WingLoss(nn.Module):
+    """Wing Loss. paper ref: 'Wing Loss for Robust Facial Landmark Localisation
+    with Convolutional Neural Networks' Feng et al. CVPR'2018.
+
+    Args:
+        omega (float): Also referred to as width.
+        epsilon (float): Also referred to as curvature.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 omega=10.0,
+                 epsilon=2.0,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.omega = omega
+        self.epsilon = epsilon
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        # constant that smoothly links the piecewise-defined linear
+        # and nonlinear parts
+        self.C = self.omega * (1.0 - math.log(1.0 + self.omega / self.epsilon))
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            pred (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+        """
+        delta = (target - pred).abs()
+        losses = torch.where(
+            delta < self.omega,
+            self.omega * torch.log(1.0 + delta / self.epsilon), delta - self.C)
+        return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class SoftWingLoss(nn.Module):
+    """Soft Wing Loss 'Structure-Coherent Deep Feature Learning for Robust Face
+    Alignment' Lin et al. TIP'2021.
+
+    loss =
+        1. |x|                           , if |x| < omega1
+        2. omega2*ln(1+|x|/epsilon) + B, if |x| >= omega1
+
+    Args:
+        omega1 (float): The first threshold.
+        omega2 (float): The second threshold.
+        epsilon (float): Also referred to as curvature.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 omega1=2.0,
+                 omega2=20.0,
+                 epsilon=0.5,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.omega1 = omega1
+        self.omega2 = omega2
+        self.epsilon = epsilon
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        # constant that smoothly links the piecewise-defined linear
+        # and nonlinear parts
+        self.B = self.omega1 - self.omega2 * math.log(1.0 + self.omega1 /
+                                                      self.epsilon)
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            pred (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+        """
+        delta = (target - pred).abs()
+        losses = torch.where(
+            delta < self.omega1, delta,
+            self.omega2 * torch.log(1.0 + delta / self.epsilon) + self.B)
+        return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class MPJPELoss(nn.Module):
+    """MPJPE (Mean Per Joint Position Error) loss.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = torch.mean(
+                torch.norm((output - target) * target_weight, dim=-1))
+        else:
+            loss = torch.mean(torch.norm(output - target, dim=-1))
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class L1Loss(nn.Module):
+    """L1Loss loss ."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.l1_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output regression.
+            target (torch.Tensor[N, K, 2]): Target regression.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class MSELoss(nn.Module):
+    """MSE loss for coordinate regression."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.mse_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output regression.
+            target (torch.Tensor[N, K, 2]): Target regression.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class BoneLoss(nn.Module):
+    """Bone length loss.
+
+    Args:
+        joint_parents (list): Indices of each joint's parent joint.
+        use_target_weight (bool): Option to use weighted bone loss.
+            Different bone types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, joint_parents, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.joint_parents = joint_parents
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        self.non_root_indices = []
+        for i in range(len(self.joint_parents)):
+            if i != self.joint_parents[i]:
+                self.non_root_indices.append(i)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K-1]):
+                Weights across different bone types.
+        """
+        output_bone = torch.norm(
+            output - output[:, self.joint_parents, :],
+            dim=-1)[:, self.non_root_indices]
+        target_bone = torch.norm(
+            target - target[:, self.joint_parents, :],
+            dim=-1)[:, self.non_root_indices]
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = torch.mean(
+                torch.abs((output_bone * target_weight).mean(dim=0) -
+                          (target_bone * target_weight).mean(dim=0)))
+        else:
+            loss = torch.mean(
+                torch.abs(output_bone.mean(dim=0) - target_bone.mean(dim=0)))
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class SemiSupervisionLoss(nn.Module):
+    """Semi-supervision loss for unlabeled data. It is composed of projection
+    loss and bone loss.
+
+    Paper ref: `3D human pose estimation in video with temporal convolutions
+    and semi-supervised training` Dario Pavllo et al. CVPR'2019.
+
+    Args:
+        joint_parents (list): Indices of each joint's parent joint.
+        projection_loss_weight (float): Weight for projection loss.
+        bone_loss_weight (float): Weight for bone loss.
+        warmup_iterations (int): Number of warmup iterations. In the first
+            `warmup_iterations` iterations, the model is trained only on
+            labeled data, and semi-supervision loss will be 0.
+            This is a workaround since currently we cannot access
+            epoch number in loss functions. Note that the iteration number in
+            an epoch can be changed due to different GPU numbers in multi-GPU
+            settings. So please set this parameter carefully.
+            warmup_iterations = dataset_size // samples_per_gpu // gpu_num
+            * warmup_epochs
+    """
+
+    def __init__(self,
+                 joint_parents,
+                 projection_loss_weight=1.,
+                 bone_loss_weight=1.,
+                 warmup_iterations=0):
+        super().__init__()
+        self.criterion_projection = MPJPELoss(
+            loss_weight=projection_loss_weight)
+        self.criterion_bone = BoneLoss(
+            joint_parents, loss_weight=bone_loss_weight)
+        self.warmup_iterations = warmup_iterations
+        self.num_iterations = 0
+
+    @staticmethod
+    def project_joints(x, intrinsics):
+        """Project 3D joint coordinates to 2D image plane using camera
+        intrinsic parameters.
+
+        Args:
+            x (torch.Tensor[N, K, 3]): 3D joint coordinates.
+            intrinsics (torch.Tensor[N, 4] | torch.Tensor[N, 9]): Camera
+                intrinsics: f (2), c (2), k (3), p (2).
+        """
+        while intrinsics.dim() < x.dim():
+            intrinsics.unsqueeze_(1)
+        f = intrinsics[..., :2]
+        c = intrinsics[..., 2:4]
+        _x = torch.clamp(x[:, :, :2] / x[:, :, 2:], -1, 1)
+        if intrinsics.shape[-1] == 9:
+            k = intrinsics[..., 4:7]
+            p = intrinsics[..., 7:9]
+
+            r2 = torch.sum(_x[:, :, :2]**2, dim=-1, keepdim=True)
+            radial = 1 + torch.sum(
+                k * torch.cat((r2, r2**2, r2**3), dim=-1),
+                dim=-1,
+                keepdim=True)
+            tan = torch.sum(p * _x, dim=-1, keepdim=True)
+            _x = _x * (radial + tan) + p * r2
+        _x = f * _x + c
+        return _x
+
+    def forward(self, output, target):
+        losses = dict()
+
+        self.num_iterations += 1
+        if self.num_iterations <= self.warmup_iterations:
+            return losses
+
+        labeled_pose = output['labeled_pose']
+        unlabeled_pose = output['unlabeled_pose']
+        unlabeled_traj = output['unlabeled_traj']
+        unlabeled_target_2d = target['unlabeled_target_2d']
+        intrinsics = target['intrinsics']
+
+        # projection loss
+        unlabeled_output = unlabeled_pose + unlabeled_traj
+        unlabeled_output_2d = self.project_joints(unlabeled_output, intrinsics)
+        loss_proj = self.criterion_projection(unlabeled_output_2d,
+                                              unlabeled_target_2d, None)
+        losses['proj_loss'] = loss_proj
+
+        # bone loss
+        loss_bone = self.criterion_bone(unlabeled_pose, labeled_pose, None)
+        losses['bone_loss'] = loss_bone
+
+        return losses
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/misc/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/misc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/misc/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/misc/discriminator.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/misc/discriminator.py
new file mode 100644
index 0000000000000000000000000000000000000000..712f0a8b566e3dcbc0cd13206610d3c750b942ab
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/misc/discriminator.py
@@ -0,0 +1,307 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/akanazawa/hmr
+# Original licence: Copyright (c) 2018 akanazawa, under the MIT License.
+# ------------------------------------------------------------------------------
+
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import normal_init, xavier_init
+
+from mmpose.models.utils.geometry import batch_rodrigues
+
+
+class BaseDiscriminator(nn.Module):
+    """Base linear module for SMPL parameter discriminator.
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+            such as (9, 32, 32, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or not
+            for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active function
+            or not, such as (True, True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        super().__init__()
+        self.fc_layers = fc_layers
+        self.use_dropout = use_dropout
+        self.drop_prob = drop_prob
+        self.use_activation = use_activation
+        self._check()
+        self.create_layers()
+
+    def _check(self):
+        """Check input to avoid ValueError."""
+        if not isinstance(self.fc_layers, tuple):
+            raise TypeError(f'fc_layers require tuple, '
+                            f'get {type(self.fc_layers)}')
+
+        if not isinstance(self.use_dropout, tuple):
+            raise TypeError(f'use_dropout require tuple, '
+                            f'get {type(self.use_dropout)}')
+
+        if not isinstance(self.drop_prob, tuple):
+            raise TypeError(f'drop_prob require tuple, '
+                            f'get {type(self.drop_prob)}')
+
+        if not isinstance(self.use_activation, tuple):
+            raise TypeError(f'use_activation require tuple, '
+                            f'get {type(self.use_activation)}')
+
+        l_fc_layer = len(self.fc_layers)
+        l_use_drop = len(self.use_dropout)
+        l_drop_prob = len(self.drop_prob)
+        l_use_activation = len(self.use_activation)
+
+        pass_check = (
+            l_fc_layer >= 2 and l_use_drop < l_fc_layer
+            and l_drop_prob < l_fc_layer and l_use_activation < l_fc_layer
+            and l_drop_prob == l_use_drop)
+
+        if not pass_check:
+            msg = 'Wrong BaseDiscriminator parameters!'
+            raise ValueError(msg)
+
+    def create_layers(self):
+        """Create layers."""
+        l_fc_layer = len(self.fc_layers)
+        l_use_drop = len(self.use_dropout)
+        l_use_activation = len(self.use_activation)
+
+        self.fc_blocks = nn.Sequential()
+
+        for i in range(l_fc_layer - 1):
+            self.fc_blocks.add_module(
+                name=f'regressor_fc_{i}',
+                module=nn.Linear(
+                    in_features=self.fc_layers[i],
+                    out_features=self.fc_layers[i + 1]))
+
+            if i < l_use_activation and self.use_activation[i]:
+                self.fc_blocks.add_module(
+                    name=f'regressor_af_{i}', module=nn.ReLU())
+
+            if i < l_use_drop and self.use_dropout[i]:
+                self.fc_blocks.add_module(
+                    name=f'regressor_fc_dropout_{i}',
+                    module=nn.Dropout(p=self.drop_prob[i]))
+
+    @abstractmethod
+    def forward(self, inputs):
+        """Forward function."""
+        msg = 'the base class [BaseDiscriminator] is not callable!'
+        raise NotImplementedError(msg)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.fc_blocks.named_modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, gain=0.01)
+
+
+class ShapeDiscriminator(BaseDiscriminator):
+    """Discriminator for SMPL shape parameters, the inputs is (batch_size x 10)
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count, such as (10, 5, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or
+            not for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active
+            function or not, such as (True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        if fc_layers[-1] != 1:
+            msg = f'the neuron count of the last layer ' \
+                  f'must be 1, but got {fc_layers[-1]}'
+            raise ValueError(msg)
+
+        super().__init__(fc_layers, use_dropout, drop_prob, use_activation)
+
+    def forward(self, inputs):
+        """Forward function."""
+        return self.fc_blocks(inputs)
+
+
+class PoseDiscriminator(nn.Module):
+    """Discriminator for SMPL pose parameters of each joint. It is composed of
+    discriminators for each joints. The inputs is (batch_size x joint_count x
+    9)
+
+    Args:
+        channels (Tuple): Tuple of channel number,
+            such as (9, 32, 32, 1)
+        joint_count (int): Joint number, such as 23
+    """
+
+    def __init__(self, channels, joint_count):
+        super().__init__()
+        if channels[-1] != 1:
+            msg = f'the neuron count of the last layer ' \
+                  f'must be 1, but got {channels[-1]}'
+            raise ValueError(msg)
+        self.joint_count = joint_count
+
+        self.conv_blocks = nn.Sequential()
+        len_channels = len(channels)
+        for idx in range(len_channels - 2):
+            self.conv_blocks.add_module(
+                name=f'conv_{idx}',
+                module=nn.Conv2d(
+                    in_channels=channels[idx],
+                    out_channels=channels[idx + 1],
+                    kernel_size=1,
+                    stride=1))
+
+        self.fc_layer = nn.ModuleList()
+        for idx in range(joint_count):
+            self.fc_layer.append(
+                nn.Linear(
+                    in_features=channels[len_channels - 2], out_features=1))
+
+    def forward(self, inputs):
+        """Forward function.
+
+        The input is (batch_size x joint_count x 9).
+        """
+        # shape: batch_size x 9 x 1 x joint_count
+        inputs = inputs.transpose(1, 2).unsqueeze(2).contiguous()
+        # shape: batch_size x c x 1 x joint_count
+        internal_outputs = self.conv_blocks(inputs)
+        outputs = []
+        for idx in range(self.joint_count):
+            outputs.append(self.fc_layer[idx](internal_outputs[:, :, 0, idx]))
+
+        return torch.cat(outputs, 1), internal_outputs
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.conv_blocks:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+        for m in self.fc_layer.named_modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, gain=0.01)
+
+
+class FullPoseDiscriminator(BaseDiscriminator):
+    """Discriminator for SMPL pose parameters of all joints.
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+            such as (736, 1024, 1024, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or not
+            for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active
+            function or not, such as (True, True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        if fc_layers[-1] != 1:
+            msg = f'the neuron count of the last layer must be 1,' \
+                  f' but got {fc_layers[-1]}'
+            raise ValueError(msg)
+
+        super().__init__(fc_layers, use_dropout, drop_prob, use_activation)
+
+    def forward(self, inputs):
+        """Forward function."""
+        return self.fc_blocks(inputs)
+
+
+class SMPLDiscriminator(nn.Module):
+    """Discriminator for SMPL pose and shape parameters. It is composed of a
+    discriminator for SMPL shape parameters, a discriminator for SMPL pose
+    parameters of all joints  and a discriminator for SMPL pose parameters of
+    each joint.
+
+    Args:
+        beta_channel (tuple of int): Tuple of neuron count of the
+            discriminator of shape parameters. Defaults to (10, 5, 1)
+        per_joint_channel (tuple of int): Tuple of neuron count of the
+            discriminator of each joint. Defaults to (9, 32, 32, 1)
+        full_pose_channel (tuple of int): Tuple of neuron count of the
+            discriminator of full pose. Defaults to (23*32, 1024, 1024, 1)
+    """
+
+    def __init__(self,
+                 beta_channel=(10, 5, 1),
+                 per_joint_channel=(9, 32, 32, 1),
+                 full_pose_channel=(23 * 32, 1024, 1024, 1)):
+        super().__init__()
+        self.joint_count = 23
+        # The count of SMPL shape parameter is 10.
+        assert beta_channel[0] == 10
+        # Use 3 x 3 rotation matrix as the pose parameters
+        # of each joint, so the input channel is 9.
+        assert per_joint_channel[0] == 9
+        assert self.joint_count * per_joint_channel[-2] \
+            == full_pose_channel[0]
+
+        self.beta_channel = beta_channel
+        self.per_joint_channel = per_joint_channel
+        self.full_pose_channel = full_pose_channel
+        self._create_sub_modules()
+
+    def _create_sub_modules(self):
+        """Create sub discriminators."""
+
+        # create theta discriminator for each joint
+        self.pose_discriminator = PoseDiscriminator(self.per_joint_channel,
+                                                    self.joint_count)
+
+        # create full pose discriminator for total joints
+        fc_layers = self.full_pose_channel
+        use_dropout = tuple([False] * (len(fc_layers) - 1))
+        drop_prob = tuple([0.5] * (len(fc_layers) - 1))
+        use_activation = tuple([True] * (len(fc_layers) - 2) + [False])
+
+        self.full_pose_discriminator = FullPoseDiscriminator(
+            fc_layers, use_dropout, drop_prob, use_activation)
+
+        # create shape discriminator for betas
+        fc_layers = self.beta_channel
+        use_dropout = tuple([False] * (len(fc_layers) - 1))
+        drop_prob = tuple([0.5] * (len(fc_layers) - 1))
+        use_activation = tuple([True] * (len(fc_layers) - 2) + [False])
+        self.shape_discriminator = ShapeDiscriminator(fc_layers, use_dropout,
+                                                      drop_prob,
+                                                      use_activation)
+
+    def forward(self, thetas):
+        """Forward function."""
+        _, poses, shapes = thetas
+
+        batch_size = poses.shape[0]
+        shape_disc_value = self.shape_discriminator(shapes)
+
+        # The first rotation matrix is global rotation
+        # and is NOT used in discriminator.
+        if poses.dim() == 2:
+            rotate_matrixs = \
+                batch_rodrigues(poses.contiguous().view(-1, 3)
+                                ).view(batch_size, 24, 9)[:, 1:, :]
+        else:
+            rotate_matrixs = poses.contiguous().view(batch_size, 24,
+                                                     9)[:, 1:, :].contiguous()
+        pose_disc_value, pose_inter_disc_value \
+            = self.pose_discriminator(rotate_matrixs)
+        full_pose_disc_value = self.full_pose_discriminator(
+            pose_inter_disc_value.contiguous().view(batch_size, -1))
+        return torch.cat(
+            (pose_disc_value, full_pose_disc_value, shape_disc_value), 1)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        self.full_pose_discriminator.init_weights()
+        self.pose_discriminator.init_weights()
+        self.shape_discriminator.init_weights()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/necks/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3a5cc01a93604f3d9da9242ea2eac0fe60638c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/necks/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .gap_neck import GlobalAveragePooling
+from .posewarper_neck import PoseWarperNeck
+
+__all__ = ['GlobalAveragePooling', 'PoseWarperNeck']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/necks/gap_neck.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/necks/gap_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e6ad68ec11110daaad3a66e09d67efb355c4b93
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/necks/gap_neck.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class GlobalAveragePooling(nn.Module):
+    """Global Average Pooling neck.
+
+    Note that we use `view` to remove extra channel after pooling. We do not
+    use `squeeze` as it will also remove the batch dimension when the tensor
+    has a batch dimension of size 1, which can lead to unexpected errors.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.gap = nn.AdaptiveAvgPool2d((1, 1))
+
+    def init_weights(self):
+        pass
+
+    def forward(self, inputs):
+        if isinstance(inputs, tuple):
+            outs = tuple([self.gap(x) for x in inputs])
+            outs = tuple(
+                [out.view(x.size(0), -1) for out, x in zip(outs, inputs)])
+        elif isinstance(inputs, list):
+            outs = [self.gap(x) for x in inputs]
+            outs = [out.view(x.size(0), -1) for out, x in zip(outs, inputs)]
+        elif isinstance(inputs, torch.Tensor):
+            outs = self.gap(inputs)
+            outs = outs.view(inputs.size(0), -1)
+        else:
+            raise TypeError('neck inputs should be tuple or torch.tensor')
+        return outs
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/necks/posewarper_neck.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/necks/posewarper_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd4ddfbf8984857a6110f19b0a7d703b53f1c433
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/necks/posewarper_neck.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
+                      normal_init)
+from mmcv.utils import digit_version
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.models.utils.ops import resize
+from ..backbones.resnet import BasicBlock, Bottleneck
+from ..builder import NECKS
+
+try:
+    from mmcv.ops import DeformConv2d
+    has_mmcv_full = True
+except (ImportError, ModuleNotFoundError):
+    has_mmcv_full = False
+
+
+@NECKS.register_module()
+class PoseWarperNeck(nn.Module):
+    """PoseWarper neck.
+
+    `"Learning temporal pose estimation from sparsely-labeled videos"
+    <https://arxiv.org/abs/1906.04016>`_.
+
+    Args:
+        in_channels (int): Number of input channels from backbone
+        out_channels (int): Number of output channels
+        inner_channels (int): Number of intermediate channels of the res block
+        deform_groups (int): Number of groups in the deformable conv
+        dilations (list|tuple): different dilations of the offset conv layers
+        trans_conv_kernel (int): the kernel of the trans conv layer, which is
+            used to get heatmap from the output of backbone. Default: 1
+        res_blocks_cfg (dict|None): config of residual blocks. If None,
+            use the default values. If not None, it should contain the
+            following keys:
+
+            - block (str): the type of residual block, Default: 'BASIC'.
+            - num_blocks (int):  the number of blocks, Default: 20.
+
+        offsets_kernel (int): the kernel of offset conv layer.
+        deform_conv_kernel (int): the kernel of defomrable conv layer.
+        in_index (int|Sequence[int]): Input feature index. Default: 0
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resize to \
+                the same size as first one and than concat together. \
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into \
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+
+        freeze_trans_layer (bool): Whether to freeze the transition layer
+            (stop grad and set eval mode). Default: True.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        im2col_step (int): the argument `im2col_step` in deformable conv,
+            Default: 80.
+    """
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+    minimum_mmcv_version = '1.3.17'
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 inner_channels,
+                 deform_groups=17,
+                 dilations=(3, 6, 12, 18, 24),
+                 trans_conv_kernel=1,
+                 res_blocks_cfg=None,
+                 offsets_kernel=3,
+                 deform_conv_kernel=3,
+                 in_index=0,
+                 input_transform=None,
+                 freeze_trans_layer=True,
+                 norm_eval=False,
+                 im2col_step=80):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.inner_channels = inner_channels
+        self.deform_groups = deform_groups
+        self.dilations = dilations
+        self.trans_conv_kernel = trans_conv_kernel
+        self.res_blocks_cfg = res_blocks_cfg
+        self.offsets_kernel = offsets_kernel
+        self.deform_conv_kernel = deform_conv_kernel
+        self.in_index = in_index
+        self.input_transform = input_transform
+        self.freeze_trans_layer = freeze_trans_layer
+        self.norm_eval = norm_eval
+        self.im2col_step = im2col_step
+
+        identity_trans_layer = False
+
+        assert trans_conv_kernel in [0, 1, 3]
+        kernel_size = trans_conv_kernel
+        if kernel_size == 3:
+            padding = 1
+        elif kernel_size == 1:
+            padding = 0
+        else:
+            # 0 for Identity mapping.
+            identity_trans_layer = True
+
+        if identity_trans_layer:
+            self.trans_layer = nn.Identity()
+        else:
+            self.trans_layer = build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=padding)
+
+        # build chain of residual blocks
+        if res_blocks_cfg is not None and not isinstance(res_blocks_cfg, dict):
+            raise TypeError('res_blocks_cfg should be dict or None.')
+
+        if res_blocks_cfg is None:
+            block_type = 'BASIC'
+            num_blocks = 20
+        else:
+            block_type = res_blocks_cfg.get('block', 'BASIC')
+            num_blocks = res_blocks_cfg.get('num_blocks', 20)
+
+        block = self.blocks_dict[block_type]
+
+        res_layers = []
+        downsample = nn.Sequential(
+            build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=out_channels,
+                out_channels=inner_channels,
+                kernel_size=1,
+                stride=1,
+                bias=False),
+            build_norm_layer(dict(type='BN'), inner_channels)[1])
+        res_layers.append(
+            block(
+                in_channels=out_channels,
+                out_channels=inner_channels,
+                downsample=downsample))
+
+        for _ in range(1, num_blocks):
+            res_layers.append(block(inner_channels, inner_channels))
+        self.offset_feats = nn.Sequential(*res_layers)
+
+        # build offset layers
+        self.num_offset_layers = len(dilations)
+        assert self.num_offset_layers > 0, 'Number of offset layers ' \
+            'should be larger than 0.'
+
+        target_offset_channels = 2 * offsets_kernel**2 * deform_groups
+
+        offset_layers = [
+            build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=inner_channels,
+                out_channels=target_offset_channels,
+                kernel_size=offsets_kernel,
+                stride=1,
+                dilation=dilations[i],
+                padding=dilations[i],
+                bias=False,
+            ) for i in range(self.num_offset_layers)
+        ]
+        self.offset_layers = nn.ModuleList(offset_layers)
+
+        # build deformable conv layers
+        assert digit_version(mmcv.__version__) >= \
+            digit_version(self.minimum_mmcv_version), \
+            f'Current MMCV version: {mmcv.__version__}, ' \
+            f'but MMCV >= {self.minimum_mmcv_version} is required, see ' \
+            f'https://github.com/open-mmlab/mmcv/issues/1440, ' \
+            f'Please install the latest MMCV.'
+
+        if has_mmcv_full:
+            deform_conv_layers = [
+                DeformConv2d(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_size=deform_conv_kernel,
+                    stride=1,
+                    padding=int(deform_conv_kernel / 2) * dilations[i],
+                    dilation=dilations[i],
+                    deform_groups=deform_groups,
+                    im2col_step=self.im2col_step,
+                ) for i in range(self.num_offset_layers)
+            ]
+        else:
+            raise ImportError('Please install the full version of mmcv '
+                              'to use `DeformConv2d`.')
+
+        self.deform_conv_layers = nn.ModuleList(deform_conv_layers)
+
+        self.freeze_layers()
+
+    def freeze_layers(self):
+        if self.freeze_trans_layer:
+            self.trans_layer.eval()
+
+            for param in self.trans_layer.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                constant_init(m, 1)
+            elif isinstance(m, DeformConv2d):
+                filler = torch.zeros([
+                    m.weight.size(0),
+                    m.weight.size(1),
+                    m.weight.size(2),
+                    m.weight.size(3)
+                ],
+                                     dtype=torch.float32,
+                                     device=m.weight.device)
+                for k in range(m.weight.size(0)):
+                    filler[k, k,
+                           int(m.weight.size(2) / 2),
+                           int(m.weight.size(3) / 2)] = 1.0
+                m.weight = torch.nn.Parameter(filler)
+                m.weight.requires_grad = True
+
+        # posewarper offset layer weight initialization
+        for m in self.offset_layers.modules():
+            constant_init(m, 0)
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def forward(self, inputs, frame_weight):
+        assert isinstance(inputs, (list, tuple)), 'PoseWarperNeck inputs ' \
+            'should be list or tuple, even though the length is 1, ' \
+            'for unified processing.'
+
+        output_heatmap = 0
+        if len(inputs) > 1:
+            inputs = [self._transform_inputs(input) for input in inputs]
+            inputs = [self.trans_layer(input) for input in inputs]
+
+            # calculate difference features
+            diff_features = [
+                self.offset_feats(inputs[0] - input) for input in inputs
+            ]
+
+            for i in range(len(inputs)):
+                if frame_weight[i] == 0:
+                    continue
+                warped_heatmap = 0
+                for j in range(self.num_offset_layers):
+                    offset = (self.offset_layers[j](diff_features[i]))
+                    warped_heatmap_tmp = self.deform_conv_layers[j](inputs[i],
+                                                                    offset)
+                    warped_heatmap += warped_heatmap_tmp / \
+                        self.num_offset_layers
+
+                output_heatmap += warped_heatmap * frame_weight[i]
+
+        else:
+            inputs = inputs[0]
+            inputs = self._transform_inputs(inputs)
+            inputs = self.trans_layer(inputs)
+
+            num_frames = len(frame_weight)
+            batch_size = inputs.size(0) // num_frames
+            ref_x = inputs[:batch_size]
+            ref_x_tiled = ref_x.repeat(num_frames, 1, 1, 1)
+
+            offset_features = self.offset_feats(ref_x_tiled - inputs)
+
+            warped_heatmap = 0
+            for j in range(self.num_offset_layers):
+                offset = self.offset_layers[j](offset_features)
+
+                warped_heatmap_tmp = self.deform_conv_layers[j](inputs, offset)
+                warped_heatmap += warped_heatmap_tmp / self.num_offset_layers
+
+            for i in range(num_frames):
+                if frame_weight[i] == 0:
+                    continue
+                output_heatmap += warped_heatmap[i * batch_size:(i + 1) *
+                                                 batch_size] * frame_weight[i]
+
+        return output_heatmap
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self.freeze_layers()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/registry.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..f354ae9e137262e2f375a64aef74c3af20baae63
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/registry.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from .builder import BACKBONES, HEADS, LOSSES, NECKS, POSENETS
+
+__all__ = ['BACKBONES', 'HEADS', 'LOSSES', 'NECKS', 'POSENETS']
+
+warnings.simplefilter('once', DeprecationWarning)
+warnings.warn(
+    'Registries (BACKBONES, NECKS, HEADS, LOSSES, POSENETS) have '
+    'been moved to mmpose.models.builder. Importing from '
+    'mmpose.models.registry will be deprecated in the future.',
+    DeprecationWarning)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6871c66e50708f928ead8714aa83cb4ef6447e09
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .smpl import SMPL
+
+__all__ = ['SMPL']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/geometry.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ceadaec30cd2c9bb3fbada132e1ea674f2e8754
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/geometry.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn import functional as F
+
+
+def rot6d_to_rotmat(x):
+    """Convert 6D rotation representation to 3x3 rotation matrix.
+
+    Based on Zhou et al., "On the Continuity of Rotation
+    Representations in Neural Networks", CVPR 2019
+    Input:
+        (B,6) Batch of 6-D rotation representations
+    Output:
+        (B,3,3) Batch of corresponding rotation matrices
+    """
+    x = x.view(-1, 3, 2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    return torch.stack((b1, b2, b3), dim=-1)
+
+
+def batch_rodrigues(theta):
+    """Convert axis-angle representation to rotation matrix.
+    Args:
+        theta: size = [B, 3]
+    Returns:
+        Rotation matrix corresponding to the quaternion
+            -- size = [B, 3, 3]
+    """
+    l2norm = torch.norm(theta + 1e-8, p=2, dim=1)
+    angle = torch.unsqueeze(l2norm, -1)
+    normalized = torch.div(theta, angle)
+    angle = angle * 0.5
+    v_cos = torch.cos(angle)
+    v_sin = torch.sin(angle)
+    quat = torch.cat([v_cos, v_sin * normalized], dim=1)
+    return quat_to_rotmat(quat)
+
+
+def quat_to_rotmat(quat):
+    """Convert quaternion coefficients to rotation matrix.
+    Args:
+        quat: size = [B, 4] 4 <===>(w, x, y, z)
+    Returns:
+        Rotation matrix corresponding to the quaternion
+            -- size = [B, 3, 3]
+    """
+    norm_quat = quat
+    norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
+    w, x, y, z = norm_quat[:, 0], norm_quat[:, 1],\
+        norm_quat[:, 2], norm_quat[:, 3]
+
+    B = quat.size(0)
+
+    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
+    wx, wy, wz = w * x, w * y, w * z
+    xy, xz, yz = x * y, x * z, y * z
+
+    rotMat = torch.stack([
+        w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, 2 * wz + 2 * xy,
+        w2 - x2 + y2 - z2, 2 * yz - 2 * wx, 2 * xz - 2 * wy, 2 * wx + 2 * yz,
+        w2 - x2 - y2 + z2
+    ],
+                         dim=1).view(B, 3, 3)
+    return rotMat
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/ops.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..858d0a92148a591d235e58bfce8990207632fb39
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/ops.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    if isinstance(size, torch.Size):
+        size = tuple(int(x) for x in size)
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/smpl.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/smpl.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe723d483aadb7ce7e0e9f50ef8da7b10e7529e5
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/models/utils/smpl.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..builder import MESH_MODELS
+
+try:
+    from smplx import SMPL as SMPL_
+    has_smpl = True
+except (ImportError, ModuleNotFoundError):
+    has_smpl = False
+
+
+@MESH_MODELS.register_module()
+class SMPL(nn.Module):
+    """SMPL 3d human mesh model of paper ref: Matthew Loper. ``SMPL: A skinned
+    multi-person linear model''. This module is based on the smplx project
+    (https://github.com/vchoutas/smplx).
+
+    Args:
+        smpl_path (str): The path to the folder where the model weights are
+            stored.
+        joints_regressor (str): The path to the file where the joints
+            regressor weight are stored.
+    """
+
+    def __init__(self, smpl_path, joints_regressor):
+        super().__init__()
+
+        assert has_smpl, 'Please install smplx to use SMPL.'
+
+        self.smpl_neutral = SMPL_(
+            model_path=smpl_path,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='neutral')
+
+        self.smpl_male = SMPL_(
+            model_path=smpl_path,
+            create_betas=False,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='male')
+
+        self.smpl_female = SMPL_(
+            model_path=smpl_path,
+            create_betas=False,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='female')
+
+        joints_regressor = torch.tensor(
+            np.load(joints_regressor), dtype=torch.float)[None, ...]
+        self.register_buffer('joints_regressor', joints_regressor)
+
+        self.num_verts = self.smpl_neutral.get_num_verts()
+        self.num_joints = self.joints_regressor.shape[1]
+
+    def smpl_forward(self, model, **kwargs):
+        """Apply a specific SMPL model with given model parameters.
+
+        Note:
+            B: batch size
+            V: number of vertices
+            K: number of joints
+
+        Returns:
+            outputs (dict): Dict with mesh vertices and joints.
+                - vertices: Tensor([B, V, 3]), mesh vertices
+                - joints: Tensor([B, K, 3]), 3d joints regressed
+                    from mesh vertices.
+        """
+
+        betas = kwargs['betas']
+        batch_size = betas.shape[0]
+        device = betas.device
+        output = {}
+        if batch_size == 0:
+            output['vertices'] = betas.new_zeros([0, self.num_verts, 3])
+            output['joints'] = betas.new_zeros([0, self.num_joints, 3])
+        else:
+            smpl_out = model(**kwargs)
+            output['vertices'] = smpl_out.vertices
+            output['joints'] = torch.matmul(
+                self.joints_regressor.to(device), output['vertices'])
+        return output
+
+    def get_faces(self):
+        """Return mesh faces.
+
+        Note:
+            F: number of faces
+
+        Returns:
+            faces: np.ndarray([F, 3]), mesh faces
+        """
+        return self.smpl_neutral.faces
+
+    def forward(self,
+                betas,
+                body_pose,
+                global_orient,
+                transl=None,
+                gender=None):
+        """Forward function.
+
+        Note:
+            B: batch size
+            J: number of controllable joints of model, for smpl model J=23
+            K: number of joints
+
+        Args:
+            betas: Tensor([B, 10]), human body shape parameters of SMPL model.
+            body_pose: Tensor([B, J*3] or [B, J, 3, 3]), human body pose
+                parameters of SMPL model. It should be axis-angle vector
+                ([B, J*3]) or rotation matrix ([B, J, 3, 3)].
+            global_orient: Tensor([B, 3] or [B, 1, 3, 3]), global orientation
+                of human body. It should be axis-angle vector ([B, 3]) or
+                rotation matrix ([B, 1, 3, 3)].
+            transl: Tensor([B, 3]), global translation of human body.
+            gender: Tensor([B]), gender parameters of human body. -1 for
+                neutral, 0 for male , 1 for female.
+
+        Returns:
+            outputs (dict): Dict with mesh vertices and joints.
+                - vertices: Tensor([B, V, 3]), mesh vertices
+                - joints: Tensor([B, K, 3]), 3d joints regressed from
+                    mesh vertices.
+        """
+
+        batch_size = betas.shape[0]
+        pose2rot = True if body_pose.dim() == 2 else False
+        if batch_size > 0 and gender is not None:
+            output = {
+                'vertices': betas.new_zeros([batch_size, self.num_verts, 3]),
+                'joints': betas.new_zeros([batch_size, self.num_joints, 3])
+            }
+
+            mask = gender < 0
+            _out = self.smpl_forward(
+                self.smpl_neutral,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+
+            mask = gender == 0
+            _out = self.smpl_forward(
+                self.smpl_male,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+
+            mask = gender == 1
+            _out = self.smpl_forward(
+                self.smpl_male,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+        else:
+            return self.smpl_forward(
+                self.smpl_neutral,
+                betas=betas,
+                body_pose=body_pose,
+                global_orient=global_orient,
+                transl=transl,
+                pose2rot=pose2rot)
+
+        return output
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1293ca05aab2632e0d6df29734438bc38ed79c6c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collect_env import collect_env
+from .logger import get_root_logger
+from .setup_env import setup_multi_processes
+from .timer import StopWatch
+
+__all__ = [
+    'get_root_logger', 'collect_env', 'StopWatch', 'setup_multi_processes'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/collect_env.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..f75c5ea73383ccef367632cf497227498ac50078
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/collect_env.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import collect_env as collect_basic_env
+from mmcv.utils import get_git_hash
+
+import mmpose
+
+
+def collect_env():
+    env_info = collect_basic_env()
+    env_info['MMPose'] = (mmpose.__version__ + '+' + get_git_hash(digits=7))
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/hooks.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..b68940f2b7a8a618916ea5aab331e3ce45ba98e7
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/hooks.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+
+
+class OutputHook:
+
+    def __init__(self, module, outputs=None, as_tensor=False):
+        self.outputs = outputs
+        self.as_tensor = as_tensor
+        self.layer_outputs = {}
+        self.register(module)
+
+    def register(self, module):
+
+        def hook_wrapper(name):
+
+            def hook(model, input, output):
+                if self.as_tensor:
+                    self.layer_outputs[name] = output
+                else:
+                    if isinstance(output, list):
+                        self.layer_outputs[name] = [
+                            out.detach().cpu().numpy() for out in output
+                        ]
+                    else:
+                        self.layer_outputs[name] = output.detach().cpu().numpy(
+                        )
+
+            return hook
+
+        self.handles = []
+        if isinstance(self.outputs, (list, tuple)):
+            for name in self.outputs:
+                try:
+                    layer = rgetattr(module, name)
+                    h = layer.register_forward_hook(hook_wrapper(name))
+                except ModuleNotFoundError as module_not_found:
+                    raise ModuleNotFoundError(
+                        f'Module {name} not found') from module_not_found
+                self.handles.append(h)
+
+    def remove(self):
+        for h in self.handles:
+            h.remove()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.remove()
+
+
+# using wonder's beautiful simplification:
+# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects
+def rgetattr(obj, attr, *args):
+
+    def _getattr(obj, attr):
+        return getattr(obj, attr, *args)
+
+    return functools.reduce(_getattr, [obj] + attr.split('.'))
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/logger.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..294837fa6aec1e1896de8c8accf470f366f81296
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/logger.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+from mmcv.utils import get_logger
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Use `get_logger` method in mmcv to get the root logger.
+
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added. The name of the root logger is the top-level package name,
+    e.g., "mmpose".
+
+    Args:
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the root logger.
+        log_level (int): The root logger level. Note that only the process of
+            rank 0 is affected, while other processes will set the level to
+            "Error" and be silent most of the time.
+
+    Returns:
+        logging.Logger: The root logger.
+    """
+    return get_logger(__name__.split('.')[0], log_file, log_level)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/setup_env.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..21def2f0809153a5f755af2431f7e702db625e5c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/setup_env.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+import warnings
+
+import cv2
+import torch.multiprocessing as mp
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', 'fork')
+        current_method = mp.get_start_method(allow_none=True)
+        if current_method is not None and current_method != mp_start_method:
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can change '
+                f'this behavior by changing `mp_start_method` in your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', 0)
+    cv2.setNumThreads(opencv_num_threads)
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    if 'OMP_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1:
+        omp_num_threads = 1
+        warnings.warn(
+            f'Setting OMP_NUM_THREADS environment variable for each process '
+            f'to be {omp_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1:
+        mkl_num_threads = 1
+        warnings.warn(
+            f'Setting MKL_NUM_THREADS environment variable for each process '
+            f'to be {mkl_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/timer.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a3185c5e89ce73bd33591c22ce74fc73ef8e770
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/utils/timer.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from contextlib import contextmanager
+from functools import partial
+
+import numpy as np
+from mmcv import Timer
+
+
+class RunningAverage():
+    r"""A helper class to calculate running average in a sliding window.
+
+    Args:
+        window (int): The size of the sliding window.
+    """
+
+    def __init__(self, window: int = 1):
+        self.window = window
+        self._data = []
+
+    def update(self, value):
+        """Update a new data sample."""
+        self._data.append(value)
+        self._data = self._data[-self.window:]
+
+    def average(self):
+        """Get the average value of current window."""
+        return np.mean(self._data)
+
+
+class StopWatch:
+    r"""A helper class to measure FPS and detailed time consuming of each phase
+    in a video processing loop or similar scenarios.
+
+    Args:
+        window (int): The sliding window size to calculate the running average
+            of the time consuming.
+
+    Example:
+        >>> from mmpose.utils import StopWatch
+        >>> import time
+        >>> stop_watch = StopWatch(window=10)
+        >>> with stop_watch.timeit('total'):
+        >>>     time.sleep(0.1)
+        >>>     # 'timeit' support nested use
+        >>>     with stop_watch.timeit('phase1'):
+        >>>         time.sleep(0.1)
+        >>>     with stop_watch.timeit('phase2'):
+        >>>         time.sleep(0.2)
+        >>>     time.sleep(0.2)
+        >>> report = stop_watch.report()
+    """
+
+    def __init__(self, window=1):
+        self.window = window
+        self._record = defaultdict(partial(RunningAverage, window=self.window))
+        self._timer_stack = []
+
+    @contextmanager
+    def timeit(self, timer_name='_FPS_'):
+        """Timing a code snippet with an assigned name.
+
+        Args:
+            timer_name (str): The unique name of the interested code snippet to
+                handle multiple timers and generate reports. Note that '_FPS_'
+                is a special key that the measurement will be in `fps` instead
+                of `millisecond`. Also see `report` and `report_strings`.
+                Default: '_FPS_'.
+        Note:
+            This function should always be used in a `with` statement, as shown
+            in the example.
+        """
+        self._timer_stack.append((timer_name, Timer()))
+        try:
+            yield
+        finally:
+            timer_name, timer = self._timer_stack.pop()
+            self._record[timer_name].update(timer.since_start())
+
+    def report(self, key=None):
+        """Report timing information.
+
+        Returns:
+            dict: The key is the timer name and the value is the \
+                corresponding average time consuming.
+        """
+        result = {
+            name: r.average() * 1000.
+            for name, r in self._record.items()
+        }
+
+        if '_FPS_' in result:
+            result['_FPS_'] = 1000. / result.pop('_FPS_')
+
+        if key is None:
+            return result
+        return result[key]
+
+    def report_strings(self):
+        """Report timing information in texture strings.
+
+        Returns:
+            list(str): Each element is the information string of a timed \
+                event, in format of '{timer_name}: {time_in_ms}'. \
+                Specially, if timer_name is '_FPS_', the result will \
+                be converted to fps.
+        """
+        result = self.report()
+        strings = []
+        if '_FPS_' in result:
+            strings.append(f'FPS: {result["_FPS_"]:>5.1f}')
+        strings += [f'{name}: {val:>3.0f}' for name, val in result.items()]
+        return strings
+
+    def reset(self):
+        self._record = defaultdict(list)
+        self._active_timer_stack = []
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/version.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a10826ab75786cbc8aaaf2a6a87e0465be35801
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/mmpose/version.py
@@ -0,0 +1,19 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+
+__version__ = '0.24.0'
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/model-index.yml b/phantom/submodules/phantom-hamer/third-party/ViTPose/model-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c5522f6fc18c959f604864464998a1b9ed53f9ef
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/model-index.yml
@@ -0,0 +1,139 @@
+Import:
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_animalpose.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/resnet_animalpose.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_ap10k.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/resnet_ap10k.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_atrw.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/resnet_atrw.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/resnet_fly.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_horse10.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/resnet_horse10.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/resnet_locust.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_macaque.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/resnet_macaque.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/resnet_zebra.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_udp_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_udp_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/resnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_crowdpose.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_mhp.yml
+- configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/resnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_augmentation_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_dark_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_fp16_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_udp_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_dark_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_fp16_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_crowdpose.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/resnet_crowdpose.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_h36m.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/resnet_jhmdb.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/resnet_mhp.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_dark_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/resnet_mpii_trb.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_ochuman.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/resnet_ochuman.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_posetrack18.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/resnet_posetrack18.yml
+- configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_posetrack18_posewarper.yml
+- configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.yml
+- configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.yml
+- configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.yml
+- configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m.yml
+- configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp.yml
+- configs/body/3d_mesh_sview_rgb_img/hmr/mixed/resnet_mixed.yml
+- configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_softwingloss_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wingloss_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_300w.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_aflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_cofw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_wflw.yml
+- configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/resnet_deepfashion.yml
+- configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/resnet_deepfashion.yml
+- configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/resnet_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/resnet_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/resnet_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/resnet_freihand2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/resnet_interhand2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/resnet_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_dark_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_udp_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/resnet_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/resnet_rhd2d.yml
+- configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/internet_interhand3d.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_dark_halpe.yml
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/pytest.ini b/phantom/submodules/phantom-hamer/third-party/ViTPose/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..9796e871e70c7c67345b1d6bcf708c0c82377a98
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+addopts = --xdoctest --xdoctest-style=auto
+norecursedirs = .git ignore build __pycache__ data docker docs .eggs
+
+filterwarnings= default
+                ignore:.*No cfgstr given in Cacher constructor or call.*:Warning
+                ignore:.*Define the __nice__ method for.*:Warning
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements.txt b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b5b5d97a6ea7837890ff0247bac8c5f24f6eabab
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/runtime.txt
+-r requirements/tests.txt
+-r requirements/optional.txt
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/build.txt b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/build.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a9566943cef029e5c8dab0b52ba564a7f9c7ad30
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/build.txt
@@ -0,0 +1,3 @@
+# These must be installed before building mmpose
+numpy
+torch>=1.3
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/docs.txt b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/docs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..20170845c44eefcb139ee2baa1a3d375b71c34ec
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/docs.txt
@@ -0,0 +1,6 @@
+docutils==0.16.0
+myst-parser
+-e git+https://github.com/gaotongxiao/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx==4.0.2
+sphinx_copybutton
+sphinx_markdown_tables
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/mminstall.txt b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/mminstall.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89199e36061dcd5361d029606fa25cb791af110a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/mminstall.txt
@@ -0,0 +1,3 @@
+mmcv-full>=1.3.8
+mmdet>=2.14.0
+mmtrack>=0.6.0
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/optional.txt b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/optional.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bfb1e75f86aba2fd074b0b1723e9b07a2037e9c3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/optional.txt
@@ -0,0 +1,8 @@
+albumentations>=0.3.2 --no-binary qudida,albumentations
+onnx
+onnxruntime
+poseval@git+https://github.com/svenkreiss/poseval.git
+pyrender
+requests
+smplx>=0.1.28
+trimesh
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/readthedocs.txt b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/readthedocs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b8b69d3ca2f051dcb6d6a96a25e7cb9054483c76
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/readthedocs.txt
@@ -0,0 +1,9 @@
+mmcv-full
+munkres
+poseval@git+https://github.com/svenkreiss/poseval.git
+regex
+scipy
+titlecase
+torch
+torchvision
+xtcocotools>=1.8
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/runtime.txt b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/runtime.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e83d9d232061098a768184076b451fa6b402230c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/runtime.txt
@@ -0,0 +1,11 @@
+chumpy
+dataclasses; python_version == '3.6'
+json_tricks
+matplotlib
+munkres
+numpy
+opencv-python
+pillow
+scipy
+torchvision
+xtcocotools>=1.8
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/tests.txt b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/tests.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aa23e69da611f7dec62cf84541b7b508f4437a26
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/requirements/tests.txt
@@ -0,0 +1,9 @@
+coverage
+flake8
+interrogate
+isort==4.3.21
+pytest
+pytest-runner
+smplx>=0.1.28
+xdoctest>=0.10.0
+yapf
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/resources/mmpose-logo.png b/phantom/submodules/phantom-hamer/third-party/ViTPose/resources/mmpose-logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..1704dc9abdc743f774d5e04782077d0be8bccd73
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/resources/mmpose-logo.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6342cc3ea98ab9910db6d28edaadd2c6b04713a685e486128615cf532cd5e615
+size 28553
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/setup.cfg b/phantom/submodules/phantom-hamer/third-party/ViTPose/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..c4d8643bc91a06cc48f0d88b23288e892121249c
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/setup.cfg
@@ -0,0 +1,24 @@
+[bdist_wheel]
+universal=1
+
+[aliases]
+test=pytest
+
+[tool:pytest]
+addopts=tests/
+
+[yapf]
+based_on_style = pep8
+blank_line_before_nested_class_or_def = true
+split_before_expression_after_opening_paren = true
+split_penalty_import_names=0
+SPLIT_PENALTY_AFTER_OPENING_BRACKET=800
+
+[isort]
+line_length = 79
+multi_line_output = 0
+extra_standard_library = pkg_resources,setuptools
+known_first_party = mmpose
+known_third_party = PIL,cv2,h5py,json_tricks,matplotlib,mmcv,munkres,numpy,pytest,pytorch_sphinx_theme,requests,scipy,seaborn,spacepy,titlecase,torch,torchvision,webcam_apis,xmltodict,xtcocotools
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/setup.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..c72e8cee00eb360310ab9676ea3465a49993fd33
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/setup.py
@@ -0,0 +1,193 @@
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import warnings
+from setuptools import find_packages, setup
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+version_file = 'mmpose/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    import sys
+
+    # return short version for sdist
+    if 'sdist' in sys.argv or 'bdist_wheel' in sys.argv:
+        return locals()['short_version']
+    else:
+        return locals()['__version__']
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import re
+    import sys
+    from os.path import exists
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            elif '@git+' in line:
+                info['package'] = line
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+def add_mim_extension():
+    """Add extra files that are required to support MIM into the package.
+
+    These files will be added by creating a symlink to the originals if the
+    package is installed in `editable` mode (e.g. pip install -e .), or by
+    copying from the originals otherwise.
+    """
+
+    # parse installment mode
+    if 'develop' in sys.argv:
+        # installed by `pip install -e .`
+        if platform.system() == 'Windows':
+            mode = 'copy'
+        else:
+            mode = 'symlink'
+    elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv:
+        # installed by `pip install .`
+        # or create source distribution by `python setup.py sdist`
+        mode = 'copy'
+    else:
+        return
+
+    filenames = ['tools', 'configs', 'demo', 'model-index.yml']
+    repo_path = osp.dirname(__file__)
+    mim_path = osp.join(repo_path, 'mmpose', '.mim')
+    os.makedirs(mim_path, exist_ok=True)
+
+    for filename in filenames:
+        if osp.exists(filename):
+            src_path = osp.join(repo_path, filename)
+            tar_path = osp.join(mim_path, filename)
+
+            if osp.isfile(tar_path) or osp.islink(tar_path):
+                os.remove(tar_path)
+            elif osp.isdir(tar_path):
+                shutil.rmtree(tar_path)
+
+            if mode == 'symlink':
+                src_relpath = osp.relpath(src_path, osp.dirname(tar_path))
+                os.symlink(src_relpath, tar_path)
+            elif mode == 'copy':
+                if osp.isfile(src_path):
+                    shutil.copyfile(src_path, tar_path)
+                elif osp.isdir(src_path):
+                    shutil.copytree(src_path, tar_path)
+                else:
+                    warnings.warn(f'Cannot copy file {src_path}.')
+            else:
+                raise ValueError(f'Invalid mode {mode}')
+
+
+if __name__ == '__main__':
+    add_mim_extension()
+    setup(
+        name='mmpose',
+        version=get_version(),
+        description='OpenMMLab Pose Estimation Toolbox and Benchmark.',
+        author='MMPose Contributors',
+        author_email='openmmlab@gmail.com',
+        keywords='computer vision, pose estimation',
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        include_package_data=True,
+        package_data={'mmpose.ops': ['*/*.so']},
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+        ],
+        url='https://github.com/open-mmlab/mmpose',
+        license='Apache License 2.0',
+        install_requires=parse_requirements('requirements/runtime.txt'),
+        extras_require={
+            'tests': parse_requirements('requirements/tests.txt'),
+            'build': parse_requirements('requirements/build.txt'),
+            'runtime': parse_requirements('requirements/runtime.txt')
+        },
+        zip_safe=False)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/analyze_logs.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/analyze_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0e1a0260850de685bcee4bc5d7eac43345698e8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/analyze_logs.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        all_times = np.array(all_times)
+        epoch_ave_time = all_times.mean(-1)
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+        print()
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[0]]:
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}')
+            xs = []
+            ys = []
+            num_iters_per_epoch = log_dict[epochs[0]]['iter'][-1]
+            for epoch in epochs:
+                iters = log_dict[epoch]['iter']
+                if log_dict[epoch]['mode'][-1] == 'val':
+                    iters = iters[:-1]
+                xs.append(np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+                ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+            xs = np.concatenate(xs)
+            ys = np.concatenate(ys)
+            plt.xlabel('iter')
+            plt.plot(xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['top1_acc'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, top1_acc
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/benchmark_inference.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/benchmark_inference.py
new file mode 100755
index 0000000000000000000000000000000000000000..14c0736d5d6c9f7ced255495b095247e9d82e0d6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/benchmark_inference.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+
+import torch
+from mmcv import Config
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel
+from mmcv.runner.fp16_utils import wrap_fp16_model
+
+from mmpose.datasets import build_dataloader, build_dataset
+from mmpose.models import build_posenet
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMPose benchmark a recognizer')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        '--log-interval', default=10, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.val)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    model = build_posenet(cfg.model)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    model = MMDataParallel(model, device_ids=[0])
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with total batch and take the average
+    for i, data in enumerate(data_loader):
+
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        with torch.no_grad():
+            model(return_loss=False, **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                its = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done item [{i + 1:<3}],  {its:.2f} items / s')
+    print(f'Overall average: {its:.2f} items / s')
+    print(f'Total time: {pure_inf_time:.2f} s')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/benchmark_processing.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/benchmark_processing.py
new file mode 100755
index 0000000000000000000000000000000000000000..d326f3defbf941fbae256709509e67751ba4da42
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/benchmark_processing.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This file is for benchmark data loading process. It can also be used to
+refresh the memcached cache. The command line to run this file is:
+
+$ python -m cProfile -o program.prof tools/analysis/benchmark_processing.py
+configs/task/method/[config filename]
+
+Note: When debugging, the `workers_per_gpu` in the config should be set to 0
+during benchmark.
+
+It use cProfile to record cpu running time and output to program.prof
+To visualize cProfile output program.prof, use Snakeviz and run:
+$ snakeviz program.prof
+"""
+import argparse
+
+import mmcv
+from mmcv import Config
+
+from mmpose import __version__
+from mmpose.datasets import build_dataloader, build_dataset
+from mmpose.utils import get_root_logger
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Benchmark data loading')
+    parser.add_argument('config', help='train config file path')
+    args = parser.parse_args()
+    cfg = Config.fromfile(args.config)
+
+    # init logger before other steps
+    logger = get_root_logger()
+    logger.info(f'MMPose Version: {__version__}')
+    logger.info(f'Config: {cfg.text}')
+
+    dataset = build_dataset(cfg.data.train)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    # Start progress bar after first 5 batches
+    prog_bar = mmcv.ProgressBar(
+        len(dataset) - 5 * cfg.data.samples_per_gpu, start=False)
+    for i, data in enumerate(data_loader):
+        if i == 5:
+            prog_bar.start()
+        for _ in data['img']:
+            if i < 5:
+                continue
+            prog_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/get_flops.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/get_flops.py
new file mode 100644
index 0000000000000000000000000000000000000000..f492a877bce775dcad298e2ba727c6370d8d7706
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/get_flops.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from functools import partial
+
+import torch
+
+from mmpose.apis.inference import init_pose_model
+
+try:
+    from mmcv.cnn import get_model_complexity_info
+except ImportError:
+    raise ImportError('Please upgrade mmcv to >0.6.2')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a recognizer')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[256, 192],
+        help='input image size')
+    parser.add_argument(
+        '--input-constructor',
+        '-c',
+        type=str,
+        choices=['none', 'batch'],
+        default='none',
+        help='If specified, it takes a callable method that generates '
+        'input. Otherwise, it will generate a random tensor with '
+        'input shape to calculate FLOPs.')
+    parser.add_argument(
+        '--batch-size', '-b', type=int, default=1, help='input batch size')
+    parser.add_argument(
+        '--not-print-per-layer-stat',
+        '-n',
+        action='store_true',
+        help='Whether to print complexity information'
+        'for each layer in a model')
+    args = parser.parse_args()
+    return args
+
+
+def batch_constructor(flops_model, batch_size, input_shape):
+    """Generate a batch of tensors to the model."""
+    batch = {}
+
+    img = torch.ones(()).new_empty(
+        (batch_size, *input_shape),
+        dtype=next(flops_model.parameters()).dtype,
+        device=next(flops_model.parameters()).device)
+
+    batch['img'] = img
+    return batch
+
+
+def main():
+
+    args = parse_args()
+
+    if len(args.shape) == 1:
+        input_shape = (3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (3, ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    model = init_pose_model(args.config)
+
+    if args.input_constructor == 'batch':
+        input_constructor = partial(batch_constructor, model, args.batch_size)
+    else:
+        input_constructor = None
+
+    if args.input_constructor == 'batch':
+        input_constructor = partial(batch_constructor, model, args.batch_size)
+    else:
+        input_constructor = None
+
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not currently supported with {}'.
+            format(model.__class__.__name__))
+
+    flops, params = get_model_complexity_info(
+        model,
+        input_shape,
+        input_constructor=input_constructor,
+        print_per_layer_stat=(not args.not_print_per_layer_stat))
+    split_line = '=' * 30
+    input_shape = (args.batch_size, ) + input_shape
+    print(f'{split_line}\nInput shape: {input_shape}\n'
+          f'Flops: {flops}\nParams: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/print_config.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/print_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3538ef56bdd07a841352c138ccf23ac3390561a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/print_config.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+from mmcv import Config, DictAction
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/speed_test.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/speed_test.py
new file mode 100755
index 0000000000000000000000000000000000000000..fef9e2d205ebbff2bf228c75e7e95fc6ac06f399
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/analysis/speed_test.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+
+import torch
+from mmcv import Config
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel
+from mmcv.runner.fp16_utils import wrap_fp16_model
+
+from mmpose.datasets import build_dataloader, build_dataset
+from mmpose.models import build_posenet
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMPose benchmark a recognizer')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--bz', default=32, type=int, help='test config file path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    # Since we only care about the forward speed of the network
+    cfg.model.pretrained=None
+    cfg.model.test_cfg.flip_test=False
+    cfg.model.test_cfg.use_udp=False
+    cfg.model.test_cfg.post_process='none'
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.val)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=args.bz,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    model = build_posenet(cfg.model)
+    model = MMDataParallel(model, device_ids=[0])
+    model.eval()
+    
+    # get the example data
+    for i, data in enumerate(data_loader):
+        break
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 100
+    inference_times = 100
+
+    with torch.no_grad():
+        start_time = time.perf_counter()
+      
+        for i in range(num_warmup):
+            torch.cuda.synchronize()            
+            model(return_loss=False, **data)
+            torch.cuda.synchronize()
+        
+        elapsed = time.perf_counter() - start_time
+        print(f'warmup cost {elapsed} time')
+
+        start_time = time.perf_counter()
+        
+        for i in range(inference_times):
+            torch.cuda.synchronize()
+            model(return_loss=False, **data)
+            torch.cuda.synchronize()
+
+        elapsed = time.perf_counter() - start_time
+        fps = args.bz * inference_times / elapsed
+        print(f'the fps is {fps}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/h36m_to_coco.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/h36m_to_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f306d409ee22c9667e1d4f9d4510b3816465ad00
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/h36m_to_coco.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from functools import wraps
+
+import mmcv
+import numpy as np
+from PIL import Image
+
+from mmpose.core import SimpleCamera
+
+
+def _keypoint_camera_to_world(keypoints,
+                              camera_params,
+                              image_name=None,
+                              dataset='Body3DH36MDataset'):
+    """Project 3D keypoints from the camera space to the world space.
+
+    Args:
+        keypoints (np.ndarray): 3D keypoints in shape [..., 3]
+        camera_params (dict): Parameters for all cameras.
+        image_name (str): The image name to specify the camera.
+        dataset (str): The dataset type, e.g., Body3DH36MDataset.
+    """
+    cam_key = None
+    if dataset == 'Body3DH36MDataset':
+        subj, rest = osp.basename(image_name).split('_', 1)
+        _, rest = rest.split('.', 1)
+        camera, rest = rest.split('_', 1)
+        cam_key = (subj, camera)
+    else:
+        raise NotImplementedError
+
+    camera = SimpleCamera(camera_params[cam_key])
+    keypoints_world = keypoints.copy()
+    keypoints_world[..., :3] = camera.camera_to_world(keypoints[..., :3])
+
+    return keypoints_world
+
+
+def _get_bbox_xywh(center, scale, w=200, h=200):
+    w = w * scale
+    h = h * scale
+    x = center[0] - w / 2
+    y = center[1] - h / 2
+    return [x, y, w, h]
+
+
+def mmcv_track_func(func):
+
+    @wraps(func)
+    def wrapped_func(args):
+        return func(*args)
+
+    return wrapped_func
+
+
+@mmcv_track_func
+def _get_img_info(img_idx, img_name, img_root):
+    try:
+        im = Image.open(osp.join(img_root, img_name))
+        w, h = im.size
+    except:  # noqa: E722
+        return None
+
+    img = {
+        'file_name': img_name,
+        'height': h,
+        'width': w,
+        'id': img_idx + 1,
+    }
+    return img
+
+
+@mmcv_track_func
+def _get_ann(idx, kpt_2d, kpt_3d, center, scale, imgname, camera_params):
+    bbox = _get_bbox_xywh(center, scale)
+    kpt_3d = _keypoint_camera_to_world(kpt_3d, camera_params, imgname)
+
+    ann = {
+        'id': idx + 1,
+        'category_id': 1,
+        'image_id': idx + 1,
+        'iscrowd': 0,
+        'bbox': bbox,
+        'area': bbox[2] * bbox[3],
+        'num_keypoints': 17,
+        'keypoints': kpt_2d.reshape(-1).tolist(),
+        'keypoints_3d': kpt_3d.reshape(-1).tolist()
+    }
+
+    return ann
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--ann-file', type=str, default='tests/data/h36m/test_h36m_body3d.npz')
+    parser.add_argument(
+        '--camera-param-file', type=str, default='tests/data/h36m/cameras.pkl')
+    parser.add_argument('--img-root', type=str, default='tests/data/h36m')
+    parser.add_argument(
+        '--out-file', type=str, default='tests/data/h36m/h36m_coco.json')
+    parser.add_argument('--full-img-name', action='store_true')
+
+    args = parser.parse_args()
+
+    h36m_data = np.load(args.ann_file)
+    h36m_camera_params = mmcv.load(args.camera_param_file)
+    h36m_coco = {}
+
+    # categories
+    h36m_cats = [{
+        'supercategory':
+        'person',
+        'id':
+        1,
+        'name':
+        'person',
+        'keypoints': [
+            'root (pelvis)', 'left_hip', 'left_knee', 'left_foot', 'right_hip',
+            'right_knee', 'right_foot', 'spine', 'thorax', 'neck_base', 'head',
+            'left_shoulder', 'left_elbow', 'left_wrist', 'right_shoulder',
+            'right_elbow', 'right_wrist'
+        ],
+        'skeleton': [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7],
+                     [7, 8], [8, 9], [9, 10], [8, 11], [11, 12], [12, 13],
+                     [8, 14], [14, 15], [15, 16]],
+    }]
+
+    # images
+    imgnames = h36m_data['imgname']
+    if not args.full_img_name:
+        imgnames = [osp.basename(fn) for fn in imgnames]
+    tasks = [(idx, fn, args.img_root) for idx, fn in enumerate(imgnames)]
+
+    h36m_imgs = mmcv.track_parallel_progress(_get_img_info, tasks, nproc=12)
+
+    # annotations
+    kpts_2d = h36m_data['part']
+    kpts_3d = h36m_data['S']
+    centers = h36m_data['center']
+    scales = h36m_data['scale']
+    tasks = [(idx, ) + args + (h36m_camera_params, )
+             for idx, args in enumerate(
+                 zip(kpts_2d, kpts_3d, centers, scales, imgnames))]
+
+    h36m_anns = mmcv.track_parallel_progress(_get_ann, tasks, nproc=12)
+
+    # remove invalid data
+    h36m_imgs = [img for img in h36m_imgs if img is not None]
+    h36m_img_ids = set([img['id'] for img in h36m_imgs])
+    h36m_anns = [ann for ann in h36m_anns if ann['image_id'] in h36m_img_ids]
+
+    h36m_coco = {
+        'categories': h36m_cats,
+        'images': h36m_imgs,
+        'annotations': h36m_anns,
+    }
+
+    mmcv.dump(h36m_coco, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/mat2json.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/mat2json.py
new file mode 100644
index 0000000000000000000000000000000000000000..caf7453e70891ae1707a0b2f33d622253904a6ac
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/mat2json.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import time
+
+from scipy.io import loadmat
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Converting the predicted .mat file to .json file.')
+    parser.add_argument('pred_mat_file', help='input prediction mat file.')
+    parser.add_argument(
+        'gt_json_file',
+        help='input ground-truth json file to get the image name. '
+        'Default: "data/mpii/mpii_val.json" ')
+    parser.add_argument('output_json_file', help='output converted json file.')
+    args = parser.parse_args()
+    return args
+
+
+def save_json(list_file, path):
+    with open(path, 'w') as f:
+        json.dump(list_file, f, indent=4)
+    return 0
+
+
+def convert_mat(pred_mat_file, gt_json_file, output_json_file):
+    res = loadmat(pred_mat_file)
+    preds = res['preds']
+    N = preds.shape[0]
+
+    with open(gt_json_file) as anno_file:
+        anno = json.load(anno_file)
+
+    assert len(anno) == N
+
+    instance = {}
+
+    for pred, ann in zip(preds, anno):
+        ann.pop('joints_vis')
+        ann['joints'] = pred.tolist()
+
+    instance['annotations'] = anno
+    instance['info'] = {}
+    instance['info']['description'] = 'Converted MPII prediction.'
+    instance['info']['year'] = time.strftime('%Y', time.localtime())
+    instance['info']['date_created'] = time.strftime('%Y/%m/%d',
+                                                     time.localtime())
+
+    save_json(instance, output_json_file)
+
+
+def main():
+    args = parse_args()
+    convert_mat(args.pred_mat_file, args.gt_json_file, args.output_json_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_animalpose_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_animalpose_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..db37860164ea5ee00c3d2e2b354701ad24bb9f9e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_animalpose_dataset.py
@@ -0,0 +1,436 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import re
+import time
+import warnings
+
+import cv2
+import numpy as np
+import xmltodict
+from xtcocotools.coco import COCO
+
+np.random.seed(0)
+
+
+def list_all_files(root_dir, ext='.xml'):
+    """List all files in the root directory and all its sub directories.
+
+    :param root_dir: root directory
+    :param ext: filename extension
+    :return: list of files
+    """
+    files = []
+    file_list = os.listdir(root_dir)
+    for i in range(0, len(file_list)):
+        path = os.path.join(root_dir, file_list[i])
+        if os.path.isdir(path):
+            files.extend(list_all_files(path))
+        if os.path.isfile(path):
+            if path.lower().endswith(ext):
+                files.append(path)
+    return files
+
+
+def get_anno_info():
+    keypoints_info = [
+        'L_Eye',
+        'R_Eye',
+        'L_EarBase',
+        'R_EarBase',
+        'Nose',
+        'Throat',
+        'TailBase',
+        'Withers',
+        'L_F_Elbow',
+        'R_F_Elbow',
+        'L_B_Elbow',
+        'R_B_Elbow',
+        'L_F_Knee',
+        'R_F_Knee',
+        'L_B_Knee',
+        'R_B_Knee',
+        'L_F_Paw',
+        'R_F_Paw',
+        'L_B_Paw',
+        'R_B_Paw',
+    ]
+    skeleton_info = [[1, 2], [1, 3], [2, 4], [1, 5], [2, 5], [5, 6], [6, 8],
+                     [7, 8], [6, 9], [9, 13], [13, 17], [6, 10], [10, 14],
+                     [14, 18], [7, 11], [11, 15], [15, 19], [7, 12], [12, 16],
+                     [16, 20]]
+    category_info = [{
+        'supercategory': 'animal',
+        'id': 1,
+        'name': 'animal',
+        'keypoints': keypoints_info,
+        'skeleton': skeleton_info
+    }]
+
+    return keypoints_info, skeleton_info, category_info
+
+
+def xml2coco_trainval(file_list, img_root, save_path, start_ann_id=0):
+    """Save annotations in coco-format.
+
+    :param file_list: list of data annotation files.
+    :param img_root: the root dir to load images.
+    :param save_path: the path to save transformed annotation file.
+    :param start_ann_id: the starting point to count the annotation id.
+    :param val_num: the number of annotated objects for validation.
+    """
+    images = []
+    annotations = []
+    img_ids = []
+    ann_ids = []
+
+    ann_id = start_ann_id
+
+    name2id = {
+        'L_Eye': 0,
+        'R_Eye': 1,
+        'L_EarBase': 2,
+        'R_EarBase': 3,
+        'Nose': 4,
+        'Throat': 5,
+        'TailBase': 6,
+        'Withers': 7,
+        'L_F_Elbow': 8,
+        'R_F_Elbow': 9,
+        'L_B_Elbow': 10,
+        'R_B_Elbow': 11,
+        'L_F_Knee': 12,
+        'R_F_Knee': 13,
+        'L_B_Knee': 14,
+        'R_B_Knee': 15,
+        'L_F_Paw': 16,
+        'R_F_Paw': 17,
+        'L_B_Paw': 18,
+        'R_B_Paw': 19
+    }
+    for file in file_list:
+        data_anno = xmltodict.parse(open(file).read())['annotation']
+
+        img_id = int(data_anno['image'].split('_')[0] +
+                     data_anno['image'].split('_')[1])
+
+        if img_id not in img_ids:
+            image_name = 'VOC2012/JPEGImages/' + data_anno['image'] + '.jpg'
+            img = cv2.imread(os.path.join(img_root, image_name))
+
+            image = {}
+            image['id'] = img_id
+            image['file_name'] = image_name
+            image['height'] = img.shape[0]
+            image['width'] = img.shape[1]
+
+            images.append(image)
+            img_ids.append(img_id)
+        else:
+            pass
+
+        keypoint_anno = data_anno['keypoints']['keypoint']
+        assert len(keypoint_anno) == 20
+
+        keypoints = np.zeros([20, 3], dtype=np.float32)
+
+        for kpt_anno in keypoint_anno:
+            keypoint_name = kpt_anno['@name']
+            keypoint_id = name2id[keypoint_name]
+
+            visibility = int(kpt_anno['@visible'])
+
+            if visibility == 0:
+                continue
+            else:
+                keypoints[keypoint_id, 0] = float(kpt_anno['@x'])
+                keypoints[keypoint_id, 1] = float(kpt_anno['@y'])
+                keypoints[keypoint_id, 2] = 2
+
+        anno = {}
+        anno['keypoints'] = keypoints.reshape(-1).tolist()
+        anno['image_id'] = img_id
+        anno['id'] = ann_id
+        anno['num_keypoints'] = int(sum(keypoints[:, 2] > 0))
+
+        visible_bounds = data_anno['visible_bounds']
+        anno['bbox'] = [
+            float(visible_bounds['@xmin']),
+            float(visible_bounds['@ymin']),
+            float(visible_bounds['@width']),
+            float(visible_bounds['@height'])
+        ]
+        anno['iscrowd'] = 0
+        anno['area'] = float(anno['bbox'][2] * anno['bbox'][3])
+        anno['category_id'] = 1
+
+        annotations.append(anno)
+        ann_ids.append(ann_id)
+        ann_id += 1
+
+    cocotype = {}
+
+    cocotype['info'] = {}
+    cocotype['info'][
+        'description'] = 'AnimalPose dataset Generated by MMPose Team'
+    cocotype['info']['version'] = '1.0'
+    cocotype['info']['year'] = time.strftime('%Y', time.localtime())
+    cocotype['info']['date_created'] = time.strftime('%Y/%m/%d',
+                                                     time.localtime())
+
+    cocotype['images'] = images
+    cocotype['annotations'] = annotations
+
+    keypoints_info, skeleton_info, category_info = get_anno_info()
+
+    cocotype['categories'] = category_info
+
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    json.dump(cocotype, open(save_path, 'w'), indent=4)
+    print('number of images:', len(img_ids))
+    print('number of annotations:', len(ann_ids))
+    print(f'done {save_path}')
+
+
+def xml2coco_test(file_list, img_root, save_path, start_ann_id=0):
+    """Save annotations in coco-format.
+
+    :param file_list: list of data annotation files.
+    :param img_root: the root dir to load images.
+    :param save_path: the path to save transformed annotation file.
+    :param start_ann_id: the starting point to count the annotation id.
+    """
+    images = []
+    annotations = []
+    img_ids = []
+    ann_ids = []
+
+    ann_id = start_ann_id
+
+    name2id = {
+        'L_eye': 0,
+        'R_eye': 1,
+        'L_ear': 2,
+        'R_ear': 3,
+        'Nose': 4,
+        'Throat': 5,
+        'Tail': 6,
+        'withers': 7,
+        'L_F_elbow': 8,
+        'R_F_elbow': 9,
+        'L_B_elbow': 10,
+        'R_B_elbow': 11,
+        'L_F_knee': 12,
+        'R_F_knee': 13,
+        'L_B_knee': 14,
+        'R_B_knee': 15,
+        'L_F_paw': 16,
+        'R_F_paw': 17,
+        'L_B_paw': 18,
+        'R_B_paw': 19
+    }
+
+    cat2id = {'cat': 1, 'cow': 2, 'dog': 3, 'horse': 4, 'sheep': 5}
+
+    for file in file_list:
+        data_anno = xmltodict.parse(open(file).read())['annotation']
+
+        category_id = cat2id[data_anno['category']]
+
+        img_id = category_id * 1000 + int(
+            re.findall(r'\d+', data_anno['image'])[0])
+
+        assert img_id not in img_ids
+
+        # prepare images
+        image_name = os.path.join('animalpose_image_part2',
+                                  data_anno['category'], data_anno['image'])
+        img = cv2.imread(os.path.join(img_root, image_name))
+
+        image = {}
+        image['id'] = img_id
+        image['file_name'] = image_name
+        image['height'] = img.shape[0]
+        image['width'] = img.shape[1]
+
+        images.append(image)
+        img_ids.append(img_id)
+
+        # prepare annotations
+        keypoint_anno = data_anno['keypoints']['keypoint']
+        keypoints = np.zeros([20, 3], dtype=np.float32)
+
+        for kpt_anno in keypoint_anno:
+            keypoint_name = kpt_anno['@name']
+            keypoint_id = name2id[keypoint_name]
+
+            visibility = int(kpt_anno['@visible'])
+
+            if visibility == 0:
+                continue
+            else:
+                keypoints[keypoint_id, 0] = float(kpt_anno['@x'])
+                keypoints[keypoint_id, 1] = float(kpt_anno['@y'])
+                keypoints[keypoint_id, 2] = 2
+
+        anno = {}
+        anno['keypoints'] = keypoints.reshape(-1).tolist()
+        anno['image_id'] = img_id
+        anno['id'] = ann_id
+        anno['num_keypoints'] = int(sum(keypoints[:, 2] > 0))
+
+        visible_bounds = data_anno['visible_bounds']
+        anno['bbox'] = [
+            float(visible_bounds['@xmin']),
+            float(visible_bounds['@xmax']
+                  ),  # typo in original xml: should be 'ymin'
+            float(visible_bounds['@width']),
+            float(visible_bounds['@height'])
+        ]
+        anno['iscrowd'] = 0
+        anno['area'] = float(anno['bbox'][2] * anno['bbox'][3])
+        anno['category_id'] = 1
+
+        annotations.append(anno)
+        ann_ids.append(ann_id)
+        ann_id += 1
+
+    cocotype = {}
+
+    cocotype['info'] = {}
+    cocotype['info'][
+        'description'] = 'AnimalPose dataset Generated by MMPose Team'
+    cocotype['info']['version'] = '1.0'
+    cocotype['info']['year'] = time.strftime('%Y', time.localtime())
+    cocotype['info']['date_created'] = time.strftime('%Y/%m/%d',
+                                                     time.localtime())
+
+    cocotype['images'] = images
+    cocotype['annotations'] = annotations
+
+    keypoints_info, skeleton_info, category_info = get_anno_info()
+
+    cocotype['categories'] = category_info
+
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    json.dump(cocotype, open(save_path, 'w'), indent=4)
+    print('=========================================================')
+    print('number of images:', len(img_ids))
+    print('number of annotations:', len(ann_ids))
+    print(f'done {save_path}')
+
+
+def split_train_val(work_dir, trainval_file, train_file, val_file,
+                    val_ann_num):
+    """Split train-val json file into training and validation files.
+
+    :param work_dir: path to load train-val json file, and save split files.
+    :param trainval_file: The input json file combining both train and val.
+    :param trainval_file: The output json file for training.
+    :param trainval_file: The output json file for validation.
+    :param val_ann_num: the number of validation annotations.
+    """
+
+    coco = COCO(os.path.join(work_dir, trainval_file))
+
+    img_list = list(coco.imgs.keys())
+    np.random.shuffle(img_list)
+
+    count = 0
+
+    images_train = []
+    images_val = []
+    annotations_train = []
+    annotations_val = []
+
+    for img_id in img_list:
+        ann_ids = coco.getAnnIds(img_id)
+
+        if count + len(ann_ids) <= val_ann_num:
+            # for validation
+            count += len(ann_ids)
+            images_val.append(coco.imgs[img_id])
+            for ann_id in ann_ids:
+                annotations_val.append(coco.anns[ann_id])
+
+        else:
+            images_train.append(coco.imgs[img_id])
+            for ann_id in ann_ids:
+                annotations_train.append(coco.anns[ann_id])
+
+    if count == val_ann_num:
+        print(f'We have found {count} annotations for validation.')
+    else:
+        warnings.warn(
+            f'We only found {count} annotations, instead of {val_ann_num}.')
+
+    cocotype_train = {}
+    cocotype_val = {}
+
+    keypoints_info, skeleton_info, category_info = get_anno_info()
+
+    cocotype_train['info'] = {}
+    cocotype_train['info'][
+        'description'] = 'AnimalPose dataset Generated by MMPose Team'
+    cocotype_train['info']['version'] = '1.0'
+    cocotype_train['info']['year'] = time.strftime('%Y', time.localtime())
+    cocotype_train['info']['date_created'] = time.strftime(
+        '%Y/%m/%d', time.localtime())
+    cocotype_train['images'] = images_train
+    cocotype_train['annotations'] = annotations_train
+    cocotype_train['categories'] = category_info
+
+    json.dump(
+        cocotype_train,
+        open(os.path.join(work_dir, train_file), 'w'),
+        indent=4)
+    print('=========================================================')
+    print('number of images:', len(images_train))
+    print('number of annotations:', len(annotations_train))
+    print(f'done {train_file}')
+
+    cocotype_val['info'] = {}
+    cocotype_val['info'][
+        'description'] = 'AnimalPose dataset Generated by MMPose Team'
+    cocotype_val['info']['version'] = '1.0'
+    cocotype_val['info']['year'] = time.strftime('%Y', time.localtime())
+    cocotype_val['info']['date_created'] = time.strftime(
+        '%Y/%m/%d', time.localtime())
+    cocotype_val['images'] = images_val
+    cocotype_val['annotations'] = annotations_val
+    cocotype_val['categories'] = category_info
+
+    json.dump(
+        cocotype_val, open(os.path.join(work_dir, val_file), 'w'), indent=4)
+    print('=========================================================')
+    print('number of images:', len(images_val))
+    print('number of annotations:', len(annotations_val))
+    print(f'done {val_file}')
+
+
+dataset_dir = 'data/animalpose/'
+
+# We choose the images from PascalVOC for train + val
+# In total, train+val: 3608 images, 5117 annotations
+xml2coco_trainval(
+    list_all_files(os.path.join(dataset_dir, 'PASCAL2011_animal_annotation')),
+    dataset_dir,
+    os.path.join(dataset_dir, 'annotations', 'animalpose_trainval.json'),
+    start_ann_id=1000000)
+
+# train: 2798 images, 4000 annotations
+# val: 810 images, 1117 annotations
+split_train_val(
+    os.path.join(dataset_dir, 'annotations'),
+    'animalpose_trainval.json',
+    'animalpose_train.json',
+    'animalpose_val.json',
+    val_ann_num=1117)
+
+# We choose the remaining 1000 images for test
+# 1000 images, 1000 annotations
+xml2coco_test(
+    list_all_files(os.path.join(dataset_dir, 'animalpose_anno2')),
+    dataset_dir,
+    os.path.join(dataset_dir, 'annotations', 'animalpose_test.json'),
+    start_ann_id=0)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_cofw_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_cofw_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..46b6affcb6ddcd9454856f96feca1faa1f010b44
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_cofw_dataset.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import time
+
+import cv2
+import h5py
+import numpy as np
+
+mat_files = ['COFW_train_color.mat', 'COFW_test_color.mat']
+dataset_dir = 'data/cofw/'
+
+image_root = os.path.join(dataset_dir, 'images/')
+annotation_root = os.path.join(dataset_dir, 'annotations/')
+
+os.makedirs(image_root, exist_ok=True)
+os.makedirs(annotation_root, exist_ok=True)
+
+cnt = 0
+for mat_file in mat_files:
+    mat = h5py.File(os.path.join(dataset_dir, mat_file), 'r')
+
+    if 'train' in mat_file:
+        imgs = mat['IsTr']
+        pts = mat['phisTr']
+        bboxes = mat['bboxesTr']
+        is_train = True
+        json_file = 'cofw_train.json'
+    else:
+        imgs = mat['IsT']
+        pts = mat['phisT']
+        bboxes = mat['bboxesT']
+        is_train = False
+        json_file = 'cofw_test.json'
+
+    images = []
+    annotations = []
+
+    num = pts.shape[1]
+    for idx in range(0, num):
+        cnt += 1
+        img = np.array(mat[imgs[0, idx]]).transpose()
+        keypoints = pts[:, idx].reshape(3, -1).transpose()
+        # 2 for valid and 1 for occlusion
+        keypoints[:, 2] = 2 - keypoints[:, 2]
+        # matlab 1-index to python 0-index
+        keypoints[:, :2] -= 1
+        bbox = bboxes[:, idx]
+
+        # check nonnegativity
+        bbox[bbox < 0] = 0
+        keypoints[keypoints < 0] = 0
+
+        image = {}
+        image['id'] = cnt
+        image['file_name'] = f'{str(cnt).zfill(6)}.jpg'
+        image['height'] = img.shape[0]
+        image['width'] = img.shape[1]
+        cv2.imwrite(
+            os.path.join(image_root, image['file_name']),
+            cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
+        images.append(image)
+
+        anno = {}
+        anno['keypoints'] = keypoints.reshape(-1).tolist()
+        anno['image_id'] = cnt
+        anno['id'] = cnt
+        anno['num_keypoints'] = len(keypoints)  # all keypoints are labelled
+        anno['bbox'] = bbox.tolist()
+        anno['iscrowd'] = 0
+        anno['area'] = anno['bbox'][2] * anno['bbox'][3]
+        anno['category_id'] = 1
+
+        annotations.append(anno)
+
+    cocotype = {}
+
+    cocotype['info'] = {}
+    cocotype['info']['description'] = 'COFW Generated by MMPose Team'
+    cocotype['info']['version'] = '1.0'
+    cocotype['info']['year'] = time.strftime('%Y', time.localtime())
+    cocotype['info']['date_created'] = time.strftime('%Y/%m/%d',
+                                                     time.localtime())
+
+    cocotype['images'] = images
+    cocotype['annotations'] = annotations
+    cocotype['categories'] = [{
+        'supercategory': 'person',
+        'id': 1,
+        'name': 'face',
+        'keypoints': [],
+        'skeleton': []
+    }]
+
+    ann_path = os.path.join(annotation_root, json_file)
+    json.dump(cocotype, open(ann_path, 'w'))
+    print(f'done {ann_path}')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_deepposekit_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_deepposekit_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fe7ae398f4f94a22e36cd76e377c5d5bcbf193d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_deepposekit_dataset.py
@@ -0,0 +1,180 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import time
+
+import cv2
+import h5py
+import numpy as np
+
+np.random.seed(0)
+
+
+def save_coco_anno(keypoints_all,
+                   annotated_all,
+                   imgs_all,
+                   keypoints_info,
+                   skeleton_info,
+                   dataset,
+                   img_root,
+                   save_path,
+                   start_img_id=0,
+                   start_ann_id=0):
+    """Save annotations in coco-format.
+
+    :param keypoints_all: keypoint annotations.
+    :param annotated_all: images annotated or not.
+    :param imgs_all: the array of images.
+    :param keypoints_info: information about keypoint name.
+    :param skeleton_info: information about skeleton connection.
+    :param dataset: information about dataset name.
+    :param img_root: the path to save images.
+    :param save_path: the path to save transformed annotation file.
+    :param start_img_id: the starting point to count the image id.
+    :param start_ann_id: the starting point to count the annotation id.
+    """
+    images = []
+    annotations = []
+
+    img_id = start_img_id
+    ann_id = start_ann_id
+
+    num_annotations, keypoints_num, _ = keypoints_all.shape
+
+    for i in range(num_annotations):
+        img = imgs_all[i]
+        keypoints = np.concatenate(
+            [keypoints_all[i], annotated_all[i][:, None] * 2], axis=1)
+
+        min_x, min_y = np.min(keypoints[keypoints[:, 2] > 0, :2], axis=0)
+        max_x, max_y = np.max(keypoints[keypoints[:, 2] > 0, :2], axis=0)
+
+        anno = {}
+        anno['keypoints'] = keypoints.reshape(-1).tolist()
+        anno['image_id'] = img_id
+        anno['id'] = ann_id
+        anno['num_keypoints'] = int(sum(keypoints[:, 2] > 0))
+        anno['bbox'] = [
+            float(min_x),
+            float(min_y),
+            float(max_x - min_x + 1),
+            float(max_y - min_y + 1)
+        ]
+        anno['iscrowd'] = 0
+        anno['area'] = anno['bbox'][2] * anno['bbox'][3]
+        anno['category_id'] = 1
+
+        annotations.append(anno)
+        ann_id += 1
+
+        image = {}
+        image['id'] = img_id
+        image['file_name'] = f'{img_id}.jpg'
+        image['height'] = img.shape[0]
+        image['width'] = img.shape[1]
+
+        images.append(image)
+        img_id += 1
+
+        cv2.imwrite(os.path.join(img_root, image['file_name']), img)
+
+    skeleton = np.concatenate(
+        [np.arange(keypoints_num)[:, None], skeleton_info[:, 0][:, None]],
+        axis=1) + 1
+    skeleton = skeleton[skeleton.min(axis=1) > 0]
+
+    cocotype = {}
+
+    cocotype['info'] = {}
+    cocotype['info'][
+        'description'] = 'DeepPoseKit-Data Generated by MMPose Team'
+    cocotype['info']['version'] = '1.0'
+    cocotype['info']['year'] = time.strftime('%Y', time.localtime())
+    cocotype['info']['date_created'] = time.strftime('%Y/%m/%d',
+                                                     time.localtime())
+
+    cocotype['images'] = images
+    cocotype['annotations'] = annotations
+    cocotype['categories'] = [{
+        'supercategory': 'animal',
+        'id': 1,
+        'name': dataset,
+        'keypoints': keypoints_info,
+        'skeleton': skeleton.tolist()
+    }]
+
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    json.dump(cocotype, open(save_path, 'w'), indent=4)
+    print('number of images:', img_id)
+    print('number of annotations:', ann_id)
+    print(f'done {save_path}')
+
+
+for dataset in ['fly', 'locust', 'zebra']:
+    keypoints_info = []
+    if dataset == 'fly':
+        keypoints_info = [
+            'head', 'eyeL', 'eyeR', 'neck', 'thorax', 'abdomen', 'forelegR1',
+            'forelegR2', 'forelegR3', 'forelegR4', 'midlegR1', 'midlegR2',
+            'midlegR3', 'midlegR4', 'hindlegR1', 'hindlegR2', 'hindlegR3',
+            'hindlegR4', 'forelegL1', 'forelegL2', 'forelegL3', 'forelegL4',
+            'midlegL1', 'midlegL2', 'midlegL3', 'midlegL4', 'hindlegL1',
+            'hindlegL2', 'hindlegL3', 'hindlegL4', 'wingL', 'wingR'
+        ]
+    elif dataset == 'locust':
+        keypoints_info = [
+            'head', 'neck', 'thorax', 'abdomen1', 'abdomen2', 'anttipL',
+            'antbaseL', 'eyeL', 'forelegL1', 'forelegL2', 'forelegL3',
+            'forelegL4', 'midlegL1', 'midlegL2', 'midlegL3', 'midlegL4',
+            'hindlegL1', 'hindlegL2', 'hindlegL3', 'hindlegL4', 'anttipR',
+            'antbaseR', 'eyeR', 'forelegR1', 'forelegR2', 'forelegR3',
+            'forelegR4', 'midlegR1', 'midlegR2', 'midlegR3', 'midlegR4',
+            'hindlegR1', 'hindlegR2', 'hindlegR3', 'hindlegR4'
+        ]
+    elif dataset == 'zebra':
+        keypoints_info = [
+            'snout', 'head', 'neck', 'forelegL1', 'forelegR1', 'hindlegL1',
+            'hindlegR1', 'tailbase', 'tailtip'
+        ]
+    else:
+        NotImplementedError()
+
+    dataset_dir = f'data/DeepPoseKit-Data/datasets/{dataset}'
+
+    with h5py.File(
+            os.path.join(dataset_dir, 'annotation_data_release.h5'), 'r') as f:
+        # List all groups
+        annotations = np.array(f['annotations'])
+        annotated = np.array(f['annotated'])
+        images = np.array(f['images'])
+        skeleton_info = np.array(f['skeleton'])
+
+        annotation_num, kpt_num, _ = annotations.shape
+
+        data_list = np.arange(0, annotation_num)
+        np.random.shuffle(data_list)
+
+        val_data_num = annotation_num // 10
+        train_data_num = annotation_num - val_data_num
+
+        train_list = data_list[0:train_data_num]
+        val_list = data_list[train_data_num:]
+
+        img_root = os.path.join(dataset_dir, 'images')
+        os.makedirs(img_root, exist_ok=True)
+
+        save_coco_anno(
+            annotations[train_list], annotated[train_list], images[train_list],
+            keypoints_info, skeleton_info, dataset, img_root,
+            os.path.join(dataset_dir, 'annotations', f'{dataset}_train.json'))
+        save_coco_anno(
+            annotations[val_list],
+            annotated[val_list],
+            images[val_list],
+            keypoints_info,
+            skeleton_info,
+            dataset,
+            img_root,
+            os.path.join(dataset_dir, 'annotations', f'{dataset}_test.json'),
+            start_img_id=train_data_num,
+            start_ann_id=train_data_num)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_macaquepose_dataset.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_macaquepose_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..85801a2225c0c08c6a1b67778b8241a14b79e49a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/parse_macaquepose_dataset.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import csv
+import json
+import os
+import time
+
+import cv2
+import numpy as np
+
+np.random.seed(0)
+
+
+def get_poly_area(x, y):
+    """Calculate area of polygon given (x,y) coordinates (Shoelace formula)
+
+    :param x: np.ndarray(N, )
+    :param y: np.ndarray(N, )
+    :return: area
+    """
+    return float(0.5 *
+                 np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1))))
+
+
+def get_seg_area(segmentations):
+    area = 0
+    for segmentation in segmentations:
+        area += get_poly_area(segmentation[:, 0], segmentation[:, 1])
+    return area
+
+
+def save_coco_anno(data_annotation,
+                   img_root,
+                   save_path,
+                   start_img_id=0,
+                   start_ann_id=0,
+                   kpt_num=17):
+    """Save annotations in coco-format.
+
+    :param data_annotation: list of data annotation.
+    :param img_root: the root dir to load images.
+    :param save_path: the path to save transformed annotation file.
+    :param start_img_id: the starting point to count the image id.
+    :param start_ann_id: the starting point to count the annotation id.
+    :param kpt_num: the number of keypoint.
+    """
+    images = []
+    annotations = []
+
+    img_id = start_img_id
+    ann_id = start_ann_id
+
+    for i in range(0, len(data_annotation)):
+        data_anno = data_annotation[i]
+        image_name = data_anno[0]
+
+        img = cv2.imread(os.path.join(img_root, image_name))
+
+        kp_string = data_anno[1]
+        kps = json.loads(kp_string)
+
+        seg_string = data_anno[2]
+        segs = json.loads(seg_string)
+
+        for kp, seg in zip(kps, segs):
+            keypoints = np.zeros([kpt_num, 3])
+            for ind, p in enumerate(kp):
+                if p['position'] is None:
+                    continue
+                else:
+                    keypoints[ind, 0] = p['position'][0]
+                    keypoints[ind, 1] = p['position'][1]
+                    keypoints[ind, 2] = 2
+
+            segmentations = []
+
+            max_x = -1
+            max_y = -1
+            min_x = 999999
+            min_y = 999999
+            for segm in seg:
+                if len(segm['segment']) == 0:
+                    continue
+
+                segmentation = np.array(segm['segment'])
+                segmentations.append(segmentation)
+
+                _max_x, _max_y = segmentation.max(0)
+                _min_x, _min_y = segmentation.min(0)
+
+                max_x = max(max_x, _max_x)
+                max_y = max(max_y, _max_y)
+                min_x = min(min_x, _min_x)
+                min_y = min(min_y, _min_y)
+
+            anno = {}
+            anno['keypoints'] = keypoints.reshape(-1).tolist()
+            anno['image_id'] = img_id
+            anno['id'] = ann_id
+            anno['num_keypoints'] = int(sum(keypoints[:, 2] > 0))
+            anno['bbox'] = [
+                float(min_x),
+                float(min_y),
+                float(max_x - min_x + 1),
+                float(max_y - min_y + 1)
+            ]
+            anno['iscrowd'] = 0
+            anno['area'] = get_seg_area(segmentations)
+            anno['category_id'] = 1
+            anno['segmentation'] = [
+                seg.reshape(-1).tolist() for seg in segmentations
+            ]
+
+            annotations.append(anno)
+            ann_id += 1
+
+        image = {}
+        image['id'] = img_id
+        image['file_name'] = image_name
+        image['height'] = img.shape[0]
+        image['width'] = img.shape[1]
+
+        images.append(image)
+        img_id += 1
+
+    cocotype = {}
+
+    cocotype['info'] = {}
+    cocotype['info']['description'] = 'MacaquePose Generated by MMPose Team'
+    cocotype['info']['version'] = '1.0'
+    cocotype['info']['year'] = time.strftime('%Y', time.localtime())
+    cocotype['info']['date_created'] = time.strftime('%Y/%m/%d',
+                                                     time.localtime())
+
+    cocotype['images'] = images
+    cocotype['annotations'] = annotations
+    cocotype['categories'] = [{
+        'supercategory':
+        'animal',
+        'id':
+        1,
+        'name':
+        'macaque',
+        'keypoints': [
+            'nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear',
+            'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
+            'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee',
+            'right_knee', 'left_ankle', 'right_ankle'
+        ],
+        'skeleton': [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12],
+                     [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3],
+                     [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]
+    }]
+
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    json.dump(cocotype, open(save_path, 'w'), indent=4)
+    print('number of images:', img_id)
+    print('number of annotations:', ann_id)
+    print(f'done {save_path}')
+
+
+dataset_dir = '/data/macaque/'
+with open(os.path.join(dataset_dir, 'annotations.csv'), 'r') as fp:
+    data_annotation_all = list(csv.reader(fp, delimiter=','))[1:]
+
+np.random.shuffle(data_annotation_all)
+
+data_annotation_train = data_annotation_all[0:12500]
+data_annotation_val = data_annotation_all[12500:]
+
+img_root = os.path.join(dataset_dir, 'images')
+save_coco_anno(
+    data_annotation_train,
+    img_root,
+    os.path.join(dataset_dir, 'annotations', 'macaque_train.json'),
+    kpt_num=17)
+save_coco_anno(
+    data_annotation_val,
+    img_root,
+    os.path.join(dataset_dir, 'annotations', 'macaque_test.json'),
+    start_img_id=12500,
+    start_ann_id=15672,
+    kpt_num=17)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/preprocess_h36m.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/preprocess_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f0edb50d61c5b405d3a6a1fa8c65cfb0c0a683
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/preprocess_h36m.py
@@ -0,0 +1,417 @@
+# -----------------------------------------------------------------------------
+# Adapted from https://github.com/anibali/h36m-fetch
+# Original license: Copyright (c) Aiden Nibali, under the Apache License.
+# -----------------------------------------------------------------------------
+
+import argparse
+import os
+import pickle
+import tarfile
+import xml.etree.ElementTree as ET
+from os.path import join
+
+import cv2
+import numpy as np
+from spacepy import pycdf
+
+
+class PreprocessH36m:
+    """Preprocess Human3.6M dataset.
+
+    Args:
+        metadata (str): Path to metadata.xml.
+        original_dir (str): Directory of the original dataset with all files
+            compressed. Specifically, .tgz files belonging to subject 1
+            should be placed under the subdirectory 's1'.
+        extracted_dir (str): Directory of the extracted files. If not given, it
+            will be placed under the same parent directory as original_dir.
+        processed_der (str): Directory of the processed files. If not given, it
+            will be placed under the same parent directory as original_dir.
+        sample_rate (int): Downsample FPS to `1 / sample_rate`. Default: 5.
+    """
+
+    def __init__(self,
+                 metadata,
+                 original_dir,
+                 extracted_dir=None,
+                 processed_dir=None,
+                 sample_rate=5):
+        self.metadata = metadata
+        self.original_dir = original_dir
+        self.sample_rate = sample_rate
+
+        if extracted_dir is None:
+            self.extracted_dir = join(
+                os.path.dirname(os.path.abspath(self.original_dir)),
+                'extracted')
+        else:
+            self.extracted_dir = extracted_dir
+
+        if processed_dir is None:
+            self.processed_dir = join(
+                os.path.dirname(os.path.abspath(self.original_dir)),
+                'processed')
+        else:
+            self.processed_dir = processed_dir
+
+        self.subjects = []
+        self.sequence_mappings = {}
+        self.action_names = {}
+        self.camera_ids = []
+        self._load_metadata()
+
+        self.subjects_annot = ['S1', 'S5', 'S6', 'S7', 'S8', 'S9', 'S11']
+        self.subjects_splits = {
+            'train': ['S1', 'S5', 'S6', 'S7', 'S8'],
+            'test': ['S9', 'S11']
+        }
+        self.extract_files = ['Videos', 'D2_Positions', 'D3_Positions_mono']
+        self.movable_joints = [
+            0, 1, 2, 3, 6, 7, 8, 12, 13, 14, 15, 17, 18, 19, 25, 26, 27
+        ]
+        self.scale_factor = 1.2
+        self.image_sizes = {
+            '54138969': {
+                'width': 1000,
+                'height': 1002
+            },
+            '55011271': {
+                'width': 1000,
+                'height': 1000
+            },
+            '58860488': {
+                'width': 1000,
+                'height': 1000
+            },
+            '60457274': {
+                'width': 1000,
+                'height': 1002
+            }
+        }
+
+    def extract_tgz(self):
+        """Extract files from self.extrct_files."""
+        os.makedirs(self.extracted_dir, exist_ok=True)
+        for subject in self.subjects_annot:
+            cur_dir = join(self.original_dir, subject.lower())
+            for file in self.extract_files:
+                filename = join(cur_dir, file + '.tgz')
+                print(f'Extracting {filename} ...')
+                with tarfile.open(filename) as tar:
+                    tar.extractall(self.extracted_dir)
+        print('Extraction done.\n')
+
+    def generate_cameras_file(self):
+        """Generate cameras.pkl which contains camera parameters for 11
+        subjects each with 4 cameras."""
+        cameras = {}
+        for subject in range(1, 12):
+            for camera in range(4):
+                key = (f'S{subject}', self.camera_ids[camera])
+                cameras[key] = self._get_camera_params(camera, subject)
+
+        out_file = join(self.processed_dir, 'annotation_body3d', 'cameras.pkl')
+        with open(out_file, 'wb') as fout:
+            pickle.dump(cameras, fout)
+        print(f'Camera parameters have been written to "{out_file}".\n')
+
+    def generate_annotations(self):
+        """Generate annotations for training and testing data."""
+        output_dir = join(self.processed_dir, 'annotation_body3d',
+                          f'fps{50 // self.sample_rate}')
+        os.makedirs(output_dir, exist_ok=True)
+
+        for data_split in ('train', 'test'):
+            imgnames_all = []
+            centers_all = []
+            scales_all = []
+            kps2d_all = []
+            kps3d_all = []
+            for subject in self.subjects_splits[data_split]:
+                for action, subaction in self.sequence_mappings[subject].keys(
+                ):
+                    if action == '1':
+                        # exclude action "_ALL"
+                        continue
+                    for camera in self.camera_ids:
+                        imgnames, centers, scales, kps2d, kps3d\
+                         = self._load_annotations(
+                            subject, action, subaction, camera)
+                        imgnames_all.append(imgnames)
+                        centers_all.append(centers)
+                        scales_all.append(scales)
+                        kps2d_all.append(kps2d)
+                        kps3d_all.append(kps3d)
+
+            imgnames_all = np.concatenate(imgnames_all)
+            centers_all = np.concatenate(centers_all)
+            scales_all = np.concatenate(scales_all)
+            kps2d_all = np.concatenate(kps2d_all)
+            kps3d_all = np.concatenate(kps3d_all)
+
+            out_file = join(output_dir, f'h36m_{data_split}.npz')
+            np.savez(
+                out_file,
+                imgname=imgnames_all,
+                center=centers_all,
+                scale=scales_all,
+                part=kps2d_all,
+                S=kps3d_all)
+
+            print(
+                f'All annotations of {data_split}ing data have been written to'
+                f' "{out_file}". {len(imgnames_all)} samples in total.\n')
+
+            if data_split == 'train':
+                kps_3d_all = kps3d_all[..., :3]  # remove visibility
+                mean_3d, std_3d = self._get_pose_stats(kps_3d_all)
+
+                kps_2d_all = kps2d_all[..., :2]  # remove visibility
+                mean_2d, std_2d = self._get_pose_stats(kps_2d_all)
+
+                # centered around root
+                # the root keypoint is 0-index
+                kps_3d_rel = kps_3d_all[..., 1:, :] - kps_3d_all[..., :1, :]
+                mean_3d_rel, std_3d_rel = self._get_pose_stats(kps_3d_rel)
+
+                kps_2d_rel = kps_2d_all[..., 1:, :] - kps_2d_all[..., :1, :]
+                mean_2d_rel, std_2d_rel = self._get_pose_stats(kps_2d_rel)
+
+                stats = {
+                    'joint3d_stats': {
+                        'mean': mean_3d,
+                        'std': std_3d
+                    },
+                    'joint2d_stats': {
+                        'mean': mean_2d,
+                        'std': std_2d
+                    },
+                    'joint3d_rel_stats': {
+                        'mean': mean_3d_rel,
+                        'std': std_3d_rel
+                    },
+                    'joint2d_rel_stats': {
+                        'mean': mean_2d_rel,
+                        'std': std_2d_rel
+                    }
+                }
+                for name, stat_dict in stats.items():
+                    out_file = join(output_dir, f'{name}.pkl')
+                    with open(out_file, 'wb') as f:
+                        pickle.dump(stat_dict, f)
+                    print(f'Create statistic data file: {out_file}')
+
+    @staticmethod
+    def _get_pose_stats(kps):
+        """Get statistic information `mean` and `std` of pose data.
+
+        Args:
+            kps (ndarray): keypoints in shape [..., K, C] where K and C is
+                the keypoint category number and dimension.
+        Returns:
+            mean (ndarray): [K, C]
+        """
+        assert kps.ndim > 2
+        K, C = kps.shape[-2:]
+        kps = kps.reshape(-1, K, C)
+        mean = kps.mean(axis=0)
+        std = kps.std(axis=0)
+        return mean, std
+
+    def _load_metadata(self):
+        """Load meta data from metadata.xml."""
+
+        assert os.path.exists(self.metadata)
+
+        tree = ET.parse(self.metadata)
+        root = tree.getroot()
+
+        for i, tr in enumerate(root.find('mapping')):
+            if i == 0:
+                _, _, *self.subjects = [td.text for td in tr]
+                self.sequence_mappings \
+                    = {subject: {} for subject in self.subjects}
+            elif i < 33:
+                action_id, subaction_id, *prefixes = [td.text for td in tr]
+                for subject, prefix in zip(self.subjects, prefixes):
+                    self.sequence_mappings[subject][(action_id, subaction_id)]\
+                        = prefix
+
+        for i, elem in enumerate(root.find('actionnames')):
+            action_id = str(i + 1)
+            self.action_names[action_id] = elem.text
+
+        self.camera_ids \
+            = [elem.text for elem in root.find('dbcameras/index2id')]
+
+        w0 = root.find('w0')
+        self.cameras_raw = [float(num) for num in w0.text[1:-1].split()]
+
+    def _get_base_filename(self, subject, action, subaction, camera):
+        """Get base filename given subject, action, subaction and camera."""
+        return f'{self.sequence_mappings[subject][(action, subaction)]}' + \
+            f'.{camera}'
+
+    def _get_camera_params(self, camera, subject):
+        """Get camera parameters given camera id and subject id."""
+        metadata_slice = np.zeros(15)
+        start = 6 * (camera * 11 + (subject - 1))
+
+        metadata_slice[:6] = self.cameras_raw[start:start + 6]
+        metadata_slice[6:] = self.cameras_raw[265 + camera * 9 - 1:265 +
+                                              (camera + 1) * 9 - 1]
+
+        # extrinsics
+        x, y, z = -metadata_slice[0], metadata_slice[1], -metadata_slice[2]
+
+        R_x = np.array([[1, 0, 0], [0, np.cos(x), np.sin(x)],
+                        [0, -np.sin(x), np.cos(x)]])
+        R_y = np.array([[np.cos(y), 0, np.sin(y)], [0, 1, 0],
+                        [-np.sin(y), 0, np.cos(y)]])
+        R_z = np.array([[np.cos(z), np.sin(z), 0], [-np.sin(z),
+                                                    np.cos(z), 0], [0, 0, 1]])
+        R = (R_x @ R_y @ R_z).T
+        T = metadata_slice[3:6].reshape(-1, 1)
+        # convert unit from millimeter to meter
+        T *= 0.001
+
+        # intrinsics
+        c = metadata_slice[8:10, None]
+        f = metadata_slice[6:8, None]
+
+        # distortion
+        k = metadata_slice[10:13, None]
+        p = metadata_slice[13:15, None]
+
+        return {
+            'R': R,
+            'T': T,
+            'c': c,
+            'f': f,
+            'k': k,
+            'p': p,
+            'w': self.image_sizes[self.camera_ids[camera]]['width'],
+            'h': self.image_sizes[self.camera_ids[camera]]['height'],
+            'name': f'camera{camera + 1}',
+            'id': self.camera_ids[camera]
+        }
+
+    def _load_annotations(self, subject, action, subaction, camera):
+        """Load annotations for a sequence."""
+        subj_dir = join(self.extracted_dir, subject)
+        basename = self._get_base_filename(subject, action, subaction, camera)
+
+        # load 2D keypoints
+        with pycdf.CDF(
+                join(subj_dir, 'MyPoseFeatures', 'D2_Positions',
+                     basename + '.cdf')) as cdf:
+            kps_2d = np.array(cdf['Pose'])
+
+        num_frames = kps_2d.shape[1]
+        kps_2d = kps_2d.reshape((num_frames, 32, 2))[::self.sample_rate,
+                                                     self.movable_joints]
+        kps_2d = np.concatenate([kps_2d, np.ones((len(kps_2d), 17, 1))],
+                                axis=2)
+
+        # load 3D keypoints
+        with pycdf.CDF(
+                join(subj_dir, 'MyPoseFeatures', 'D3_Positions_mono',
+                     basename + '.cdf')) as cdf:
+            kps_3d = np.array(cdf['Pose'])
+
+        kps_3d = kps_3d.reshape(
+            (num_frames, 32, 3))[::self.sample_rate,
+                                 self.movable_joints] / 1000.
+        kps_3d = np.concatenate([kps_3d, np.ones((len(kps_3d), 17, 1))],
+                                axis=2)
+
+        # calculate bounding boxes
+        bboxes = np.stack([
+            np.min(kps_2d[:, :, 0], axis=1),
+            np.min(kps_2d[:, :, 1], axis=1),
+            np.max(kps_2d[:, :, 0], axis=1),
+            np.max(kps_2d[:, :, 1], axis=1)
+        ],
+                          axis=1)
+        centers = np.stack([(bboxes[:, 0] + bboxes[:, 2]) / 2,
+                            (bboxes[:, 1] + bboxes[:, 3]) / 2],
+                           axis=1)
+        scales = self.scale_factor * np.max(
+            bboxes[:, 2:] - bboxes[:, :2], axis=1) / 200
+
+        # extract frames and save imgnames
+        imgnames = []
+        video_path = join(subj_dir, 'Videos', basename + '.mp4')
+        sub_base = subject + '_' + basename.replace(' ', '_')
+        img_dir = join(self.processed_dir, 'images', subject, sub_base)
+        os.makedirs(img_dir, exist_ok=True)
+        prefix = join(subject, sub_base, sub_base)
+
+        cap = cv2.VideoCapture(video_path)
+        i = 0
+        while True:
+            success, img = cap.read()
+            if not success:
+                break
+            if i % self.sample_rate == 0:
+                imgname = f'{prefix}_{i + 1:06d}.jpg'
+                imgnames.append(imgname)
+                dest_path = join(self.processed_dir, 'images', imgname)
+                if not os.path.exists(dest_path):
+                    cv2.imwrite(dest_path, img)
+                if len(imgnames) == len(centers):
+                    break
+            i += 1
+        cap.release()
+        imgnames = np.array(imgnames)
+
+        print(f'Annoatations for sequence "{subject} {basename}" are loaded. '
+              f'{len(imgnames)} samples in total.')
+
+        return imgnames, centers, scales, kps_2d, kps_3d
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--metadata', type=str, required=True, help='Path to metadata.xml')
+    parser.add_argument(
+        '--original',
+        type=str,
+        required=True,
+        help='Directory of the original dataset with all files compressed. '
+        'Specifically, .tgz files belonging to subject 1 should be placed '
+        'under the subdirectory \"s1\".')
+    parser.add_argument(
+        '--extracted',
+        type=str,
+        default=None,
+        help='Directory of the extracted files. If not given, it will be '
+        'placed under the same parent directory as original_dir.')
+    parser.add_argument(
+        '--processed',
+        type=str,
+        default=None,
+        help='Directory of the processed files. If not given, it will be '
+        'placed under the same parent directory as original_dir.')
+    parser.add_argument(
+        '--sample_rate',
+        type=int,
+        default=5,
+        help='Downsample FPS to `1 / sample_rate`. Default: 5.')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    h36m = PreprocessH36m(
+        metadata=args.metadata,
+        original_dir=args.original,
+        extracted_dir=args.extracted,
+        processed_dir=args.processed,
+        sample_rate=args.sample_rate)
+    h36m.extract_tgz()
+    h36m.generate_cameras_file()
+    h36m.generate_annotations()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/preprocess_mpi_inf_3dhp.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/preprocess_mpi_inf_3dhp.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bef25c9433e0894ffa03db72510204bd75b67f4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dataset/preprocess_mpi_inf_3dhp.py
@@ -0,0 +1,359 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import pickle
+import shutil
+from os.path import join
+
+import cv2
+import h5py
+import mmcv
+import numpy as np
+from scipy.io import loadmat
+
+train_subjects = [i for i in range(1, 9)]
+test_subjects = [i for i in range(1, 7)]
+train_seqs = [1, 2]
+train_cams = [0, 1, 2, 4, 5, 6, 7, 8]
+train_frame_nums = {
+    (1, 1): 6416,
+    (1, 2): 12430,
+    (2, 1): 6502,
+    (2, 2): 6081,
+    (3, 1): 12488,
+    (3, 2): 12283,
+    (4, 1): 6171,
+    (4, 2): 6675,
+    (5, 1): 12820,
+    (5, 2): 12312,
+    (6, 1): 6188,
+    (6, 2): 6145,
+    (7, 1): 6239,
+    (7, 2): 6320,
+    (8, 1): 6468,
+    (8, 2): 6054
+}
+test_frame_nums = {1: 6151, 2: 6080, 3: 5838, 4: 6007, 5: 320, 6: 492}
+train_img_size = (2048, 2048)
+root_index = 14
+joints_17 = [7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 4, 3, 6]
+
+
+def get_pose_stats(kps):
+    """Get statistic information `mean` and `std` of pose data.
+
+    Args:
+        kps (ndarray): keypoints in shape [..., K, C] where K and C is
+            the keypoint category number and dimension.
+    Returns:
+        mean (ndarray): [K, C]
+    """
+    assert kps.ndim > 2
+    K, C = kps.shape[-2:]
+    kps = kps.reshape(-1, K, C)
+    mean = kps.mean(axis=0)
+    std = kps.std(axis=0)
+    return mean, std
+
+
+def get_annotations(joints_2d, joints_3d, scale_factor=1.2):
+    """Get annotations, including centers, scales, joints_2d and joints_3d.
+
+    Args:
+        joints_2d: 2D joint coordinates in shape [N, K, 2], where N is the
+            frame number, K is the joint number.
+        joints_3d: 3D joint coordinates in shape [N, K, 3], where N is the
+            frame number, K is the joint number.
+        scale_factor: Scale factor of bounding box. Default: 1.2.
+    Returns:
+        centers (ndarray): [N, 2]
+        scales (ndarray): [N,]
+        joints_2d (ndarray): [N, K, 3]
+        joints_3d (ndarray): [N, K, 4]
+    """
+    # calculate joint visibility
+    visibility = (joints_2d[:, :, 0] >= 0) * \
+                 (joints_2d[:, :, 0] < train_img_size[0]) * \
+                 (joints_2d[:, :, 1] >= 0) * \
+                 (joints_2d[:, :, 1] < train_img_size[1])
+    visibility = np.array(visibility, dtype=np.float32)[:, :, None]
+    joints_2d = np.concatenate([joints_2d, visibility], axis=-1)
+    joints_3d = np.concatenate([joints_3d, visibility], axis=-1)
+
+    # calculate bounding boxes
+    bboxes = np.stack([
+        np.min(joints_2d[:, :, 0], axis=1),
+        np.min(joints_2d[:, :, 1], axis=1),
+        np.max(joints_2d[:, :, 0], axis=1),
+        np.max(joints_2d[:, :, 1], axis=1)
+    ],
+                      axis=1)
+    centers = np.stack([(bboxes[:, 0] + bboxes[:, 2]) / 2,
+                        (bboxes[:, 1] + bboxes[:, 3]) / 2],
+                       axis=1)
+    scales = scale_factor * np.max(bboxes[:, 2:] - bboxes[:, :2], axis=1) / 200
+
+    return centers, scales, joints_2d, joints_3d
+
+
+def load_trainset(data_root, out_dir):
+    """Load training data, create annotation file and camera file.
+    Args:
+        data_root: Directory of dataset, which is organized in the following
+            hierarchy:
+                data_root
+                |-- train
+                    |-- S1
+                        |-- Seq1
+                        |-- Seq2
+                    |-- S2
+                    |-- ...
+                |-- test
+                    |-- TS1
+                    |-- TS2
+                    |-- ...
+        out_dir: Directory to save annotation file.
+    """
+    _imgnames = []
+    _centers = []
+    _scales = []
+    _joints_2d = []
+    _joints_3d = []
+    cameras = {}
+
+    img_dir = join(out_dir, 'images')
+    os.makedirs(img_dir, exist_ok=True)
+    annot_dir = join(out_dir, 'annotations')
+    os.makedirs(annot_dir, exist_ok=True)
+
+    for subj in train_subjects:
+        for seq in train_seqs:
+            seq_path = join(data_root, 'train', f'S{subj}', f'Seq{seq}')
+            num_frames = train_frame_nums[(subj, seq)]
+
+            # load camera parametres
+            camera_file = join(seq_path, 'camera.calibration')
+            with open(camera_file, 'r') as fin:
+                lines = fin.readlines()
+                for cam in train_cams:
+                    K = [float(s) for s in lines[cam * 7 + 5][11:-2].split()]
+                    f = np.array([[K[0]], [K[5]]])
+                    c = np.array([[K[2]], [K[6]]])
+                    RT = np.array(
+                        [float(s) for s in lines[cam * 7 + 6][11:-2].split()])
+                    RT = np.reshape(RT, (4, 4))
+                    R = RT[:3, :3]
+                    # convert unit from millimeter to meter
+                    T = RT[:3, 3:] * 0.001
+                    size = [int(s) for s in lines[cam * 7 + 3][14:].split()]
+                    w, h = size
+                    cam_param = dict(
+                        R=R, T=T, c=c, f=f, w=w, h=h, name=f'train_cam_{cam}')
+                    cameras[f'S{subj}_Seq{seq}_Cam{cam}'] = cam_param
+
+            # load annotations
+            annot_file = os.path.join(seq_path, 'annot.mat')
+            annot2 = loadmat(annot_file)['annot2']
+            annot3 = loadmat(annot_file)['annot3']
+            for cam in train_cams:
+                # load 2D and 3D annotations
+                joints_2d = np.reshape(annot2[cam][0][:num_frames],
+                                       (num_frames, 28, 2))[:, joints_17]
+                joints_3d = np.reshape(annot3[cam][0][:num_frames],
+                                       (num_frames, 28, 3))[:, joints_17]
+                joints_3d = joints_3d * 0.001
+                centers, scales, joints_2d, joints_3d = get_annotations(
+                    joints_2d, joints_3d)
+                _centers.append(centers)
+                _scales.append(scales)
+                _joints_2d.append(joints_2d)
+                _joints_3d.append(joints_3d)
+
+                # extract frames from video
+                video_path = join(seq_path, 'imageSequence',
+                                  f'video_{cam}.avi')
+                video = mmcv.VideoReader(video_path)
+                for i in mmcv.track_iter_progress(range(num_frames)):
+                    img = video.read()
+                    if img is None:
+                        break
+                    imgname = f'S{subj}_Seq{seq}_Cam{cam}_{i+1:06d}.jpg'
+                    _imgnames.append(imgname)
+                    cv2.imwrite(join(img_dir, imgname), img)
+
+    _imgnames = np.array(_imgnames)
+    _centers = np.concatenate(_centers)
+    _scales = np.concatenate(_scales)
+    _joints_2d = np.concatenate(_joints_2d)
+    _joints_3d = np.concatenate(_joints_3d)
+
+    out_file = join(annot_dir, 'mpi_inf_3dhp_train.npz')
+    np.savez(
+        out_file,
+        imgname=_imgnames,
+        center=_centers,
+        scale=_scales,
+        part=_joints_2d,
+        S=_joints_3d)
+    print(f'Create annotation file for trainset: {out_file}. '
+          f'{len(_imgnames)} samples in total.')
+
+    out_file = join(annot_dir, 'cameras_train.pkl')
+    with open(out_file, 'wb') as fout:
+        pickle.dump(cameras, fout)
+    print(f'Create camera file for trainset: {out_file}.')
+
+    # get `mean` and `std` of pose data
+    _joints_3d = _joints_3d[..., :3]  # remove visibility
+    mean_3d, std_3d = get_pose_stats(_joints_3d)
+
+    _joints_2d = _joints_2d[..., :2]  # remove visibility
+    mean_2d, std_2d = get_pose_stats(_joints_2d)
+
+    # centered around root
+    _joints_3d_rel = _joints_3d - _joints_3d[..., root_index:root_index + 1, :]
+    mean_3d_rel, std_3d_rel = get_pose_stats(_joints_3d_rel)
+    mean_3d_rel[root_index] = mean_3d[root_index]
+    std_3d_rel[root_index] = std_3d[root_index]
+
+    _joints_2d_rel = _joints_2d - _joints_2d[..., root_index:root_index + 1, :]
+    mean_2d_rel, std_2d_rel = get_pose_stats(_joints_2d_rel)
+    mean_2d_rel[root_index] = mean_2d[root_index]
+    std_2d_rel[root_index] = std_2d[root_index]
+
+    stats = {
+        'joint3d_stats': {
+            'mean': mean_3d,
+            'std': std_3d
+        },
+        'joint2d_stats': {
+            'mean': mean_2d,
+            'std': std_2d
+        },
+        'joint3d_rel_stats': {
+            'mean': mean_3d_rel,
+            'std': std_3d_rel
+        },
+        'joint2d_rel_stats': {
+            'mean': mean_2d_rel,
+            'std': std_2d_rel
+        }
+    }
+    for name, stat_dict in stats.items():
+        out_file = join(annot_dir, f'{name}.pkl')
+        with open(out_file, 'wb') as f:
+            pickle.dump(stat_dict, f)
+        print(f'Create statistic data file: {out_file}')
+
+
+def load_testset(data_root, out_dir, valid_only=True):
+    """Load testing data, create annotation file and camera file.
+
+    Args:
+        data_root: Directory of dataset.
+        out_dir: Directory to save annotation file.
+        valid_only: Only keep frames with valid_label == 1.
+    """
+    _imgnames = []
+    _centers = []
+    _scales = []
+    _joints_2d = []
+    _joints_3d = []
+    cameras = {}
+
+    img_dir = join(out_dir, 'images')
+    os.makedirs(img_dir, exist_ok=True)
+    annot_dir = join(out_dir, 'annotations')
+    os.makedirs(annot_dir, exist_ok=True)
+
+    for subj in test_subjects:
+        subj_path = join(data_root, 'test', f'TS{subj}')
+        num_frames = test_frame_nums[subj]
+
+        # load annotations
+        annot_file = os.path.join(subj_path, 'annot_data.mat')
+        with h5py.File(annot_file, 'r') as fin:
+            annot2 = np.array(fin['annot2']).reshape((-1, 17, 2))
+            annot3 = np.array(fin['annot3']).reshape((-1, 17, 3))
+            valid = np.array(fin['valid_frame']).reshape(-1)
+
+        # manually estimate camera intrinsics
+        fx, cx = np.linalg.lstsq(
+            annot3[:, :, [0, 2]].reshape((-1, 2)),
+            (annot2[:, :, 0] * annot3[:, :, 2]).reshape(-1, 1),
+            rcond=None)[0].flatten()
+        fy, cy = np.linalg.lstsq(
+            annot3[:, :, [1, 2]].reshape((-1, 2)),
+            (annot2[:, :, 1] * annot3[:, :, 2]).reshape(-1, 1),
+            rcond=None)[0].flatten()
+        if subj <= 4:
+            w, h = 2048, 2048
+        else:
+            w, h = 1920, 1080
+        cameras[f'TS{subj}'] = dict(
+            c=np.array([[cx], [cy]]),
+            f=np.array([[fx], [fy]]),
+            w=w,
+            h=h,
+            name=f'test_cam_{subj}')
+
+        # get annotations
+        if valid_only:
+            valid_frames = np.nonzero(valid)[0]
+        else:
+            valid_frames = np.arange(num_frames)
+        joints_2d = annot2[valid_frames, :, :]
+        joints_3d = annot3[valid_frames, :, :] * 0.001
+
+        centers, scales, joints_2d, joints_3d = get_annotations(
+            joints_2d, joints_3d)
+        _centers.append(centers)
+        _scales.append(scales)
+        _joints_2d.append(joints_2d)
+        _joints_3d.append(joints_3d)
+
+        # copy and rename images
+        for i in valid_frames:
+            imgname = f'TS{subj}_{i+1:06d}.jpg'
+            shutil.copyfile(
+                join(subj_path, 'imageSequence', f'img_{i+1:06d}.jpg'),
+                join(img_dir, imgname))
+            _imgnames.append(imgname)
+
+    _imgnames = np.array(_imgnames)
+    _centers = np.concatenate(_centers)
+    _scales = np.concatenate(_scales)
+    _joints_2d = np.concatenate(_joints_2d)
+    _joints_3d = np.concatenate(_joints_3d)
+
+    if valid_only:
+        out_file = join(annot_dir, 'mpi_inf_3dhp_test_valid.npz')
+    else:
+        out_file = join(annot_dir, 'mpi_inf_3dhp_test_all.npz')
+    np.savez(
+        out_file,
+        imgname=_imgnames,
+        center=_centers,
+        scale=_scales,
+        part=_joints_2d,
+        S=_joints_3d)
+    print(f'Create annotation file for testset: {out_file}. '
+          f'{len(_imgnames)} samples in total.')
+
+    out_file = join(annot_dir, 'cameras_test.pkl')
+    with open(out_file, 'wb') as fout:
+        pickle.dump(cameras, fout)
+    print(f'Create camera file for testset: {out_file}.')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('data_root', type=str, help='data root')
+    parser.add_argument(
+        'out_dir', type=str, help='directory to save annotation files.')
+    args = parser.parse_args()
+    data_root = args.data_root
+    out_dir = args.out_dir
+
+    load_trainset(data_root, out_dir)
+    load_testset(data_root, out_dir, valid_only=True)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/mmpose2torchserve.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/mmpose2torchserve.py
new file mode 100644
index 0000000000000000000000000000000000000000..492a45b6b36935fadbae8578c1ffecc5b928b893
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/mmpose2torchserve.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from argparse import ArgumentParser, Namespace
+from tempfile import TemporaryDirectory
+
+import mmcv
+import torch
+from mmcv.runner import CheckpointLoader
+
+try:
+    from model_archiver.model_packaging import package_model
+    from model_archiver.model_packaging_utils import ModelExportUtils
+except ImportError:
+    package_model = None
+
+
+def mmpose2torchserve(config_file: str,
+                      checkpoint_file: str,
+                      output_folder: str,
+                      model_name: str,
+                      model_version: str = '1.0',
+                      force: bool = False):
+    """Converts MMPose model (config + checkpoint) to TorchServe `.mar`.
+
+    Args:
+        config_file:
+            In MMPose config format.
+            The contents vary for each task repository.
+        checkpoint_file:
+            In MMPose checkpoint format.
+            The contents vary for each task repository.
+        output_folder:
+            Folder where `{model_name}.mar` will be created.
+            The file created will be in TorchServe archive format.
+        model_name:
+            If not None, used for naming the `{model_name}.mar` file
+            that will be created under `output_folder`.
+            If None, `{Path(checkpoint_file).stem}` will be used.
+        model_version:
+            Model's version.
+        force:
+            If True, if there is an existing `{model_name}.mar`
+            file under `output_folder` it will be overwritten.
+    """
+
+    mmcv.mkdir_or_exist(output_folder)
+
+    config = mmcv.Config.fromfile(config_file)
+
+    with TemporaryDirectory() as tmpdir:
+        model_file = osp.join(tmpdir, 'config.py')
+        config.dump(model_file)
+        handler_path = osp.join(osp.dirname(__file__), 'mmpose_handler.py')
+        model_name = model_name or osp.splitext(
+            osp.basename(checkpoint_file))[0]
+
+        # use mmcv CheckpointLoader if checkpoint is not from a local file
+        if not osp.isfile(checkpoint_file):
+            ckpt = CheckpointLoader.load_checkpoint(checkpoint_file)
+            checkpoint_file = osp.join(tmpdir, 'checkpoint.pth')
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(ckpt, f)
+
+        args = Namespace(
+            **{
+                'model_file': model_file,
+                'serialized_file': checkpoint_file,
+                'handler': handler_path,
+                'model_name': model_name,
+                'version': model_version,
+                'export_path': output_folder,
+                'force': force,
+                'requirements_file': None,
+                'extra_files': None,
+                'runtime': 'python',
+                'archive_format': 'default'
+            })
+        manifest = ModelExportUtils.generate_manifest_json(args)
+        package_model(args, manifest)
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Convert MMPose models to TorchServe `.mar` format.')
+    parser.add_argument('config', type=str, help='config file path')
+    parser.add_argument('checkpoint', type=str, help='checkpoint file path')
+    parser.add_argument(
+        '--output-folder',
+        type=str,
+        required=True,
+        help='Folder where `{model_name}.mar` will be created.')
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default=None,
+        help='If not None, used for naming the `{model_name}.mar`'
+        'file that will be created under `output_folder`.'
+        'If None, `{Path(checkpoint_file).stem}` will be used.')
+    parser.add_argument(
+        '--model-version',
+        type=str,
+        default='1.0',
+        help='Number used for versioning.')
+    parser.add_argument(
+        '-f',
+        '--force',
+        action='store_true',
+        help='overwrite the existing `{model_name}.mar`')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    if package_model is None:
+        raise ImportError('`torch-model-archiver` is required.'
+                          'Try: pip install torch-model-archiver')
+
+    mmpose2torchserve(args.config, args.checkpoint, args.output_folder,
+                      args.model_name, args.model_version, args.force)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/mmpose_handler.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/mmpose_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7da881cdc9dd26ab23242052668958b8172ce57
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/mmpose_handler.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import base64
+import os
+
+import mmcv
+import torch
+
+from mmpose.apis import (inference_bottom_up_pose_model,
+                         inference_top_down_pose_model, init_pose_model)
+from mmpose.models.detectors import AssociativeEmbedding, TopDown
+
+try:
+    from ts.torch_handler.base_handler import BaseHandler
+except ImportError:
+    raise ImportError('Please install torchserve.')
+
+
+class MMPoseHandler(BaseHandler):
+
+    def initialize(self, context):
+        properties = context.system_properties
+        self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = torch.device(self.map_location + ':' +
+                                   str(properties.get('gpu_id')) if torch.cuda.
+                                   is_available() else self.map_location)
+        self.manifest = context.manifest
+
+        model_dir = properties.get('model_dir')
+        serialized_file = self.manifest['model']['serializedFile']
+        checkpoint = os.path.join(model_dir, serialized_file)
+        self.config_file = os.path.join(model_dir, 'config.py')
+
+        self.model = init_pose_model(self.config_file, checkpoint, self.device)
+        self.initialized = True
+
+    def preprocess(self, data):
+        images = []
+
+        for row in data:
+            image = row.get('data') or row.get('body')
+            if isinstance(image, str):
+                image = base64.b64decode(image)
+            image = mmcv.imfrombytes(image)
+            images.append(image)
+
+        return images
+
+    def inference(self, data, *args, **kwargs):
+        if isinstance(self.model, TopDown):
+            results = self._inference_top_down_pose_model(data)
+        elif isinstance(self.model, (AssociativeEmbedding, )):
+            results = self._inference_bottom_up_pose_model(data)
+        else:
+            raise NotImplementedError(
+                f'Model type {type(self.model)} is not supported.')
+
+        return results
+
+    def _inference_top_down_pose_model(self, data):
+        results = []
+        for image in data:
+            # use dummy person bounding box
+            preds, _ = inference_top_down_pose_model(
+                self.model, image, person_results=None)
+            results.append(preds)
+        return results
+
+    def _inference_bottom_up_pose_model(self, data):
+        results = []
+        for image in data:
+            preds, _ = inference_bottom_up_pose_model(self.model, image)
+            results.append(preds)
+        return results
+
+    def postprocess(self, data):
+        output = [[{
+            'keypoints': pred['keypoints'].tolist()
+        } for pred in preds] for preds in data]
+
+        return output
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/pytorch2onnx.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/pytorch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..5caff6e070b5690a0dc8ba8e09caac0409c23047
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/pytorch2onnx.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import warnings
+
+import numpy as np
+import torch
+
+from mmpose.apis import init_pose_model
+
+try:
+    import onnx
+    import onnxruntime as rt
+except ImportError as e:
+    raise ImportError(f'Please install onnx and onnxruntime first. {e}')
+
+try:
+    from mmcv.onnx.symbolic import register_extra_symbolics
+except ModuleNotFoundError:
+    raise NotImplementedError('please update mmcv to version>=1.0.4')
+
+
+def _convert_batchnorm(module):
+    """Convert the syncBNs into normal BN3ds."""
+    module_output = module
+    if isinstance(module, torch.nn.SyncBatchNorm):
+        module_output = torch.nn.BatchNorm3d(module.num_features, module.eps,
+                                             module.momentum, module.affine,
+                                             module.track_running_stats)
+        if module.affine:
+            module_output.weight.data = module.weight.data.clone().detach()
+            module_output.bias.data = module.bias.data.clone().detach()
+            # keep requires_grad unchanged
+            module_output.weight.requires_grad = module.weight.requires_grad
+            module_output.bias.requires_grad = module.bias.requires_grad
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+    for name, child in module.named_children():
+        module_output.add_module(name, _convert_batchnorm(child))
+    del module
+    return module_output
+
+
+def pytorch2onnx(model,
+                 input_shape,
+                 opset_version=11,
+                 show=False,
+                 output_file='tmp.onnx',
+                 verify=False):
+    """Convert pytorch model to onnx model.
+
+    Args:
+        model (:obj:`nn.Module`): The pytorch model to be exported.
+        input_shape (tuple[int]): The input tensor shape of the model.
+        opset_version (int): Opset version of onnx used. Default: 11.
+        show (bool): Determines whether to print the onnx model architecture.
+            Default: False.
+        output_file (str): Output onnx model name. Default: 'tmp.onnx'.
+        verify (bool): Determines whether to verify the onnx model.
+            Default: False.
+    """
+    model.cpu().eval()
+
+    one_img = torch.randn(input_shape)
+
+    register_extra_symbolics(opset_version)
+    torch.onnx.export(
+        model,
+        one_img,
+        output_file,
+        export_params=True,
+        keep_initializers_as_inputs=True,
+        verbose=show,
+        opset_version=opset_version)
+
+    print(f'Successfully exported ONNX model: {output_file}')
+    if verify:
+        # check by onnx
+        onnx_model = onnx.load(output_file)
+        onnx.checker.check_model(onnx_model)
+
+        # check the numerical value
+        # get pytorch output
+        pytorch_results = model(one_img)
+        if not isinstance(pytorch_results, (list, tuple)):
+            assert isinstance(pytorch_results, torch.Tensor)
+            pytorch_results = [pytorch_results]
+
+        # get onnx output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert len(net_feed_input) == 1
+        sess = rt.InferenceSession(output_file)
+        onnx_results = sess.run(None,
+                                {net_feed_input[0]: one_img.detach().numpy()})
+
+        # compare results
+        assert len(pytorch_results) == len(onnx_results)
+        for pt_result, onnx_result in zip(pytorch_results, onnx_results):
+            assert np.allclose(
+                pt_result.detach().cpu(), onnx_result, atol=1.e-5
+            ), 'The outputs are different between Pytorch and ONNX'
+        print('The numerical values are same between Pytorch and ONNX')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MMPose models to ONNX')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--show', action='store_true', help='show onnx graph')
+    parser.add_argument('--output-file', type=str, default='tmp.onnx')
+    parser.add_argument('--opset-version', type=int, default=11)
+    parser.add_argument(
+        '--verify',
+        action='store_true',
+        help='verify the onnx model output against pytorch output')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[1, 3, 256, 192],
+        help='input size')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    assert args.opset_version == 11, 'MMPose only supports opset 11 now'
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    model = init_pose_model(args.config, args.checkpoint, device='cpu')
+    model = _convert_batchnorm(model)
+
+    # onnx.export does not support kwargs
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'Please implement the forward method for exporting.')
+
+    # convert model to onnx file
+    pytorch2onnx(
+        model,
+        args.shape,
+        opset_version=args.opset_version,
+        show=args.show,
+        output_file=args.output_file,
+        verify=args.verify)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/test_torchserver.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/test_torchserver.py
new file mode 100644
index 0000000000000000000000000000000000000000..70e27c575be05fb4a72ce19063ceec5015fc6779
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/deployment/test_torchserver.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import warnings
+from argparse import ArgumentParser
+
+import requests
+
+from mmpose.apis import (inference_bottom_up_pose_model,
+                         inference_top_down_pose_model, init_pose_model,
+                         vis_pose_result)
+from mmpose.models import AssociativeEmbedding, TopDown
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model_name', help='The model name in the server')
+    parser.add_argument(
+        '--inference-addr',
+        default='127.0.0.1:8080',
+        help='Address and port of the inference server')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--out-dir', default='vis_results', help='Visualization output path')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    os.makedirs(args.out_dir, exist_ok=True)
+
+    # Inference single image by native apis.
+    model = init_pose_model(args.config, args.checkpoint, device=args.device)
+    if isinstance(model, TopDown):
+        pytorch_result, _ = inference_top_down_pose_model(
+            model, args.img, person_results=None)
+    elif isinstance(model, (AssociativeEmbedding, )):
+        pytorch_result, _ = inference_bottom_up_pose_model(model, args.img)
+    else:
+        raise NotImplementedError()
+
+    vis_pose_result(
+        model,
+        args.img,
+        pytorch_result,
+        out_file=osp.join(args.out_dir, 'pytorch_result.png'))
+
+    # Inference single image by torchserve engine.
+    url = 'http://' + args.inference_addr + '/predictions/' + args.model_name
+    with open(args.img, 'rb') as image:
+        response = requests.post(url, image)
+    server_result = response.json()
+
+    vis_pose_result(
+        model,
+        args.img,
+        server_result,
+        out_file=osp.join(args.out_dir, 'torchserve_result.png'))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dist_test.sh b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dist_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9dcb8851c9b25f1c5ec081ab1a0a59178bbf81ca
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dist_test.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+# Copyright (c) OpenMMLab. All rights reserved.
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29500}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dist_train.sh b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dist_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9727f5310ae78bcd02c3b08a12f135fdb3b93437
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/dist_train.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+# Copyright (c) OpenMMLab. All rights reserved.
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-29500}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/misc/keypoints2coco_without_mmdet.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/misc/keypoints2coco_without_mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..63220fcb19cb5d80435e69874022741b33e84ef0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/misc/keypoints2coco_without_mmdet.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+from argparse import ArgumentParser
+
+from mmcv import track_iter_progress
+from PIL import Image
+from xtcocotools.coco import COCO
+
+from mmpose.apis import inference_top_down_pose_model, init_pose_model
+
+
+def main():
+    """Visualize the demo images.
+
+    pose_keypoints require the json_file containing boxes.
+    """
+    parser = ArgumentParser()
+    parser.add_argument('pose_config', help='Config file for detection')
+    parser.add_argument('pose_checkpoint', help='Checkpoint file')
+    parser.add_argument('--img-root', type=str, default='', help='Image root')
+    parser.add_argument(
+        '--json-file',
+        type=str,
+        default='',
+        help='Json file containing image person bboxes in COCO format.')
+    parser.add_argument(
+        '--out-json-file',
+        type=str,
+        default='',
+        help='Output json contains pseudolabeled annotation')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        default=False,
+        help='whether to show img')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--kpt-thr', type=float, default=0.3, help='Keypoint score threshold')
+
+    args = parser.parse_args()
+
+    coco = COCO(args.json_file)
+    # build the pose model from a config file and a checkpoint file
+    pose_model = init_pose_model(
+        args.pose_config, args.pose_checkpoint, device=args.device.lower())
+
+    dataset = pose_model.cfg.data['test']['type']
+
+    img_keys = list(coco.imgs.keys())
+
+    # optional
+    return_heatmap = False
+
+    # e.g. use ('backbone', ) to return backbone feature
+    output_layer_names = None
+
+    categories = [{'id': 1, 'name': 'person'}]
+    img_anno_dict = {'images': [], 'annotations': [], 'categories': categories}
+
+    # process each image
+    ann_uniq_id = int(0)
+    for i in track_iter_progress(range(len(img_keys))):
+        # get bounding box annotations
+        image_id = img_keys[i]
+        image = coco.loadImgs(image_id)[0]
+        image_name = os.path.join(args.img_root, image['file_name'])
+
+        width, height = Image.open(image_name).size
+        ann_ids = coco.getAnnIds(image_id)
+
+        # make person bounding boxes
+        person_results = []
+        for ann_id in ann_ids:
+            person = {}
+            ann = coco.anns[ann_id]
+            # bbox format is 'xywh'
+            person['bbox'] = ann['bbox']
+            person_results.append(person)
+
+        pose_results, returned_outputs = inference_top_down_pose_model(
+            pose_model,
+            image_name,
+            person_results,
+            bbox_thr=None,
+            format='xywh',
+            dataset=dataset,
+            return_heatmap=return_heatmap,
+            outputs=output_layer_names)
+
+        # add output of model and bboxes to dict
+        for indx, i in enumerate(pose_results):
+            pose_results[indx]['keypoints'][
+                pose_results[indx]['keypoints'][:, 2] < args.kpt_thr, :3] = 0
+            pose_results[indx]['keypoints'][
+                pose_results[indx]['keypoints'][:, 2] >= args.kpt_thr, 2] = 2
+            x = int(pose_results[indx]['bbox'][0])
+            y = int(pose_results[indx]['bbox'][1])
+            w = int(pose_results[indx]['bbox'][2] -
+                    pose_results[indx]['bbox'][0])
+            h = int(pose_results[indx]['bbox'][3] -
+                    pose_results[indx]['bbox'][1])
+            bbox = [x, y, w, h]
+            area = round((w * h), 0)
+
+            images = {
+                'file_name': image_name.split('/')[-1],
+                'height': height,
+                'width': width,
+                'id': int(image_id)
+            }
+
+            annotations = {
+                'keypoints': [
+                    int(i) for i in pose_results[indx]['keypoints'].reshape(
+                        -1).tolist()
+                ],
+                'num_keypoints':
+                len(pose_results[indx]['keypoints']),
+                'area':
+                area,
+                'iscrowd':
+                0,
+                'image_id':
+                int(image_id),
+                'bbox':
+                bbox,
+                'category_id':
+                1,
+                'id':
+                ann_uniq_id,
+            }
+
+            img_anno_dict['annotations'].append(annotations)
+            ann_uniq_id += 1
+
+        img_anno_dict['images'].append(images)
+
+    # create json
+    with open(args.out_json_file, 'w') as outfile:
+        json.dump(img_anno_dict, outfile, indent=2)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/misc/publish_model.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/misc/publish_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..393721ab06cde171f2b06afc8674c9f03046b65b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/misc/publish_model.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+from datetime import date
+
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    if out_file.endswith('.pth'):
+        out_file_name = out_file[:-4]
+    else:
+        out_file_name = out_file
+
+    date_now = date.today().strftime('%Y%m%d')
+    final_file = out_file_name + f'-{sha[:8]}_{date_now}.pth'
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/model_split.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/model_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..928380a54e293579e43833264410fe7de4ee8954
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/model_split.py
@@ -0,0 +1,104 @@
+import torch
+import os
+import argparse
+import copy
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source', type=str)
+    parser.add_argument('--target', type=str, default=None)
+    args = parser.parse_args()
+    return args
+
+def main():
+
+    args = parse_args()
+
+    if args.target is None:
+        args.target = '/'.join(args.source.split('/')[:-1])
+
+    ckpt = torch.load(args.source, map_location='cpu')
+    
+    experts = dict()
+
+    new_ckpt = copy.deepcopy(ckpt)
+
+    state_dict = new_ckpt['state_dict']
+
+    for key, value in state_dict.items():
+        if 'mlp.experts' in key:
+            experts[key] = value
+
+    keys = ckpt['state_dict'].keys()
+
+    target_expert = 0
+    new_ckpt = copy.deepcopy(ckpt)
+
+    for key in keys:
+        if 'mlp.fc2' in key:
+            value = new_ckpt['state_dict'][key]
+            value = torch.cat([value, experts[key.replace('fc2.', f'experts.{target_expert}.')]], dim=0)
+            new_ckpt['state_dict'][key] = value
+
+    torch.save(new_ckpt, os.path.join(args.target, 'coco.pth'))
+
+    names = ['aic', 'mpii', 'ap10k', 'apt36k','wholebody']
+    num_keypoints = [14, 16, 17, 17, 133]
+    weight_names = ['keypoint_head.deconv_layers.0.weight', 
+                    'keypoint_head.deconv_layers.1.weight', 
+                    'keypoint_head.deconv_layers.1.bias', 
+                    'keypoint_head.deconv_layers.1.running_mean', 
+                    'keypoint_head.deconv_layers.1.running_var', 
+                    'keypoint_head.deconv_layers.1.num_batches_tracked', 
+                    'keypoint_head.deconv_layers.3.weight', 
+                    'keypoint_head.deconv_layers.4.weight', 
+                    'keypoint_head.deconv_layers.4.bias', 
+                    'keypoint_head.deconv_layers.4.running_mean', 
+                    'keypoint_head.deconv_layers.4.running_var', 
+                    'keypoint_head.deconv_layers.4.num_batches_tracked', 
+                    'keypoint_head.final_layer.weight', 
+                    'keypoint_head.final_layer.bias']
+    
+    exist_range = True
+
+    for i in range(5):
+
+        new_ckpt = copy.deepcopy(ckpt)
+
+        target_expert = i + 1
+
+        for key in keys:
+            if 'mlp.fc2' in key:
+                expert_key = key.replace('fc2.', f'experts.{target_expert}.')
+                if expert_key in experts:
+                    value = new_ckpt['state_dict'][key]
+                    value = torch.cat([value, experts[expert_key]], dim=0)
+                else:
+                    exist_range = False
+
+                new_ckpt['state_dict'][key] = value
+
+        if not exist_range:
+            break
+
+        for tensor_name in weight_names:
+            new_ckpt['state_dict'][tensor_name] = new_ckpt['state_dict'][tensor_name.replace('keypoint_head', f'associate_keypoint_heads.{i}')]
+
+        for tensor_name in ['keypoint_head.final_layer.weight', 'keypoint_head.final_layer.bias']:
+            new_ckpt['state_dict'][tensor_name] = new_ckpt['state_dict'][tensor_name][:num_keypoints[i]]
+        
+        # remove unnecessary part in the state dict
+        for j in range(5):
+            # remove associate part
+            for tensor_name in weight_names:
+                new_ckpt['state_dict'].pop(tensor_name.replace('keypoint_head', f'associate_keypoint_heads.{j}'))
+        # remove expert part
+        keys = new_ckpt['state_dict'].keys()
+        for key in list(keys):
+            if 'expert' in keys:
+                new_ckpt['state_dict'].pop(key)
+            
+        torch.save(new_ckpt, os.path.join(args.target, f'{names[i]}.pth'))
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/slurm_test.sh b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/slurm_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c528dc9d4514539d86e18371129ceb2bfff54dea
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/slurm_test.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright (c) OpenMMLab. All rights reserved.
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+CHECKPOINT=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+PY_ARGS=${@:5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/slurm_train.sh b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/slurm_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c3b65490a5271b6e9967362a2a727685292e8a78
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/slurm_train.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright (c) OpenMMLab. All rights reserved.
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+WORK_DIR=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${@:5}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/test.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1539925f6b45a4c04a844b31521b0a202fcfbd0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/test.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import warnings
+
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import get_dist_info, init_dist, load_checkpoint
+
+from mmpose.apis import multi_gpu_test, single_gpu_test
+from mmpose.datasets import build_dataloader, build_dataset
+from mmpose.models import build_posenet
+from mmpose.utils import setup_multi_processes
+
+try:
+    from mmcv.runner import wrap_fp16_model
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import wrap_fp16_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='mmpose test model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='output result file')
+    parser.add_argument(
+        '--work-dir', help='the dir to save evaluation results')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
+    parser.add_argument(
+        '--eval',
+        default=None,
+        nargs='+',
+        help='evaluation metric, which depends on the dataset,'
+        ' e.g., "mAP" for MSCOCO')
+    parser.add_argument(
+        '--gpu_collect',
+        action='store_true',
+        help='whether to use gpu to collect results')
+    parser.add_argument('--tmpdir', help='tmp dir for writing some results')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def merge_configs(cfg1, cfg2):
+    # Merge cfg2 into cfg1
+    # Overwrite cfg1 if repeated, ignore if value is None.
+    cfg1 = {} if cfg1 is None else cfg1.copy()
+    cfg2 = {} if cfg2 is None else cfg2
+    for k, v in cfg2.items():
+        if v:
+            cfg1[k] = v
+    return cfg1
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # set multi-process settings
+    setup_multi_processes(cfg)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test, dict(test_mode=True))
+    # step 1: give default values and override (if exist) from cfg.data
+    loader_cfg = {
+        **dict(seed=cfg.get('seed'), drop_last=False, dist=distributed),
+        **({} if torch.__version__ != 'parrots' else dict(
+               prefetch_num=2,
+               pin_memory=False,
+           )),
+        **dict((k, cfg.data[k]) for k in [
+                   'seed',
+                   'prefetch_num',
+                   'pin_memory',
+                   'persistent_workers',
+               ] if k in cfg.data)
+    }
+    # step2: cfg.data.test_dataloader has higher priority
+    test_loader_cfg = {
+        **loader_cfg,
+        **dict(shuffle=False, drop_last=False),
+        **dict(workers_per_gpu=cfg.data.get('workers_per_gpu', 1)),
+        **dict(samples_per_gpu=cfg.data.get('samples_per_gpu', 1)),
+        **cfg.data.get('test_dataloader', {})
+    }
+    data_loader = build_dataloader(dataset, **test_loader_cfg)
+
+    # build the model and load checkpoint
+    model = build_posenet(cfg.model)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+
+    if not distributed:
+        model = MMDataParallel(model, device_ids=[args.gpu_id])
+        outputs = single_gpu_test(model, data_loader)
+    else:
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
+                                 args.gpu_collect)
+
+    rank, _ = get_dist_info()
+    eval_config = cfg.get('evaluation', {})
+    eval_config = merge_configs(eval_config, dict(metric=args.eval))
+
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+            mmcv.dump(outputs, args.out)
+
+        results = dataset.evaluate(outputs, cfg.work_dir, **eval_config)
+        for k, v in sorted(results.items()):
+            print(f'{k}: {v}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/train.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e1f7074b9cf77739f9d786c6589a2c8f1352aba
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/train.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+import warnings
+
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist, set_random_seed
+from mmcv.utils import get_git_hash
+
+from mmpose import __version__
+from mmpose.apis import init_random_seed, train_model
+from mmpose.datasets import build_dataset
+from mmpose.models import build_posenet
+from mmpose.utils import collect_env, get_root_logger, setup_multi_processes
+import mmcv_custom
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a pose model')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='(Deprecated, please use --gpu-id) number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # set multi-process settings
+    setup_multi_processes(cfg)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    if args.gpus is not None:
+        cfg.gpu_ids = range(1)
+        warnings.warn('`--gpus` is deprecated because we only support '
+                      'single GPU mode in non-distributed training. '
+                      'Use `gpus=1` now.')
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed training. Use the first GPU '
+                      'in `gpu_ids` now.')
+    if args.gpus is None and args.gpu_ids is None:
+        cfg.gpu_ids = [args.gpu_id]
+
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+        if len(cfg.gpu_ids) > 1:
+            warnings.warn(
+                f'We treat {cfg.gpu_ids} as gpu-ids, and reset to '
+                f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in '
+                'non-distribute training time.')
+            cfg.gpu_ids = cfg.gpu_ids[0:1]
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    seed = init_random_seed(args.seed)
+    logger.info(f'Set random seed to {seed}, '
+                f'deterministic: {args.deterministic}')
+    set_random_seed(seed, deterministic=args.deterministic)
+    cfg.seed = seed
+    meta['seed'] = seed
+
+    model = build_posenet(cfg.model)
+    datasets = [build_dataset(cfg.data.train)]
+
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        val_dataset.pipeline = cfg.data.train.pipeline
+        datasets.append(build_dataset(val_dataset))
+
+    if cfg.checkpoint_config is not None:
+        # save mmpose version, config file content
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmpose_version=__version__ + get_git_hash(digits=7),
+            config=cfg.pretty_text,
+        )
+    train_model(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..30960fd4aeec6698f2f99d41bbb3c97e8f0b29ad
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/README.md
@@ -0,0 +1,28 @@
+# MMPose Webcam API
+
+MMPose Webcam API is a handy tool to develop interactive webcam applications with MMPose functions.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15977946/153800450-2522efe8-bc11-457d-9037-d8aee4fc4f36.png">
+<figcaption>MMPose Webcam API Overview</figcaption>
+</div>
+
+## Requirements
+
+* Python >= 3.7.0
+* MMPose >= 0.23.0
+* MMDetection >= 2.21.0
+
+## Tutorials
+
+* [Get started with MMPose Webcam API (Chinese)](/tools/webcam/docs/get_started_cn.md)
+* [Build a Webcam App: A Step-by-step Instruction (Chinese)](/tools/webcam/docs/example_cn.md)
+
+## Examples
+
+* [Pose Estimation](/tools/webcam/configs/examples/): A simple example to estimate and visualize human/animal pose.
+* [Eye Effects](/tools/webcam/configs/eyes/): Apply sunglasses and bug-eye effects.
+* [Face Swap](/tools/webcam/configs/face_swap/): Everybody gets someone else's face.
+* [Meow Dwen Dwen](/tools/webcam/configs/meow_dwen_dwen/): Dress up your cat in Bing Dwen Dwen costume.
+* [Super Saiyan](/tools/webcam/configs/supersaiyan/): Super Saiyan transformation!
+* [New Year](/tools/webcam/configs/newyear/): Set off some firecrackers to celebrate Chinese New Year.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/background/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/background/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7be8782e38717c6d537648e313921fb8c48b124e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/background/README.md
@@ -0,0 +1,73 @@
+# Matting Effects
+
+We can apply background matting to the videos.
+
+## Instruction
+
+### Get started
+
+Launch the demo from the mmpose root directory:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/background/background.py
+```
+
+### Hotkeys
+
+| Hotkey | Function |
+| -- | -- |
+| b | Toggle the background matting effect on/off. |
+| h | Show help information. |
+| m | Show the monitoring information. |
+| q | Exit. |
+
+Note that the demo will automatically save the output video into a file `record.mp4`.
+
+### Configuration
+
+- **Choose a detection model**
+
+Users can choose detection models from the [MMDetection Model Zoo](https://mmdetection.readthedocs.io/en/v2.20.0/model_zoo.html). Just set the `model_config` and `model_checkpoint` in the detector node accordingly, and the model will be automatically downloaded and loaded.
+Note that in order to perform background matting, the model should be able to produce segmentation masks.
+
+```python
+# 'DetectorNode':
+# This node performs object detection from the frame image using an
+# MMDetection model.
+dict(
+    type='DetectorNode',
+    name='Detector',
+    model_config='demo/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py',
+    model_checkpoint='https://download.openmmlab.com/'
+    'mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/'
+    'mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392'
+    '__segm_mAP-0.354_20200505_003907-3e542a40.pth',
+    input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+    output_buffer='det_result'),
+```
+
+- **Run the demo without GPU**
+
+If you don't have GPU and CUDA in your device, the demo can run with only CPU by setting `device='cpu'` in all model nodes. For example:
+
+```python
+dict(
+    type='DetectorNode',
+    name='Detector',
+    model_config='demo/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py',
+    model_checkpoint='https://download.openmmlab.com/'
+    'mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/'
+    'mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392'
+    '__segm_mAP-0.354_20200505_003907-3e542a40.pth',
+    device='cpu',
+    input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+    output_buffer='det_result'),
+```
+
+- **Debug webcam and display**
+
+You can launch the webcam runner with a debug config:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/examples/test_camera.py
+```
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/background/background.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/background/background.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb9f4d616e929cbe7f3c789a729ce2c07d40b9a1
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/background/background.py
@@ -0,0 +1,93 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+runner = dict(
+    # Basic configurations of the runner
+    name='Matting Effects',
+    camera_id=0,
+    camera_fps=10,
+    synchronous=False,
+    # Define nodes.
+    # The configuration of a node usually includes:
+    #   1. 'type': Node class name
+    #   2. 'name': Node name
+    #   3. I/O buffers (e.g. 'input_buffer', 'output_buffer'): specify the
+    #       input and output buffer names. This may depend on the node class.
+    #   4. 'enable_key': assign a hot-key to toggle enable/disable this node.
+    #       This may depend on the node class.
+    #   5. Other class-specific arguments
+    nodes=[
+        # 'DetectorNode':
+        # This node performs object detection from the frame image using an
+        # MMDetection model.
+        dict(
+            type='DetectorNode',
+            name='Detector',
+            model_config='demo/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py',
+            model_checkpoint='https://download.openmmlab.com/'
+            'mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/'
+            'mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392'
+            '__segm_mAP-0.354_20200505_003907-3e542a40.pth',
+            input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+            output_buffer='det_result'),
+        # 'TopDownPoseEstimatorNode':
+        # This node performs keypoint detection from the frame image using an
+        # MMPose top-down model. Detection results is needed.
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='Human Pose Estimator',
+            model_config='configs/wholebody/2d_kpt_sview_rgb_img/'
+            'topdown_heatmap/coco-wholebody/'
+            'vipnas_mbv3_coco_wholebody_256x192_dark.py',
+            model_checkpoint='https://openmmlab-share.oss-cn-hangz'
+            'hou.aliyuncs.com/mmpose/top_down/vipnas/vipnas_mbv3_co'
+            'co_wholebody_256x192_dark-e2158108_20211205.pth',
+            cls_names=['person'],
+            input_buffer='det_result',
+            output_buffer='human_pose'),
+        # 'ModelResultBindingNode':
+        # This node binds the latest model inference result with the current
+        # frame. (This means the frame image and inference result may be
+        # asynchronous).
+        dict(
+            type='ModelResultBindingNode',
+            name='ResultBinder',
+            frame_buffer='_frame_',  # `_frame_` is a runner-reserved buffer
+            result_buffer='human_pose',
+            output_buffer='frame'),
+        # 'MattingNode':
+        # This node draw the matting visualization result in the frame image.
+        # mask results is needed.
+        dict(
+            type='BackgroundNode',
+            name='Visualizer',
+            enable_key='b',
+            enable=True,
+            frame_buffer='frame',
+            output_buffer='vis_bg',
+            cls_names=['person']),
+        # 'NoticeBoardNode':
+        # This node show a notice board with given content, e.g. help
+        # information.
+        dict(
+            type='NoticeBoardNode',
+            name='Helper',
+            enable_key='h',
+            frame_buffer='vis_bg',
+            output_buffer='vis',
+            content_lines=[
+                'This is a demo for background changing effects. Have fun!',
+                '', 'Hot-keys:', '"b": Change background',
+                '"h": Show help information',
+                '"m": Show diagnostic information', '"q": Exit'
+            ],
+        ),
+        # 'MonitorNode':
+        # This node show diagnostic information in the frame image. It can
+        # be used for debugging or monitoring system resource status.
+        dict(
+            type='MonitorNode',
+            name='Monitor',
+            enable_key='m',
+            enable=False,
+            frame_buffer='vis',
+            output_buffer='_display_')  # `_frame_` is a runner-reserved buffer
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/examples/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/examples/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ec9b961d284631478b3c326872d75942437a7f0e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/examples/README.md
@@ -0,0 +1,110 @@
+# Pose Estimation Demo
+
+This demo performs human bounding box and keypoint detection, and visualizes results.
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/15977946/148911749-57b263c0-1075-4a65-af53-b51fc815da68.gif" width="600px" alt><br>
+</div>
+
+## Instruction
+
+### Get started
+
+Launch the demo from the mmpose root directory:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/examples/pose_estimation.py
+```
+
+### Hotkeys
+
+| Hotkey | Function |
+| -- | -- |
+| v | Toggle the pose visualization on/off. |
+| h | Show help information. |
+| m | Show the monitoring information. |
+| q | Exit. |
+
+Note that the demo will automatically save the output video into a file `record.mp4`.
+
+### Configuration
+
+- **Choose a detection model**
+
+Users can choose detection models from the [MMDetection Model Zoo](https://mmdetection.readthedocs.io/en/v2.20.0/model_zoo.html). Just set the `model_config` and `model_checkpoint` in the detector node accordingly, and the model will be automatically downloaded and loaded.
+
+```python
+# 'DetectorNode':
+    # This node performs object detection from the frame image using an
+    # MMDetection model.
+dict(
+    type='DetectorNode',
+    name='Detector',
+    model_config='demo/mmdetection_cfg/'
+    'ssdlite_mobilenetv2_scratch_600e_coco.py',
+    model_checkpoint='https://download.openmmlab.com'
+    '/mmdetection/v2.0/ssd/'
+    'ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_'
+    'scratch_600e_coco_20210629_110627-974d9307.pth',
+    input_buffer='_input_',
+    output_buffer='det_result')
+```
+
+- **Choose a or more pose models**
+
+In this demo we use two [top-down](https://github.com/open-mmlab/mmpose/tree/master/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap) pose estimation models for humans and animals respectively. Users can choose models from the [MMPose Model Zoo](https://mmpose.readthedocs.io/en/latest/modelzoo.html). To apply different pose models on different instance types, you can add multiple pose estimator nodes with `cls_names` set accordingly.
+
+```python
+# 'TopDownPoseEstimatorNode':
+# This node performs keypoint detection from the frame image using an
+# MMPose top-down model. Detection results is needed.
+dict(
+    type='TopDownPoseEstimatorNode',
+    name='Human Pose Estimator',
+    model_config='configs/wholebody/2d_kpt_sview_rgb_img/'
+    'topdown_heatmap/coco-wholebody/'
+    'vipnas_mbv3_coco_wholebody_256x192_dark.py',
+    model_checkpoint='https://openmmlab-share.oss-cn-hangz'
+    'hou.aliyuncs.com/mmpose/top_down/vipnas/vipnas_mbv3_co'
+    'co_wholebody_256x192_dark-e2158108_20211205.pth',
+    cls_names=['person'],
+    input_buffer='det_result',
+    output_buffer='human_pose'),
+dict(
+    type='TopDownPoseEstimatorNode',
+    name='Animal Pose Estimator',
+    model_config='configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap'
+    '/animalpose/hrnet_w32_animalpose_256x256.py',
+    model_checkpoint='https://download.openmmlab.com/mmpose/animal/'
+    'hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth',
+    cls_names=['cat', 'dog', 'horse', 'sheep', 'cow'],
+    input_buffer='human_pose',
+    output_buffer='animal_pose')
+```
+
+- **Run the demo without GPU**
+
+If you don't have GPU and CUDA in your device, the demo can run with only CPU by setting `device='cpu'` in all model nodes. For example:
+
+```python
+dict(
+    type='DetectorNode',
+    name='Detector',
+    model_config='demo/mmdetection_cfg/'
+    'ssdlite_mobilenetv2_scratch_600e_coco.py',
+    model_checkpoint='https://download.openmmlab.com'
+    '/mmdetection/v2.0/ssd/'
+    'ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_'
+    'scratch_600e_coco_20210629_110627-974d9307.pth',
+    device='cpu',
+    input_buffer='_input_',
+    output_buffer='det_result')
+```
+
+- **Debug webcam and display**
+
+You can lanch the webcam runner with a debug config:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/examples/test_camera.py
+```
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/examples/pose_estimation.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/examples/pose_estimation.py
new file mode 100644
index 0000000000000000000000000000000000000000..471333a448530c5b99f9016729b269953099f466
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/examples/pose_estimation.py
@@ -0,0 +1,115 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+runner = dict(
+    # Basic configurations of the runner
+    name='Pose Estimation',
+    camera_id=0,
+    camera_fps=20,
+    synchronous=False,
+    # Define nodes.
+    # The configuration of a node usually includes:
+    #   1. 'type': Node class name
+    #   2. 'name': Node name
+    #   3. I/O buffers (e.g. 'input_buffer', 'output_buffer'): specify the
+    #       input and output buffer names. This may depend on the node class.
+    #   4. 'enable_key': assign a hot-key to toggle enable/disable this node.
+    #       This may depend on the node class.
+    #   5. Other class-specific arguments
+    nodes=[
+        # 'DetectorNode':
+        # This node performs object detection from the frame image using an
+        # MMDetection model.
+        dict(
+            type='DetectorNode',
+            name='Detector',
+            model_config='demo/mmdetection_cfg/'
+            'ssdlite_mobilenetv2_scratch_600e_coco.py',
+            model_checkpoint='https://download.openmmlab.com'
+            '/mmdetection/v2.0/ssd/'
+            'ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_'
+            'scratch_600e_coco_20210629_110627-974d9307.pth',
+            input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+            output_buffer='det_result'),
+        # 'TopDownPoseEstimatorNode':
+        # This node performs keypoint detection from the frame image using an
+        # MMPose top-down model. Detection results is needed.
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='Human Pose Estimator',
+            model_config='configs/wholebody/2d_kpt_sview_rgb_img/'
+            'topdown_heatmap/coco-wholebody/'
+            'vipnas_mbv3_coco_wholebody_256x192_dark.py',
+            model_checkpoint='https://download.openmmlab.com/mmpose/top_down/'
+            'vipnas/vipnas_mbv3_coco_wholebody_256x192_dark'
+            '-e2158108_20211205.pth',
+            cls_names=['person'],
+            input_buffer='det_result',
+            output_buffer='human_pose'),
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='Animal Pose Estimator',
+            model_config='configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap'
+            '/animalpose/hrnet_w32_animalpose_256x256.py',
+            model_checkpoint='https://download.openmmlab.com/mmpose/animal/'
+            'hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth',
+            cls_names=['cat', 'dog', 'horse', 'sheep', 'cow'],
+            input_buffer='human_pose',
+            output_buffer='animal_pose'),
+        # 'ModelResultBindingNode':
+        # This node binds the latest model inference result with the current
+        # frame. (This means the frame image and inference result may be
+        # asynchronous).
+        dict(
+            type='ModelResultBindingNode',
+            name='ResultBinder',
+            frame_buffer='_frame_',  # `_frame_` is a runner-reserved buffer
+            result_buffer='animal_pose',
+            output_buffer='frame'),
+        # 'PoseVisualizerNode':
+        # This node draw the pose visualization result in the frame image.
+        # Pose results is needed.
+        dict(
+            type='PoseVisualizerNode',
+            name='Visualizer',
+            enable_key='v',
+            frame_buffer='frame',
+            output_buffer='vis'),
+        # 'NoticeBoardNode':
+        # This node show a notice board with given content, e.g. help
+        # information.
+        dict(
+            type='NoticeBoardNode',
+            name='Helper',
+            enable_key='h',
+            enable=True,
+            frame_buffer='vis',
+            output_buffer='vis_notice',
+            content_lines=[
+                'This is a demo for pose visualization and simple image '
+                'effects. Have fun!', '', 'Hot-keys:',
+                '"v": Pose estimation result visualization',
+                '"s": Sunglasses effect B-)', '"b": Bug-eye effect 0_0',
+                '"h": Show help information',
+                '"m": Show diagnostic information', '"q": Exit'
+            ],
+        ),
+        # 'MonitorNode':
+        # This node show diagnostic information in the frame image. It can
+        # be used for debugging or monitoring system resource status.
+        dict(
+            type='MonitorNode',
+            name='Monitor',
+            enable_key='m',
+            enable=False,
+            frame_buffer='vis_notice',
+            output_buffer='display'),
+        # 'RecorderNode':
+        # This node save the output video into a file.
+        dict(
+            type='RecorderNode',
+            name='Recorder',
+            out_video_file='record.mp4',
+            frame_buffer='display',
+            output_buffer='_display_'
+            # `_display_` is a runner-reserved buffer
+        )
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/examples/test_camera.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/examples/test_camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0c1677f4f1cbe8fe3dad081c7b9889602a39956
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/examples/test_camera.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+runner = dict(
+    name='Debug CamRunner',
+    camera_id=0,
+    camera_fps=20,
+    nodes=[
+        dict(
+            type='MonitorNode',
+            name='Monitor',
+            enable_key='m',
+            frame_buffer='_frame_',
+            output_buffer='display'),
+        dict(
+            type='RecorderNode',
+            name='Recorder',
+            out_video_file='webcam_output.mp4',
+            frame_buffer='display',
+            output_buffer='_display_')
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/eyes/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/eyes/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9c37695eecb18a0e4becdbcc1aa59bde4e75247
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/eyes/README.md
@@ -0,0 +1,31 @@
+# Sunglasses and Bug-eye Effects
+
+We can apply fun effects on videos with pose estimation results, like adding sunglasses on the face, or make the eyes look bigger.
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/15977946/124059525-ce20c580-da5d-11eb-8e4a-2d96cd31fe9f.gif" width="600px" alt><br>
+</div>
+
+## Instruction
+
+### Get started
+
+Launch the demo from the mmpose root directory:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/examples/pose_estimation.py
+```
+
+### Hotkeys
+
+| Hotkey | Function |
+| -- | -- |
+| s | Toggle the sunglasses effect on/off. |
+| b | Toggle the bug-eye effect on/off. |
+| h | Show help information. |
+| m | Show the monitoring information. |
+| q | Exit. |
+
+### Configuration
+
+See the [README](/tools/webcam/configs/examples/README.md#configuration) of pose estimation demo for model configurations.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/eyes/eyes.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/eyes/eyes.py
new file mode 100644
index 0000000000000000000000000000000000000000..91bbfba9d9f89f7c7071375bedcc73a1e18d1783
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/eyes/eyes.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+runner = dict(
+    # Basic configurations of the runner
+    name='Eye Effects',
+    camera_id=0,
+    camera_fps=20,
+    synchronous=False,
+    # Define nodes.
+    # The configuration of a node usually includes:
+    #   1. 'type': Node class name
+    #   2. 'name': Node name
+    #   3. I/O buffers (e.g. 'input_buffer', 'output_buffer'): specify the
+    #       input and output buffer names. This may depend on the node class.
+    #   4. 'enable_key': assign a hot-key to toggle enable/disable this node.
+    #       This may depend on the node class.
+    #   5. Other class-specific arguments
+    nodes=[
+        # 'DetectorNode':
+        # This node performs object detection from the frame image using an
+        # MMDetection model.
+        dict(
+            type='DetectorNode',
+            name='Detector',
+            model_config='demo/mmdetection_cfg/'
+            'ssdlite_mobilenetv2_scratch_600e_coco.py',
+            model_checkpoint='https://download.openmmlab.com'
+            '/mmdetection/v2.0/ssd/'
+            'ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_'
+            'scratch_600e_coco_20210629_110627-974d9307.pth',
+            input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+            output_buffer='det_result'),
+        # 'TopDownPoseEstimatorNode':
+        # This node performs keypoint detection from the frame image using an
+        # MMPose top-down model. Detection results is needed.
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='Human Pose Estimator',
+            model_config='configs/wholebody/2d_kpt_sview_rgb_img/'
+            'topdown_heatmap/coco-wholebody/'
+            'vipnas_mbv3_coco_wholebody_256x192_dark.py',
+            model_checkpoint='https://openmmlab-share.oss-cn-hangz'
+            'hou.aliyuncs.com/mmpose/top_down/vipnas/vipnas_mbv3_co'
+            'co_wholebody_256x192_dark-e2158108_20211205.pth',
+            cls_names=['person'],
+            input_buffer='det_result',
+            output_buffer='human_pose'),
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='Animal Pose Estimator',
+            model_config='configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap'
+            '/animalpose/hrnet_w32_animalpose_256x256.py',
+            model_checkpoint='https://download.openmmlab.com/mmpose/animal/'
+            'hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth',
+            cls_names=['cat', 'dog', 'horse', 'sheep', 'cow'],
+            input_buffer='human_pose',
+            output_buffer='animal_pose'),
+        # 'ModelResultBindingNode':
+        # This node binds the latest model inference result with the current
+        # frame. (This means the frame image and inference result may be
+        # asynchronous).
+        dict(
+            type='ModelResultBindingNode',
+            name='ResultBinder',
+            frame_buffer='_frame_',  # `_frame_` is a runner-reserved buffer
+            result_buffer='animal_pose',
+            output_buffer='frame'),
+        # 'SunglassesNode':
+        # This node draw the sunglasses effect in the frame image.
+        # Pose results is needed.
+        dict(
+            type='SunglassesNode',
+            name='Visualizer',
+            enable_key='s',
+            enable=True,
+            frame_buffer='frame',
+            output_buffer='vis_sunglasses'),
+        # 'BugEyeNode':
+        # This node draw the bug-eye effetc in the frame image.
+        # Pose results is needed.
+        dict(
+            type='BugEyeNode',
+            name='Visualizer',
+            enable_key='b',
+            enable=False,
+            frame_buffer='vis_sunglasses',
+            output_buffer='vis_bugeye'),
+        # 'NoticeBoardNode':
+        # This node show a notice board with given content, e.g. help
+        # information.
+        dict(
+            type='NoticeBoardNode',
+            name='Helper',
+            enable_key='h',
+            frame_buffer='vis_bugeye',
+            output_buffer='vis',
+            content_lines=[
+                'This is a demo for pose visualization and simple image '
+                'effects. Have fun!', '', 'Hot-keys:',
+                '"s": Sunglasses effect B-)', '"b": Bug-eye effect 0_0',
+                '"h": Show help information',
+                '"m": Show diagnostic information', '"q": Exit'
+            ],
+        ),
+        # 'MonitorNode':
+        # This node show diagnostic information in the frame image. It can
+        # be used for debugging or monitoring system resource status.
+        dict(
+            type='MonitorNode',
+            name='Monitor',
+            enable_key='m',
+            enable=False,
+            frame_buffer='vis',
+            output_buffer='_display_')  # `_frame_` is a runner-reserved buffer
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/face_swap/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/face_swap/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..02f4c8aa855702bf6a668970f8e7e071611caf8e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/face_swap/README.md
@@ -0,0 +1,31 @@
+# Sunglasses and Bug-eye Effects
+
+Look! Where is my face?:eyes: And whose face is it?:laughing:
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/15977946/149705000-8818d55c-9264-409f-b995-77718cc6c822.gif" width="600px" alt><br>
+</div>
+
+## Instruction
+
+### Get started
+
+Launch the demo from the mmpose root directory:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/face_swap/face_swap.py
+```
+
+### Hotkeys
+
+| Hotkey | Function |
+| -- | -- |
+| s | Switch between modes <ul><li>Shuffle: Randomly shuffle all faces</li><li>Clone: Choose one face and clone it for everyone</li><li>None: Nothing happens and everyone is safe :)</li></ul>|
+| v | Toggle the pose visualization on/off. |
+| h | Show help information. |
+| m | Show diagnostic information. |
+| q | Exit. |
+
+### Configuration
+
+See the [README](/tools/webcam/configs/examples/README.md#configuration) of pose estimation demo for model configurations.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/face_swap/face_swap.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/face_swap/face_swap.py
new file mode 100644
index 0000000000000000000000000000000000000000..403eaae4ace483d72a4baedbaf61072c24e3a1ec
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/face_swap/face_swap.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+runner = dict(
+    name='FaceSwap',
+    camera_id=0,
+    camera_fps=20,
+    synchronous=False,
+    nodes=[
+        dict(
+            type='DetectorNode',
+            name='Detector',
+            model_config='demo/mmdetection_cfg/'
+            'ssdlite_mobilenetv2_scratch_600e_coco.py',
+            model_checkpoint='https://download.openmmlab.com'
+            '/mmdetection/v2.0/ssd/'
+            'ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_'
+            'scratch_600e_coco_20210629_110627-974d9307.pth',
+            device='cpu',
+            input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+            output_buffer='det_result'),
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='TopDown Pose Estimator',
+            model_config='configs/wholebody/2d_kpt_sview_rgb_img/'
+            'topdown_heatmap/coco-wholebody/'
+            'vipnas_res50_coco_wholebody_256x192_dark.py',
+            model_checkpoint='https://openmmlab-share.oss-cn-hangzhou'
+            '.aliyuncs.com/mmpose/top_down/vipnas/'
+            'vipnas_res50_wholebody_256x192_dark-67c0ce35_20211112.pth',
+            device='cpu',
+            cls_names=['person'],
+            input_buffer='det_result',
+            output_buffer='pose_result'),
+        dict(
+            type='ModelResultBindingNode',
+            name='ResultBinder',
+            frame_buffer='_frame_',  # `_frame_` is a runner-reserved buffer
+            result_buffer='pose_result',
+            output_buffer='frame'),
+        dict(
+            type='FaceSwapNode',
+            name='FaceSwapper',
+            mode_key='s',
+            frame_buffer='frame',
+            output_buffer='face_swap'),
+        dict(
+            type='PoseVisualizerNode',
+            name='Visualizer',
+            enable_key='v',
+            frame_buffer='face_swap',
+            output_buffer='vis_pose'),
+        dict(
+            type='NoticeBoardNode',
+            name='Help Information',
+            enable_key='h',
+            content_lines=[
+                'Swap your faces! ',
+                'Hot-keys:',
+                '"v": Toggle the pose visualization on/off.',
+                '"s": Switch between modes: Shuffle, Clone and None',
+                '"h": Show help information',
+                '"m": Show diagnostic information',
+                '"q": Exit',
+            ],
+            frame_buffer='vis_pose',
+            output_buffer='vis_notice'),
+        dict(
+            type='MonitorNode',
+            name='Monitor',
+            enable_key='m',
+            enable=False,
+            frame_buffer='vis_notice',
+            output_buffer='display'),
+        dict(
+            type='RecorderNode',
+            name='Recorder',
+            out_video_file='faceswap_output.mp4',
+            frame_buffer='display',
+            output_buffer='_display_')
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/meow_dwen_dwen/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/meow_dwen_dwen/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..997ffc174bd70c2de6a22edee53f5b52275ae187
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/meow_dwen_dwen/README.md
@@ -0,0 +1,44 @@
+# Meow Dwen Dwen
+
+Do you know [Bing DwenDwen (冰墩墩)](https://en.wikipedia.org/wiki/Bing_Dwen_Dwen_and_Shuey_Rhon_Rhon), the mascot of 2022 Beijing Olympic Games?
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/15977946/152742707-c0c51844-e1d0-42d0-9a12-e369002e082f.jpg" width="224px" alt><br>
+</div>
+
+Now you can dress your cat up in this costume and TA-DA! Be prepared for super cute **Meow Dwen Dwen**.
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/15977946/152942243-a17194a2-0fd1-4467-993c-634f6d7966d8.gif" width="300px" alt><br>
+</div>
+
+You are a dog fan? Hold on, here comes Woof Dwen Dwen.
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/15977946/152942747-84240692-3944-48a5-b60b-e60bd0a4339c.gif" width="300px" alt><br>
+</div>
+
+## Instruction
+
+### Get started
+
+Launch the demo from the mmpose root directory:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/meow_dwen_dwen/meow_dwen_dwen.py
+```
+
+### Hotkeys
+
+| Hotkey | Function |
+| -- | -- |
+| s | Change the background. |
+| h | Show help information. |
+| m | Show diagnostic information. |
+| q | Exit. |
+
+### Configuration
+
+- **Use video input**
+
+As you can see in the config, we set `camera_id` as the path of the input image. You can also set it as a video file path (or url), or a webcam ID number (e.g. `camera_id=0`), to capture the dynamic face from the video input.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/meow_dwen_dwen/meow_dwen_dwen.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/meow_dwen_dwen/meow_dwen_dwen.py
new file mode 100644
index 0000000000000000000000000000000000000000..399d01cf7c8df103772913294f1c0612979330e6
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/meow_dwen_dwen/meow_dwen_dwen.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+runner = dict(
+    # Basic configurations of the runner
+    name='Little fans of 2022 Beijing Winter Olympics',
+    # Cat image
+    camera_id='https://user-images.githubusercontent.com/'
+    '15977946/152932036-b5554cf8-24cf-40d6-a358-35a106013f11.jpeg',
+    # Dog image
+    # camera_id='https://user-images.githubusercontent.com/'
+    # '15977946/152932051-cd280b35-8066-45a0-8f52-657c8631aaba.jpg',
+    camera_fps=20,
+    nodes=[
+        dict(
+            type='DetectorNode',
+            name='Detector',
+            model_config='demo/mmdetection_cfg/'
+            'ssdlite_mobilenetv2_scratch_600e_coco.py',
+            model_checkpoint='https://download.openmmlab.com'
+            '/mmdetection/v2.0/ssd/'
+            'ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_'
+            'scratch_600e_coco_20210629_110627-974d9307.pth',
+            input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+            output_buffer='det_result'),
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='Animal Pose Estimator',
+            model_config='configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap'
+            '/ap10k/hrnet_w32_ap10k_256x256.py',
+            model_checkpoint='https://download.openmmlab.com/mmpose/animal/'
+            'hrnet/hrnet_w32_ap10k_256x256-18aac840_20211029.pth',
+            cls_names=['cat', 'dog'],
+            input_buffer='det_result',
+            output_buffer='animal_pose'),
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='TopDown Pose Estimator',
+            model_config='configs/wholebody/2d_kpt_sview_rgb_img/'
+            'topdown_heatmap/coco-wholebody/'
+            'vipnas_res50_coco_wholebody_256x192_dark.py',
+            model_checkpoint='https://openmmlab-share.oss-cn-hangzhou'
+            '.aliyuncs.com/mmpose/top_down/vipnas/'
+            'vipnas_res50_wholebody_256x192_dark-67c0ce35_20211112.pth',
+            device='cpu',
+            cls_names=['person'],
+            input_buffer='animal_pose',
+            output_buffer='human_pose'),
+        dict(
+            type='ModelResultBindingNode',
+            name='ResultBinder',
+            frame_buffer='_frame_',  # `_frame_` is a runner-reserved buffer
+            result_buffer='human_pose',
+            output_buffer='frame'),
+        dict(
+            type='XDwenDwenNode',
+            name='XDwenDwen',
+            mode_key='s',
+            resource_file='tools/webcam/configs/meow_dwen_dwen/'
+            'resource-info.json',
+            out_shape=(480, 480),
+            frame_buffer='frame',
+            output_buffer='vis'),
+        dict(
+            type='NoticeBoardNode',
+            name='Helper',
+            enable_key='h',
+            enable=False,
+            frame_buffer='vis',
+            output_buffer='vis_notice',
+            content_lines=[
+                'Let your pet put on a costume of Bing-Dwen-Dwen, '
+                'the mascot of 2022 Beijing Winter Olympics. Have fun!', '',
+                'Hot-keys:', '"s": Change the background',
+                '"h": Show help information',
+                '"m": Show diagnostic information', '"q": Exit'
+            ],
+        ),
+        dict(
+            type='MonitorNode',
+            name='Monitor',
+            enable_key='m',
+            enable=False,
+            frame_buffer='vis_notice',
+            output_buffer='display'),
+        dict(
+            type='RecorderNode',
+            name='Recorder',
+            out_video_file='record.mp4',
+            frame_buffer='display',
+            output_buffer='_display_'
+            # `_display_` is a runner-reserved buffer
+        )
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/meow_dwen_dwen/resource-info.json b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/meow_dwen_dwen/resource-info.json
new file mode 100644
index 0000000000000000000000000000000000000000..adb811cc7f3eafea56ff4d3f577ec28e33e80f0a
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/meow_dwen_dwen/resource-info.json
@@ -0,0 +1,26 @@
+[
+    {
+        "id": 1,
+        "result": "{\"width\":690,\"height\":713,\"valid\":true,\"rotate\":0,\"step_1\":{\"toolName\":\"pointTool\",\"result\":[{\"x\":374.86387434554973,\"y\":262.8020942408377,\"attribute\":\"\",\"valid\":true,\"id\":\"8SK9cVyu\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":2},{\"x\":492.8261780104712,\"y\":285.2,\"attribute\":\"\",\"valid\":true,\"id\":\"qDk54WsI\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":1},{\"x\":430.11204188481673,\"y\":318.0502617801047,\"attribute\":\"\",\"valid\":true,\"id\":\"4H80L7lL\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":3}]},\"step_2\":{\"dataSourceStep\":0,\"toolName\":\"polygonTool\",\"result\":[{\"id\":\"pwUsrf9u\",\"sourceID\":\"\",\"valid\":true,\"textAttribute\":\"\",\"pointList\":[{\"x\":423.3926701570681,\"y\":191.87539267015708},{\"x\":488.3465968586388,\"y\":209.04712041884818},{\"x\":535.3821989528797,\"y\":248.6167539267016},{\"x\":549.5675392670157,\"y\":306.8513089005236},{\"x\":537.6219895287959,\"y\":349.407329842932},{\"x\":510.74450261780106,\"y\":381.51099476439794},{\"x\":480.1340314136126,\"y\":394.9497382198953},{\"x\":411.4471204188482,\"y\":390.47015706806286},{\"x\":355.45235602094243,\"y\":373.29842931937173},{\"x\":306.17696335078534,\"y\":327.00942408376966},{\"x\":294.97801047120424,\"y\":284.45340314136126},{\"x\":306.9235602094241,\"y\":245.6303664921466},{\"x\":333.8010471204189,\"y\":217.25968586387435},{\"x\":370.3842931937173,\"y\":196.35497382198955}],\"attribute\":\"\",\"order\":1}]}}",
+        "url": "https://user-images.githubusercontent.com/15977946/152742677-35fe8a01-bd06-4a12-a02e-949e7d71f28a.jpg",
+        "fileName": "bing_dwen_dwen1.jpg"
+    },
+    {
+        "id": 2,
+        "result": "{\"width\":690,\"height\":659,\"valid\":true,\"rotate\":0,\"step_1\":{\"dataSourceStep\":0,\"toolName\":\"pointTool\",\"result\":[{\"x\":293.2460732984293,\"y\":242.89842931937173,\"attribute\":\"\",\"valid\":true,\"id\":\"KgPs39bY\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":1},{\"x\":170.41675392670155,\"y\":270.50052356020944,\"attribute\":\"\",\"valid\":true,\"id\":\"XwHyoBFU\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":2},{\"x\":224.24083769633506,\"y\":308.45340314136126,\"attribute\":\"\",\"valid\":true,\"id\":\"Qfs4YfuB\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":3}]},\"step_2\":{\"dataSourceStep\":0,\"toolName\":\"polygonTool\",\"result\":[{\"id\":\"ts5jlJxb\",\"sourceID\":\"\",\"valid\":true,\"textAttribute\":\"\",\"pointList\":[{\"x\":178.69738219895285,\"y\":184.93403141361256},{\"x\":204.91937172774865,\"y\":172.5130890052356},{\"x\":252.5329842931937,\"y\":169.0628272251309},{\"x\":295.3162303664921,\"y\":175.27329842931937},{\"x\":333.95916230366487,\"y\":195.2848167539267},{\"x\":360.18115183246067,\"y\":220.1267015706806},{\"x\":376.0523560209424,\"y\":262.909947643979},{\"x\":373.98219895287957,\"y\":296.0324607329843},{\"x\":344.99999999999994,\"y\":335.365445026178},{\"x\":322.22827225130885,\"y\":355.37696335078533},{\"x\":272.544502617801,\"y\":378.1486910994764},{\"x\":221.48062827225127,\"y\":386.42931937172773},{\"x\":187.6680628272251,\"y\":385.7392670157068},{\"x\":158.68586387434553,\"y\":369.1780104712042},{\"x\":137.98429319371724,\"y\":337.43560209424083},{\"x\":127.63350785340312,\"y\":295.34240837696336},{\"x\":131.0837696335078,\"y\":242.89842931937173},{\"x\":147.64502617801045,\"y\":208.3958115183246}],\"attribute\":\"\",\"order\":1}]}}",
+        "url": "https://user-images.githubusercontent.com/15977946/152742707-c0c51844-e1d0-42d0-9a12-e369002e082f.jpg",
+        "fileName": "bing_dwen_dwen2.jpg"
+    },
+    {
+        "id": 3,
+        "result": "{\"width\":690,\"height\":811,\"valid\":true,\"rotate\":0,\"step_1\":{\"dataSourceStep\":0,\"toolName\":\"pointTool\",\"result\":[{\"x\":361.13507853403144,\"y\":300.62198952879584,\"attribute\":\"\",\"valid\":true,\"id\":\"uAtbXtf2\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":1},{\"x\":242.24502617801048,\"y\":317.60628272251313,\"attribute\":\"\",\"valid\":true,\"id\":\"iLtceHMA\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":2},{\"x\":302.5392670157068,\"y\":356.67015706806285,\"attribute\":\"\",\"valid\":true,\"id\":\"n9MTlJ6A\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":3}]},\"step_2\":{\"dataSourceStep\":0,\"toolName\":\"polygonTool\",\"result\":[{\"id\":\"5sTLU5wF\",\"sourceID\":\"\",\"valid\":true,\"textAttribute\":\"\",\"pointList\":[{\"x\":227.80837696335078,\"y\":247.12146596858642},{\"x\":248.18952879581153,\"y\":235.23246073298432},{\"x\":291.4994764397906,\"y\":225.04188481675394},{\"x\":351.7937172774869,\"y\":229.28795811518327},{\"x\":393.40523560209425,\"y\":245.42303664921468},{\"x\":424.8261780104712,\"y\":272.59790575916236},{\"x\":443.5089005235602,\"y\":298.07434554973827},{\"x\":436.7151832460733,\"y\":345.6303664921466},{\"x\":406.1434554973822,\"y\":382.9958115183247},{\"x\":355.1905759162304,\"y\":408.4722513089006},{\"x\":313.57905759162304,\"y\":419.5120418848168},{\"x\":262.6261780104712,\"y\":417.81361256544506},{\"x\":224.41151832460733,\"y\":399.9801047120419},{\"x\":201.48272251308902,\"y\":364.3130890052356},{\"x\":194.68900523560208,\"y\":315.0586387434555},{\"x\":202.33193717277487,\"y\":272.59790575916236}],\"attribute\":\"\",\"order\":1}]}}",
+        "url": "https://user-images.githubusercontent.com/15977946/152742728-99392ecf-8f5c-46cf-b5c4-fe7fb6b39976.jpg",
+        "fileName": "bing_dwen_dwen3.jpg"
+    },
+    {
+        "id": 4,
+        "result": "{\"width\":690,\"height\":690,\"valid\":true,\"rotate\":0,\"step_1\":{\"dataSourceStep\":0,\"toolName\":\"pointTool\",\"result\":[{\"x\":365.9528795811519,\"y\":464.5759162303665,\"attribute\":\"\",\"valid\":true,\"id\":\"IKprTuHS\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":1},{\"x\":470.71727748691103,\"y\":445.06806282722516,\"attribute\":\"\",\"valid\":true,\"id\":\"Z90CWkEI\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":2},{\"x\":410.74869109947645,\"y\":395.2146596858639,\"attribute\":\"\",\"valid\":true,\"id\":\"UWRstKZk\",\"sourceID\":\"\",\"textAttribute\":\"\",\"order\":3}]},\"step_2\":{\"dataSourceStep\":0,\"toolName\":\"polygonTool\",\"result\":[{\"id\":\"C30Pc9Ww\",\"sourceID\":\"\",\"valid\":true,\"textAttribute\":\"\",\"pointList\":[{\"x\":412.91623036649213,\"y\":325.85340314136124},{\"x\":468.5497382198953,\"y\":335.9685863874345},{\"x\":501.78534031413614,\"y\":369.2041884816754},{\"x\":514.0680628272252,\"y\":415.44502617801044},{\"x\":504.67539267015707,\"y\":472.5235602094241},{\"x\":484.44502617801044,\"y\":497.0890052356021},{\"x\":443.26178010471205,\"y\":512.9842931937172},{\"x\":389.7958115183246,\"y\":518.7643979057591},{\"x\":336.32984293193715,\"y\":504.31413612565444},{\"x\":302.3717277486911,\"y\":462.40837696335075},{\"x\":298.0366492146597,\"y\":416.89005235602093},{\"x\":318.26701570680626,\"y\":372.0942408376963},{\"x\":363.0628272251309,\"y\":341.0261780104712}],\"attribute\":\"\",\"order\":1}]}}",
+        "url": "https://user-images.githubusercontent.com/15977946/152742755-9dc75f89-4156-4103-9c6d-f35f1f409d11.jpg",
+        "fileName": "bing_dwen_dwen4.jpg"
+    }
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/newyear/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/newyear/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c655c121e236146a00a378b5bf495dbf24e6888
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/newyear/README.md
@@ -0,0 +1,31 @@
+# New Year Hat and Firecracker Effects
+
+This demo provides new year effects with pose estimation results, like adding hat on the head and firecracker in the hands.
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/28900607/149774252-8bff7738-2a54-4480-a31c-0a574e7159cb.gif" width="600px" alt><br>
+</div>
+
+## Instruction
+
+### Get started
+
+Launch the demo from the mmpose root directory:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/newyear/new_year.py
+```
+
+### Hotkeys
+
+| Hotkey | Function |
+| -- | -- |
+| t | Toggle the hat effect on/off. |
+| f | Toggle the firecracker effect on/off. |
+| h | Show help information. |
+| m | Show the monitoring information. |
+| q | Exit. |
+
+### Configuration
+
+See the [README](/tools/webcam/configs/examples/README.md#configuration) of pose estimation demo for model configurations.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/newyear/new_year.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/newyear/new_year.py
new file mode 100644
index 0000000000000000000000000000000000000000..3551184053312da288ccac95ae9f37e7f116dd1b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/newyear/new_year.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+runner = dict(
+    # Basic configurations of the runner
+    name='Pose Estimation',
+    camera_id=0,
+    camera_fps=20,
+    synchronous=False,
+    # Define nodes.
+    # The configuration of a node usually includes:
+    #   1. 'type': Node class name
+    #   2. 'name': Node name
+    #   3. I/O buffers (e.g. 'input_buffer', 'output_buffer'): specify the
+    #       input and output buffer names. This may depend on the node class.
+    #   4. 'enable_key': assign a hot-key to toggle enable/disable this node.
+    #       This may depend on the node class.
+    #   5. Other class-specific arguments
+    nodes=[
+        # 'DetectorNode':
+        # This node performs object detection from the frame image using an
+        # MMDetection model.
+        dict(
+            type='DetectorNode',
+            name='Detector',
+            model_config='demo/mmdetection_cfg/'
+            'ssdlite_mobilenetv2_scratch_600e_coco.py',
+            model_checkpoint='https://download.openmmlab.com'
+            '/mmdetection/v2.0/ssd/'
+            'ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_'
+            'scratch_600e_coco_20210629_110627-974d9307.pth',
+            input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+            output_buffer='det_result'),
+        # 'TopDownPoseEstimatorNode':
+        # This node performs keypoint detection from the frame image using an
+        # MMPose top-down model. Detection results is needed.
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='Human Pose Estimator',
+            model_config='configs/wholebody/2d_kpt_sview_rgb_img/'
+            'topdown_heatmap/coco-wholebody/'
+            'vipnas_mbv3_coco_wholebody_256x192_dark.py',
+            model_checkpoint='https://openmmlab-share.oss-cn-hangz'
+            'hou.aliyuncs.com/mmpose/top_down/vipnas/vipnas_mbv3_co'
+            'co_wholebody_256x192_dark-e2158108_20211205.pth',
+            cls_names=['person'],
+            input_buffer='det_result',
+            output_buffer='human_pose'),
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='Animal Pose Estimator',
+            model_config='configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap'
+            '/animalpose/hrnet_w32_animalpose_256x256.py',
+            model_checkpoint='https://download.openmmlab.com/mmpose/animal/'
+            'hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth',
+            cls_names=['cat', 'dog', 'horse', 'sheep', 'cow'],
+            input_buffer='human_pose',
+            output_buffer='animal_pose'),
+        # 'ModelResultBindingNode':
+        # This node binds the latest model inference result with the current
+        # frame. (This means the frame image and inference result may be
+        # asynchronous).
+        dict(
+            type='ModelResultBindingNode',
+            name='ResultBinder',
+            frame_buffer='_frame_',  # `_frame_` is a runner-reserved buffer
+            result_buffer='animal_pose',
+            output_buffer='frame'),
+        # 'HatNode':
+        # This node draw the hat effect in the frame image.
+        # Pose results is needed.
+        dict(
+            type='HatNode',
+            name='Visualizer',
+            enable_key='t',
+            frame_buffer='frame',
+            output_buffer='vis_hat'),
+        # 'FirecrackerNode':
+        # This node draw the firecracker effect in the frame image.
+        # Pose results is needed.
+        dict(
+            type='FirecrackerNode',
+            name='Visualizer',
+            enable_key='f',
+            frame_buffer='vis_hat',
+            output_buffer='vis_firecracker'),
+        # 'NoticeBoardNode':
+        # This node show a notice board with given content, e.g. help
+        # information.
+        dict(
+            type='NoticeBoardNode',
+            name='Helper',
+            enable_key='h',
+            enable=True,
+            frame_buffer='vis_firecracker',
+            output_buffer='vis_notice',
+            content_lines=[
+                'This is a demo for pose visualization and simple image '
+                'effects. Have fun!', '', 'Hot-keys:', '"t": Hat effect',
+                '"f": Firecracker effect', '"h": Show help information',
+                '"m": Show diagnostic information', '"q": Exit'
+            ],
+        ),
+        # 'MonitorNode':
+        # This node show diagnostic information in the frame image. It can
+        # be used for debugging or monitoring system resource status.
+        dict(
+            type='MonitorNode',
+            name='Monitor',
+            enable_key='m',
+            enable=False,
+            frame_buffer='vis_notice',
+            output_buffer='display'),
+        # 'RecorderNode':
+        # This node save the output video into a file.
+        dict(
+            type='RecorderNode',
+            name='Recorder',
+            out_video_file='record.mp4',
+            frame_buffer='display',
+            output_buffer='_display_'
+            # `_display_` is a runner-reserved buffer
+        )
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/supersaiyan/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/supersaiyan/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e9aef1bbaa7c62277a039cfad995a01e0491a10
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/supersaiyan/README.md
@@ -0,0 +1,96 @@
+# Super Saiyan Effects
+
+We can apply fun effects on videos with pose estimation results, like Super Saiyan transformation.
+
+https://user-images.githubusercontent.com/11788150/150138076-2192079f-068a-4d43-bf27-2f1fd708cabc.mp4
+
+## Instruction
+
+### Get started
+
+Launch the demo from the mmpose root directory:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/supersaiyan/saiyan.py
+```
+
+### Hotkeys
+
+| Hotkey | Function |
+| -- | -- |
+| s | Toggle the Super Saiyan effect on/off. |
+| h | Show help information. |
+| m | Show the monitoring information. |
+| q | Exit. |
+
+Note that the demo will automatically save the output video into a file `record.mp4`.
+
+### Configuration
+
+- **Choose a detection model**
+
+Users can choose detection models from the [MMDetection Model Zoo](https://mmdetection.readthedocs.io/en/v2.20.0/model_zoo.html). Just set the `model_config` and `model_checkpoint` in the detector node accordingly, and the model will be automatically downloaded and loaded.
+
+```python
+# 'DetectorNode':
+# This node performs object detection from the frame image using an
+# MMDetection model.
+dict(
+    type='DetectorNode',
+    name='Detector',
+    model_config='demo/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py',
+    model_checkpoint='https://download.openmmlab.com/'
+    'mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/'
+    'mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392'
+    '__segm_mAP-0.354_20200505_003907-3e542a40.pth',
+    input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+    output_buffer='det_result'),
+```
+
+- **Choose a or more pose models**
+
+In this demo we use two [top-down](https://github.com/open-mmlab/mmpose/tree/master/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap) pose estimation models for humans and animals respectively. Users can choose models from the [MMPose Model Zoo](https://mmpose.readthedocs.io/en/latest/modelzoo.html). To apply different pose models on different instance types, you can add multiple pose estimator nodes with `cls_names` set accordingly.
+
+```python
+# 'TopDownPoseEstimatorNode':
+# This node performs keypoint detection from the frame image using an
+# MMPose top-down model. Detection results is needed.
+dict(
+    type='TopDownPoseEstimatorNode',
+    name='Human Pose Estimator',
+    model_config='configs/wholebody/2d_kpt_sview_rgb_img/'
+    'topdown_heatmap/coco-wholebody/'
+    'vipnas_mbv3_coco_wholebody_256x192_dark.py',
+    model_checkpoint='https://openmmlab-share.oss-cn-hangz'
+    'hou.aliyuncs.com/mmpose/top_down/vipnas/vipnas_mbv3_co'
+    'co_wholebody_256x192_dark-e2158108_20211205.pth',
+    cls_names=['person'],
+    input_buffer='det_result',
+    output_buffer='human_pose')
+```
+
+- **Run the demo without GPU**
+
+If you don't have GPU and CUDA in your device, the demo can run with only CPU by setting `device='cpu'` in all model nodes. For example:
+
+```python
+dict(
+    type='DetectorNode',
+    name='Detector',
+    model_config='demo/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py',
+    model_checkpoint='https://download.openmmlab.com/'
+    'mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/'
+    'mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392'
+    '__segm_mAP-0.354_20200505_003907-3e542a40.pth',
+    device='cpu',
+    input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+    output_buffer='det_result'),
+```
+
+- **Debug webcam and display**
+
+You can launch the webcam runner with a debug config:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/examples/test_camera.py
+```
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/supersaiyan/saiyan.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/supersaiyan/saiyan.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a8e7bc82c7ca53fb6a0350ce8b0bd3e3ac6e737
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/supersaiyan/saiyan.py
@@ -0,0 +1,93 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+runner = dict(
+    # Basic configurations of the runner
+    name='Super Saiyan Effects',
+    camera_id=0,
+    camera_fps=30,
+    synchronous=False,
+    # Define nodes.
+    # The configuration of a node usually includes:
+    #   1. 'type': Node class name
+    #   2. 'name': Node name
+    #   3. I/O buffers (e.g. 'input_buffer', 'output_buffer'): specify the
+    #       input and output buffer names. This may depend on the node class.
+    #   4. 'enable_key': assign a hot-key to toggle enable/disable this node.
+    #       This may depend on the node class.
+    #   5. Other class-specific arguments
+    nodes=[
+        # 'DetectorNode':
+        # This node performs object detection from the frame image using an
+        # MMDetection model.
+        dict(
+            type='DetectorNode',
+            name='Detector',
+            model_config='demo/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py',
+            model_checkpoint='https://download.openmmlab.com/'
+            'mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/'
+            'mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392'
+            '__segm_mAP-0.354_20200505_003907-3e542a40.pth',
+            input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+            output_buffer='det_result'),
+        # 'TopDownPoseEstimatorNode':
+        # This node performs keypoint detection from the frame image using an
+        # MMPose top-down model. Detection results is needed.
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='Human Pose Estimator',
+            model_config='configs/wholebody/2d_kpt_sview_rgb_img/'
+            'topdown_heatmap/coco-wholebody/'
+            'vipnas_mbv3_coco_wholebody_256x192_dark.py',
+            model_checkpoint='https://openmmlab-share.oss-cn-hangz'
+            'hou.aliyuncs.com/mmpose/top_down/vipnas/vipnas_mbv3_co'
+            'co_wholebody_256x192_dark-e2158108_20211205.pth',
+            cls_names=['person'],
+            input_buffer='det_result',
+            output_buffer='human_pose'),
+        # 'ModelResultBindingNode':
+        # This node binds the latest model inference result with the current
+        # frame. (This means the frame image and inference result may be
+        # asynchronous).
+        dict(
+            type='ModelResultBindingNode',
+            name='ResultBinder',
+            frame_buffer='_frame_',  # `_frame_` is a runner-reserved buffer
+            result_buffer='human_pose',
+            output_buffer='frame'),
+        # 'SaiyanNode':
+        # This node draw the Super Saiyan effect in the frame image.
+        # Pose results is needed.
+        dict(
+            type='SaiyanNode',
+            name='Visualizer',
+            enable_key='s',
+            cls_names=['person'],
+            enable=True,
+            frame_buffer='frame',
+            output_buffer='vis_saiyan'),
+        # 'NoticeBoardNode':
+        # This node show a notice board with given content, e.g. help
+        # information.
+        dict(
+            type='NoticeBoardNode',
+            name='Helper',
+            enable_key='h',
+            frame_buffer='vis_saiyan',
+            output_buffer='vis',
+            content_lines=[
+                'This is a demo for super saiyan effects. Have fun!', '',
+                'Hot-keys:', '"s": Saiyan effect',
+                '"h": Show help information',
+                '"m": Show diagnostic information', '"q": Exit'
+            ],
+        ),
+        # 'MonitorNode':
+        # This node show diagnostic information in the frame image. It can
+        # be used for debugging or monitoring system resource status.
+        dict(
+            type='MonitorNode',
+            name='Monitor',
+            enable_key='m',
+            enable=False,
+            frame_buffer='vis',
+            output_buffer='_display_')  # `_frame_` is a runner-reserved buffer
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/valentinemagic/README.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/valentinemagic/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8063d2e18640a4312167ed1c022fce3cf613937e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/valentinemagic/README.md
@@ -0,0 +1,35 @@
+# Valentine Magic
+
+Do you want to show your **love** to your beloved one, especially on Valentine's Day? Express it with your pose using MMPose right away and see the Valentine Magic!
+
+Try to pose a hand heart gesture, and see what will happen?
+
+Prefer a blow kiss? Here comes your flying heart~
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/87690686/153837208-8975eda1-17d5-4ee7-8c83-9b535e5ee9d9.gif" width="300px" alt><br>
+</div>
+
+## Instruction
+
+### Get started
+
+Launch the demo from the mmpose root directory:
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/valentinemagic/valentinemagic.py
+```
+
+### Hotkeys
+
+| Hotkey | Function |
+| -- | -- |
+| l | Toggle the Valentine Magic effect on/off. |
+| v | Toggle the pose visualization on/off. |
+| h | Show help information. |
+| m | Show diagnostic information. |
+| q | Exit. |
+
+### Configuration
+
+See the [README](/tools/webcam/configs/examples/README.md#configuration) of pose estimation demo for model configurations.
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/valentinemagic/valentinemagic.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/valentinemagic/valentinemagic.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f921b07901805b490be264c28e12c7de3648f8b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/configs/valentinemagic/valentinemagic.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+runner = dict(
+    # Basic configurations of the runner
+    name='Human Pose and Effects',
+    camera_id=0,
+    camera_fps=30,
+
+    # Define nodes.
+    #
+    # The configuration of a node usually includes:
+    #   1. 'type': Node class name
+    #   2. 'name': Node name
+    #   3. I/O buffers (e.g. 'input_buffer', 'output_buffer'): specify the
+    #       input and output buffer names. This may depend on the node class.
+    #   4. 'enable_key': assign a hot-key to toggle enable/disable this node.
+    #       This may depend on the node class.
+    #   5. Other class-specific arguments
+    nodes=[
+        # 'DetectorNode':
+        # This node performs object detection from the frame image using an
+        # MMDetection model.
+        dict(
+            type='DetectorNode',
+            name='Detector',
+            model_config='demo/mmdetection_cfg/'
+            'ssdlite_mobilenetv2_scratch_600e_coco.py',
+            model_checkpoint='https://download.openmmlab.com'
+            '/mmdetection/v2.0/ssd/'
+            'ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_'
+            'scratch_600e_coco_20210629_110627-974d9307.pth',
+            input_buffer='_input_',  # `_input_` is a runner-reserved buffer
+            output_buffer='det_result'),
+        # 'TopDownPoseEstimatorNode':
+        # This node performs keypoint detection from the frame image using an
+        # MMPose top-down model. Detection results is needed.
+        dict(
+            type='TopDownPoseEstimatorNode',
+            name='Human Pose Estimator',
+            model_config='configs/wholebody/2d_kpt_sview_rgb_img/'
+            'topdown_heatmap/coco-wholebody/'
+            'vipnas_mbv3_coco_wholebody_256x192_dark.py',
+            model_checkpoint='https://download.openmmlab.com/mmpose/top_down/'
+            'vipnas/vipnas_mbv3_coco_wholebody_256x192_dark'
+            '-e2158108_20211205.pth',
+            cls_names=['person'],
+            input_buffer='det_result',
+            output_buffer='pose_result'),
+        # 'ModelResultBindingNode':
+        # This node binds the latest model inference result with the current
+        # frame. (This means the frame image and inference result may be
+        # asynchronous).
+        dict(
+            type='ModelResultBindingNode',
+            name='ResultBinder',
+            frame_buffer='_frame_',  # `_frame_` is a runner-reserved buffer
+            result_buffer='pose_result',
+            output_buffer='frame'),
+        # 'PoseVisualizerNode':
+        # This node draw the pose visualization result in the frame image.
+        # Pose results is needed.
+        dict(
+            type='PoseVisualizerNode',
+            name='Visualizer',
+            enable_key='v',
+            enable=False,
+            frame_buffer='frame',
+            output_buffer='vis'),
+        # 'ValentineMagicNode':
+        # This node draw heart in the image.
+        # It can launch dynamically expanding heart from the middle of
+        # hands if the persons pose a "hand heart" gesture or blow a kiss.
+        # Only there are two persons in the image can trigger this effect.
+        # Pose results is needed.
+        dict(
+            type='ValentineMagicNode',
+            name='Visualizer',
+            enable_key='l',
+            frame_buffer='vis',
+            output_buffer='vis_heart',
+        ),
+        # 'NoticeBoardNode':
+        # This node show a notice board with given content, e.g. help
+        # information.
+        dict(
+            type='NoticeBoardNode',
+            name='Helper',
+            enable_key='h',
+            enable=False,
+            frame_buffer='vis_heart',
+            output_buffer='vis_notice',
+            content_lines=[
+                'This is a demo for pose visualization and simple image '
+                'effects. Have fun!', '', 'Hot-keys:',
+                '"h": Show help information', '"l": LoveHeart Effect',
+                '"v": PoseVisualizer', '"m": Show diagnostic information',
+                '"q": Exit'
+            ],
+        ),
+        # 'MonitorNode':
+        # This node show diagnostic information in the frame image. It can
+        # be used for debugging or monitoring system resource status.
+        dict(
+            type='MonitorNode',
+            name='Monitor',
+            enable_key='m',
+            enable=False,
+            frame_buffer='vis_notice',
+            output_buffer='display'),  # `_frame_` is a runner-reserved buffer
+        # 'RecorderNode':
+        # This node record the frames into a local file. It can save the
+        # visualiztion results. Uncommit the following lines to turn it on.
+        dict(
+            type='RecorderNode',
+            name='Recorder',
+            out_video_file='record.mp4',
+            frame_buffer='display',
+            output_buffer='_display_')
+    ])
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/docs/example_cn.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/docs/example_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..69b9898c3237ab6c81b6af28dfcb50224ac424df
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/docs/example_cn.md
@@ -0,0 +1,171 @@
+# 开发示例：给猫咪戴上太阳镜
+
+## 设计思路
+
+在动手之前，我们先考虑如何实现这个功能：
+
+- 首先，要做目标检测，找到图像中的猫咪
+- 接着，要估计猫咪的关键点位置，比如左右眼的位置
+- 最后，把太阳镜素材图片贴在合适的位置，TA-DA！
+
+按照这个思路，下面我们来看如何一步一步实现它。
+
+## Step 1：从一个现成的 Config 开始
+
+在 WebcamAPI 中，已经添加了一些实现常用功能的 Node，并提供了对应的 config 示例。利用这些可以减少用户的开发量。例如，我们可以以上面的姿态估计 demo 为基础。它的 config 位于 `tools/webcam/configs/example/pose_estimation.py`。为了更直观，我们把这个 config 中的功能节点表示成以下流程图：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15977946/153801397-640f2b45-64e7-41b3-8b00-670c16c57df5.png">
+<figcaption>Pose Estimation Config 示意</figcaption>
+</div>
+
+可以看到，这个 config 已经实现了我们设计思路中“1-目标检测”和“2-关键点检测”的功能。我们还需要实现“3-贴素材图”功能，这就需要定义一个新的 Node了。
+
+## Step 2：实现一个新 Node
+
+在 WebcamAPI 我们定义了以下 2 个 Node 基类：
+
+1. Node：所有 node 的基类，实现了初始化，绑定 runner，启动运行，数据输入输出等基本功能。子类通过重写抽象方法`process()`方法定义具体的 node 功能。
+2. FrameDrawingNode：用来绘制图像的 node 基类。FrameDrawingNode继承自 Node 并进一步封装了`process()`方法，提供了抽象方法`draw()`供子类实现具体的图像绘制功能。
+
+显然，“贴素材图”这个功能属于图像绘制，因此我们只需要继承 BaseFrameEffectNode 类即可。具体实现如下：
+
+```python
+# 假设该文件路径为
+# <MMPose Root>/tools/webcam/webcam_apis/nodes/sunglasses_node.py
+from mmpose.core import apply_sunglasses_effect
+from ..utils import (load_image_from_disk_or_url,
+    get_eye_keypoint_ids)
+from .frame_drawing_node import FrameDrawingNode
+from .builder import NODES
+
+@NODES.register_module()  # 将 SunglassesNode 注册到 NODES（Registry）
+class SunglassesNode(FrameDrawingNode):
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 enable: bool = True,
+                 src_img_path: Optional[str] = None):
+
+        super().__init__(name, frame_buffer, output_buffer, enable_key, enable)
+
+        # 加载素材图片
+        if src_img_path is None:
+            # The image attributes to:
+            # https://www.vecteezy.com/free-vector/glass
+            # Glass Vectors by Vecteezy
+            src_img_path = ('https://raw.githubusercontent.com/open-mmlab/'
+                            'mmpose/master/demo/resources/sunglasses.jpg')
+        self.src_img = load_image_from_disk_or_url(src_img_path)
+
+    def draw(self, frame_msg):
+        # 获取当前帧图像
+        canvas = frame_msg.get_image()
+        # 获取姿态估计结果
+        pose_results = frame_msg.get_pose_results()
+        if not pose_results:
+            return canvas
+
+        # 给每个目标添加太阳镜效果
+        for pose_result in pose_results:
+            model_cfg = pose_result['model_cfg']
+            preds = pose_result['preds']
+            # 获取目标左、右眼关键点位置
+            left_eye_idx, right_eye_idx = get_eye_keypoint_ids(model_cfg)
+            # 根据双眼位置，绘制太阳镜
+            canvas = apply_sunglasses_effect(canvas, preds, self.src_img,
+                                             left_eye_idx, right_eye_idx)
+        return canvas
+```
+
+这里对代码实现中用到的一些函数和类稍作说明：
+
+1. `NODES`：是一个 mmcv.Registry 实例。相信用过 OpenMMLab 系列的同学都对 Registry 不陌生。这里用 NODES来注册和管理所有的 node 类，从而让用户可以在 config 中通过类的名称（如 "DetectorNode"，"SunglassesNode" 等）来指定使用对应的 node。
+2. `load_image_from_disk_or_url`：用来从本地路径或 url 读取图片
+3. `get_eye_keypoint_ids`：根据模型配置文件（model_cfg）中记录的数据集信息，返回双眼关键点的索引。如 COCO 格式对应的左右眼索引为 $(1,2)$
+4. `apply_sunglasses_effect`：将太阳镜绘制到原图中的合适位置，具体步骤为：
+    - 在素材图片上定义一组源锚点 $(s_1, s_2, s_3, s_4)$
+    - 根据目标左右眼关键点位置 $(k_1, k_2)$，计算目标锚点 $(t_1, t_2, t_3, t_4)$
+    - 通过源锚点和目标锚点，计算几何变换矩阵（平移，缩放，旋转），将素材图片做变换后贴入原图片。即可将太阳镜绘制在合适的位置。
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15977946/153773612-bcf86b91-31a3-47b5-886d-e33577016f85.png">
+<figcaption>太阳镜特效原理示意</figcaption>
+</div>
+
+### Get Advanced：关于 Node 和 FrameEffectNode
+
+[Node 类](/tools/webcam/webcam_apis/nodes/node.py) ：继承自 Thread 类。正如我们在前面 数据流 部分提到的，所有节点都在各自的线程中彼此异步运行。在`Node.run()` 方法中定义了节点的基本运行逻辑：
+
+1. 当 buffer 中有数据时，会触发一次运行
+2. 调用`process()`来执行具体的功能。`process()`是一个抽象接口，由子类具体实现
+    - 特别地，如果节点需要实现“开/关”功能，则还需要实现`bypass()`方法，以定义节点“关”时的行为。`bypass()`与`process()`的输入输出接口完全相同。在run()中会根据`Node.enable`的状态，调用`process()`或`bypass()`
+3. 将运行结果发送到输出 buffer
+
+在继承 Node 类实现具体的节点类时，通常需要完成以下工作：
+
+1. 在__init__()中注册输入、输出 buffer，并调用基类的__init__()方法
+2. 实现process()和bypass()（如需要）方法
+
+[FrameDrawingNode 类](tools/webcam/webcam_apis/nodes/frame_drawing_node.py) ：继承自 Node 类，对`process()`和`bypass()`方法做了进一步封装：
+
+- process()：从接到输入中提取帧图像，传入draw()方法中绘图。draw()是一个抽象接口，有子类实现
+- bypass()：直接将节点输入返回
+
+### Get Advanced: 关于节点的输入、输出格式
+
+我们定义了[FrameMessage 类](tools/webcam/webcam_apis/utils/message.py)作为节点间通信的数据结构。也就是说，通常情况下节点的输入、输出和 buffer 中存储的元素，都是 FrameMessage 类的实例。FrameMessage 通常用来存储视频中1帧的信息，它提供了简单的接口，用来提取和存入数据：
+
+- `get_image()`：返回图像
+- `set_image()`：设置图像
+- `add_detection_result()`：添加一个目标检测模型的结果
+- `get_detection_results()`：返回所有目标检测结果
+- `add_pose_result()`：添加一个姿态估计模型的结果
+- `get_pose_results()`：返回所有姿态估计结果
+
+## Step 3：调整 Config
+
+有了 Step 2 中实现的 SunglassesNode，我们只要把它加入 config 里就可以使用了。比如，我们可以把它放在“Visualizer” node 之后：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15977946/153801499-590a7810-b231-4a38-8053-c7d33af1535a.png">
+<figcaption>修改后的 Config，添加了 SunglassesNode 节点</figcaption>
+</div>
+
+具体的写法如下：
+
+```python
+runner = dict(
+    # runner的基本参数
+    name='Everybody Wears Sunglasses',
+    camera_id=0,
+    camera_fps=20,
+    # 定义了若干节点(node)
+    nodes=[
+        ...,
+        dict(
+            type='SunglassesNode',  # 节点类名称
+            name='Sunglasses',  # 节点名，由用户自己定义
+            frame_buffer='vis',  # 输入
+            output_buffer='sunglasses',  # 输出
+            enable_key='s',  # 定义开关快捷键
+            enable=True,)  # 启动时默认的开关状态
+        ...]  # 更多节点
+)
+```
+
+此外，用户还可以根据需求调整 config 中的参数。一些常用的设置包括：
+
+1. 选择摄像头：可以通过设置camera_id参数指定使用的摄像头。通常电脑上的默认摄像头 id 为 0，如果有多个则 id 数字依次增大。此外，也可以给camera_id设置一个本地视频文件的路径，从而使用该视频文件作为应用程序的输入
+2. 选择模型：可以通过模型推理节点（如 DetectorNode，TopDownPoseEstimationNode）的model_config和model_checkpoint参数来配置。用户可以根据自己的需求（如目标物体类别，关键点类别等）和硬件情况选用合适的模型
+3. 设置快捷键：一些 node 支持使用快捷键开关，用户可以设置对应的enable_key（快捷键）和enable（默认开关状态）参数
+4. 提示信息：通过设置 NoticeBoardNode 的 content_lines参数，可以在程序运行时在画面上显示提示信息，帮助使用者快速了解这个应用程序的功能和操作方法
+
+最后，将修改过的 config 存到文件`tools/webcam/configs/sunglasses.py`中，就可以运行了：
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/sunglasses.py
+```
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/docs/get_started_cn.md b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/docs/get_started_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..561ac10cd4d3f1eeeb0b808bf7526271deaa18c9
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/docs/get_started_cn.md
@@ -0,0 +1,123 @@
+# MMPose Webcam API 快速上手
+
+## 什么是 MMPose Webcam API
+
+MMPose WebcamAPI 是一套简单的应用开发接口，可以帮助用户方便的调用 MMPose 以及其他 OpenMMLab 算法库中的算法，实现基于摄像头输入视频的交互式应用。
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15977946/153800450-2522efe8-bc11-457d-9037-d8aee4fc4f36.png">
+<figcaption>MMPose Webcam API 框架概览</figcaption>
+</div>
+
+## 运行一个 Demo
+
+我们将从一个简单的 Demo 开始，向您介绍 MMPose WebcamAPI 的功能和特性，并详细展示如何基于这个 API 搭建自己的应用。为了使用 MMPose WebcamAPI，您只需要做简单的准备：
+
+1. 一台计算机（最好有 GPU 和 CUDA 环境，但这并不是必须的）
+1. 一个摄像头。计算机自带摄像头或者外接 USB 摄像头均可
+1. 安装 MMPose
+    - 在 OpenMMLab [官方仓库](https://github.com/open-mmlab/mmpose) fork MMPose 到自己的 github，并 clone 到本地
+    - 安装 MMPose，只需要按照我们的 [安装文档](https://mmpose.readthedocs.io/zh_CN/latest/install.html) 中的步骤操作即可
+
+完成准备工作后，请在命令行进入 MMPose 根目录，执行以下指令，即可运行 demo：
+
+```shell
+python tools/webcam/run_webcam.py --config tools/webcam/configs/examples/pose_estimation.py
+```
+
+这个 demo 实现了目标检测，姿态估计和可视化功能，效果如下：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15977946/153772158-a5702193-3d3f-40c8-bd6b-ab186979d1b4.png">
+<figcaption>Pose Estimation Demo 效果</figcaption>
+</div>
+
+## Demo 里面有什么？
+
+### 从 Config 说起
+
+成功运行 demo 后，我们来看一下它是怎样工作的。在启动脚本 `tools/webcam/run_webcam.py` 中可以看到，这里的操作很简单：首先读取了一个 config 文件，接着使用 config 构建了一个 runner ，最后调用了 runner 的 `run()` 方法，这样 demo 就开始运行了。
+
+```python
+# tools/webcam/run_webcam.py
+
+def launch():
+    # 读取 config 文件
+    args = parse_args()
+    cfg = mmcv.Config.fromfile(args.config)
+    # 构建 runner（WebcamRunner类的实例）
+    runner = WebcamRunner(**cfg.runner)
+    # 调用 run()方法，启动程序
+    runner.run()
+
+
+if __name__ == '__main__':
+    launch()
+```
+
+我们先不深究 runner 为何物，而是接着看一下这个 config 文件的内容。省略掉细节和注释，可以发现 config 的结构大致包含两部分（如下图所示）：
+
+1. Runner 的基本参数，如 camera_id，camera_fps 等。这部分比較好理解，是一些在读取视频时的必要设置
+2. 一系列＂节点＂（Node），每个节点属于特定的类型（type），并有对应的一些参数
+
+```python
+runner = dict(
+    # runner的基本参数
+    name='Pose Estimation',
+    camera_id=0,
+    camera_fps=20,
+    # 定义了若干节点(Node)
+    Nodes=[
+        dict(
+            type='DetectorNode',  # 节点１类型
+            name='Detector',  # 节点１名字
+            input_buffer='_input_',  # 节点１数据输入
+            output_buffer='det_result',  # 节点１数据输出
+            ...), # 节点１其他参数
+        dict(
+            type='TopDownPoseEstimatorNode',  # 节点２类型
+            name='Human Pose Estimator',  # 节点２名字
+            input_buffer='det_result',  # 节点2数据输入
+            output_buffer='pose_result',  # 节点2数据输出
+            ...),  # 节点２参数
+        ...]  # 更多节点
+)
+```
+
+### 核心概念：Runner 和 Node
+
+到这里，我们已经引出了 MMPose WebcamAPI 的２个最重要的概念：runner 和 Node，下面做正式介绍：
+
+- Runner：Runner 类是程序的主体，提供了程序启动的入口runner.run()方法，并负责视频读入，输出显示等功能。此外，runner 中会包含若干个 Node，分别负责在视频帧的处理中执行不同的功能。
+- Node：Node 类用来定义功能模块，例如模型推理，可视化，特效绘制等都可以通过定义一个对应的 Node 来实现。如上面的 config 例子中，2 个节点的功能分别是做目标检测（Detector）和姿态估计（TopDownPoseEstimator）
+
+Runner 和 Node 的关系简单来说如下图所示：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15977946/153772839-104430bd-de0e-4ee5-bd67-dff4e52d784c.png">
+<figcaption>Runner 和 Node 逻辑关系示意</figcaption>
+</div>
+
+### 数据流
+
+一个重要的问题是：当一帧视频数据被 runner 读取后，会按照怎样的顺序通过所有的 Node 并最终被输出（显示）呢？
+答案就是 config 中每个 Node 的输入输出配置。如示例 config 中，可以看到每个 Node 都有`input_buffer`，`output_buffer`等参数，用来定义该节点的输入输出。通过这种连接关系，所有的 Node 构成了一个有向无环图结构，如下图所示：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15977946/153772900-9619ae80-3e64-4b40-bc1e-b184405e3d5b.png">
+<figcaption>数据流示意</figcaption>
+</div>
+
+图中的每个 Data Buffer 就是一个用来存放数据的容器。用户不需要关注 buffer 的具体细节，只需要将其简单理解成 Node 输入输出的名字即可。用户在 config 中可以任意定义这些名字，不过要注意有以下几个特殊的名字：
+
+- _input_：存放 runner 读入的视频帧，用于模型推理
+- _frame_ ：存放 runner 读入的视频帧，用于可视化
+- _display_：存放经过所以 Node 处理后的结果，用于在屏幕上显示
+
+当一帧视频数据被 runner 读入后，会被放进 _input_ 和 _frame_ 两个 buffer 中，然后按照 config 中定义的 Node 连接关系依次通过各个 Node ，最终到达 _display_ ，并被 runner 读出显示在屏幕上。
+
+#### Get Advanced: 关于 buffer
+
+- Buffer 本质是一个有限长度的队列，在 runner 中会包含一个 BufferManager 实例（见`mmpose/tools/webcam/webcam_apis/buffer.py'）来生成和管理所有 buffer。Node 会按照 config 从对应的 buffer 中读出或写入数据。
+- 当一个 buffer 已满（达到最大长度）时，写入数据的操作通常不会被 block，而是会将 buffer 中已有的最早一条数据“挤出去”。
+- 为什么有_input_和_frame_两个输入呢？因为有些 Node 的操作较为耗时（如目标检测，姿态估计等需要模型推理的 Node）。为了保证显示的流畅，我们通常用_input_来作为这类耗时较大的操作的输入，而用_frame_来实时绘制可视化的结果。因为各个节点是异步运行的，这样就可以保证可视化的实时和流畅。
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/run_webcam.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/run_webcam.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8d92e78e385d5bfaf2782cfc5b9d627531d20b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/run_webcam.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from argparse import ArgumentParser
+
+from mmcv import Config, DictAction
+from webcam_apis import WebcamRunner
+
+
+def parse_args():
+    parser = ArgumentParser('Lauch webcam runner')
+    parser.add_argument(
+        '--config',
+        type=str,
+        default='tools/webcam/configs/meow_dwen_dwen/meow_dwen_dwen.py')
+
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options runner.camera_id=1 runner.synchronous=True'")
+
+    return parser.parse_args()
+
+
+def launch():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    cfg.merge_from_dict(args.cfg_options)
+
+    runner = WebcamRunner(**cfg.runner)
+    runner.run()
+
+
+if __name__ == '__main__':
+    launch()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c8a2f5e0f6bf8d3c1b3d766dbe7a7d2c69cfaa4
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .webcam_runner import WebcamRunner
+
+__all__ = ['WebcamRunner']
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a882030b4a1b5aac87206e84fe69041bcd83035f
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import NODES
+from .faceswap_node import FaceSwapNode
+from .frame_effect_node import (BackgroundNode, BugEyeNode, MoustacheNode,
+                                NoticeBoardNode, PoseVisualizerNode,
+                                SaiyanNode, SunglassesNode)
+from .helper_node import ModelResultBindingNode, MonitorNode, RecorderNode
+from .mmdet_node import DetectorNode
+from .mmpose_node import TopDownPoseEstimatorNode
+from .valentinemagic_node import ValentineMagicNode
+from .xdwendwen_node import XDwenDwenNode
+
+__all__ = [
+    'NODES', 'PoseVisualizerNode', 'DetectorNode', 'TopDownPoseEstimatorNode',
+    'MonitorNode', 'BugEyeNode', 'SunglassesNode', 'ModelResultBindingNode',
+    'NoticeBoardNode', 'RecorderNode', 'FaceSwapNode', 'MoustacheNode',
+    'SaiyanNode', 'BackgroundNode', 'XDwenDwenNode', 'ValentineMagicNode'
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/builder.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..44900b7efdc9822e693ce572cca16dafda388640
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/builder.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry
+
+NODES = Registry('node')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/faceswap_node.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/faceswap_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac44207fc363680aef49cfa1ea2b77707682484
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/faceswap_node.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import IntEnum
+from typing import List, Union
+
+import cv2
+import numpy as np
+
+from mmpose.datasets import DatasetInfo
+from .builder import NODES
+from .frame_drawing_node import FrameDrawingNode
+
+
+class Mode(IntEnum):
+    NONE = 0,
+    SHUFFLE = 1,
+    CLONE = 2
+
+
+@NODES.register_module()
+class FaceSwapNode(FrameDrawingNode):
+
+    def __init__(
+        self,
+        name: str,
+        frame_buffer: str,
+        output_buffer: Union[str, List[str]],
+        mode_key: Union[str, int],
+    ):
+        super().__init__(name, frame_buffer, output_buffer, enable=True)
+
+        self.mode_key = mode_key
+        self.mode_index = 0
+        self.register_event(
+            self.mode_key, is_keyboard=True, handler_func=self.switch_mode)
+        self.history = dict(mode=None)
+        self._mode = Mode.SHUFFLE
+
+    @property
+    def mode(self):
+        return self._mode
+
+    def switch_mode(self):
+        """Switch modes by updating mode index."""
+        self._mode = Mode((self._mode + 1) % len(Mode))
+
+    def draw(self, frame_msg):
+
+        if self.mode == Mode.NONE:
+            self.history = {'mode': Mode.NONE}
+            return frame_msg.get_image()
+
+        # Init history
+        if self.history['mode'] != self.mode:
+            self.history = {'mode': self.mode, 'target_map': {}}
+
+        # Merge pose results
+        pose_preds = self._merge_pose_results(frame_msg.get_pose_results())
+        num_target = len(pose_preds)
+
+        # Show mode
+        img = frame_msg.get_image()
+        canvas = img.copy()
+        if self.mode == Mode.SHUFFLE:
+            mode_txt = 'Shuffle'
+        else:
+            mode_txt = 'Clone'
+
+        cv2.putText(canvas, mode_txt, (10, 50), cv2.FONT_HERSHEY_DUPLEX, 0.8,
+                    (255, 126, 0), 1)
+
+        # Skip if target number is less than 2
+        if num_target >= 2:
+            # Generate new mapping if target number changes
+            if num_target != len(self.history['target_map']):
+                if self.mode == Mode.SHUFFLE:
+                    self.history['target_map'] = self._get_swap_map(num_target)
+                else:
+                    self.history['target_map'] = np.repeat(
+                        np.random.choice(num_target), num_target)
+
+            # # Draw on canvas
+            for tar_idx, src_idx in enumerate(self.history['target_map']):
+                face_src = self._get_face_info(pose_preds[src_idx])
+                face_tar = self._get_face_info(pose_preds[tar_idx])
+                canvas = self._swap_face(img, canvas, face_src, face_tar)
+
+        return canvas
+
+    def _crop_face_by_contour(self, img, contour):
+        mask = np.zeros(img.shape[:2], dtype=np.uint8)
+        cv2.fillPoly(mask, [contour.astype(np.int32)], 1)
+        mask = cv2.dilate(
+            mask, kernel=np.ones((9, 9), dtype=np.uint8), anchor=(4, 0))
+        x1, y1, w, h = cv2.boundingRect(mask)
+        x2 = x1 + w
+        y2 = y1 + h
+        bbox = np.array([x1, y1, x2, y2], dtype=np.int64)
+        patch = img[y1:y2, x1:x2]
+        mask = mask[y1:y2, x1:x2]
+
+        return bbox, patch, mask
+
+    def _swap_face(self, img_src, img_tar, face_src, face_tar):
+
+        if face_src['dataset'] == face_tar['dataset']:
+            # Use full keypoints for face alignment
+            kpts_src = face_src['contour']
+            kpts_tar = face_tar['contour']
+        else:
+            # Use only common landmarks (eyes and nose) for face alignment if
+            # source and target have differenet data type
+            # (e.g. human vs animal)
+            kpts_src = face_src['landmarks']
+            kpts_tar = face_tar['landmarks']
+
+        # Get everything local
+        bbox_src, patch_src, mask_src = self._crop_face_by_contour(
+            img_src, face_src['contour'])
+
+        bbox_tar, _, mask_tar = self._crop_face_by_contour(
+            img_tar, face_tar['contour'])
+
+        kpts_src = kpts_src - bbox_src[:2]
+        kpts_tar = kpts_tar - bbox_tar[:2]
+
+        # Compute affine transformation matrix
+        trans_mat, _ = cv2.estimateAffine2D(
+            kpts_src.astype(np.float32), kpts_tar.astype(np.float32))
+        patch_warp = cv2.warpAffine(
+            patch_src,
+            trans_mat,
+            dsize=tuple(bbox_tar[2:] - bbox_tar[:2]),
+            borderValue=(0, 0, 0))
+        mask_warp = cv2.warpAffine(
+            mask_src,
+            trans_mat,
+            dsize=tuple(bbox_tar[2:] - bbox_tar[:2]),
+            borderValue=(0, 0, 0))
+
+        # Target mask
+        mask_tar = mask_tar & mask_warp
+        mask_tar_soft = cv2.GaussianBlur(mask_tar * 255, (3, 3), 3)
+
+        # Blending
+        center = tuple((0.5 * (bbox_tar[:2] + bbox_tar[2:])).astype(np.int64))
+        img_tar = cv2.seamlessClone(patch_warp, img_tar, mask_tar_soft, center,
+                                    cv2.NORMAL_CLONE)
+        return img_tar
+
+    @staticmethod
+    def _get_face_info(pose_pred):
+        keypoints = pose_pred['keypoints'][:, :2]
+        model_cfg = pose_pred['model_cfg']
+        dataset_info = DatasetInfo(model_cfg.data.test.dataset_info)
+
+        face_info = {
+            'dataset': dataset_info.dataset_name,
+            'landmarks': None,  # For alignment
+            'contour': None,  # For mask generation
+            'bbox': None  # For image warping
+        }
+
+        # Fall back to hard coded keypoint id
+
+        if face_info['dataset'] == 'coco':
+            face_info['landmarks'] = np.stack([
+                keypoints[1],  # left eye
+                keypoints[2],  # right eye
+                keypoints[0],  # nose
+                0.5 * (keypoints[5] + keypoints[6]),  # neck (shoulder center)
+            ])
+        elif face_info['dataset'] == 'coco_wholebody':
+            face_info['landmarks'] = np.stack([
+                keypoints[1],  # left eye
+                keypoints[2],  # right eye
+                keypoints[0],  # nose
+                keypoints[32],  # chin
+            ])
+            contour_ids = list(range(23, 40)) + list(range(40, 50))[::-1]
+            face_info['contour'] = keypoints[contour_ids]
+        elif face_info['dataset'] == 'ap10k':
+            face_info['landmarks'] = np.stack([
+                keypoints[0],  # left eye
+                keypoints[1],  # right eye
+                keypoints[2],  # nose
+                keypoints[3],  # neck
+            ])
+        elif face_info['dataset'] == 'animalpose':
+            face_info['landmarks'] = np.stack([
+                keypoints[0],  # left eye
+                keypoints[1],  # right eye
+                keypoints[4],  # nose
+                keypoints[5],  # throat
+            ])
+        elif face_info['dataset'] == 'wflw':
+            face_info['landmarks'] = np.stack([
+                keypoints[97],  # left eye
+                keypoints[96],  # right eye
+                keypoints[54],  # nose
+                keypoints[16],  # chine
+            ])
+            contour_ids = list(range(33))[::-1] + list(range(33, 38)) + list(
+                range(42, 47))
+            face_info['contour'] = keypoints[contour_ids]
+        else:
+            raise ValueError('Can not obtain face landmark information'
+                             f'from dataset: {face_info["type"]}')
+
+        # Face region
+        if face_info['contour'] is None:
+            # Manually defined counter of face region
+            left_eye, right_eye, nose = face_info['landmarks'][:3]
+            eye_center = 0.5 * (left_eye + right_eye)
+            w_vec = right_eye - left_eye
+            eye_dist = np.linalg.norm(w_vec) + 1e-6
+            w_vec = w_vec / eye_dist
+            h_vec = np.array([w_vec[1], -w_vec[0]], dtype=w_vec.dtype)
+            w = max(0.5 * eye_dist, np.abs(np.dot(nose - eye_center, w_vec)))
+            h = np.abs(np.dot(nose - eye_center, h_vec))
+
+            left_top = eye_center + 1.5 * w * w_vec - 0.5 * h * h_vec
+            right_top = eye_center - 1.5 * w * w_vec - 0.5 * h * h_vec
+            left_bottom = eye_center + 1.5 * w * w_vec + 4 * h * h_vec
+            right_bottom = eye_center - 1.5 * w * w_vec + 4 * h * h_vec
+
+            face_info['contour'] = np.stack(
+                [left_top, right_top, right_bottom, left_bottom])
+
+        # Get tight bbox of face region
+        face_info['bbox'] = np.array([
+            face_info['contour'][:, 0].min(), face_info['contour'][:, 1].min(),
+            face_info['contour'][:, 0].max(), face_info['contour'][:, 1].max()
+        ]).astype(np.int64)
+
+        return face_info
+
+    @staticmethod
+    def _merge_pose_results(pose_results):
+        preds = []
+        if pose_results is not None:
+            for prefix, pose_result in enumerate(pose_results):
+                model_cfg = pose_result['model_cfg']
+                for idx, _pred in enumerate(pose_result['preds']):
+                    pred = _pred.copy()
+                    pred['id'] = f'{prefix}.{_pred.get("track_id", str(idx))}'
+                    pred['model_cfg'] = model_cfg
+                    preds.append(pred)
+        return preds
+
+    @staticmethod
+    def _get_swap_map(num_target):
+        ids = np.random.choice(num_target, num_target, replace=False)
+        target_map = ids[(ids + 1) % num_target]
+        return target_map
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/frame_drawing_node.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/frame_drawing_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfc3511cadc2e8db0fb393ba1f821ee8091fcada
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/frame_drawing_node.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ..utils import FrameMessage, Message
+from .node import Node
+
+
+class FrameDrawingNode(Node):
+    """Base class for Node that draw on single frame images.
+
+    Args:
+        name (str, optional): The node name (also thread name).
+        frame_buffer (str): The name of the input buffer.
+        output_buffer (str | list): The name(s) of the output buffer(s).
+        enable_key (str | int, optional): Set a hot-key to toggle
+            enable/disable of the node. If an int value is given, it will be
+            treated as an ascii code of a key. Please note:
+                1. If enable_key is set, the bypass method need to be
+                    overridden to define the node behavior when disabled
+                2. Some hot-key has been use for particular use. For example:
+                    'q', 'Q' and 27 are used for quit
+            Default: None
+        enable (bool): Default enable/disable status. Default: True.
+    """
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 enable: bool = True):
+
+        super().__init__(name=name, enable_key=enable_key)
+
+        # Register buffers
+        self.register_input_buffer(frame_buffer, 'frame', essential=True)
+        self.register_output_buffer(output_buffer)
+
+        self._enabled = enable
+
+    def process(self, input_msgs: Dict[str, Message]) -> Union[Message, None]:
+        frame_msg = input_msgs['frame']
+
+        img = self.draw(frame_msg)
+        frame_msg.set_image(img)
+
+        return frame_msg
+
+    def bypass(self, input_msgs: Dict[str, Message]) -> Union[Message, None]:
+        return input_msgs['frame']
+
+    @abstractmethod
+    def draw(self, frame_msg: FrameMessage) -> np.ndarray:
+        """Draw on the frame image with information from the single frame.
+
+        Args:
+            frame_meg (FrameMessage): The frame to get information from and
+                draw on.
+
+        Returns:
+            array: The output image
+        """
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/frame_effect_node.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/frame_effect_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..c248c3820a944e6b5e7f0613794d6290fcda7bcc
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/frame_effect_node.py
@@ -0,0 +1,917 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import cv2
+import numpy as np
+from mmcv import color_val
+
+from mmpose.core import (apply_bugeye_effect, apply_sunglasses_effect,
+                         imshow_bboxes, imshow_keypoints)
+from mmpose.datasets import DatasetInfo
+from ..utils import (FrameMessage, copy_and_paste, expand_and_clamp,
+                     get_cached_file_path, get_eye_keypoint_ids,
+                     get_face_keypoint_ids, get_wrist_keypoint_ids,
+                     load_image_from_disk_or_url, screen_matting)
+from .builder import NODES
+from .frame_drawing_node import FrameDrawingNode
+
+try:
+    import psutil
+    psutil_proc = psutil.Process()
+except (ImportError, ModuleNotFoundError):
+    psutil_proc = None
+
+
+@NODES.register_module()
+class PoseVisualizerNode(FrameDrawingNode):
+    """Draw the bbox and keypoint detection results.
+
+    Args:
+        name (str, optional): The node name (also thread name).
+        frame_buffer (str): The name of the input buffer.
+        output_buffer (str|list): The name(s) of the output buffer(s).
+        enable_key (str|int, optional): Set a hot-key to toggle enable/disable
+            of the node. If an int value is given, it will be treated as an
+            ascii code of a key. Please note:
+                1. If enable_key is set, the bypass method need to be
+                    overridden to define the node behavior when disabled
+                2. Some hot-key has been use for particular use. For example:
+                    'q', 'Q' and 27 are used for quit
+            Default: None
+        enable (bool): Default enable/disable status. Default: True.
+        kpt_thr (float): The threshold of keypoint score. Default: 0.3.
+        radius (int): The radius of keypoint. Default: 4.
+        thickness (int): The thickness of skeleton. Default: 2.
+        bbox_color (str|tuple|dict): If a single color (a str like 'green' or
+            a tuple like (0, 255, 0)), it will used to draw the bbox.
+            Optionally, a dict can be given as a map from class labels to
+            colors.
+    """
+
+    default_bbox_color = {
+        'person': (148, 139, 255),
+        'cat': (255, 255, 0),
+        'dog': (255, 255, 0),
+    }
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 enable: bool = True,
+                 kpt_thr: float = 0.3,
+                 radius: int = 4,
+                 thickness: int = 2,
+                 bbox_color: Optional[Union[str, Tuple, Dict]] = None):
+
+        super().__init__(name, frame_buffer, output_buffer, enable_key, enable)
+
+        self.kpt_thr = kpt_thr
+        self.radius = radius
+        self.thickness = thickness
+        if bbox_color is None:
+            self.bbox_color = self.default_bbox_color
+        elif isinstance(bbox_color, dict):
+            self.bbox_color = {k: color_val(v) for k, v in bbox_color.items()}
+        else:
+            self.bbox_color = color_val(bbox_color)
+
+    def draw(self, frame_msg):
+        canvas = frame_msg.get_image()
+        pose_results = frame_msg.get_pose_results()
+
+        if not pose_results:
+            return canvas
+
+        for pose_result in frame_msg.get_pose_results():
+            model_cfg = pose_result['model_cfg']
+            dataset_info = DatasetInfo(model_cfg.dataset_info)
+
+            # Extract bboxes and poses
+            bbox_preds = []
+            bbox_labels = []
+            pose_preds = []
+            for pred in pose_result['preds']:
+                if 'bbox' in pred:
+                    bbox_preds.append(pred['bbox'])
+                    bbox_labels.append(pred.get('label', None))
+                pose_preds.append(pred['keypoints'])
+
+            # Get bbox colors
+            if isinstance(self.bbox_color, dict):
+                bbox_colors = [
+                    self.bbox_color.get(label, (0, 255, 0))
+                    for label in bbox_labels
+                ]
+            else:
+                bbox_labels = self.bbox_color
+
+            # Draw bboxes
+            if bbox_preds:
+                bboxes = np.vstack(bbox_preds)
+
+                imshow_bboxes(
+                    canvas,
+                    bboxes,
+                    labels=bbox_labels,
+                    colors=bbox_colors,
+                    text_color='white',
+                    font_scale=0.5,
+                    show=False)
+
+            # Draw poses
+            if pose_preds:
+                imshow_keypoints(
+                    canvas,
+                    pose_preds,
+                    skeleton=dataset_info.skeleton,
+                    kpt_score_thr=0.3,
+                    pose_kpt_color=dataset_info.pose_kpt_color,
+                    pose_link_color=dataset_info.pose_link_color,
+                    radius=self.radius,
+                    thickness=self.thickness)
+
+        return canvas
+
+
+@NODES.register_module()
+class SunglassesNode(FrameDrawingNode):
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 enable: bool = True,
+                 src_img_path: Optional[str] = None):
+
+        super().__init__(name, frame_buffer, output_buffer, enable_key, enable)
+
+        if src_img_path is None:
+            # The image attributes to:
+            # https://www.vecteezy.com/free-vector/glass
+            # Glass Vectors by Vecteezy
+            src_img_path = 'demo/resources/sunglasses.jpg'
+        self.src_img = load_image_from_disk_or_url(src_img_path)
+
+    def draw(self, frame_msg):
+        canvas = frame_msg.get_image()
+        pose_results = frame_msg.get_pose_results()
+        if not pose_results:
+            return canvas
+        for pose_result in pose_results:
+            model_cfg = pose_result['model_cfg']
+            preds = pose_result['preds']
+            left_eye_idx, right_eye_idx = get_eye_keypoint_ids(model_cfg)
+
+            canvas = apply_sunglasses_effect(canvas, preds, self.src_img,
+                                             left_eye_idx, right_eye_idx)
+        return canvas
+
+
+@NODES.register_module()
+class SpriteNode(FrameDrawingNode):
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 enable: bool = True,
+                 src_img_path: Optional[str] = None):
+
+        super().__init__(name, frame_buffer, output_buffer, enable_key, enable)
+
+        if src_img_path is None:
+            # Sprites of Touhou characters :)
+            # Come from https://www.deviantart.com/shadowbendy/art/Touhou-rpg-maker-vx-Sprite-1-812746920  # noqa: E501
+            src_img_path = (
+                'https://user-images.githubusercontent.com/'
+                '26739999/151532276-33f968d9-917f-45e3-8a99-ebde60be83bb.png')
+        self.src_img = load_image_from_disk_or_url(
+            src_img_path, cv2.IMREAD_UNCHANGED)[:144, :108]
+        tmp = np.array(np.split(self.src_img, range(36, 144, 36), axis=0))
+        tmp = np.array(np.split(tmp, range(36, 108, 36), axis=2))
+        self.sprites = tmp
+        self.pos = None
+        self.anime_frame = 0
+
+    def apply_sprite_effect(self,
+                            img,
+                            pose_results,
+                            left_hand_index,
+                            right_hand_index,
+                            kpt_thr=0.5):
+        """Apply sprite effect.
+
+        Args:
+            img (np.ndarray): Image data.
+            pose_results (list[dict]): The pose estimation results containing:
+                - "keypoints" ([K,3]): detection result in [x, y, score]
+            left_hand_index (int): Keypoint index of left hand
+            right_hand_index (int): Keypoint index of right hand
+            kpt_thr (float): The score threshold of required keypoints.
+        """
+
+        hm, wm = self.sprites.shape[2:4]
+        # anchor points in the sunglasses mask
+        if self.pos is None:
+            self.pos = [img.shape[0] // 2, img.shape[1] // 2]
+
+        if len(pose_results) == 0:
+            return img
+
+        kpts = pose_results[0]['keypoints']
+
+        if kpts[left_hand_index, 2] < kpt_thr and kpts[right_hand_index,
+                                                       2] < kpt_thr:
+            aim = self.pos
+        else:
+            kpt_lhand = kpts[left_hand_index, :2][::-1]
+            kpt_rhand = kpts[right_hand_index, :2][::-1]
+
+            def distance(a, b):
+                return (a[0] - b[0])**2 + (a[1] - b[1])**2
+
+            # Go to the nearest hand
+            if distance(kpt_lhand, self.pos) < distance(kpt_rhand, self.pos):
+                aim = kpt_lhand
+            else:
+                aim = kpt_rhand
+
+        pos_thr = 15
+        if aim[0] < self.pos[0] - pos_thr:
+            # Go down
+            sprite = self.sprites[self.anime_frame][3]
+            self.pos[0] -= 1
+        elif aim[0] > self.pos[0] + pos_thr:
+            # Go up
+            sprite = self.sprites[self.anime_frame][0]
+            self.pos[0] += 1
+        elif aim[1] < self.pos[1] - pos_thr:
+            # Go right
+            sprite = self.sprites[self.anime_frame][1]
+            self.pos[1] -= 1
+        elif aim[1] > self.pos[1] + pos_thr:
+            # Go left
+            sprite = self.sprites[self.anime_frame][2]
+            self.pos[1] += 1
+        else:
+            # Stay
+            self.anime_frame = 0
+            sprite = self.sprites[self.anime_frame][0]
+
+        if self.anime_frame < 2:
+            self.anime_frame += 1
+        else:
+            self.anime_frame = 0
+
+        x = self.pos[0] - hm // 2
+        y = self.pos[1] - wm // 2
+        x = max(0, min(x, img.shape[0] - hm))
+        y = max(0, min(y, img.shape[0] - wm))
+
+        # Overlay image with transparent
+        img[x:x + hm, y:y +
+            wm] = (img[x:x + hm, y:y + wm] * (1 - sprite[:, :, 3:] / 255) +
+                   sprite[:, :, :3] * (sprite[:, :, 3:] / 255)).astype('uint8')
+
+        return img
+
+    def draw(self, frame_msg):
+        canvas = frame_msg.get_image()
+        pose_results = frame_msg.get_pose_results()
+        if not pose_results:
+            return canvas
+        for pose_result in pose_results:
+            model_cfg = pose_result['model_cfg']
+            preds = pose_result['preds']
+            # left_hand_idx, right_hand_idx = get_wrist_keypoint_ids(model_cfg)  # noqa: E501
+            left_hand_idx, right_hand_idx = get_eye_keypoint_ids(model_cfg)
+
+            canvas = self.apply_sprite_effect(canvas, preds, left_hand_idx,
+                                              right_hand_idx)
+        return canvas
+
+
+@NODES.register_module()
+class BackgroundNode(FrameDrawingNode):
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 enable: bool = True,
+                 src_img_path: Optional[str] = None,
+                 cls_ids: Optional[List] = None,
+                 cls_names: Optional[List] = None):
+
+        super().__init__(name, frame_buffer, output_buffer, enable_key, enable)
+
+        self.cls_ids = cls_ids
+        self.cls_names = cls_names
+
+        if src_img_path is None:
+            src_img_path = 'https://user-images.githubusercontent.com/'\
+                           '11788150/149731957-abd5c908-9c7f-45b2-b7bf-'\
+                           '821ab30c6a3e.jpg'
+        self.src_img = load_image_from_disk_or_url(src_img_path)
+
+    def apply_background_effect(self,
+                                img,
+                                det_results,
+                                background_img,
+                                effect_region=(0.2, 0.2, 0.8, 0.8)):
+        """Change background.
+
+        Args:
+            img (np.ndarray): Image data.
+            det_results (list[dict]): The detection results containing:
+
+                - "cls_id" (int): Class index.
+                - "label" (str): Class label (e.g. 'person').
+                - "bbox" (ndarray:(5, )): bounding box result
+                    [x, y, w, h, score].
+                - "mask" (ndarray:(w, h)): instance segmentation result.
+            background_img (np.ndarray): Background image.
+            effect_region (tuple(4, )): The region to apply mask,
+                the coordinates are normalized (x1, y1, x2, y2).
+        """
+        if len(det_results) > 0:
+            # Choose the one with the highest score.
+            det_result = det_results[0]
+            bbox = det_result['bbox']
+            mask = det_result['mask'].astype(np.uint8)
+            img = copy_and_paste(img, background_img, mask, bbox,
+                                 effect_region)
+            return img
+        else:
+            return background_img
+
+    def draw(self, frame_msg):
+        canvas = frame_msg.get_image()
+        if canvas.shape != self.src_img.shape:
+            self.src_img = cv2.resize(self.src_img, canvas.shape[:2])
+        det_results = frame_msg.get_detection_results()
+        if not det_results:
+            return canvas
+
+        full_preds = []
+        for det_result in det_results:
+            preds = det_result['preds']
+            if self.cls_ids:
+                # Filter results by class ID
+                filtered_preds = [
+                    p for p in preds if p['cls_id'] in self.cls_ids
+                ]
+            elif self.cls_names:
+                # Filter results by class name
+                filtered_preds = [
+                    p for p in preds if p['label'] in self.cls_names
+                ]
+            else:
+                filtered_preds = preds
+            full_preds.extend(filtered_preds)
+
+        canvas = self.apply_background_effect(canvas, full_preds, self.src_img)
+
+        return canvas
+
+
+@NODES.register_module()
+class SaiyanNode(FrameDrawingNode):
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 enable: bool = True,
+                 hair_img_path: Optional[str] = None,
+                 light_video_path: Optional[str] = None,
+                 cls_ids: Optional[List] = None,
+                 cls_names: Optional[List] = None):
+
+        super().__init__(name, frame_buffer, output_buffer, enable_key, enable)
+
+        self.cls_ids = cls_ids
+        self.cls_names = cls_names
+
+        if hair_img_path is None:
+            hair_img_path = 'https://user-images.githubusercontent.com/'\
+                            '11788150/149732117-fcd2d804-dc2c-426c-bee7-'\
+                            '94be6146e05c.png'
+        self.hair_img = load_image_from_disk_or_url(hair_img_path)
+
+        if light_video_path is None:
+            light_video_path = get_cached_file_path(
+                'https://'
+                'user-images.githubusercontent.com/11788150/149732080'
+                '-ea6cfeda-0dc5-4bbb-892a-3831e5580520.mp4')
+        self.light_video_path = light_video_path
+        self.light_video = cv2.VideoCapture(self.light_video_path)
+
+    def apply_saiyan_effect(self,
+                            img,
+                            pose_results,
+                            saiyan_img,
+                            light_frame,
+                            face_indices,
+                            bbox_thr=0.3,
+                            kpt_thr=0.5):
+        """Apply saiyan hair effect.
+
+        Args:
+            img (np.ndarray): Image data.
+            pose_results (list[dict]): The pose estimation results containing:
+                - "keypoints" ([K,3]): keypoint detection result
+                    in [x, y, score]
+            saiyan_img (np.ndarray): Saiyan image with transparent background.
+            light_frame (np.ndarray): Light image with green screen.
+            face_indices (int): Keypoint index of the face
+            kpt_thr (float): The score threshold of required keypoints.
+        """
+        img = img.copy()
+        im_shape = img.shape
+        # Apply lightning effects.
+        light_mask = screen_matting(light_frame, color='green')
+
+        # anchor points in the mask
+        pts_src = np.array(
+            [
+                [84, 398],  # face kpt 0
+                [331, 393],  # face kpt 16
+                [84, 145],
+                [331, 140]
+            ],
+            dtype=np.float32)
+
+        for pose in pose_results:
+            bbox = pose['bbox']
+
+            if bbox[-1] < bbox_thr:
+                continue
+
+            mask_inst = pose['mask']
+            # cache
+            fg = img[np.where(mask_inst)]
+
+            bbox = expand_and_clamp(bbox[:4], im_shape, s=3.0)
+            # Apply light effects between fg and bg
+            img = copy_and_paste(
+                light_frame,
+                img,
+                light_mask,
+                effect_region=(bbox[0] / im_shape[1], bbox[1] / im_shape[0],
+                               bbox[2] / im_shape[1], bbox[3] / im_shape[0]))
+            # pop
+            img[np.where(mask_inst)] = fg
+
+            # Apply Saiyan hair effects
+            kpts = pose['keypoints']
+            if kpts[face_indices[0], 2] < kpt_thr or kpts[face_indices[16],
+                                                          2] < kpt_thr:
+                continue
+
+            kpt_0 = kpts[face_indices[0], :2]
+            kpt_16 = kpts[face_indices[16], :2]
+            # orthogonal vector
+            vo = (kpt_0 - kpt_16)[::-1] * [-1, 1]
+
+            # anchor points in the image by eye positions
+            pts_tar = np.vstack([kpt_0, kpt_16, kpt_0 + vo, kpt_16 + vo])
+
+            h_mat, _ = cv2.findHomography(pts_src, pts_tar)
+            patch = cv2.warpPerspective(
+                saiyan_img,
+                h_mat,
+                dsize=(img.shape[1], img.shape[0]),
+                borderValue=(0, 0, 0))
+            mask_patch = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY)
+            mask_patch = (mask_patch > 1).astype(np.uint8)
+            img = cv2.copyTo(patch, mask_patch, img)
+
+        return img
+
+    def draw(self, frame_msg):
+        canvas = frame_msg.get_image()
+
+        det_results = frame_msg.get_detection_results()
+        if not det_results:
+            return canvas
+
+        pose_results = frame_msg.get_pose_results()
+        if not pose_results:
+            return canvas
+
+        for pose_result in pose_results:
+            model_cfg = pose_result['model_cfg']
+            preds = pose_result['preds']
+            face_indices = get_face_keypoint_ids(model_cfg)
+
+            ret, frame = self.light_video.read()
+            if not ret:
+                self.light_video = cv2.VideoCapture(self.light_video_path)
+                ret, frame = self.light_video.read()
+
+            canvas = self.apply_saiyan_effect(canvas, preds, self.hair_img,
+                                              frame, face_indices)
+
+        return canvas
+
+
+@NODES.register_module()
+class MoustacheNode(FrameDrawingNode):
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 enable: bool = True,
+                 src_img_path: Optional[str] = None):
+
+        super().__init__(name, frame_buffer, output_buffer, enable_key, enable)
+
+        if src_img_path is None:
+            src_img_path = 'https://user-images.githubusercontent.com/'\
+                           '11788150/149732141-3afbab55-252a-428c-b6d8'\
+                           '-0e352f432651.jpeg'
+        self.src_img = load_image_from_disk_or_url(src_img_path)
+
+    def apply_moustache_effect(self,
+                               img,
+                               pose_results,
+                               moustache_img,
+                               face_indices,
+                               kpt_thr=0.5):
+        """Apply moustache effect.
+
+        Args:
+            img (np.ndarray): Image data.
+            pose_results (list[dict]): The pose estimation results containing:
+                - "keypoints" ([K,3]): keypoint detection result
+                    in [x, y, score]
+            moustache_img (np.ndarray): Moustache image with white background.
+            left_eye_index (int): Keypoint index of left eye
+            right_eye_index (int): Keypoint index of right eye
+            kpt_thr (float): The score threshold of required keypoints.
+        """
+
+        hm, wm = moustache_img.shape[:2]
+        # anchor points in the moustache mask
+        pts_src = np.array([[1164, 741], [1729, 741], [1164, 1244],
+                            [1729, 1244]],
+                           dtype=np.float32)
+
+        for pose in pose_results:
+            kpts = pose['keypoints']
+            if kpts[face_indices[32], 2] < kpt_thr \
+                    or kpts[face_indices[34], 2] < kpt_thr \
+                    or kpts[face_indices[61], 2] < kpt_thr \
+                    or kpts[face_indices[63], 2] < kpt_thr:
+                continue
+
+            kpt_32 = kpts[face_indices[32], :2]
+            kpt_34 = kpts[face_indices[34], :2]
+            kpt_61 = kpts[face_indices[61], :2]
+            kpt_63 = kpts[face_indices[63], :2]
+            # anchor points in the image by eye positions
+            pts_tar = np.vstack([kpt_32, kpt_34, kpt_61, kpt_63])
+
+            h_mat, _ = cv2.findHomography(pts_src, pts_tar)
+            patch = cv2.warpPerspective(
+                moustache_img,
+                h_mat,
+                dsize=(img.shape[1], img.shape[0]),
+                borderValue=(255, 255, 255))
+            #  mask the white background area in the patch with a threshold 200
+            mask = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY)
+            mask = (mask < 200).astype(np.uint8)
+            img = cv2.copyTo(patch, mask, img)
+
+        return img
+
+    def draw(self, frame_msg):
+        canvas = frame_msg.get_image()
+        pose_results = frame_msg.get_pose_results()
+        if not pose_results:
+            return canvas
+        for pose_result in pose_results:
+            model_cfg = pose_result['model_cfg']
+            preds = pose_result['preds']
+            face_indices = get_face_keypoint_ids(model_cfg)
+            canvas = self.apply_moustache_effect(canvas, preds, self.src_img,
+                                                 face_indices)
+        return canvas
+
+
+@NODES.register_module()
+class BugEyeNode(FrameDrawingNode):
+
+    def draw(self, frame_msg):
+        canvas = frame_msg.get_image()
+        pose_results = frame_msg.get_pose_results()
+        if not pose_results:
+            return canvas
+        for pose_result in pose_results:
+            model_cfg = pose_result['model_cfg']
+            preds = pose_result['preds']
+            left_eye_idx, right_eye_idx = get_eye_keypoint_ids(model_cfg)
+
+            canvas = apply_bugeye_effect(canvas, preds, left_eye_idx,
+                                         right_eye_idx)
+        return canvas
+
+
+@NODES.register_module()
+class NoticeBoardNode(FrameDrawingNode):
+
+    default_content_lines = ['This is a notice board!']
+
+    def __init__(
+        self,
+        name: str,
+        frame_buffer: str,
+        output_buffer: Union[str, List[str]],
+        enable_key: Optional[Union[str, int]] = None,
+        enable: bool = True,
+        content_lines: Optional[List[str]] = None,
+        x_offset: int = 20,
+        y_offset: int = 20,
+        y_delta: int = 15,
+        text_color: Union[str, Tuple[int, int, int]] = 'black',
+        background_color: Union[str, Tuple[int, int, int]] = (255, 183, 0),
+        text_scale: float = 0.4,
+    ):
+        super().__init__(name, frame_buffer, output_buffer, enable_key, enable)
+
+        self.x_offset = x_offset
+        self.y_offset = y_offset
+        self.y_delta = y_delta
+        self.text_color = color_val(text_color)
+        self.background_color = color_val(background_color)
+        self.text_scale = text_scale
+
+        if content_lines:
+            self.content_lines = content_lines
+        else:
+            self.content_lines = self.default_content_lines
+
+    def draw(self, frame_msg: FrameMessage) -> np.ndarray:
+        img = frame_msg.get_image()
+        canvas = np.full(img.shape, self.background_color, dtype=img.dtype)
+
+        x = self.x_offset
+        y = self.y_offset
+
+        max_len = max([len(line) for line in self.content_lines])
+
+        def _put_line(line=''):
+            nonlocal y
+            cv2.putText(canvas, line, (x, y), cv2.FONT_HERSHEY_DUPLEX,
+                        self.text_scale, self.text_color, 1)
+            y += self.y_delta
+
+        for line in self.content_lines:
+            _put_line(line)
+
+        x1 = max(0, self.x_offset)
+        x2 = min(img.shape[1], int(x + max_len * self.text_scale * 20))
+        y1 = max(0, self.y_offset - self.y_delta)
+        y2 = min(img.shape[0], y)
+
+        src1 = canvas[y1:y2, x1:x2]
+        src2 = img[y1:y2, x1:x2]
+        img[y1:y2, x1:x2] = cv2.addWeighted(src1, 0.5, src2, 0.5, 0)
+
+        return img
+
+
+@NODES.register_module()
+class HatNode(FrameDrawingNode):
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 src_img_path: Optional[str] = None):
+
+        super().__init__(name, frame_buffer, output_buffer, enable_key)
+
+        if src_img_path is None:
+            # The image attributes to:
+            # http://616pic.com/sucai/1m9i70p52.html
+            src_img_path = 'https://user-images.githubusercontent.' \
+                           'com/28900607/149766271-2f591c19-9b67-4' \
+                           'd92-8f94-c272396ca141.png'
+        self.src_img = load_image_from_disk_or_url(src_img_path,
+                                                   cv2.IMREAD_UNCHANGED)
+
+    @staticmethod
+    def apply_hat_effect(img,
+                         pose_results,
+                         hat_img,
+                         left_eye_index,
+                         right_eye_index,
+                         kpt_thr=0.5):
+        """Apply hat effect.
+        Args:
+            img (np.ndarray): Image data.
+            pose_results (list[dict]): The pose estimation results containing:
+                - "keypoints" ([K,3]): keypoint detection result in
+                    [x, y, score]
+            hat_img (np.ndarray): Hat image with white alpha channel.
+            left_eye_index (int): Keypoint index of left eye
+            right_eye_index (int): Keypoint index of right eye
+            kpt_thr (float): The score threshold of required keypoints.
+        """
+        img_orig = img.copy()
+
+        img = img_orig.copy()
+        hm, wm = hat_img.shape[:2]
+        # anchor points in the sunglasses mask
+        a = 0.3
+        b = 0.7
+        pts_src = np.array([[a * wm, a * hm], [a * wm, b * hm],
+                            [b * wm, a * hm], [b * wm, b * hm]],
+                           dtype=np.float32)
+
+        for pose in pose_results:
+            kpts = pose['keypoints']
+
+            if kpts[left_eye_index, 2] < kpt_thr or \
+                    kpts[right_eye_index, 2] < kpt_thr:
+                continue
+
+            kpt_leye = kpts[left_eye_index, :2]
+            kpt_reye = kpts[right_eye_index, :2]
+            # orthogonal vector to the left-to-right eyes
+            vo = 0.5 * (kpt_reye - kpt_leye)[::-1] * [-1, 1]
+            veye = 0.5 * (kpt_reye - kpt_leye)
+
+            # anchor points in the image by eye positions
+            pts_tar = np.vstack([
+                kpt_reye + 1 * veye + 5 * vo, kpt_reye + 1 * veye + 1 * vo,
+                kpt_leye - 1 * veye + 5 * vo, kpt_leye - 1 * veye + 1 * vo
+            ])
+
+            h_mat, _ = cv2.findHomography(pts_src, pts_tar)
+            patch = cv2.warpPerspective(
+                hat_img,
+                h_mat,
+                dsize=(img.shape[1], img.shape[0]),
+                borderValue=(255, 255, 255))
+            #  mask the white background area in the patch with a threshold 200
+            mask = (patch[:, :, -1] > 128)
+            patch = patch[:, :, :-1]
+            mask = mask * (cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY) > 30)
+            mask = mask.astype(np.uint8)
+
+            img = cv2.copyTo(patch, mask, img)
+        return img
+
+    def draw(self, frame_msg):
+        canvas = frame_msg.get_image()
+        pose_results = frame_msg.get_pose_results()
+        if not pose_results:
+            return canvas
+        for pose_result in pose_results:
+            model_cfg = pose_result['model_cfg']
+            preds = pose_result['preds']
+            left_eye_idx, right_eye_idx = get_eye_keypoint_ids(model_cfg)
+
+            canvas = self.apply_hat_effect(canvas, preds, self.src_img,
+                                           left_eye_idx, right_eye_idx)
+        return canvas
+
+
+@NODES.register_module()
+class FirecrackerNode(FrameDrawingNode):
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 src_img_path: Optional[str] = None):
+
+        super().__init__(name, frame_buffer, output_buffer, enable_key)
+
+        if src_img_path is None:
+            self.src_img_path = 'https://user-images.githubusercontent' \
+                                '.com/28900607/149766281-6376055c-ed8b' \
+                                '-472b-991f-60e6ae6ee1da.gif'
+        src_img = cv2.VideoCapture(self.src_img_path)
+
+        self.frame_list = []
+        ret, frame = src_img.read()
+        while frame is not None:
+            self.frame_list.append(frame)
+            ret, frame = src_img.read()
+        self.num_frames = len(self.frame_list)
+        self.frame_idx = 0
+        self.frame_period = 4  # each frame in gif lasts for 4 frames in video
+
+    @staticmethod
+    def apply_firecracker_effect(img,
+                                 pose_results,
+                                 firecracker_img,
+                                 left_wrist_idx,
+                                 right_wrist_idx,
+                                 kpt_thr=0.5):
+        """Apply firecracker effect.
+        Args:
+            img (np.ndarray): Image data.
+            pose_results (list[dict]): The pose estimation results containing:
+                - "keypoints" ([K,3]): keypoint detection result in
+                    [x, y, score]
+            firecracker_img (np.ndarray): Firecracker image with white
+                background.
+            left_wrist_idx (int): Keypoint index of left wrist
+            right_wrist_idx (int): Keypoint index of right wrist
+            kpt_thr (float): The score threshold of required keypoints.
+        """
+
+        hm, wm = firecracker_img.shape[:2]
+        # anchor points in the firecracker mask
+        pts_src = np.array([[0. * wm, 0. * hm], [0. * wm, 1. * hm],
+                            [1. * wm, 0. * hm], [1. * wm, 1. * hm]],
+                           dtype=np.float32)
+
+        h, w = img.shape[:2]
+        h_tar = h / 3
+        w_tar = h_tar / hm * wm
+
+        for pose in pose_results:
+            kpts = pose['keypoints']
+
+            if kpts[left_wrist_idx, 2] > kpt_thr:
+                kpt_lwrist = kpts[left_wrist_idx, :2]
+                # anchor points in the image by eye positions
+                pts_tar = np.vstack([
+                    kpt_lwrist - [w_tar / 2, 0],
+                    kpt_lwrist - [w_tar / 2, -h_tar],
+                    kpt_lwrist + [w_tar / 2, 0],
+                    kpt_lwrist + [w_tar / 2, h_tar]
+                ])
+
+                h_mat, _ = cv2.findHomography(pts_src, pts_tar)
+                patch = cv2.warpPerspective(
+                    firecracker_img,
+                    h_mat,
+                    dsize=(img.shape[1], img.shape[0]),
+                    borderValue=(255, 255, 255))
+                #  mask the white background area in the patch with
+                # a threshold 200
+                mask = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY)
+                mask = (mask < 240).astype(np.uint8)
+                img = cv2.copyTo(patch, mask, img)
+
+            if kpts[right_wrist_idx, 2] > kpt_thr:
+                kpt_rwrist = kpts[right_wrist_idx, :2]
+
+                # anchor points in the image by eye positions
+                pts_tar = np.vstack([
+                    kpt_rwrist - [w_tar / 2, 0],
+                    kpt_rwrist - [w_tar / 2, -h_tar],
+                    kpt_rwrist + [w_tar / 2, 0],
+                    kpt_rwrist + [w_tar / 2, h_tar]
+                ])
+
+                h_mat, _ = cv2.findHomography(pts_src, pts_tar)
+                patch = cv2.warpPerspective(
+                    firecracker_img,
+                    h_mat,
+                    dsize=(img.shape[1], img.shape[0]),
+                    borderValue=(255, 255, 255))
+                #  mask the white background area in the patch with
+                # a threshold 200
+                mask = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY)
+                mask = (mask < 240).astype(np.uint8)
+                img = cv2.copyTo(patch, mask, img)
+
+        return img
+
+    def draw(self, frame_msg):
+        canvas = frame_msg.get_image()
+        pose_results = frame_msg.get_pose_results()
+        if not pose_results:
+            return canvas
+
+        frame = self.frame_list[self.frame_idx // self.frame_period]
+        for pose_result in pose_results:
+            model_cfg = pose_result['model_cfg']
+            preds = pose_result['preds']
+            left_wrist_idx, right_wrist_idx = get_wrist_keypoint_ids(model_cfg)
+
+            canvas = self.apply_firecracker_effect(canvas, preds, frame,
+                                                   left_wrist_idx,
+                                                   right_wrist_idx)
+        self.frame_idx = (self.frame_idx + 1) % (
+            self.num_frames * self.frame_period)
+
+        return canvas
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/helper_node.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/helper_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..349c4f423456781a092d83fc6382d7f9f3376fd8
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/helper_node.py
@@ -0,0 +1,296 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import time
+from queue import Full, Queue
+from threading import Thread
+from typing import List, Optional, Union
+
+import cv2
+import numpy as np
+from mmcv import color_val
+
+from mmpose.utils.timer import RunningAverage
+from .builder import NODES
+from .node import Node
+
+try:
+    import psutil
+    psutil_proc = psutil.Process()
+except (ImportError, ModuleNotFoundError):
+    psutil_proc = None
+
+
+@NODES.register_module()
+class ModelResultBindingNode(Node):
+
+    def __init__(self, name: str, frame_buffer: str, result_buffer: str,
+                 output_buffer: Union[str, List[str]]):
+        super().__init__(name=name, enable=True)
+        self.synchronous = None
+
+        # Cache the latest model result
+        self.last_result_msg = None
+        self.last_output_msg = None
+
+        # Inference speed analysis
+        self.frame_fps = RunningAverage(window=10)
+        self.frame_lag = RunningAverage(window=10)
+        self.result_fps = RunningAverage(window=10)
+        self.result_lag = RunningAverage(window=10)
+
+        # Register buffers
+        # Note that essential buffers will be set in set_runner() because
+        # it depends on the runner.synchronous attribute.
+        self.register_input_buffer(result_buffer, 'result', essential=False)
+        self.register_input_buffer(frame_buffer, 'frame', essential=False)
+        self.register_output_buffer(output_buffer)
+
+    def set_runner(self, runner):
+        super().set_runner(runner)
+
+        # Set synchronous according to the runner
+        if runner.synchronous:
+            self.synchronous = True
+            essential_input = 'result'
+        else:
+            self.synchronous = False
+            essential_input = 'frame'
+
+        # Set essential input buffer according to the synchronous setting
+        for buffer_info in self._input_buffers:
+            if buffer_info.input_name == essential_input:
+                buffer_info.essential = True
+
+    def process(self, input_msgs):
+        result_msg = input_msgs['result']
+
+        # Update last result
+        if result_msg is not None:
+            # Update result FPS
+            if self.last_result_msg is not None:
+                self.result_fps.update(
+                    1.0 /
+                    (result_msg.timestamp - self.last_result_msg.timestamp))
+            # Update inference latency
+            self.result_lag.update(time.time() - result_msg.timestamp)
+            # Update last inference result
+            self.last_result_msg = result_msg
+
+        if not self.synchronous:
+            # Asynchronous mode: Bind the latest result with the current frame.
+            frame_msg = input_msgs['frame']
+
+            self.frame_lag.update(time.time() - frame_msg.timestamp)
+
+            # Bind result to frame
+            if self.last_result_msg is not None:
+                frame_msg.set_full_results(
+                    self.last_result_msg.get_full_results())
+                frame_msg.merge_route_info(
+                    self.last_result_msg.get_route_info())
+
+            output_msg = frame_msg
+
+        else:
+            # Synchronous mode: Directly output the frame that the model result
+            # was obtained from.
+            self.frame_lag.update(time.time() - result_msg.timestamp)
+            output_msg = result_msg
+
+        # Update frame fps and lag
+        if self.last_output_msg is not None:
+            self.frame_lag.update(time.time() - output_msg.timestamp)
+            self.frame_fps.update(
+                1.0 / (output_msg.timestamp - self.last_output_msg.timestamp))
+        self.last_output_msg = output_msg
+
+        return output_msg
+
+    def _get_node_info(self):
+        info = super()._get_node_info()
+        info['result_fps'] = self.result_fps.average()
+        info['result_lag (ms)'] = self.result_lag.average() * 1000
+        info['frame_fps'] = self.frame_fps.average()
+        info['frame_lag (ms)'] = self.frame_lag.average() * 1000
+        return info
+
+
+@NODES.register_module()
+class MonitorNode(Node):
+
+    _default_ignore_items = ['timestamp']
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 enable: bool = False,
+                 x_offset=20,
+                 y_offset=20,
+                 y_delta=15,
+                 text_color='black',
+                 background_color=(255, 183, 0),
+                 text_scale=0.4,
+                 ignore_items: Optional[List[str]] = None):
+        super().__init__(name=name, enable_key=enable_key, enable=enable)
+
+        self.x_offset = x_offset
+        self.y_offset = y_offset
+        self.y_delta = y_delta
+        self.text_color = color_val(text_color)
+        self.background_color = color_val(background_color)
+        self.text_scale = text_scale
+        if ignore_items is None:
+            self.ignore_items = self._default_ignore_items
+        else:
+            self.ignore_items = ignore_items
+
+        self.register_input_buffer(frame_buffer, 'frame', essential=True)
+        self.register_output_buffer(output_buffer)
+
+    def process(self, input_msgs):
+        frame_msg = input_msgs['frame']
+
+        frame_msg.update_route_info(
+            node_name='System Info',
+            node_type='dummy',
+            info=self._get_system_info())
+
+        img = frame_msg.get_image()
+        route_info = frame_msg.get_route_info()
+        img = self._show_route_info(img, route_info)
+
+        frame_msg.set_image(img)
+        return frame_msg
+
+    def _get_system_info(self):
+        sys_info = {}
+        if psutil_proc is not None:
+            sys_info['CPU(%)'] = psutil_proc.cpu_percent()
+            sys_info['Memory(%)'] = psutil_proc.memory_percent()
+        return sys_info
+
+    def _show_route_info(self, img, route_info):
+        canvas = np.full(img.shape, self.background_color, dtype=img.dtype)
+
+        x = self.x_offset
+        y = self.y_offset
+
+        max_len = 0
+
+        def _put_line(line=''):
+            nonlocal y, max_len
+            cv2.putText(canvas, line, (x, y), cv2.FONT_HERSHEY_DUPLEX,
+                        self.text_scale, self.text_color, 1)
+            y += self.y_delta
+            max_len = max(max_len, len(line))
+
+        for node_info in route_info:
+            title = f'{node_info["node"]}({node_info["node_type"]})'
+            _put_line(title)
+            for k, v in node_info['info'].items():
+                if k in self.ignore_items:
+                    continue
+                if isinstance(v, float):
+                    v = f'{v:.1f}'
+                _put_line(f'    {k}: {v}')
+
+        x1 = max(0, self.x_offset)
+        x2 = min(img.shape[1], int(x + max_len * self.text_scale * 20))
+        y1 = max(0, self.y_offset - self.y_delta)
+        y2 = min(img.shape[0], y)
+
+        src1 = canvas[y1:y2, x1:x2]
+        src2 = img[y1:y2, x1:x2]
+        img[y1:y2, x1:x2] = cv2.addWeighted(src1, 0.5, src2, 0.5, 0)
+
+        return img
+
+    def bypass(self, input_msgs):
+        return input_msgs['frame']
+
+
+@NODES.register_module()
+class RecorderNode(Node):
+    """Record the frames into a local file."""
+
+    def __init__(
+        self,
+        name: str,
+        frame_buffer: str,
+        output_buffer: Union[str, List[str]],
+        out_video_file: str,
+        out_video_fps: int = 30,
+        out_video_codec: str = 'mp4v',
+        buffer_size: int = 30,
+    ):
+        super().__init__(name=name, enable_key=None, enable=True)
+
+        self.queue = Queue(maxsize=buffer_size)
+        self.out_video_file = out_video_file
+        self.out_video_fps = out_video_fps
+        self.out_video_codec = out_video_codec
+        self.vwriter = None
+
+        # Register buffers
+        self.register_input_buffer(frame_buffer, 'frame', essential=True)
+        self.register_output_buffer(output_buffer)
+
+        # Start a new thread to write frame
+        self.t_record = Thread(target=self._record, args=(), daemon=True)
+        self.t_record.start()
+
+    def process(self, input_msgs):
+
+        frame_msg = input_msgs['frame']
+        img = frame_msg.get_image() if frame_msg is not None else None
+        img_queued = False
+
+        while not img_queued:
+            try:
+                self.queue.put(img, timeout=1)
+                img_queued = True
+                logging.info(f'{self.name}: recorder received one frame!')
+            except Full:
+                logging.info(f'{self.name}: recorder jamed!')
+
+        return frame_msg
+
+    def _record(self):
+
+        while True:
+
+            img = self.queue.get()
+
+            if img is None:
+                break
+
+            if self.vwriter is None:
+                fourcc = cv2.VideoWriter_fourcc(*self.out_video_codec)
+                fps = self.out_video_fps
+                frame_size = (img.shape[1], img.shape[0])
+                self.vwriter = cv2.VideoWriter(self.out_video_file, fourcc,
+                                               fps, frame_size)
+                assert self.vwriter.isOpened()
+
+            self.vwriter.write(img)
+
+        logging.info('Video recorder released!')
+        if self.vwriter is not None:
+            self.vwriter.release()
+
+    def on_exit(self):
+        try:
+            # Try putting a None into the output queue so the self.vwriter will
+            # be released after all queue frames have been written to file.
+            self.queue.put(None, timeout=1)
+            self.t_record.join(timeout=1)
+        except Full:
+            pass
+
+        if self.t_record.is_alive():
+            # Force to release self.vwriter
+            logging.info('Video recorder forced release!')
+            if self.vwriter is not None:
+                self.vwriter.release()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/mmdet_node.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/mmdet_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..4207647c927dfbd34af225454ed5c2ef7466a012
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/mmdet_node.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+from .builder import NODES
+from .node import Node
+
+try:
+    from mmdet.apis import inference_detector, init_detector
+    has_mmdet = True
+except (ImportError, ModuleNotFoundError):
+    has_mmdet = False
+
+
+@NODES.register_module()
+class DetectorNode(Node):
+
+    def __init__(self,
+                 name: str,
+                 model_config: str,
+                 model_checkpoint: str,
+                 input_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 device: str = 'cuda:0'):
+        # Check mmdetection is installed
+        assert has_mmdet, 'Please install mmdet to run the demo.'
+        super().__init__(name=name, enable_key=enable_key, enable=True)
+
+        self.model_config = model_config
+        self.model_checkpoint = model_checkpoint
+        self.device = device.lower()
+
+        # Init model
+        self.model = init_detector(
+            self.model_config,
+            self.model_checkpoint,
+            device=self.device.lower())
+
+        # Register buffers
+        self.register_input_buffer(input_buffer, 'input', essential=True)
+        self.register_output_buffer(output_buffer)
+
+    def bypass(self, input_msgs):
+        return input_msgs['input']
+
+    def process(self, input_msgs):
+        input_msg = input_msgs['input']
+
+        img = input_msg.get_image()
+
+        preds = inference_detector(self.model, img)
+        det_result = self._post_process(preds)
+
+        input_msg.add_detection_result(det_result, tag=self.name)
+        return input_msg
+
+    def _post_process(self, preds):
+        if isinstance(preds, tuple):
+            dets = preds[0]
+            segms = preds[1]
+        else:
+            dets = preds
+            segms = [None] * len(dets)
+
+        assert len(dets) == len(self.model.CLASSES)
+        assert len(segms) == len(self.model.CLASSES)
+        result = {'preds': [], 'model_cfg': self.model.cfg.copy()}
+
+        for i, (cls_name, bboxes,
+                masks) in enumerate(zip(self.model.CLASSES, dets, segms)):
+            if masks is None:
+                masks = [None] * len(bboxes)
+            else:
+                assert len(masks) == len(bboxes)
+
+            preds_i = [{
+                'cls_id': i,
+                'label': cls_name,
+                'bbox': bbox,
+                'mask': mask,
+            } for (bbox, mask) in zip(bboxes, masks)]
+            result['preds'].extend(preds_i)
+
+        return result
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/mmpose_node.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/mmpose_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..167d7413ea48943b9373525bf5f392b5f1aa248b
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/mmpose_node.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+from typing import Dict, List, Optional, Union
+
+from mmpose.apis import (get_track_id, inference_top_down_pose_model,
+                         init_pose_model)
+from ..utils import Message
+from .builder import NODES
+from .node import Node
+
+
+@NODES.register_module()
+class TopDownPoseEstimatorNode(Node):
+
+    def __init__(self,
+                 name: str,
+                 model_config: str,
+                 model_checkpoint: str,
+                 input_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 enable: bool = True,
+                 device: str = 'cuda:0',
+                 cls_ids: Optional[List] = None,
+                 cls_names: Optional[List] = None,
+                 bbox_thr: float = 0.5):
+        super().__init__(name=name, enable_key=enable_key, enable=enable)
+
+        # Init model
+        self.model_config = model_config
+        self.model_checkpoint = model_checkpoint
+        self.device = device.lower()
+
+        self.cls_ids = cls_ids
+        self.cls_names = cls_names
+        self.bbox_thr = bbox_thr
+
+        # Init model
+        self.model = init_pose_model(
+            self.model_config,
+            self.model_checkpoint,
+            device=self.device.lower())
+
+        # Store history for pose tracking
+        self.track_info = {
+            'next_id': 0,
+            'last_pose_preds': [],
+            'last_time': None
+        }
+
+        # Register buffers
+        self.register_input_buffer(input_buffer, 'input', essential=True)
+        self.register_output_buffer(output_buffer)
+
+    def bypass(self, input_msgs):
+        return input_msgs['input']
+
+    def process(self, input_msgs: Dict[str, Message]) -> Message:
+
+        input_msg = input_msgs['input']
+        img = input_msg.get_image()
+        det_results = input_msg.get_detection_results()
+
+        if det_results is None:
+            raise ValueError(
+                'No detection results are found in the frame message.'
+                f'{self.__class__.__name__} should be used after a '
+                'detector node.')
+
+        full_det_preds = []
+        for det_result in det_results:
+            det_preds = det_result['preds']
+            if self.cls_ids:
+                # Filter detection results by class ID
+                det_preds = [
+                    p for p in det_preds if p['cls_id'] in self.cls_ids
+                ]
+            elif self.cls_names:
+                # Filter detection results by class name
+                det_preds = [
+                    p for p in det_preds if p['label'] in self.cls_names
+                ]
+            full_det_preds.extend(det_preds)
+
+        # Inference pose
+        pose_preds, _ = inference_top_down_pose_model(
+            self.model,
+            img,
+            full_det_preds,
+            bbox_thr=self.bbox_thr,
+            format='xyxy')
+
+        # Pose tracking
+        current_time = time.time()
+        if self.track_info['last_time'] is None:
+            fps = None
+        elif self.track_info['last_time'] >= current_time:
+            fps = None
+        else:
+            fps = 1.0 / (current_time - self.track_info['last_time'])
+
+        pose_preds, next_id = get_track_id(
+            pose_preds,
+            self.track_info['last_pose_preds'],
+            self.track_info['next_id'],
+            use_oks=False,
+            tracking_thr=0.3,
+            use_one_euro=True,
+            fps=fps)
+
+        self.track_info['next_id'] = next_id
+        self.track_info['last_pose_preds'] = pose_preds.copy()
+        self.track_info['last_time'] = current_time
+
+        pose_result = {
+            'preds': pose_preds,
+            'model_cfg': self.model.cfg.copy(),
+        }
+
+        input_msg.add_pose_result(pose_result, tag=self.name)
+
+        return input_msg
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/node.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/node.py
new file mode 100644
index 0000000000000000000000000000000000000000..31e48d089dd18f8845125f50676cc175dbc2d24d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/node.py
@@ -0,0 +1,372 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import time
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass
+from queue import Empty
+from threading import Thread
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+from mmcv.utils.misc import is_method_overridden
+
+from mmpose.utils import StopWatch
+from ..utils import Message, VideoEndingMessage, limit_max_fps
+
+
+@dataclass
+class BufferInfo():
+    """Dataclass for buffer information."""
+    buffer_name: str
+    input_name: Optional[str] = None
+    essential: bool = False
+
+
+@dataclass
+class EventInfo():
+    """Dataclass for event handler information."""
+    event_name: str
+    is_keyboard: bool = False
+    handler_func: Optional[Callable] = None
+
+
+class Node(Thread, metaclass=ABCMeta):
+    """Base interface of functional module.
+
+    Parameters:
+        name (str, optional): The node name (also thread name).
+        enable_key (str|int, optional): Set a hot-key to toggle enable/disable
+            of the node. If an int value is given, it will be treated as an
+            ascii code of a key. Please note:
+                1. If enable_key is set, the bypass method need to be
+                    overridden to define the node behavior when disabled
+                2. Some hot-key has been use for particular use. For example:
+                    'q', 'Q' and 27 are used for quit
+            Default: None
+        max_fps (int): Maximum FPS of the node. This is to avoid the node
+            running unrestrictedly and causing large resource consuming.
+            Default: 30
+        input_check_interval (float): Minimum interval (in millisecond) between
+            checking if input is ready. Default: 0.001
+        enable (bool): Default enable/disable status. Default: True.
+        daemon (bool): Whether node is a daemon. Default: True.
+    """
+
+    def __init__(self,
+                 name: Optional[str] = None,
+                 enable_key: Optional[Union[str, int]] = None,
+                 max_fps: int = 30,
+                 input_check_interval: float = 0.01,
+                 enable: bool = True,
+                 daemon=False):
+        super().__init__(name=name, daemon=daemon)
+        self._runner = None
+        self._enabled = enable
+        self.enable_key = enable_key
+        self.max_fps = max_fps
+        self.input_check_interval = input_check_interval
+
+        # A partitioned buffer manager the runner's buffer manager that
+        # only accesses the buffers related to the node
+        self._buffer_manager = None
+
+        # Input/output buffers are a list of registered buffers' information
+        self._input_buffers = []
+        self._output_buffers = []
+
+        # Event manager is a copy of assigned runner's event manager
+        self._event_manager = None
+
+        # A list of registered event information
+        # See register_event() for more information
+        # Note that we recommend to handle events in nodes by registering
+        # handlers, but one can still access the raw event by _event_manager
+        self._registered_events = []
+
+        # A list of (listener_threads, event_info)
+        # See set_runner() for more information
+        self._event_listener_threads = []
+
+        # A timer to calculate node FPS
+        self._timer = StopWatch(window=10)
+
+        # Register enable toggle key
+        if self.enable_key:
+            # If the node allows toggling enable, it should override the
+            # `bypass` method to define the node behavior when disabled.
+            if not is_method_overridden('bypass', Node, self.__class__):
+                raise NotImplementedError(
+                    f'The node {self.__class__} does not support toggling'
+                    'enable but got argument `enable_key`. To support toggling'
+                    'enable, please override the `bypass` method of the node.')
+
+            self.register_event(
+                event_name=self.enable_key,
+                is_keyboard=True,
+                handler_func=self._toggle_enable,
+            )
+
+    @property
+    def registered_buffers(self):
+        return self._input_buffers + self._output_buffers
+
+    @property
+    def registered_events(self):
+        return self._registered_events.copy()
+
+    def _toggle_enable(self):
+        self._enabled = not self._enabled
+
+    def register_input_buffer(self,
+                              buffer_name: str,
+                              input_name: str,
+                              essential: bool = False):
+        """Register an input buffer, so that Node can automatically check if
+        data is ready, fetch data from the buffers and format the inputs to
+        feed into `process` method.
+
+        This method can be invoked multiple times to register multiple input
+        buffers.
+
+        The subclass of Node should invoke `register_input_buffer` in its
+        `__init__` method.
+
+        Args:
+            buffer_name (str): The name of the buffer
+            input_name (str): The name of the fetched message from the
+                corresponding buffer
+            essential (bool): An essential input means the node will wait
+                until the input is ready before processing. Otherwise, an
+                inessential input will not block the processing, instead
+                a None will be fetched if the buffer is not ready.
+        """
+        buffer_info = BufferInfo(buffer_name, input_name, essential)
+        self._input_buffers.append(buffer_info)
+
+    def register_output_buffer(self, buffer_name: Union[str, List[str]]):
+        """Register one or multiple output buffers, so that the Node can
+        automatically send the output of the `process` method to these buffers.
+
+        The subclass of Node should invoke `register_output_buffer` in its
+        `__init__` method.
+
+        Args:
+            buffer_name (str|list): The name(s) of the output buffer(s).
+        """
+
+        if not isinstance(buffer_name, list):
+            buffer_name = [buffer_name]
+
+        for name in buffer_name:
+            buffer_info = BufferInfo(name)
+            self._output_buffers.append(buffer_info)
+
+    def register_event(self,
+                       event_name: str,
+                       is_keyboard: bool = False,
+                       handler_func: Optional[Callable] = None):
+        """Register an event. All events used in the node need to be registered
+        in __init__(). If a callable handler is given, a thread will be create
+        to listen and handle the event when the node starts.
+
+        Args:
+            Args:
+            event_name (str|int): The event name. If is_keyboard==True,
+                event_name should be a str (as char) or an int (as ascii)
+            is_keyboard (bool): Indicate whether it is an keyboard
+                event. If True, the argument event_name will be regarded as a
+                key indicator.
+            handler_func (callable, optional): The event handler function,
+                which should be a collable object with no arguments or
+                return values. Default: None.
+        """
+        event_info = EventInfo(event_name, is_keyboard, handler_func)
+        self._registered_events.append(event_info)
+
+    def set_runner(self, runner):
+        # Get partitioned buffer manager
+        buffer_names = [
+            buffer.buffer_name
+            for buffer in self._input_buffers + self._output_buffers
+        ]
+        self._buffer_manager = runner.buffer_manager.get_sub_manager(
+            buffer_names)
+
+        # Get event manager
+        self._event_manager = runner.event_manager
+
+    def _get_input_from_buffer(self) -> Tuple[bool, Optional[Dict]]:
+        """Get and pack input data if it's ready. The function returns a tuple
+        of a status flag and a packed data dictionary. If input_buffer is
+        ready, the status flag will be True, and the packed data is a dict
+        whose items are buffer names and corresponding messages (unready
+        additional buffers will give a `None`). Otherwise, the status flag is
+        False and the packed data is None.
+
+        Returns:
+            bool: status flag
+            dict[str, Message]: the packed inputs where the key is the buffer
+                name and the value is the Message got from the corresponding
+                buffer.
+        """
+        buffer_manager = self._buffer_manager
+
+        if buffer_manager is None:
+            raise ValueError(f'{self.name}: Runner not set!')
+
+        # Check that essential buffers are ready
+        for buffer_info in self._input_buffers:
+            if buffer_info.essential and buffer_manager.is_empty(
+                    buffer_info.buffer_name):
+                return False, None
+
+        # Default input
+        result = {
+            buffer_info.input_name: None
+            for buffer_info in self._input_buffers
+        }
+
+        for buffer_info in self._input_buffers:
+            try:
+                result[buffer_info.input_name] = buffer_manager.get(
+                    buffer_info.buffer_name, block=False)
+            except Empty:
+                if buffer_info.essential:
+                    # Return unsuccessful flag if any
+                    # essential input is unready
+                    return False, None
+
+        return True, result
+
+    def _send_output_to_buffers(self, output_msg):
+        """Send output of the process method to registered output buffers.
+
+        Args:
+            output_msg (Message): output message
+            force (bool, optional): If True, block until the output message
+                has been put into all output buffers. Default: False
+        """
+        for buffer_info in self._output_buffers:
+            buffer_name = buffer_info.buffer_name
+            self._buffer_manager.put_force(buffer_name, output_msg)
+
+    @abstractmethod
+    def process(self, input_msgs: Dict[str, Message]) -> Union[Message, None]:
+        """The core method that implement the function of the node. This method
+        will be invoked when the node is enabled and the input data is ready.
+
+        All subclasses of Node should override this method.
+
+        Args:
+            input_msgs (dict): The input data collected from the buffers. For
+                each item, the key is the `input_name` of the registered input
+                buffer, while the value is a Message instance fetched from the
+                buffer (or None if the buffer is unessential and not ready).
+
+        Returns:
+            Message: The output message of the node. It will be send to all
+                registered output buffers.
+        """
+
+    def bypass(self, input_msgs: Dict[str, Message]) -> Union[Message, None]:
+        """The method that defines the node behavior when disabled. Note that
+        if the node has an `enable_key`, this method should be override.
+
+        The method input/output is same as it of `process` method.
+
+        Args:
+            input_msgs (dict): The input data collected from the buffers. For
+                each item, the key is the `input_name` of the registered input
+                buffer, while the value is a Message instance fetched from the
+                buffer (or None if the buffer is unessential and not ready).
+
+        Returns:
+            Message: The output message of the node. It will be send to all
+                registered output buffers.
+        """
+        raise NotImplementedError
+
+    def _get_node_info(self):
+        """Get route information of the node."""
+        info = {'fps': self._timer.report('_FPS_'), 'timestamp': time.time()}
+        return info
+
+    def on_exit(self):
+        """This method will be invoked on event `_exit_`.
+
+        Subclasses should override this method to specifying the exiting
+        behavior.
+        """
+
+    def run(self):
+        """Method representing the Node's activity.
+
+        This method override the standard run() method of Thread. Users should
+        not override this method in subclasses.
+        """
+
+        logging.info(f'Node {self.name} starts')
+
+        # Create event listener threads
+        for event_info in self._registered_events:
+
+            if event_info.handler_func is None:
+                continue
+
+            def event_listener():
+                while True:
+                    with self._event_manager.wait_and_handle(
+                            event_info.event_name, event_info.is_keyboard):
+                        event_info.handler_func()
+
+            t_listener = Thread(target=event_listener, args=(), daemon=True)
+            t_listener.start()
+            self._event_listener_threads.append(t_listener)
+
+        # Loop
+        while True:
+            # Exit
+            if self._event_manager.is_set('_exit_'):
+                self.on_exit()
+                break
+
+            # Check if input is ready
+            input_status, input_msgs = self._get_input_from_buffer()
+
+            # Input is not ready
+            if not input_status:
+                time.sleep(self.input_check_interval)
+                continue
+
+            # If a VideoEndingMessage is received, broadcast the signal
+            # without invoking process() or bypass()
+            video_ending = False
+            for _, msg in input_msgs.items():
+                if isinstance(msg, VideoEndingMessage):
+                    self._send_output_to_buffers(msg)
+                    video_ending = True
+                    break
+
+            if video_ending:
+                self.on_exit()
+                break
+
+            # Check if enabled
+            if not self._enabled:
+                # Override bypass method to define node behavior when disabled
+                output_msg = self.bypass(input_msgs)
+            else:
+                with self._timer.timeit():
+                    with limit_max_fps(self.max_fps):
+                        # Process
+                        output_msg = self.process(input_msgs)
+
+                if output_msg:
+                    # Update route information
+                    node_info = self._get_node_info()
+                    output_msg.update_route_info(node=self, info=node_info)
+
+            # Send output message
+            if output_msg is not None:
+                self._send_output_to_buffers(output_msg)
+
+        logging.info(f'{self.name}: process ending.')
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/valentinemagic_node.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/valentinemagic_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b1c6a585065416b50f1c889272d7e869942354e
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/valentinemagic_node.py
@@ -0,0 +1,340 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import cv2
+import numpy as np
+
+from ..utils import (FrameMessage, get_eye_keypoint_ids, get_hand_keypoint_ids,
+                     get_mouth_keypoint_ids, load_image_from_disk_or_url)
+from .builder import NODES
+from .frame_drawing_node import FrameDrawingNode
+
+
+@dataclass
+class HeartInfo():
+    """Dataclass for heart information."""
+    heart_type: int
+    start_time: float
+    start_pos: Tuple[int, int]
+    end_pos: Tuple[int, int]
+
+
+@NODES.register_module()
+class ValentineMagicNode(FrameDrawingNode):
+
+    def __init__(self,
+                 name: str,
+                 frame_buffer: str,
+                 output_buffer: Union[str, List[str]],
+                 enable_key: Optional[Union[str, int]] = None,
+                 kpt_vis_thr: float = 0.3,
+                 hand_heart_angle_thr: float = 90.0,
+                 longest_duration: float = 2.0,
+                 largest_ratio: float = 0.25,
+                 hand_heart_img_path: Optional[str] = None,
+                 flying_heart_img_path: Optional[str] = None,
+                 hand_heart_dis_ratio_thr: float = 1.0,
+                 flying_heart_dis_ratio_thr: float = 3.5,
+                 num_persons: int = 2):
+
+        super().__init__(
+            name, frame_buffer, output_buffer, enable_key=enable_key)
+
+        if hand_heart_img_path is None:
+            hand_heart_img_path = 'https://user-images.githubusercontent.com/'\
+                           '87690686/149731850-ea946766-a4e8-4efa-82f5'\
+                           '-e2f0515db8ae.png'
+        if flying_heart_img_path is None:
+            flying_heart_img_path = 'https://user-images.githubusercontent.'\
+                                    'com/87690686/153554948-937ce496-33dd-4'\
+                                    '9ab-9829-0433fd7c13c4.png'
+
+        self.hand_heart = load_image_from_disk_or_url(hand_heart_img_path)
+        self.flying_heart = load_image_from_disk_or_url(flying_heart_img_path)
+
+        self.kpt_vis_thr = kpt_vis_thr
+        self.hand_heart_angle_thr = hand_heart_angle_thr
+        self.hand_heart_dis_ratio_thr = hand_heart_dis_ratio_thr
+        self.flying_heart_dis_ratio_thr = flying_heart_dis_ratio_thr
+        self.longest_duration = longest_duration
+        self.largest_ratio = largest_ratio
+        self.num_persons = num_persons
+
+        # record the heart infos for each person
+        self.heart_infos = {}
+
+    def _cal_distance(self, p1: np.ndarray, p2: np.ndarray) -> np.float64:
+        """calculate the distance of points p1 and p2."""
+        return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)
+
+    def _cal_angle(self, p1: np.ndarray, p2: np.ndarray, p3: np.ndarray,
+                   p4: np.ndarray) -> np.float64:
+        """calculate the angle of vectors v1(constructed by points p2 and p1)
+        and v2(constructed by points p4 and p3)"""
+        v1 = p2 - p1
+        v2 = p4 - p3
+
+        vector_prod = v1[0] * v2[0] + v1[1] * v2[1]
+        length_prod = np.sqrt(pow(v1[0], 2) + pow(v1[1], 2)) * np.sqrt(
+            pow(v2[0], 2) + pow(v2[1], 2))
+        cos = vector_prod * 1.0 / (length_prod * 1.0 + 1e-6)
+
+        return (np.arccos(cos) / np.pi) * 180
+
+    def _check_heart(self, pred: Dict[str,
+                                      np.ndarray], hand_indices: List[int],
+                     mouth_index: int, eye_indices: List[int]) -> int:
+        """Check the type of Valentine Magic based on the pose results and
+        keypoint indices of hand, mouth. and eye.
+
+        Args:
+            pred(dict): The pose estimation results containing:
+                - "keypoints" (np.ndarray[K,3]): keypoint detection result
+                                                 in [x, y, score]
+            hand_indices(list[int]): keypoint indices of hand
+            mouth_index(int): keypoint index of mouth
+            eye_indices(list[int]): keypoint indices of eyes
+
+        Returns:
+            int: a number representing the type of heart pose,
+                 0: None, 1: hand heart, 2: left hand blow kiss,
+                 3: right hand blow kiss
+        """
+        kpts = pred['keypoints']
+
+        left_eye_idx, right_eye_idx = eye_indices
+        left_eye_pos = kpts[left_eye_idx][:2]
+        right_eye_pos = kpts[right_eye_idx][:2]
+        eye_dis = self._cal_distance(left_eye_pos, right_eye_pos)
+
+        # these indices are corresoponding to the following keypoints:
+        # left_hand_root, left_pinky_finger1,
+        # left_pinky_finger3, left_pinky_finger4,
+        # right_hand_root, right_pinky_finger1
+        # right_pinky_finger3, right_pinky_finger4
+
+        both_hands_vis = True
+        for i in [0, 17, 19, 20, 21, 38, 40, 41]:
+            if kpts[hand_indices[i]][2] < self.kpt_vis_thr:
+                both_hands_vis = False
+
+        if both_hands_vis:
+            p1 = kpts[hand_indices[20]][:2]
+            p2 = kpts[hand_indices[19]][:2]
+            p3 = kpts[hand_indices[17]][:2]
+            p4 = kpts[hand_indices[0]][:2]
+            left_angle = self._cal_angle(p1, p2, p3, p4)
+
+            p1 = kpts[hand_indices[41]][:2]
+            p2 = kpts[hand_indices[40]][:2]
+            p3 = kpts[hand_indices[38]][:2]
+            p4 = kpts[hand_indices[21]][:2]
+            right_angle = self._cal_angle(p1, p2, p3, p4)
+
+            hand_dis = self._cal_distance(kpts[hand_indices[20]][:2],
+                                          kpts[hand_indices[41]][:2])
+
+            if (left_angle < self.hand_heart_angle_thr
+                    and right_angle < self.hand_heart_angle_thr
+                    and hand_dis / eye_dis < self.hand_heart_dis_ratio_thr):
+                return 1
+
+        # these indices are corresoponding to the following keypoints:
+        # left_middle_finger1, left_middle_finger4,
+        left_hand_vis = True
+        for i in [9, 12]:
+            if kpts[hand_indices[i]][2] < self.kpt_vis_thr:
+                left_hand_vis = False
+                break
+        # right_middle_finger1, right_middle_finger4
+
+        right_hand_vis = True
+        for i in [30, 33]:
+            if kpts[hand_indices[i]][2] < self.kpt_vis_thr:
+                right_hand_vis = False
+                break
+
+        mouth_vis = True
+        if kpts[mouth_index][2] < self.kpt_vis_thr:
+            mouth_vis = False
+
+        if (not left_hand_vis and not right_hand_vis) or not mouth_vis:
+            return 0
+
+        mouth_pos = kpts[mouth_index]
+
+        left_mid_hand_pos = (kpts[hand_indices[9]][:2] +
+                             kpts[hand_indices[12]][:2]) / 2
+        lefthand_mouth_dis = self._cal_distance(left_mid_hand_pos, mouth_pos)
+
+        if lefthand_mouth_dis / eye_dis < self.flying_heart_dis_ratio_thr:
+            return 2
+
+        right_mid_hand_pos = (kpts[hand_indices[30]][:2] +
+                              kpts[hand_indices[33]][:2]) / 2
+        righthand_mouth_dis = self._cal_distance(right_mid_hand_pos, mouth_pos)
+
+        if righthand_mouth_dis / eye_dis < self.flying_heart_dis_ratio_thr:
+            return 3
+
+        return 0
+
+    def _get_heart_route(self, heart_type: int, cur_pred: Dict[str,
+                                                               np.ndarray],
+                         tar_pred: Dict[str,
+                                        np.ndarray], hand_indices: List[int],
+                         mouth_index: int) -> Tuple[int, int]:
+        """get the start and end position of the heart, based on two keypoint
+        results and keypoint indices of hand and mouth.
+
+        Args:
+            cur_pred(dict): The pose estimation results of current person,
+                containing: the following keys:
+                - "keypoints" (np.ndarray[K,3]): keypoint detection result
+                                                 in [x, y, score]
+            tar_pred(dict): The pose estimation results of target person,
+                containing: the following keys:
+                - "keypoints" (np.ndarray[K,3]): keypoint detection result
+                                                 in [x, y, score]
+            hand_indices(list[int]): keypoint indices of hand
+            mouth_index(int): keypoint index of mouth
+
+        Returns:
+            tuple(int): the start position of heart
+            tuple(int): the end position of heart
+        """
+        cur_kpts = cur_pred['keypoints']
+
+        assert heart_type in [1, 2,
+                              3], 'Can not determine the type of heart effect'
+
+        if heart_type == 1:
+            p1 = cur_kpts[hand_indices[20]][:2]
+            p2 = cur_kpts[hand_indices[41]][:2]
+        elif heart_type == 2:
+            p1 = cur_kpts[hand_indices[9]][:2]
+            p2 = cur_kpts[hand_indices[12]][:2]
+        elif heart_type == 3:
+            p1 = cur_kpts[hand_indices[30]][:2]
+            p2 = cur_kpts[hand_indices[33]][:2]
+
+        cur_x, cur_y = (p1 + p2) / 2
+        # the mid point of two fingers
+        start_pos = (int(cur_x), int(cur_y))
+
+        tar_kpts = tar_pred['keypoints']
+        end_pos = tar_kpts[mouth_index][:2]
+
+        return start_pos, end_pos
+
+    def _draw_heart(self, canvas: np.ndarray, heart_info: HeartInfo,
+                    t_pass: float) -> np.ndarray:
+        """draw the heart according to heart info and time."""
+        start_x, start_y = heart_info.start_pos
+        end_x, end_y = heart_info.end_pos
+
+        scale = t_pass / self.longest_duration
+
+        max_h, max_w = canvas.shape[:2]
+        hm, wm = self.largest_ratio * max_h, self.largest_ratio * max_h
+        new_h, new_w = int(hm * scale), int(wm * scale)
+
+        x = int(start_x + scale * (end_x - start_x))
+        y = int(start_y + scale * (end_y - start_y))
+
+        y1 = max(0, y - int(new_h / 2))
+        y2 = min(max_h - 1, y + int(new_h / 2))
+
+        x1 = max(0, x - int(new_w / 2))
+        x2 = min(max_w - 1, x + int(new_w / 2))
+
+        target = canvas[y1:y2 + 1, x1:x2 + 1].copy()
+        new_h, new_w = target.shape[:2]
+
+        if new_h == 0 or new_w == 0:
+            return canvas
+
+        assert heart_info.heart_type in [
+            1, 2, 3
+        ], 'Can not determine the type of heart effect'
+        if heart_info.heart_type == 1:  # hand heart
+            patch = self.hand_heart.copy()
+        elif heart_info.heart_type >= 2:  # hand blow kiss
+            patch = self.flying_heart.copy()
+            if heart_info.start_pos[0] > heart_info.end_pos[0]:
+                patch = patch[:, ::-1]
+
+        patch = cv2.resize(patch, (new_w, new_h))
+        mask = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY)
+        mask = (mask < 100)[..., None].astype(np.float32) * 0.8
+
+        canvas[y1:y2 + 1, x1:x2 + 1] = patch * mask + target * (1 - mask)
+
+        return canvas
+
+    def draw(self, frame_msg: FrameMessage) -> np.ndarray:
+        canvas = frame_msg.get_image()
+
+        pose_results = frame_msg.get_pose_results()
+        if not pose_results:
+            return canvas
+
+        for pose_result in pose_results:
+            model_cfg = pose_result['model_cfg']
+
+            preds = [pred.copy() for pred in pose_result['preds']]
+            # if number of persons in the image is less than 2,
+            # no heart effect will be triggered
+            if len(preds) < self.num_persons:
+                continue
+
+            # if number of persons in the image is more than 2,
+            # only use the first two pose results
+            preds = preds[:self.num_persons]
+            ids = [preds[i]['track_id'] for i in range(self.num_persons)]
+
+            for id in self.heart_infos.copy():
+                if id not in ids:
+                    # if the id of a person not in previous heart_infos,
+                    # delete the corresponding field
+                    del self.heart_infos[id]
+
+            for i in range(self.num_persons):
+                id = preds[i]['track_id']
+
+                # if the predicted person in previous heart_infos,
+                # draw the heart
+                if id in self.heart_infos.copy():
+                    t_pass = time.time() - self.heart_infos[id].start_time
+
+                    # the time passed since last heart pose less than
+                    # longest_duration, continue to draw the heart
+                    if t_pass < self.longest_duration:
+                        canvas = self._draw_heart(canvas, self.heart_infos[id],
+                                                  t_pass)
+                    # reset corresponding heart info
+                    else:
+                        del self.heart_infos[id]
+                else:
+                    hand_indices = get_hand_keypoint_ids(model_cfg)
+                    mouth_index = get_mouth_keypoint_ids(model_cfg)
+                    eye_indices = get_eye_keypoint_ids(model_cfg)
+
+                    # check the type of Valentine Magic based on pose results
+                    # and keypoint indices of hand and mouth
+                    heart_type = self._check_heart(preds[i], hand_indices,
+                                                   mouth_index, eye_indices)
+                    # trigger a Valentine Magic effect
+                    if heart_type:
+                        # get the route of heart
+                        start_pos, end_pos = self._get_heart_route(
+                            heart_type, preds[i],
+                            preds[self.num_persons - 1 - i], hand_indices,
+                            mouth_index)
+                        start_time = time.time()
+                        self.heart_infos[id] = HeartInfo(
+                            heart_type, start_time, start_pos, end_pos)
+
+        return canvas
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/xdwendwen_node.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/xdwendwen_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a0914d3bf473f278023ed1569ae18d6d1b5fcf3
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/nodes/xdwendwen_node.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+from dataclasses import dataclass
+from typing import List, Tuple, Union
+
+import cv2
+import numpy as np
+
+from mmpose.datasets.dataset_info import DatasetInfo
+from ..utils import load_image_from_disk_or_url
+from .builder import NODES
+from .frame_drawing_node import FrameDrawingNode
+
+
+@dataclass
+class DynamicInfo:
+    pos_curr: Tuple[int, int] = (0, 0)
+    pos_step: Tuple[int, int] = (0, 0)
+    step_curr: int = 0
+
+
+@NODES.register_module()
+class XDwenDwenNode(FrameDrawingNode):
+    """An effect drawing node that captures the face of a cat or dog and blend
+    it into a Bing-Dwen-Dwen (the mascot of 2022 Beijing Winter Olympics).
+
+    Parameters:
+        name (str, optional): The node name (also thread name).
+        frame_buffer (str): The name of the input buffer.
+        output_buffer (str | list): The name(s) of the output buffer(s).
+        mode_key (str | int): A hot key to switch the background image.
+        resource_file (str): The annotation file of resource images, which
+            should be in Labelbee format and contain both facial keypoint and
+            region annotations.
+        out_shape (tuple): The shape of output frame in (width, height).
+    """
+
+    dynamic_scale = 0.15
+    dynamic_max_step = 15
+
+    def __init__(
+        self,
+        name: str,
+        frame_buffer: str,
+        output_buffer: Union[str, List[str]],
+        mode_key: Union[str, int],
+        resource_file: str,
+        out_shape: Tuple[int, int] = (480, 480),
+        rigid_transform: bool = True,
+    ):
+        super().__init__(name, frame_buffer, output_buffer, enable=True)
+
+        self.mode_key = mode_key
+        self.mode_index = 0
+        self.out_shape = out_shape
+        self.rigid = rigid_transform
+
+        self.latest_pred = None
+
+        self.dynamic_info = DynamicInfo()
+
+        self.register_event(
+            self.mode_key, is_keyboard=True, handler_func=self.switch_mode)
+
+        self._init_resource(resource_file)
+
+    def _init_resource(self, resource_file):
+
+        # The resource_file is a JSON file that contains the facial
+        # keypoint and mask annotation information of the resource files.
+        # The annotations should follow the label-bee standard format.
+        # See https://github.com/open-mmlab/labelbee-client for details.
+        with open(resource_file) as f:
+            anns = json.load(f)
+        resource_infos = []
+
+        for ann in anns:
+            # Load image
+            img = load_image_from_disk_or_url(ann['url'])
+            # Load result
+            rst = json.loads(ann['result'])
+
+            # Check facial keypoint information
+            assert rst['step_1']['toolName'] == 'pointTool'
+            assert len(rst['step_1']['result']) == 3
+
+            keypoints = sorted(
+                rst['step_1']['result'], key=lambda x: x['order'])
+            keypoints = np.array([[pt['x'], pt['y']] for pt in keypoints])
+
+            # Check facial mask
+            assert rst['step_2']['toolName'] == 'polygonTool'
+            assert len(rst['step_2']['result']) == 1
+            assert len(rst['step_2']['result'][0]['pointList']) > 2
+
+            mask_pts = np.array(
+                [[pt['x'], pt['y']]
+                 for pt in rst['step_2']['result'][0]['pointList']])
+
+            mul = 1.0 + self.dynamic_scale
+
+            w_scale = self.out_shape[0] / img.shape[1] * mul
+            h_scale = self.out_shape[1] / img.shape[0] * mul
+
+            img = cv2.resize(
+                img,
+                dsize=None,
+                fx=w_scale,
+                fy=h_scale,
+                interpolation=cv2.INTER_CUBIC)
+
+            keypoints *= [w_scale, h_scale]
+            mask_pts *= [w_scale, h_scale]
+
+            mask = cv2.fillPoly(
+                np.zeros(img.shape[:2], dtype=np.uint8),
+                [mask_pts.astype(np.int32)],
+                color=1)
+
+            res = {
+                'img': img,
+                'keypoints': keypoints,
+                'mask': mask,
+            }
+            resource_infos.append(res)
+
+        self.resource_infos = resource_infos
+
+        self._reset_dynamic()
+
+    def switch_mode(self):
+        self.mode_index = (self.mode_index + 1) % len(self.resource_infos)
+
+    def _reset_dynamic(self):
+        x_tar = np.random.randint(int(self.out_shape[0] * self.dynamic_scale))
+        y_tar = np.random.randint(int(self.out_shape[1] * self.dynamic_scale))
+
+        x_step = (x_tar -
+                  self.dynamic_info.pos_curr[0]) / self.dynamic_max_step
+        y_step = (y_tar -
+                  self.dynamic_info.pos_curr[1]) / self.dynamic_max_step
+
+        self.dynamic_info.pos_step = (x_step, y_step)
+        self.dynamic_info.step_curr = 0
+
+    def draw(self, frame_msg):
+
+        full_pose_results = frame_msg.get_pose_results()
+
+        pred = None
+        if full_pose_results:
+            for pose_results in full_pose_results:
+                if not pose_results['preds']:
+                    continue
+
+                pred = pose_results['preds'][0].copy()
+                pred['dataset'] = DatasetInfo(pose_results['model_cfg'].data.
+                                              test.dataset_info).dataset_name
+
+                self.latest_pred = pred
+                break
+
+        # Use the latest pose result if there is none available in
+        # the current frame.
+        if pred is None:
+            pred = self.latest_pred
+
+        # Get the background image and facial annotations
+        res = self.resource_infos[self.mode_index]
+        img = frame_msg.get_image()
+        canvas = res['img'].copy()
+        mask = res['mask']
+        kpts_tar = res['keypoints']
+
+        if pred is not None:
+            if pred['dataset'] == 'ap10k':
+                # left eye: 0, right eye: 1, nose: 2
+                kpts_src = pred['keypoints'][[0, 1, 2], :2]
+            elif pred['dataset'] == 'coco_wholebody':
+                # left eye: 1, right eye 2, nose: 0
+                kpts_src = pred['keypoints'][[1, 2, 0], :2]
+            else:
+                raise ValueError('Can not obtain face landmark information'
+                                 f'from dataset: {pred["type"]}')
+
+            trans_mat = self._get_transform(kpts_src, kpts_tar)
+
+            warp = cv2.warpAffine(img, trans_mat, dsize=canvas.shape[:2])
+            cv2.copyTo(warp, mask, canvas)
+
+        # Add random movement to the background
+        xc, yc = self.dynamic_info.pos_curr
+        xs, ys = self.dynamic_info.pos_step
+        w, h = self.out_shape
+
+        x = min(max(int(xc), 0), canvas.shape[1] - w + 1)
+        y = min(max(int(yc), 0), canvas.shape[0] - h + 1)
+
+        canvas = canvas[y:y + h, x:x + w]
+
+        self.dynamic_info.pos_curr = (xc + xs, yc + ys)
+        self.dynamic_info.step_curr += 1
+
+        if self.dynamic_info.step_curr == self.dynamic_max_step:
+            self._reset_dynamic()
+
+        return canvas
+
+    def _get_transform(self, kpts_src, kpts_tar):
+        if self.rigid:
+            # rigid transform
+            n = kpts_src.shape[0]
+            X = np.zeros((n * 2, 4), dtype=np.float32)
+            U = np.zeros((n * 2, 1), dtype=np.float32)
+            X[:n, :2] = kpts_src
+            X[:n, 2] = 1
+            X[n:, 0] = kpts_src[:, 1]
+            X[n:, 1] = -kpts_src[:, 0]
+            X[n:, 3] = 1
+
+            U[:n, 0] = kpts_tar[:, 0]
+            U[n:, 0] = kpts_tar[:, 1]
+
+            M = np.linalg.pinv(X).dot(U).flatten()
+
+            trans_mat = np.array([[M[0], M[1], M[2]], [-M[1], M[0], M[3]]],
+                                 dtype=np.float32)
+
+        else:
+            # normal affine transform
+            # adaptive horizontal flipping
+            if (np.linalg.norm(kpts_tar[0] - kpts_tar[2]) -
+                    np.linalg.norm(kpts_tar[1] - kpts_tar[2])) * (
+                        np.linalg.norm(kpts_src[0] - kpts_src[2]) -
+                        np.linalg.norm(kpts_src[1] - kpts_src[2])) < 0:
+                kpts_src = kpts_src[[1, 0, 2], :]
+            trans_mat, _ = cv2.estimateAffine2D(
+                kpts_src.astype(np.float32), kpts_tar.astype(np.float32))
+
+        return trans_mat
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/__init__.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d906df0748cd6e5f87642ea6fdc9511e833e22ff
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .buffer import BufferManager
+from .event import EventManager
+from .message import FrameMessage, Message, VideoEndingMessage
+from .misc import (ImageCapture, copy_and_paste, expand_and_clamp,
+                   get_cached_file_path, is_image_file, limit_max_fps,
+                   load_image_from_disk_or_url, screen_matting)
+from .pose import (get_eye_keypoint_ids, get_face_keypoint_ids,
+                   get_hand_keypoint_ids, get_mouth_keypoint_ids,
+                   get_wrist_keypoint_ids)
+
+__all__ = [
+    'BufferManager',
+    'EventManager',
+    'FrameMessage',
+    'Message',
+    'limit_max_fps',
+    'VideoEndingMessage',
+    'load_image_from_disk_or_url',
+    'get_cached_file_path',
+    'screen_matting',
+    'expand_and_clamp',
+    'copy_and_paste',
+    'is_image_file',
+    'ImageCapture',
+    'get_eye_keypoint_ids',
+    'get_face_keypoint_ids',
+    'get_wrist_keypoint_ids',
+    'get_mouth_keypoint_ids',
+    'get_hand_keypoint_ids',
+]
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/buffer.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9fca4c392703bccb710a9659db21f56ea92e282
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/buffer.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import wraps
+from queue import Queue
+from typing import Dict, List, Optional
+
+from mmcv import is_seq_of
+
+__all__ = ['BufferManager']
+
+
+def check_buffer_registered(exist=True):
+
+    def wrapper(func):
+
+        @wraps(func)
+        def wrapped(manager, name, *args, **kwargs):
+            if exist:
+                # Assert buffer exist
+                if name not in manager:
+                    raise ValueError(f'Fail to call {func.__name__}: '
+                                     f'buffer "{name}" is not registered.')
+            else:
+                # Assert buffer not exist
+                if name in manager:
+                    raise ValueError(f'Fail to call {func.__name__}: '
+                                     f'buffer "{name}" is already registered.')
+            return func(manager, name, *args, **kwargs)
+
+        return wrapped
+
+    return wrapper
+
+
+class Buffer(Queue):
+
+    def put_force(self, item):
+        """Force to put an item into the buffer.
+
+        If the buffer is already full, the earliest item in the buffer will be
+        remove to make room for the incoming item.
+        """
+        with self.mutex:
+            if self.maxsize > 0:
+                while self._qsize() >= self.maxsize:
+                    _ = self._get()
+                    self.unfinished_tasks -= 1
+
+            self._put(item)
+            self.unfinished_tasks += 1
+            self.not_empty.notify()
+
+
+class BufferManager():
+
+    def __init__(self,
+                 buffer_type: type = Buffer,
+                 buffers: Optional[Dict] = None):
+        self.buffer_type = buffer_type
+        if buffers is None:
+            self._buffers = {}
+        else:
+            if is_seq_of(list(buffers.values()), buffer_type):
+                self._buffers = buffers.copy()
+            else:
+                raise ValueError('The values of buffers should be instance '
+                                 f'of {buffer_type}')
+
+    def __contains__(self, name):
+        return name in self._buffers
+
+    @check_buffer_registered(False)
+    def register_buffer(self, name, maxsize=0):
+        self._buffers[name] = self.buffer_type(maxsize)
+
+    @check_buffer_registered()
+    def put(self, name, item, block=True, timeout=None):
+        self._buffers[name].put(item, block, timeout)
+
+    @check_buffer_registered()
+    def put_force(self, name, item):
+        self._buffers[name].put_force(item)
+
+    @check_buffer_registered()
+    def get(self, name, block=True, timeout=None):
+        return self._buffers[name].get(block, timeout)
+
+    @check_buffer_registered()
+    def is_empty(self, name):
+        return self._buffers[name].empty()
+
+    @check_buffer_registered()
+    def is_full(self, name):
+        return self._buffers[name].full()
+
+    def get_sub_manager(self, buffer_names: List[str]):
+        buffers = {name: self._buffers[name] for name in buffer_names}
+        return BufferManager(self.buffer_type, buffers)
+
+    def get_info(self):
+        buffer_info = {}
+        for name, buffer in self._buffers.items():
+            buffer_info[name] = {
+                'size': buffer.size,
+                'maxsize': buffer.maxsize
+            }
+        return buffer_info
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/event.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/event.py
new file mode 100644
index 0000000000000000000000000000000000000000..ceab26f72b63d03bc574cda3a713fed67f20f0c0
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/event.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from contextlib import contextmanager
+from threading import Event
+from typing import Optional
+
+
+class EventManager():
+
+    def __init__(self):
+        self._events = defaultdict(Event)
+
+    def register_event(self,
+                       event_name: str = None,
+                       is_keyboard: bool = False):
+        if is_keyboard:
+            event_name = self._get_keyboard_event_name(event_name)
+        self._events[event_name] = Event()
+
+    def set(self, event_name: str = None, is_keyboard: bool = False):
+        if is_keyboard:
+            event_name = self._get_keyboard_event_name(event_name)
+        return self._events[event_name].set()
+
+    def wait(self,
+             event_name: str = None,
+             is_keyboard: Optional[bool] = False,
+             timeout: Optional[float] = None):
+        if is_keyboard:
+            event_name = self._get_keyboard_event_name(event_name)
+        return self._events[event_name].wait(timeout)
+
+    def is_set(self,
+               event_name: str = None,
+               is_keyboard: Optional[bool] = False):
+        if is_keyboard:
+            event_name = self._get_keyboard_event_name(event_name)
+        return self._events[event_name].is_set()
+
+    def clear(self,
+              event_name: str = None,
+              is_keyboard: Optional[bool] = False):
+        if is_keyboard:
+            event_name = self._get_keyboard_event_name(event_name)
+        return self._events[event_name].clear()
+
+    @staticmethod
+    def _get_keyboard_event_name(key):
+        return f'_keyboard_{chr(key) if isinstance(key,int) else key}'
+
+    @contextmanager
+    def wait_and_handle(self,
+                        event_name: str = None,
+                        is_keyboard: Optional[bool] = False):
+        self.wait(event_name, is_keyboard)
+        try:
+            yield
+        finally:
+            self.clear(event_name, is_keyboard)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/message.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/message.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7b1529c5ece3970dfae189d910720786f32612d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/message.py
@@ -0,0 +1,204 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+import uuid
+import warnings
+from typing import Dict, List, Optional
+
+import numpy as np
+
+
+class Message():
+    """Message base class.
+
+    All message class should inherit this class. The basic use of a Message
+    instance is to carray a piece of text message (self.msg) and a dict that
+    stores structured data (self.data), e.g. frame image, model prediction,
+    et al.
+
+    A message may also hold route information, which is composed of
+    information of all nodes the message has passed through.
+
+    Parameters:
+        msg (str): The text message.
+        data (dict, optional): The structured data.
+    """
+
+    def __init__(self, msg: str = '', data: Optional[Dict] = None):
+        self.msg = msg
+        self.data = data if data else {}
+        self.route_info = []
+        self.timestamp = time.time()
+        self.id = uuid.uuid4()
+
+    def update_route_info(self,
+                          node=None,
+                          node_name: Optional[str] = None,
+                          node_type: Optional[str] = None,
+                          info: Optional[Dict] = None):
+        """Append new node information to the route information.
+
+        Args:
+            node (Node, optional): An instance of Node that provides basic
+                information like the node name and type. Default: None.
+            node_name (str, optional): The node name. If node is given,
+                node_name will be ignored. Default: None.
+            node_type (str, optional): The class name of the node. If node
+                is given, node_type will be ignored. Default: None.
+            info (dict, optional): The node information, which is usually
+                given by node.get_node_info(). Default: None.
+        """
+        if node is not None:
+            if node_name is not None or node_type is not None:
+                warnings.warn(
+                    '`node_name` and `node_type` will be overridden if node'
+                    'is provided.')
+            node_name = node.name
+            node_type = node.__class__.__name__
+
+        node_info = {'node': node_name, 'node_type': node_type, 'info': info}
+        self.route_info.append(node_info)
+
+    def set_route_info(self, route_info: List):
+        """Directly set the entire route information.
+
+        Args:
+            route_info (list): route information to set to the message.
+        """
+        self.route_info = route_info
+
+    def merge_route_info(self, route_info: List):
+        """Merge the given route information into the original one of the
+        message. This is used for combining route information from multiple
+        messages. The node information in the route will be reordered according
+        to their timestamps.
+
+        Args:
+            route_info (list): route information to merge.
+        """
+        self.route_info += route_info
+        self.route_info.sort(key=lambda x: x.get('timestamp', np.inf))
+
+    def get_route_info(self) -> List:
+        return self.route_info.copy()
+
+
+class VideoEndingMessage(Message):
+    """A special message to indicate the input video is ending."""
+
+
+class FrameMessage(Message):
+    """The message to store information of a video frame.
+
+    A FrameMessage instance usually holds following data in self.data:
+        - image (array): The frame image
+        - detection_results (list): A list to hold detection results of
+            multiple detectors. Each element is a tuple (tag, result)
+        - pose_results (list): A list to hold pose estimation results of
+            multiple pose estimator. Each element is a tuple (tag, result)
+    """
+
+    def __init__(self, img):
+        super().__init__(data=dict(image=img))
+
+    def get_image(self):
+        """Get the frame image.
+
+        Returns:
+            array: The frame image.
+        """
+        return self.data.get('image', None)
+
+    def set_image(self, img):
+        """Set the frame image to the message."""
+        self.data['image'] = img
+
+    def add_detection_result(self, result, tag: str = None):
+        """Add the detection result from one model into the message's
+        detection_results.
+
+        Args:
+            tag (str, optional): Give a tag to the result, which can be used
+                to retrieve specific results.
+        """
+        if 'detection_results' not in self.data:
+            self.data['detection_results'] = []
+        self.data['detection_results'].append((tag, result))
+
+    def get_detection_results(self, tag: str = None):
+        """Get detection results of the message.
+
+        Args:
+            tag (str, optional): If given, only the results with the tag
+                will be retrieved. Otherwise all results will be retrieved.
+                Default: None.
+
+        Returns:
+            list[dict]: The retrieved detection results
+        """
+        if 'detection_results' not in self.data:
+            return None
+        if tag is None:
+            results = [res for _, res in self.data['detection_results']]
+        else:
+            results = [
+                res for _tag, res in self.data['detection_results']
+                if _tag == tag
+            ]
+        return results
+
+    def add_pose_result(self, result, tag=None):
+        """Add the pose estimation result from one model into the message's
+        pose_results.
+
+        Args:
+            tag (str, optional): Give a tag to the result, which can be used
+                to retrieve specific results.
+        """
+        if 'pose_results' not in self.data:
+            self.data['pose_results'] = []
+        self.data['pose_results'].append((tag, result))
+
+    def get_pose_results(self, tag=None):
+        """Get pose estimation results of the message.
+
+        Args:
+            tag (str, optional): If given, only the results with the tag
+                will be retrieved. Otherwise all results will be retrieved.
+                Default: None.
+
+        Returns:
+            list[dict]: The retrieved pose results
+        """
+        if 'pose_results' not in self.data:
+            return None
+        if tag is None:
+            results = [res for _, res in self.data['pose_results']]
+        else:
+            results = [
+                res for _tag, res in self.data['pose_results'] if _tag == tag
+            ]
+        return results
+
+    def get_full_results(self):
+        """Get all model predictions of the message.
+
+        See set_full_results() for inference.
+
+        Returns:
+            dict: All model predictions, including:
+                - detection_results
+                - pose_results
+        """
+        result_keys = ['detection_results', 'pose_results']
+        results = {k: self.data[k] for k in result_keys}
+        return results
+
+    def set_full_results(self, results):
+        """Set full model results directly.
+
+        Args:
+            results (dict): All model predictions including:
+                - detection_results (list): see also add_detection_results()
+                - pose_results (list): see also add_pose_results()
+        """
+        self.data.update(results)
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/misc.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..c64f4179db8a3618b38e3d6933992e9b3294af55
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/misc.py
@@ -0,0 +1,343 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import sys
+import time
+from contextlib import contextmanager
+from typing import Optional
+from urllib.parse import urlparse
+from urllib.request import urlopen
+
+import cv2
+import numpy as np
+from torch.hub import HASH_REGEX, download_url_to_file
+
+
+@contextmanager
+def limit_max_fps(fps: Optional[float]):
+    t_start = time.time()
+    try:
+        yield
+    finally:
+        t_end = time.time()
+        if fps is not None:
+            t_sleep = 1.0 / fps - t_end + t_start
+            if t_sleep > 0:
+                time.sleep(t_sleep)
+
+
+def _is_url(filename):
+    """Check if the file is a url link.
+
+    Args:
+        filename (str): the file name or url link.
+
+    Returns:
+        bool: is url or not.
+    """
+    prefixes = ['http://', 'https://']
+    for p in prefixes:
+        if filename.startswith(p):
+            return True
+    return False
+
+
+def load_image_from_disk_or_url(filename, readFlag=cv2.IMREAD_COLOR):
+    """Load an image file, from disk or url.
+
+    Args:
+        filename (str): file name on the disk or url link.
+        readFlag (int): readFlag for imdecode.
+
+    Returns:
+        np.ndarray: A loaded image
+    """
+    if _is_url(filename):
+        # download the image, convert it to a NumPy array, and then read
+        # it into OpenCV format
+        resp = urlopen(filename)
+        image = np.asarray(bytearray(resp.read()), dtype='uint8')
+        image = cv2.imdecode(image, readFlag)
+        return image
+    else:
+        image = cv2.imread(filename, readFlag)
+        return image
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def get_cached_file_path(url,
+                         save_dir=None,
+                         progress=True,
+                         check_hash=False,
+                         file_name=None):
+    r"""Loads the Torch serialized object at the given URL.
+
+    If downloaded file is a zip file, it will be automatically decompressed
+
+    If the object is already present in `model_dir`, it's deserialized and
+    returned.
+    The default value of ``model_dir`` is ``<hub_dir>/checkpoints`` where
+    ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`.
+
+    Args:
+        url (str): URL of the object to download
+        save_dir (str, optional): directory in which to save the object
+        progress (bool, optional): whether or not to display a progress bar
+            to stderr. Default: True
+        check_hash(bool, optional): If True, the filename part of the URL
+            should follow the naming convention ``filename-<sha256>.ext``
+            where ``<sha256>`` is the first eight or more digits of the
+            SHA256 hash of the contents of the file. The hash is used to
+            ensure unique names and to verify the contents of the file.
+            Default: False
+        file_name (str, optional): name for the downloaded file. Filename
+            from ``url`` will be used if not set. Default: None.
+    """
+    if save_dir is None:
+        save_dir = os.path.join('webcam_resources')
+
+    mkdir_or_exist(save_dir)
+
+    parts = urlparse(url)
+    filename = os.path.basename(parts.path)
+    if file_name is not None:
+        filename = file_name
+    cached_file = os.path.join(save_dir, filename)
+    if not os.path.exists(cached_file):
+        sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
+        hash_prefix = None
+        if check_hash:
+            r = HASH_REGEX.search(filename)  # r is Optional[Match[str]]
+            hash_prefix = r.group(1) if r else None
+        download_url_to_file(url, cached_file, hash_prefix, progress=progress)
+    return cached_file
+
+
+def screen_matting(img, color_low=None, color_high=None, color=None):
+    """Screen Matting.
+
+    Args:
+        img (np.ndarray): Image data.
+        color_low (tuple): Lower limit (b, g, r).
+        color_high (tuple): Higher limit (b, g, r).
+        color (str): Support colors include:
+
+            - 'green' or 'g'
+            - 'blue' or 'b'
+            - 'black' or 'k'
+            - 'white' or 'w'
+    """
+
+    if color_high is None or color_low is None:
+        if color is not None:
+            if color.lower() == 'g' or color.lower() == 'green':
+                color_low = (0, 200, 0)
+                color_high = (60, 255, 60)
+            elif color.lower() == 'b' or color.lower() == 'blue':
+                color_low = (230, 0, 0)
+                color_high = (255, 40, 40)
+            elif color.lower() == 'k' or color.lower() == 'black':
+                color_low = (0, 0, 0)
+                color_high = (40, 40, 40)
+            elif color.lower() == 'w' or color.lower() == 'white':
+                color_low = (230, 230, 230)
+                color_high = (255, 255, 255)
+            else:
+                NotImplementedError(f'Not supported color: {color}.')
+        else:
+            ValueError('color or color_high | color_low should be given.')
+
+    mask = cv2.inRange(img, np.array(color_low), np.array(color_high)) == 0
+
+    return mask.astype(np.uint8)
+
+
+def expand_and_clamp(box, im_shape, s=1.25):
+    """Expand the bbox and clip it to fit the image shape.
+
+    Args:
+        box (list): x1, y1, x2, y2
+        im_shape (ndarray): image shape (h, w, c)
+        s (float): expand ratio
+
+    Returns:
+        list: x1, y1, x2, y2
+    """
+
+    x1, y1, x2, y2 = box[:4]
+    w = x2 - x1
+    h = y2 - y1
+    deta_w = w * (s - 1) / 2
+    deta_h = h * (s - 1) / 2
+
+    x1, y1, x2, y2 = x1 - deta_w, y1 - deta_h, x2 + deta_w, y2 + deta_h
+
+    img_h, img_w = im_shape[:2]
+
+    x1 = min(max(0, int(x1)), img_w - 1)
+    y1 = min(max(0, int(y1)), img_h - 1)
+    x2 = min(max(0, int(x2)), img_w - 1)
+    y2 = min(max(0, int(y2)), img_h - 1)
+
+    return [x1, y1, x2, y2]
+
+
+def _find_connected_components(mask):
+    """Find connected components and sort with areas.
+
+    Args:
+        mask (ndarray): instance segmentation result.
+
+    Returns:
+        ndarray (N, 5): Each item contains (x, y, w, h, area).
+    """
+    num, labels, stats, centroids = cv2.connectedComponentsWithStats(mask)
+    stats = stats[stats[:, 4].argsort()]
+    return stats
+
+
+def _find_bbox(mask):
+    """Find the bounding box for the mask.
+
+    Args:
+        mask (ndarray): Mask.
+
+    Returns:
+        list(4, ): Returned box (x1, y1, x2, y2).
+    """
+    mask_shape = mask.shape
+    if len(mask_shape) == 3:
+        assert mask_shape[-1] == 1, 'the channel of the mask should be 1.'
+    elif len(mask_shape) == 2:
+        pass
+    else:
+        NotImplementedError()
+
+    h, w = mask_shape[:2]
+    mask_w = mask.sum(0)
+    mask_h = mask.sum(1)
+
+    left = 0
+    right = w - 1
+    up = 0
+    down = h - 1
+
+    for i in range(w):
+        if mask_w[i] > 0:
+            break
+        left += 1
+
+    for i in range(w - 1, left, -1):
+        if mask_w[i] > 0:
+            break
+        right -= 1
+
+    for i in range(h):
+        if mask_h[i] > 0:
+            break
+        up += 1
+
+    for i in range(h - 1, up, -1):
+        if mask_h[i] > 0:
+            break
+        down -= 1
+
+    return [left, up, right, down]
+
+
+def copy_and_paste(img,
+                   background_img,
+                   mask,
+                   bbox=None,
+                   effect_region=(0.2, 0.2, 0.8, 0.8),
+                   min_size=(20, 20)):
+    """Copy the image region and paste to the background.
+
+    Args:
+        img (np.ndarray): Image data.
+        background_img (np.ndarray): Background image data.
+        mask (ndarray): instance segmentation result.
+        bbox (ndarray): instance bbox, (x1, y1, x2, y2).
+        effect_region (tuple(4, )): The region to apply mask, the coordinates
+            are normalized (x1, y1, x2, y2).
+    """
+    background_img = background_img.copy()
+    background_h, background_w = background_img.shape[:2]
+    region_h = (effect_region[3] - effect_region[1]) * background_h
+    region_w = (effect_region[2] - effect_region[0]) * background_w
+    region_aspect_ratio = region_w / region_h
+
+    if bbox is None:
+        bbox = _find_bbox(mask)
+    instance_w = bbox[2] - bbox[0]
+    instance_h = bbox[3] - bbox[1]
+
+    if instance_w > min_size[0] and instance_h > min_size[1]:
+        aspect_ratio = instance_w / instance_h
+        if region_aspect_ratio > aspect_ratio:
+            resize_rate = region_h / instance_h
+        else:
+            resize_rate = region_w / instance_w
+
+        mask_inst = mask[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])]
+        img_inst = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])]
+        img_inst = cv2.resize(img_inst, (int(
+            resize_rate * instance_w), int(resize_rate * instance_h)))
+        mask_inst = cv2.resize(
+            mask_inst,
+            (int(resize_rate * instance_w), int(resize_rate * instance_h)),
+            interpolation=cv2.INTER_NEAREST)
+
+        mask_ids = list(np.where(mask_inst == 1))
+        mask_ids[1] += int(effect_region[0] * background_w)
+        mask_ids[0] += int(effect_region[1] * background_h)
+
+        background_img[tuple(mask_ids)] = img_inst[np.where(mask_inst == 1)]
+
+    return background_img
+
+
+def is_image_file(path):
+    if isinstance(path, str):
+        if path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
+            return True
+    return False
+
+
+class ImageCapture:
+    """A mock-up version of cv2.VideoCapture that always return a const image.
+
+    Args:
+        image (str | ndarray): The image or image path
+    """
+
+    def __init__(self, image):
+        if isinstance(image, str):
+            self.image = load_image_from_disk_or_url(image)
+        else:
+            self.image = image
+
+    def isOpened(self):
+        return (self.image is not None)
+
+    def read(self):
+        return True, self.image.copy()
+
+    def release(self):
+        pass
+
+    def get(self, propId):
+        if propId == cv2.CAP_PROP_FRAME_WIDTH:
+            return self.image.shape[1]
+        elif propId == cv2.CAP_PROP_FRAME_HEIGHT:
+            return self.image.shape[0]
+        elif propId == cv2.CAP_PROP_FPS:
+            return np.nan
+        else:
+            raise NotImplementedError()
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/pose.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/pose.py
new file mode 100644
index 0000000000000000000000000000000000000000..196b40ef53d78173742d4d6f953176cf76238308
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/utils/pose.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+from mmcv import Config
+
+from mmpose.datasets.dataset_info import DatasetInfo
+
+
+def get_eye_keypoint_ids(model_cfg: Config) -> Tuple[int, int]:
+    """A helpfer function to get the keypoint indices of left and right eyes
+    from the model config.
+
+    Args:
+        model_cfg (Config): pose model config.
+
+    Returns:
+        int: left eye keypoint index.
+        int: right eye keypoint index.
+    """
+    left_eye_idx = None
+    right_eye_idx = None
+
+    # try obtaining eye point ids from dataset_info
+    try:
+        dataset_info = DatasetInfo(model_cfg.data.test.dataset_info)
+        left_eye_idx = dataset_info.keypoint_name2id.get('left_eye', None)
+        right_eye_idx = dataset_info.keypoint_name2id.get('right_eye', None)
+    except AttributeError:
+        left_eye_idx = None
+        right_eye_idx = None
+
+    if left_eye_idx is None or right_eye_idx is None:
+        # Fall back to hard coded keypoint id
+        dataset_name = model_cfg.data.test.type
+        if dataset_name in {
+                'TopDownCocoDataset', 'TopDownCocoWholeBodyDataset'
+        }:
+            left_eye_idx = 1
+            right_eye_idx = 2
+        elif dataset_name in {'AnimalPoseDataset', 'AnimalAP10KDataset'}:
+            left_eye_idx = 0
+            right_eye_idx = 1
+        else:
+            raise ValueError('Can not determine the eye keypoint id of '
+                             f'{dataset_name}')
+
+    return left_eye_idx, right_eye_idx
+
+
+def get_face_keypoint_ids(model_cfg: Config) -> Tuple[int, int]:
+    """A helpfer function to get the keypoint indices of the face from the
+    model config.
+
+    Args:
+        model_cfg (Config): pose model config.
+
+    Returns:
+        list[int]: face keypoint index.
+    """
+    face_indices = None
+
+    # try obtaining nose point ids from dataset_info
+    try:
+        dataset_info = DatasetInfo(model_cfg.data.test.dataset_info)
+        for id in range(68):
+            face_indices.append(
+                dataset_info.keypoint_name2id.get(f'face_{id}', None))
+    except AttributeError:
+        face_indices = None
+
+    if face_indices is None:
+        # Fall back to hard coded keypoint id
+        dataset_name = model_cfg.data.test.type
+        if dataset_name in {'TopDownCocoWholeBodyDataset'}:
+            face_indices = list(range(23, 91))
+        else:
+            raise ValueError('Can not determine the face id of '
+                             f'{dataset_name}')
+
+    return face_indices
+
+
+def get_wrist_keypoint_ids(model_cfg: Config) -> Tuple[int, int]:
+    """A helpfer function to get the keypoint indices of left and right wrist
+    from the model config.
+
+    Args:
+        model_cfg (Config): pose model config.
+    Returns:
+        int: left wrist keypoint index.
+        int: right wrist keypoint index.
+    """
+
+    # try obtaining eye point ids from dataset_info
+    try:
+        dataset_info = DatasetInfo(model_cfg.data.test.dataset_info)
+        left_wrist_idx = dataset_info.keypoint_name2id.get('left_wrist', None)
+        right_wrist_idx = dataset_info.keypoint_name2id.get(
+            'right_wrist', None)
+    except AttributeError:
+        left_wrist_idx = None
+        right_wrist_idx = None
+
+    if left_wrist_idx is None or right_wrist_idx is None:
+        # Fall back to hard coded keypoint id
+        dataset_name = model_cfg.data.test.type
+        if dataset_name in {
+                'TopDownCocoDataset', 'TopDownCocoWholeBodyDataset'
+        }:
+            left_wrist_idx = 9
+            right_wrist_idx = 10
+        elif dataset_name == 'AnimalPoseDataset':
+            left_wrist_idx = 16
+            right_wrist_idx = 17
+        elif dataset_name == 'AnimalAP10KDataset':
+            left_wrist_idx = 7
+            right_wrist_idx = 10
+        else:
+            raise ValueError('Can not determine the eye keypoint id of '
+                             f'{dataset_name}')
+
+    return left_wrist_idx, right_wrist_idx
+
+
+def get_mouth_keypoint_ids(model_cfg: Config) -> Tuple[int, int]:
+    """A helpfer function to get the keypoint indices of the left and right
+    part of mouth from the model config.
+
+    Args:
+        model_cfg (Config): pose model config.
+    Returns:
+        int: left-part mouth keypoint index.
+        int: right-part mouth keypoint index.
+    """
+    # try obtaining mouth point ids from dataset_info
+    try:
+        dataset_info = DatasetInfo(model_cfg.data.test.dataset_info)
+        mouth_index = dataset_info.keypoint_name2id.get('face-62', None)
+    except AttributeError:
+        mouth_index = None
+
+    if mouth_index is None:
+        # Fall back to hard coded keypoint id
+        dataset_name = model_cfg.data.test.type
+        if dataset_name == 'TopDownCocoWholeBodyDataset':
+            mouth_index = 85
+        else:
+            raise ValueError('Can not determine the eye keypoint id of '
+                             f'{dataset_name}')
+
+    return mouth_index
+
+
+def get_hand_keypoint_ids(model_cfg: Config) -> List[int]:
+    """A helpfer function to get the keypoint indices of left and right hand
+    from the model config.
+
+    Args:
+        model_cfg (Config): pose model config.
+    Returns:
+        list[int]: hand keypoint indices.
+    """
+    # try obtaining hand keypoint ids from dataset_info
+    try:
+        hand_indices = []
+        dataset_info = DatasetInfo(model_cfg.data.test.dataset_info)
+
+        hand_indices.append(
+            dataset_info.keypoint_name2id.get('left_hand_root', None))
+
+        for id in range(1, 5):
+            hand_indices.append(
+                dataset_info.keypoint_name2id.get(f'left_thumb{id}', None))
+        for id in range(1, 5):
+            hand_indices.append(
+                dataset_info.keypoint_name2id.get(f'left_forefinger{id}',
+                                                  None))
+        for id in range(1, 5):
+            hand_indices.append(
+                dataset_info.keypoint_name2id.get(f'left_middle_finger{id}',
+                                                  None))
+        for id in range(1, 5):
+            hand_indices.append(
+                dataset_info.keypoint_name2id.get(f'left_ring_finger{id}',
+                                                  None))
+        for id in range(1, 5):
+            hand_indices.append(
+                dataset_info.keypoint_name2id.get(f'left_pinky_finger{id}',
+                                                  None))
+
+        hand_indices.append(
+            dataset_info.keypoint_name2id.get('right_hand_root', None))
+
+        for id in range(1, 5):
+            hand_indices.append(
+                dataset_info.keypoint_name2id.get(f'right_thumb{id}', None))
+        for id in range(1, 5):
+            hand_indices.append(
+                dataset_info.keypoint_name2id.get(f'right_forefinger{id}',
+                                                  None))
+        for id in range(1, 5):
+            hand_indices.append(
+                dataset_info.keypoint_name2id.get(f'right_middle_finger{id}',
+                                                  None))
+        for id in range(1, 5):
+            hand_indices.append(
+                dataset_info.keypoint_name2id.get(f'right_ring_finger{id}',
+                                                  None))
+        for id in range(1, 5):
+            hand_indices.append(
+                dataset_info.keypoint_name2id.get(f'right_pinky_finger{id}',
+                                                  None))
+
+    except AttributeError:
+        hand_indices = None
+
+    if hand_indices is None:
+        # Fall back to hard coded keypoint id
+        dataset_name = model_cfg.data.test.type
+        if dataset_name in {'TopDownCocoWholeBodyDataset'}:
+            hand_indices = list(range(91, 133))
+        else:
+            raise ValueError('Can not determine the hand id of '
+                             f'{dataset_name}')
+
+    return hand_indices
diff --git a/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/webcam_runner.py b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/webcam_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..7843b392cfd367d778109794a345f1c361395407
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/third-party/ViTPose/tools/webcam/webcam_apis/webcam_runner.py
@@ -0,0 +1,272 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import sys
+import time
+import warnings
+from contextlib import nullcontext
+from threading import Thread
+from typing import Dict, List, Optional, Tuple, Union
+
+import cv2
+
+from .nodes import NODES
+from .utils import (BufferManager, EventManager, FrameMessage, ImageCapture,
+                    VideoEndingMessage, is_image_file, limit_max_fps)
+
+DEFAULT_FRAME_BUFFER_SIZE = 1
+DEFAULT_INPUT_BUFFER_SIZE = 1
+DEFAULT_DISPLAY_BUFFER_SIZE = 0
+DEFAULT_USER_BUFFER_SIZE = 1
+
+
+class WebcamRunner():
+    """An interface for building webcam application from config.
+
+    Parameters:
+        name (str): Runner name.
+        camera_id (int | str): The camera ID (usually the ID of the default
+            camera is 0). Alternatively a file path or a URL can be given
+            to load from a video or image file.
+        camera_frame_shape (tuple, optional): Set the frame shape of the
+            camera in (width, height). If not given, the default frame shape
+            will be used. This argument is only valid when using a camera
+            as the input source. Default: None
+        camera_fps (int): Video reading maximum FPS. Default: 30
+        buffer_sizes (dict, optional): A dict to specify buffer sizes. The
+            key is the buffer name and the value is the buffer size.
+            Default: None
+        nodes (list): Node configs.
+    """
+
+    def __init__(self,
+                 name: str = 'Default Webcam Runner',
+                 camera_id: Union[int, str] = 0,
+                 camera_fps: int = 30,
+                 camera_frame_shape: Optional[Tuple[int, int]] = None,
+                 synchronous: bool = False,
+                 buffer_sizes: Optional[Dict[str, int]] = None,
+                 nodes: Optional[List[Dict]] = None):
+
+        # Basic parameters
+        self.name = name
+        self.camera_id = camera_id
+        self.camera_fps = camera_fps
+        self.camera_frame_shape = camera_frame_shape
+        self.synchronous = synchronous
+
+        # self.buffer_manager manages data flow between runner and nodes
+        self.buffer_manager = BufferManager()
+        # self.event_manager manages event-based asynchronous communication
+        self.event_manager = EventManager()
+        # self.node_list holds all node instance
+        self.node_list = []
+        # self.vcap is used to read camera frames. It will be built when the
+        # runner starts running
+        self.vcap = None
+
+        # Register runner events
+        self.event_manager.register_event('_exit_', is_keyboard=False)
+        if self.synchronous:
+            self.event_manager.register_event('_idle_', is_keyboard=False)
+
+        # Register nodes
+        if not nodes:
+            raise ValueError('No node is registered to the runner.')
+
+        # Register default buffers
+        if buffer_sizes is None:
+            buffer_sizes = {}
+        # _frame_ buffer
+        frame_buffer_size = buffer_sizes.get('_frame_',
+                                             DEFAULT_FRAME_BUFFER_SIZE)
+        self.buffer_manager.register_buffer('_frame_', frame_buffer_size)
+        # _input_ buffer
+        input_buffer_size = buffer_sizes.get('_input_',
+                                             DEFAULT_INPUT_BUFFER_SIZE)
+        self.buffer_manager.register_buffer('_input_', input_buffer_size)
+        # _display_ buffer
+        display_buffer_size = buffer_sizes.get('_display_',
+                                               DEFAULT_DISPLAY_BUFFER_SIZE)
+        self.buffer_manager.register_buffer('_display_', display_buffer_size)
+
+        # Build all nodes:
+        for node_cfg in nodes:
+            logging.info(f'Create node: {node_cfg.name}({node_cfg.type})')
+            node = NODES.build(node_cfg)
+
+            # Register node
+            self.node_list.append(node)
+
+            # Register buffers
+            for buffer_info in node.registered_buffers:
+                buffer_name = buffer_info.buffer_name
+                if buffer_name in self.buffer_manager:
+                    continue
+                buffer_size = buffer_sizes.get(buffer_name,
+                                               DEFAULT_USER_BUFFER_SIZE)
+                self.buffer_manager.register_buffer(buffer_name, buffer_size)
+                logging.info(
+                    f'Register user buffer: {buffer_name}({buffer_size})')
+
+            # Register events
+            for event_info in node.registered_events:
+                self.event_manager.register_event(
+                    event_name=event_info.event_name,
+                    is_keyboard=event_info.is_keyboard)
+                logging.info(f'Register event: {event_info.event_name}')
+
+        # Set runner for nodes
+        # This step is performed after node building when the runner has
+        # create full buffer/event managers and can
+        for node in self.node_list:
+            logging.info(f'Set runner for node: {node.name})')
+            node.set_runner(self)
+
+    def _read_camera(self):
+        """Continually read video frames and put them into buffers."""
+
+        camera_id = self.camera_id
+        fps = self.camera_fps
+
+        # Build video capture
+        if is_image_file(camera_id):
+            self.vcap = ImageCapture(camera_id)
+        else:
+            self.vcap = cv2.VideoCapture(camera_id)
+            if self.camera_frame_shape is not None:
+                width, height = self.camera_frame_shape
+                self.vcap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
+                self.vcap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
+
+        if not self.vcap.isOpened():
+            warnings.warn(f'Cannot open camera (ID={camera_id})')
+            sys.exit()
+
+        # Read video frames in a loop
+        first_frame = True
+        while not self.event_manager.is_set('_exit_'):
+            if self.synchronous:
+                if first_frame:
+                    cm = nullcontext()
+                else:
+                    # Read a new frame until the last frame has been processed
+                    cm = self.event_manager.wait_and_handle('_idle_')
+            else:
+                # Read frames with a maximum FPS
+                cm = limit_max_fps(fps)
+
+            first_frame = False
+
+            with cm:
+                # Read a frame
+                ret_val, frame = self.vcap.read()
+                if ret_val:
+                    # Put frame message (for display) into buffer `_frame_`
+                    frame_msg = FrameMessage(frame)
+                    self.buffer_manager.put('_frame_', frame_msg)
+
+                    # Put input message (for model inference or other use)
+                    # into buffer `_input_`
+                    input_msg = FrameMessage(frame.copy())
+                    input_msg.update_route_info(
+                        node_name='Camera Info',
+                        node_type='dummy',
+                        info=self._get_camera_info())
+                    self.buffer_manager.put_force('_input_', input_msg)
+
+                else:
+                    # Put a video ending signal
+                    self.buffer_manager.put('_frame_', VideoEndingMessage())
+
+        self.vcap.release()
+
+    def _display(self):
+        """Continually obtain and display output frames."""
+
+        output_msg = None
+
+        while not self.event_manager.is_set('_exit_'):
+            while self.buffer_manager.is_empty('_display_'):
+                time.sleep(0.001)
+
+            # Set _idle_ to allow reading next frame
+            if self.synchronous:
+                self.event_manager.set('_idle_')
+
+            # acquire output from buffer
+            output_msg = self.buffer_manager.get('_display_')
+
+            # None indicates input stream ends
+            if isinstance(output_msg, VideoEndingMessage):
+                self.event_manager.set('_exit_')
+                break
+
+            img = output_msg.get_image()
+
+            # show in a window
+            cv2.imshow(self.name, img)
+
+            # handle keyboard input
+            key = cv2.waitKey(1)
+            if key != -1:
+                self._on_keyboard_input(key)
+
+        cv2.destroyAllWindows()
+
+    def _on_keyboard_input(self, key):
+        """Handle the keyboard input."""
+
+        if key in (27, ord('q'), ord('Q')):
+            logging.info(f'Exit event captured: {key}')
+            self.event_manager.set('_exit_')
+        else:
+            logging.info(f'Keyboard event captured: {key}')
+            self.event_manager.set(key, is_keyboard=True)
+
+    def _get_camera_info(self):
+        """Return the camera information in a dict."""
+
+        frame_width = self.vcap.get(cv2.CAP_PROP_FRAME_WIDTH)
+        frame_height = self.vcap.get(cv2.CAP_PROP_FRAME_HEIGHT)
+        frame_rate = self.vcap.get(cv2.CAP_PROP_FPS)
+
+        cam_info = {
+            'Camera ID': self.camera_id,
+            'Source resolution': f'{frame_width}x{frame_height}',
+            'Source FPS': frame_rate,
+        }
+
+        return cam_info
+
+    def run(self):
+        """Program entry.
+
+        This method starts all nodes as well as video I/O in separate threads.
+        """
+
+        try:
+            # Start node threads
+            non_daemon_nodes = []
+            for node in self.node_list:
+                node.start()
+                if not node.daemon:
+                    non_daemon_nodes.append(node)
+
+            # Create a thread to read video frames
+            t_read = Thread(target=self._read_camera, args=())
+            t_read.start()
+
+            # Run display in the main thread
+            self._display()
+            logging.info('Display shut down')
+
+            # joint non-daemon nodes and runner threads
+            logging.info('Camera reading about to join')
+            t_read.join()
+
+            for node in non_daemon_nodes:
+                logging.info(f'Node {node.name} about to join')
+                node.join()
+
+        except KeyboardInterrupt:
+            pass
diff --git a/phantom/submodules/phantom-hamer/train.py b/phantom/submodules/phantom-hamer/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..329e40bb19f3fed4ba42fd9fff1abaefa22ff287
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/train.py
@@ -0,0 +1,113 @@
+from typing import Optional, Tuple
+import pyrootutils
+
+root = pyrootutils.setup_root(
+    search_from=__file__,
+    indicator=[".git", "pyproject.toml"],
+    pythonpath=True,
+    dotenv=True,
+)
+
+import os
+from pathlib import Path
+
+import hydra
+import pytorch_lightning as pl
+from omegaconf import DictConfig, OmegaConf
+from pytorch_lightning import Trainer
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.plugins.environments import SLURMEnvironment
+#from pytorch_lightning.trainingtype import DDPPlugin
+
+from yacs.config import CfgNode
+from hamer.configs import dataset_config
+from hamer.datasets import HAMERDataModule
+from hamer.models.hamer import HAMER
+from hamer.utils.pylogger import get_pylogger
+from hamer.utils.misc import task_wrapper, log_hyperparameters
+
+# HACK reset the signal handling so the lightning is free to set it
+# Based on https://github.com/facebookincubator/submitit/issues/1709#issuecomment-1246758283
+import signal
+signal.signal(signal.SIGUSR1, signal.SIG_DFL)
+
+log = get_pylogger(__name__)
+
+
+@pl.utilities.rank_zero.rank_zero_only
+def save_configs(model_cfg: CfgNode, dataset_cfg: CfgNode, rootdir: str):
+    """Save config files to rootdir."""
+    Path(rootdir).mkdir(parents=True, exist_ok=True)
+    OmegaConf.save(config=model_cfg, f=os.path.join(rootdir, 'model_config.yaml'))
+    with open(os.path.join(rootdir, 'dataset_config.yaml'), 'w') as f:
+        f.write(dataset_cfg.dump())
+
+@task_wrapper
+def train(cfg: DictConfig) -> Tuple[dict, dict]:
+
+    # Load dataset config
+    dataset_cfg = dataset_config()
+
+    # Save configs
+    save_configs(cfg, dataset_cfg, cfg.paths.output_dir)
+
+    # Setup training and validation datasets
+    datamodule = HAMERDataModule(cfg, dataset_cfg)
+
+    # Setup model
+    model = HAMER(cfg)
+
+    # Setup Tensorboard logger
+    logger = TensorBoardLogger(os.path.join(cfg.paths.output_dir, 'tensorboard'), name='', version='', default_hp_metric=False)
+    loggers = [logger]
+
+    # Setup checkpoint saving
+    checkpoint_callback = pl.callbacks.ModelCheckpoint(
+        dirpath=os.path.join(cfg.paths.output_dir, 'checkpoints'), 
+        every_n_train_steps=cfg.GENERAL.CHECKPOINT_STEPS, 
+        save_last=True,
+        save_top_k=cfg.GENERAL.CHECKPOINT_SAVE_TOP_K,
+    )
+    rich_callback = pl.callbacks.RichProgressBar()
+    lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')
+    callbacks = [
+        checkpoint_callback, 
+        lr_monitor,
+        # rich_callback
+    ]
+
+    log.info(f"Instantiating trainer <{cfg.trainer._target_}>")
+    trainer: Trainer = hydra.utils.instantiate(
+        cfg.trainer, 
+        callbacks=callbacks, 
+        logger=loggers, 
+        #plugins=(SLURMEnvironment(requeue_signal=signal.SIGUSR2) if (cfg.get('launcher',None) is not None) else DDPPlugin(find_unused_parameters=False)), # Submitit uses SIGUSR2
+        plugins=(SLURMEnvironment(requeue_signal=signal.SIGUSR2) if (cfg.get('launcher',None) is not None) else None), # Submitit uses SIGUSR2
+    )
+
+    object_dict = {
+        "cfg": cfg,
+        "datamodule": datamodule,
+        "model": model,
+        "callbacks": callbacks,
+        "logger": logger,
+        "trainer": trainer,
+    }
+
+    if logger:
+        log.info("Logging hyperparameters!")
+        log_hyperparameters(object_dict)
+
+    # Train the model
+    trainer.fit(model, datamodule=datamodule, ckpt_path='last')
+    log.info("Fitting done")
+
+
+@hydra.main(version_base="1.2", config_path=str(root/"hamer/configs_hydra"), config_name="train.yaml")
+def main(cfg: DictConfig) -> Optional[float]:
+    # train the model
+    train(cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/phantom/submodules/phantom-hamer/vitpose_model.py b/phantom/submodules/phantom-hamer/vitpose_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9206f38420220234baca89f2a3438b2ca9ce51d
--- /dev/null
+++ b/phantom/submodules/phantom-hamer/vitpose_model.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+import os
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from mmpose.apis import inference_top_down_pose_model, init_pose_model, process_mmdet_results, vis_pose_result
+
+os.environ["PYOPENGL_PLATFORM"] = "egl"
+
+# project root directory
+ROOT_DIR = "./"
+VIT_DIR = os.path.join(ROOT_DIR, "third-party/ViTPose")
+
+class ViTPoseModel(object):
+    def __init__(self, device: str | torch.device, root_dir: str = ROOT_DIR, vit_dir: str = VIT_DIR):
+        self.MODEL_DICT = {
+            'ViTPose+-G (multi-task train, COCO)': {
+                'config': f'{vit_dir}/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/ViTPose_huge_wholebody_256x192.py',
+                'model': f'{root_dir}/_DATA/vitpose_ckpts/vitpose+_huge/wholebody.pth',
+            },
+        }
+        self.device = torch.device(device)
+        self.model_name = 'ViTPose+-G (multi-task train, COCO)'
+        self.model = self._load_model(self.model_name)
+
+    def _load_all_models_once(self) -> None:
+        for name in self.MODEL_DICT:
+            self._load_model(name)
+
+    def _load_model(self, name: str) -> nn.Module:
+        dic = self.MODEL_DICT[name]
+        ckpt_path = dic['model']
+        model = init_pose_model(dic['config'], ckpt_path, device=self.device)
+        return model
+
+    def set_model(self, name: str) -> None:
+        if name == self.model_name:
+            return
+        self.model_name = name
+        self.model = self._load_model(name)
+
+    def predict_pose_and_visualize(
+        self,
+        image: np.ndarray,
+        det_results: list[np.ndarray],
+        box_score_threshold: float,
+        kpt_score_threshold: float,
+        vis_dot_radius: int,
+        vis_line_thickness: int,
+    ) -> tuple[list[dict[str, np.ndarray]], np.ndarray]:
+        out = self.predict_pose(image, det_results, box_score_threshold)
+        vis = self.visualize_pose_results(image, out, kpt_score_threshold,
+                                          vis_dot_radius, vis_line_thickness)
+        return out, vis
+
+    def predict_pose(
+            self,
+            image: np.ndarray,
+            det_results: list[np.ndarray],
+            box_score_threshold: float = 0.5) -> list[dict[str, np.ndarray]]:
+        image = image[:, :, ::-1]  # RGB -> BGR
+        person_results = process_mmdet_results(det_results, 1)
+        out, _ = inference_top_down_pose_model(self.model,
+                                               image,
+                                               person_results=person_results,
+                                               bbox_thr=box_score_threshold,
+                                               format='xyxy')
+        return out
+
+    def visualize_pose_results(self,
+                               image: np.ndarray,
+                               pose_results: list[np.ndarray],
+                               kpt_score_threshold: float = 0.3,
+                               vis_dot_radius: int = 4,
+                               vis_line_thickness: int = 1) -> np.ndarray:
+        image = image[:, :, ::-1]  # RGB -> BGR
+        vis = vis_pose_result(self.model,
+                              image,
+                              pose_results,
+                              kpt_score_thr=kpt_score_threshold,
+                              radius=vis_dot_radius,
+                              thickness=vis_line_thickness)
+        return vis[:, :, ::-1]  # BGR -> RGB
diff --git a/phantom/submodules/phantom-robomimic/.gitignore b/phantom/submodules/phantom-robomimic/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..19002b4a83137bcec04430bbb303436f4ea15b6a
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/.gitignore
@@ -0,0 +1,126 @@
+# pip distribution folder
+dist/
+
+# datasets folder at top-level (leading slash)
+/datasets
+/experiment_results
+
+# local test dataset that is lazily downloaded by example scripts
+tests/assets/test.hdf5
+tests/assets/test_v141.hdf5
+
+# pycharm configs
+.idea/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+
+.DS_Store
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+*.mp4
+*.pth
+
+# private macros
+macros_private.py
diff --git a/phantom/submodules/phantom-robomimic/LICENSE b/phantom/submodules/phantom-robomimic/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..934eaa87bb98d79ced50c8f27849625dc97b934d
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Stanford Vision and Learning Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/phantom/submodules/phantom-robomimic/MANIFEST.in b/phantom/submodules/phantom-robomimic/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..03d46cbc6f2c4d1ebfbba8c8049fa04342f9defd
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/MANIFEST.in
@@ -0,0 +1,9 @@
+include robomimic/exps/templates/*.json
+include robomimic/scripts/*.py
+include robomimic/scripts/*.sh
+include robomimic/scripts/conversion/*.py
+include robomimic/scripts/conversion/*.sh
+recursive-include examples/ *.py
+recursive-include tests/ *.py
+recursive-include tests/ *.sh
+recursive-include tests/assets/ *
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/README.md b/phantom/submodules/phantom-robomimic/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bdc3556ff9439556f124d83502669aba42d621b0
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/README.md
@@ -0,0 +1,90 @@
+# robomimic
+
+<p align="center">
+  <img width="24.0%" src="docs/images/task_lift.gif">
+  <img width="24.0%" src="docs/images/task_can.gif">
+  <img width="24.0%" src="docs/images/task_tool_hang.gif">
+  <img width="24.0%" src="docs/images/task_square.gif">
+  <img width="24.0%" src="docs/images/task_lift_real.gif">
+  <img width="24.0%" src="docs/images/task_can_real.gif">
+  <img width="24.0%" src="docs/images/task_tool_hang_real.gif">
+  <img width="24.0%" src="docs/images/task_transport.gif">
+ </p>
+
+[**[Homepage]**](https://robomimic.github.io/) &ensp; [**[Documentation]**](https://robomimic.github.io/docs/introduction/overview.html) &ensp; [**[Study Paper]**](https://arxiv.org/abs/2108.03298) &ensp; [**[Study Website]**](https://robomimic.github.io/study/) &ensp; [**[ARISE Initiative]**](https://github.com/ARISE-Initiative)
+
+-------
+## Latest Updates
+- [10/11/2023] **v0.3.1**: support for extracting, training on, and visualizing depth observations for robosuite datasets
+- [07/03/2023] **v0.3.0**: BC-Transformer and IQL :brain:, support for DeepMind MuJoCo bindings :robot:, pre-trained image reps :eye:, wandb logging :chart_with_upwards_trend:, and more
+- [05/23/2022] **v0.2.1**: Updated website and documentation to feature more tutorials :notebook_with_decorative_cover:
+- [12/16/2021] **v0.2.0**: Modular observation modalities and encoders :wrench:, support for [MOMART](https://sites.google.com/view/il-for-mm/home) datasets :open_file_folder: [[release notes]](https://github.com/ARISE-Initiative/robomimic/releases/tag/v0.2.0) [[documentation]](https://robomimic.github.io/docs/v0.2/introduction/overview.html)
+- [08/09/2021] **v0.1.0**: Initial code and paper release
+
+-------
+
+## Colab quickstart
+Get started with a quick colab notebook demo of robomimic without installing anything locally.
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1b62r_km9pP40fKF0cBdpdTO2P_2eIbC6?usp=sharing)
+
+
+-------
+
+**robomimic** is a framework for robot learning from demonstration.
+It offers a broad set of demonstration datasets collected on robot manipulation domains and offline learning algorithms to learn from these datasets.
+**robomimic** aims to make robot learning broadly *accessible* and *reproducible*, allowing researchers and practitioners to benchmark tasks and algorithms fairly and to develop the next generation of robot learning algorithms.
+
+## Core Features
+
+<p align="center">
+  <img width="50.0%" src="docs/images/core_features.png">
+ </p>
+
+<!-- **Standardized Datasets**
+- Simulated and real-world tasks
+- Multiple environments and robots
+- Diverse human-collected and machine-generated datasets
+
+**Suite of Learning Algorithms**
+- Imitation Learning algorithms (BC, BC-RNN, HBC)
+- Offline RL algorithms (BCQ, CQL, IRIS, TD3-BC)
+
+**Modular Design**
+- Low-dim + Visuomotor policies
+- Diverse network architectures
+- Support for external datasets
+
+**Flexible Workflow**
+- Hyperparameter sweep tools
+- Dataset visualization tools
+- Generating new datasets -->
+
+
+## Reproducing benchmarks
+
+The robomimic framework also makes reproducing the results from different benchmarks and datasets easy. See the [datasets page](https://robomimic.github.io/docs/datasets/overview.html) for more information on downloading datasets and reproducing experiments.
+
+## Troubleshooting
+
+Please see the [troubleshooting](https://robomimic.github.io/docs/miscellaneous/troubleshooting.html) section for common fixes, or [submit an issue](https://github.com/ARISE-Initiative/robomimic/issues) on our github page.
+
+## Contributing to robomimic
+This project is part of the broader [Advancing Robot Intelligence through Simulated Environments (ARISE) Initiative](https://github.com/ARISE-Initiative), with the aim of lowering the barriers of entry for cutting-edge research at the intersection of AI and Robotics.
+The project originally began development in late 2018 by researchers in the [Stanford Vision and Learning Lab](http://svl.stanford.edu/) (SVL).
+Now it is actively maintained and used for robotics research projects across multiple labs.
+We welcome community contributions to this project.
+For details please check our [contributing guidelines](https://robomimic.github.io/docs/miscellaneous/contributing.html).
+
+## Citation
+
+Please cite [this paper](https://arxiv.org/abs/2108.03298) if you use this framework in your work:
+
+```bibtex
+@inproceedings{robomimic2021,
+  title={What Matters in Learning from Offline Human Demonstrations for Robot Manipulation},
+  author={Ajay Mandlekar and Danfei Xu and Josiah Wong and Soroush Nasiriany and Chen Wang and Rohun Kulkarni and Li Fei-Fei and Silvio Savarese and Yuke Zhu and Roberto Mart\'{i}n-Mart\'{i}n},
+  booktitle={Conference on Robot Learning (CoRL)},
+  year={2021}
+}
+```
diff --git a/phantom/submodules/phantom-robomimic/requirements-docs.txt b/phantom/submodules/phantom-robomimic/requirements-docs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4b0538e24a3a648ba885ddf9d42b39c26f48087a
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/requirements-docs.txt
@@ -0,0 +1,8 @@
+# requirements for building sphinx docs
+pygments==2.4.1
+sphinx
+sphinx_rtd_theme
+sphinx_markdown_tables
+sphinx_book_theme
+recommonmark
+nbsphinx
diff --git a/phantom/submodules/phantom-robomimic/requirements.txt b/phantom/submodules/phantom-robomimic/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ac8ea89aa900a793eff45c57b39f3137e9edb872
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/requirements.txt
@@ -0,0 +1,14 @@
+numpy>=1.13.3
+h5py
+psutil
+tqdm
+termcolor
+tensorboard
+tensorboardX
+imageio
+imageio-ffmpeg
+matplotlib
+egl_probe>=1.0.1
+torch
+torchvision
+diffusers==0.11.1
diff --git a/phantom/submodules/phantom-robomimic/robomimic/__init__.py b/phantom/submodules/phantom-robomimic/robomimic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1930630a3c7354ea8c9453aa3a4b280cebb2eceb
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/__init__.py
@@ -0,0 +1,159 @@
+__version__ = "0.3.1"
+
+
+# stores released dataset links and rollout horizons in global dictionary.
+# Structure is given below for each type of dataset:
+
+# robosuite / real
+# {
+#   task:
+#       dataset_type:
+#           hdf5_type:
+#               url: link
+#               horizon: value
+#           ...
+#       ...
+#   ...
+# }
+DATASET_REGISTRY = {}
+
+# momart
+# {
+#   task:
+#       dataset_type:
+#           url: link
+#           size: value
+#       ...
+#   ...
+# }
+MOMART_DATASET_REGISTRY = {}
+
+
+def register_dataset_link(task, dataset_type, hdf5_type, link, horizon):
+    """
+    Helper function to register dataset link in global dictionary.
+    Also takes a @horizon parameter - this corresponds to the evaluation
+    rollout horizon that should be used during training.
+
+    Args:
+        task (str): name of task for this dataset
+        dataset_type (str): type of dataset (usually identifies the dataset source)
+        hdf5_type (str): type of hdf5 - usually one of "raw", "low_dim", or "image",
+            to identify the kind of observations in the dataset
+        link (str): download link for the dataset
+        horizon (int): evaluation rollout horizon that should be used with this dataset
+    """
+    if task not in DATASET_REGISTRY:
+        DATASET_REGISTRY[task] = {}
+    if dataset_type not in DATASET_REGISTRY[task]:
+        DATASET_REGISTRY[task][dataset_type] = {}
+    DATASET_REGISTRY[task][dataset_type][hdf5_type] = dict(url=link, horizon=horizon)
+
+
+def register_all_links():
+    """
+    Record all dataset links in this function.
+    """
+
+    # all proficient human datasets
+    ph_tasks = ["lift", "can", "square", "transport", "tool_hang", "lift_real", "can_real", "tool_hang_real"]
+    ph_horizons = [400, 400, 400, 700, 700, 1000, 1000, 1000]
+    for task, horizon in zip(ph_tasks, ph_horizons):
+        register_dataset_link(task=task, dataset_type="ph", hdf5_type="raw", horizon=horizon,
+            link="http://downloads.cs.stanford.edu/downloads/rt_benchmark/{}/ph/demo{}.hdf5".format(
+                task, "" if "real" in task else "_v141"
+            )
+        )
+        # real world datasets only have demo.hdf5 files which already contain all observation modalities
+        # while sim datasets store raw low-dim mujoco states in the demo.hdf5
+        if "real" not in task:
+            register_dataset_link(task=task, dataset_type="ph", hdf5_type="low_dim", horizon=horizon,
+                link="http://downloads.cs.stanford.edu/downloads/rt_benchmark/{}/ph/low_dim_v141.hdf5".format(task))
+            register_dataset_link(task=task, dataset_type="ph", hdf5_type="image", horizon=horizon,
+                link=None)
+
+    # all multi human datasets
+    mh_tasks = ["lift", "can", "square", "transport"]
+    mh_horizons = [500, 500, 500, 1100]
+    for task, horizon in zip(mh_tasks, mh_horizons):
+        register_dataset_link(task=task, dataset_type="mh", hdf5_type="raw", horizon=horizon,
+            link="http://downloads.cs.stanford.edu/downloads/rt_benchmark/{}/mh/demo_v141.hdf5".format(task))
+        register_dataset_link(task=task, dataset_type="mh", hdf5_type="low_dim", horizon=horizon,
+            link="http://downloads.cs.stanford.edu/downloads/rt_benchmark/{}/mh/low_dim_v141.hdf5".format(task))
+        register_dataset_link(task=task, dataset_type="mh", hdf5_type="image", horizon=horizon,
+            link=None)
+
+    # all machine generated datasets
+    for task, horizon in zip(["lift", "can"], [400, 400]):
+        register_dataset_link(task=task, dataset_type="mg", hdf5_type="raw", horizon=horizon,
+            link="http://downloads.cs.stanford.edu/downloads/rt_benchmark/{}/mg/demo_v141.hdf5".format(task))
+        register_dataset_link(task=task, dataset_type="mg", hdf5_type="low_dim_sparse", horizon=horizon,
+            link="http://downloads.cs.stanford.edu/downloads/rt_benchmark/{}/mg/low_dim_sparse_v141.hdf5".format(task))
+        register_dataset_link(task=task, dataset_type="mg", hdf5_type="image_sparse", horizon=horizon,
+            link=None)
+        register_dataset_link(task=task, dataset_type="mg", hdf5_type="low_dim_dense", horizon=horizon,
+            link="http://downloads.cs.stanford.edu/downloads/rt_benchmark/{}/mg/low_dim_dense_v141.hdf5".format(task))
+        register_dataset_link(task=task, dataset_type="mg", hdf5_type="image_dense", horizon=horizon,
+            link=None)
+
+    # can-paired dataset
+    register_dataset_link(task="can", dataset_type="paired", hdf5_type="raw", horizon=400,
+        link="http://downloads.cs.stanford.edu/downloads/rt_benchmark/can/paired/demo_v141.hdf5")
+    register_dataset_link(task="can", dataset_type="paired", hdf5_type="low_dim", horizon=400,
+        link="http://downloads.cs.stanford.edu/downloads/rt_benchmark/can/paired/low_dim_v141.hdf5")
+    register_dataset_link(task="can", dataset_type="paired", hdf5_type="image", horizon=400,
+        link=None)
+
+
+def register_momart_dataset_link(task, dataset_type, link, dataset_size):
+    """
+    Helper function to register dataset link in global dictionary.
+    Also takes a @horizon parameter - this corresponds to the evaluation
+    rollout horizon that should be used during training.
+
+    Args:
+        task (str): name of task for this dataset
+        dataset_type (str): type of dataset (usually identifies the dataset source)
+        link (str): download link for the dataset
+        dataset_size (float): size of the dataset, in GB
+    """
+    if task not in MOMART_DATASET_REGISTRY:
+        MOMART_DATASET_REGISTRY[task] = {}
+    if dataset_type not in MOMART_DATASET_REGISTRY[task]:
+        MOMART_DATASET_REGISTRY[task][dataset_type] = {}
+    MOMART_DATASET_REGISTRY[task][dataset_type] = dict(url=link, size=dataset_size)
+
+
+def register_all_momart_links():
+    """
+    Record all dataset links in this function.
+    """
+    # all tasks, mapped to their [exp, sub, gen, sam] sizes
+    momart_tasks = {
+        "table_setup_from_dishwasher": [14, 14, 3.3, 0.6],
+        "table_setup_from_dresser": [16, 17, 3.1, 0.7],
+        "table_cleanup_to_dishwasher": [23, 36, 5.3, 1.1],
+        "table_cleanup_to_sink": [17, 28, 2.9, 0.8],
+        "unload_dishwasher": [21, 27, 5.4, 1.0],
+    }
+
+    momart_dataset_types = [
+        "expert",
+        "suboptimal",
+        "generalize",
+        "sample",
+    ]
+
+    # Iterate over all combos and register the link
+    for task, dataset_sizes in momart_tasks.items():
+        for dataset_type, dataset_size in zip(momart_dataset_types, dataset_sizes):
+            register_momart_dataset_link(
+                task=task,
+                dataset_type=dataset_type,
+                link=f"http://downloads.cs.stanford.edu/downloads/rt_mm/{dataset_type}/{task}_{dataset_type}.hdf5",
+                dataset_size=dataset_size,
+            )
+
+
+register_all_links()
+register_all_momart_links()
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/__init__.py b/phantom/submodules/phantom-robomimic/robomimic/algo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dedba35c9b70e100cae7da46580720f35dc28ef1
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/__init__.py
@@ -0,0 +1,12 @@
+from robomimic.algo.algo import register_algo_factory_func, algo_name_to_factory_func, algo_factory, Algo, PolicyAlgo, ValueAlgo, PlannerAlgo, HierarchicalAlgo, RolloutPolicy
+
+# note: these imports are needed to register these classes in the global algo registry
+from robomimic.algo.bc import BC, BC_Gaussian, BC_GMM, BC_VAE, BC_RNN, BC_RNN_GMM
+from robomimic.algo.bcq import BCQ, BCQ_GMM, BCQ_Distributional
+from robomimic.algo.cql import CQL
+from robomimic.algo.iql import IQL
+from robomimic.algo.gl import GL, GL_VAE, ValuePlanner
+from robomimic.algo.hbc import HBC
+from robomimic.algo.iris import IRIS
+from robomimic.algo.td3_bc import TD3_BC
+from robomimic.algo.diffusion_policy import DiffusionPolicyUNet
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/algo.py b/phantom/submodules/phantom-robomimic/robomimic/algo/algo.py
new file mode 100644
index 0000000000000000000000000000000000000000..6289c214e15edbff66df6bfaceef25921ffb3b3b
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/algo.py
@@ -0,0 +1,574 @@
+"""
+This file contains base classes that other algorithm classes subclass.
+Each algorithm file also implements a algorithm factory function that
+takes in an algorithm config (`config.algo`) and returns the particular
+Algo subclass that should be instantiated, along with any extra kwargs.
+These factory functions are registered into a global dictionary with the
+@register_algo_factory_func function decorator. This makes it easy for
+@algo_factory to instantiate the correct `Algo` subclass.
+"""
+import textwrap
+from copy import deepcopy
+from collections import OrderedDict
+
+import torch.nn as nn
+import torch
+
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.torch_utils as TorchUtils
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.action_utils as AcUtils
+
+
+# mapping from algo name to factory functions that map algo configs to algo class names
+REGISTERED_ALGO_FACTORY_FUNCS = OrderedDict()
+
+
+def register_algo_factory_func(algo_name):
+    """
+    Function decorator to register algo factory functions that map algo configs to algo class names.
+    Each algorithm implements such a function, and decorates it with this decorator.
+
+    Args:
+        algo_name (str): the algorithm name to register the algorithm under
+    """
+    def decorator(factory_func):
+        REGISTERED_ALGO_FACTORY_FUNCS[algo_name] = factory_func
+    return decorator
+
+
+def algo_name_to_factory_func(algo_name):
+    """
+    Uses registry to retrieve algo factory function from algo name.
+
+    Args:
+        algo_name (str): the algorithm name
+    """
+    return REGISTERED_ALGO_FACTORY_FUNCS[algo_name]
+
+
+def algo_factory(algo_name, config, obs_key_shapes, ac_dim, device):
+    """
+    Factory function for creating algorithms based on the algorithm name and config.
+
+    Args:
+        algo_name (str): the algorithm name
+
+        config (BaseConfig instance): config object
+
+        obs_key_shapes (OrderedDict): dictionary that maps observation keys to shapes
+
+        ac_dim (int): dimension of action space
+
+        device (torch.Device): where the algo should live (i.e. cpu, gpu)
+    """
+
+    # @algo_name is included as an arg to be explicit, but make sure it matches the config
+    assert algo_name == config.algo_name
+
+    # use algo factory func to get algo class and kwargs from algo config
+    factory_func = algo_name_to_factory_func(algo_name)
+    algo_cls, algo_kwargs = factory_func(config.algo)
+
+    # create algo instance
+    return algo_cls(
+        algo_config=config.algo,
+        obs_config=config.observation,
+        global_config=config,
+        obs_key_shapes=obs_key_shapes,
+        ac_dim=ac_dim,
+        device=device,
+        **algo_kwargs
+    )
+
+
+class Algo(object):
+    """
+    Base algorithm class that all other algorithms subclass. Defines several
+    functions that should be overriden by subclasses, in order to provide
+    a standard API to be used by training functions such as @run_epoch in
+    utils/train_utils.py.
+    """
+    def __init__(
+        self,
+        algo_config,
+        obs_config,
+        global_config,
+        obs_key_shapes,
+        ac_dim,
+        device
+    ):
+        """
+        Args:
+            algo_config (Config object): instance of Config corresponding to the algo section
+                of the config
+
+            obs_config (Config object): instance of Config corresponding to the observation
+                section of the config
+
+            global_config (Config object): global training config
+
+            obs_key_shapes (OrderedDict): dictionary that maps observation keys to shapes
+
+            ac_dim (int): dimension of action space
+
+            device (torch.Device): where the algo should live (i.e. cpu, gpu)
+        """
+        self.optim_params = deepcopy(algo_config.optim_params)
+        self.algo_config = algo_config
+        self.obs_config = obs_config
+        self.global_config = global_config
+
+        self.ac_dim = ac_dim
+        self.device = device
+        self.obs_key_shapes = obs_key_shapes
+
+        self.nets = nn.ModuleDict()
+        self._create_shapes(obs_config.modalities, obs_key_shapes)
+        self._create_networks()
+        self._create_optimizers()
+        assert isinstance(self.nets, nn.ModuleDict)
+
+    def _create_shapes(self, obs_keys, obs_key_shapes):
+        """
+        Create obs_shapes, goal_shapes, and subgoal_shapes dictionaries, to make it
+        easy for this algorithm object to keep track of observation key shapes. Each dictionary
+        maps observation key to shape.
+
+        Args:
+            obs_keys (dict): dict of required observation keys for this training run (usually
+                specified by the obs config), e.g., {"obs": ["rgb", "proprio"], "goal": ["proprio"]}
+            obs_key_shapes (dict): dict of observation key shapes, e.g., {"rgb": [3, 224, 224]}
+        """
+        # determine shapes
+        self.obs_shapes = OrderedDict()
+        self.goal_shapes = OrderedDict()
+        self.subgoal_shapes = OrderedDict()
+
+        # We check across all modality groups (obs, goal, subgoal), and see if the inputted observation key exists
+        # across all modalitie specified in the config. If so, we store its corresponding shape internally
+        for k in obs_key_shapes:
+            if "obs" in self.obs_config.modalities and k in [obs_key for modality in self.obs_config.modalities.obs.values() for obs_key in modality]:
+                self.obs_shapes[k] = obs_key_shapes[k]
+            if "goal" in self.obs_config.modalities and k in [obs_key for modality in self.obs_config.modalities.goal.values() for obs_key in modality]:
+                self.goal_shapes[k] = obs_key_shapes[k]
+            if "subgoal" in self.obs_config.modalities and k in [obs_key for modality in self.obs_config.modalities.subgoal.values() for obs_key in modality]:
+                self.subgoal_shapes[k] = obs_key_shapes[k]
+
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        @self.nets should be a ModuleDict.
+        """
+        raise NotImplementedError
+
+    def _create_optimizers(self):
+        """
+        Creates optimizers using @self.optim_params and places them into @self.optimizers.
+        """
+        self.optimizers = dict()
+        self.lr_schedulers = dict()
+
+        for k in self.optim_params:
+            # only make optimizers for networks that have been created - @optim_params may have more
+            # settings for unused networks
+            if k in self.nets:
+                if isinstance(self.nets[k], nn.ModuleList):
+                    self.optimizers[k] = [
+                        TorchUtils.optimizer_from_optim_params(net_optim_params=self.optim_params[k], net=self.nets[k][i])
+                        for i in range(len(self.nets[k]))
+                    ]
+                    self.lr_schedulers[k] = [
+                        TorchUtils.lr_scheduler_from_optim_params(net_optim_params=self.optim_params[k], net=self.nets[k][i], optimizer=self.optimizers[k][i])
+                        for i in range(len(self.nets[k]))
+                    ]
+                else:
+                    self.optimizers[k] = TorchUtils.optimizer_from_optim_params(
+                        net_optim_params=self.optim_params[k], net=self.nets[k])
+                    self.lr_schedulers[k] = TorchUtils.lr_scheduler_from_optim_params(
+                        net_optim_params=self.optim_params[k], net=self.nets[k], optimizer=self.optimizers[k])
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training 
+        """
+        return batch
+
+    def postprocess_batch_for_training(self, batch, obs_normalization_stats):
+        """
+        Does some operations (like channel swap, uint8 to float conversion, normalization)
+        after @process_batch_for_training is called, in order to ensure these operations
+        take place on GPU.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader. Assumed to be on the device where
+                training will occur (after @process_batch_for_training
+                is called)
+
+            obs_normalization_stats (dict or None): if provided, this should map observation
+                keys to dicts with a "mean" and "std" of shape (1, ...) where ... is the
+                default shape for the observation.
+
+        Returns:
+            batch (dict): postproceesed batch
+        """
+
+        # ensure obs_normalization_stats are torch Tensors on proper device
+        obs_normalization_stats = TensorUtils.to_float(TensorUtils.to_device(TensorUtils.to_tensor(obs_normalization_stats), self.device))
+
+        obs_keys = ["obs", "next_obs", "goal_obs"]
+        for k in obs_keys:
+            if k in batch and batch[k] is not None:
+                batch[k] = ObsUtils.process_obs_dict(batch[k])
+                if obs_normalization_stats is not None:
+                    batch[k] = ObsUtils.normalize_dict(batch[k], obs_normalization_stats=obs_normalization_stats)
+        return batch
+
+    def postprocess_batch_for_training(self, batch, obs_normalization_stats):
+        """
+        Does some operations (like channel swap, uint8 to float conversion, normalization)
+        after @process_batch_for_training is called, in order to ensure these operations
+        take place on GPU.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader. Assumed to be on the device where
+                training will occur (after @process_batch_for_training
+                is called)
+
+            obs_normalization_stats (dict or None): if provided, this should map observation
+                keys to dicts with a "mean" and "std" of shape (1, ...) where ... is the
+                default shape for the observation.
+
+        Returns:
+            batch (dict): postproceesed batch
+        """
+
+        # ensure obs_normalization_stats are torch Tensors on proper device
+        obs_normalization_stats = TensorUtils.to_float(TensorUtils.to_device(TensorUtils.to_tensor(obs_normalization_stats), self.device))
+
+        # we will search the nested batch dictionary for the following special batch dict keys
+        # and apply the processing function to their values (which correspond to observations)
+        obs_keys = ["obs", "next_obs", "goal_obs"]
+
+        def recurse_helper(d):
+            """
+            Apply process_obs_dict to values in nested dictionary d that match a key in obs_keys.
+            """
+            for k in d:
+                if k in obs_keys:
+                    # found key - stop search and process observation
+                    if d[k] is not None:
+                        d[k] = ObsUtils.process_obs_dict(d[k])
+                        if obs_normalization_stats is not None:
+                            d[k] = ObsUtils.normalize_dict(d[k], obs_normalization_stats=obs_normalization_stats)
+                elif isinstance(d[k], dict):
+                    # search down into dictionary
+                    recurse_helper(d[k])
+
+        recurse_helper(batch)
+        return batch
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        assert validate or self.nets.training
+        return OrderedDict()
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss log (dict): name -> summary statistic
+        """
+        log = OrderedDict()
+
+        # record current optimizer learning rates
+        for k in self.optimizers:
+            for i, param_group in enumerate(self.optimizers[k].param_groups):
+                log["Optimizer/{}{}_lr".format(k, i)] = param_group["lr"]
+
+        return log
+
+    def on_epoch_end(self, epoch):
+        """
+        Called at the end of each epoch.
+        """
+
+        # LR scheduling updates
+        for k in self.lr_schedulers:
+            if self.lr_schedulers[k] is not None:
+                self.lr_schedulers[k].step()
+
+    def set_eval(self):
+        """
+        Prepare networks for evaluation.
+        """
+        self.nets.eval()
+
+    def set_train(self):
+        """
+        Prepare networks for training.
+        """
+        self.nets.train()
+
+    def serialize(self):
+        """
+        Get dictionary of current model parameters.
+        """
+        return self.nets.state_dict()
+
+    def deserialize(self, model_dict):
+        """
+        Load model from a checkpoint.
+
+        Args:
+            model_dict (dict): a dictionary saved by self.serialize() that contains
+                the same keys as @self.network_classes
+        """
+        self.nets.load_state_dict(model_dict)
+
+    def __repr__(self):
+        """
+        Pretty print algorithm and network description.
+        """
+        return "{} (\n".format(self.__class__.__name__) + \
+               textwrap.indent(self.nets.__repr__(), '  ') + "\n)"
+
+    def reset(self):
+        """
+        Reset algo state to prepare for environment rollouts.
+        """
+        pass
+
+
+class PolicyAlgo(Algo):
+    """
+    Base class for all algorithms that can be used as policies.
+    """
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        raise NotImplementedError
+
+
+class ValueAlgo(Algo):
+    """
+    Base class for all algorithms that can learn a value function.
+    """
+    def get_state_value(self, obs_dict, goal_dict=None):
+        """
+        Get state value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        raise NotImplementedError
+
+    def get_state_action_value(self, obs_dict, actions, goal_dict=None):
+        """
+        Get state-action value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            actions (torch.Tensor): action
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        raise NotImplementedError
+
+
+class PlannerAlgo(Algo):
+    """
+    Base class for all algorithms that can be used for planning subgoals
+    conditioned on current observations and potential goal observations.
+    """
+    def get_subgoal_predictions(self, obs_dict, goal_dict=None):
+        """
+        Get predicted subgoal outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            subgoal prediction (dict): name -> Tensor [batch_size, ...]
+        """
+        raise NotImplementedError
+
+    def sample_subgoals(self, obs_dict, goal_dict, num_samples=1):
+        """
+        For planners that rely on sampling subgoals.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            subgoals (dict): name -> Tensor [batch_size, num_samples, ...]
+        """
+        raise NotImplementedError
+
+
+class HierarchicalAlgo(Algo):
+    """
+    Base class for all hierarchical algorithms that consist of (1) subgoal planning
+    and (2) subgoal-conditioned policy learning.
+    """
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        raise NotImplementedError
+
+    def get_subgoal_predictions(self, obs_dict, goal_dict=None):
+        """
+        Get subgoal predictions from high-level subgoal planner.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            subgoal (dict): predicted subgoal
+        """
+        raise NotImplementedError
+
+    @property
+    def current_subgoal(self):
+        """
+        Get the current subgoal for conditioning the low-level policy
+
+        Returns:
+            current subgoal (dict): predicted subgoal
+        """
+        raise NotImplementedError
+
+
+class RolloutPolicy(object):
+    """
+    Wraps @Algo object to make it easy to run policies in a rollout loop.
+    """
+    def __init__(self, policy, obs_normalization_stats=None, action_normalization_stats=None):
+        """
+        Args:
+            policy (Algo instance): @Algo object to wrap to prepare for rollouts
+
+            obs_normalization_stats (dict): optionally pass a dictionary for observation
+                normalization. This should map observation keys to dicts
+                with a "mean" and "std" of shape (1, ...) where ... is the default
+                shape for the observation.
+        """
+        self.policy = policy
+        self.obs_normalization_stats = obs_normalization_stats
+        self.action_normalization_stats = action_normalization_stats
+
+    def start_episode(self):
+        """
+        Prepare the policy to start a new rollout.
+        """
+        self.policy.set_eval()
+        self.policy.reset()
+
+    def _prepare_observation(self, ob):
+        """
+        Prepare raw observation dict from environment for policy.
+
+        Args:
+            ob (dict): single observation dictionary from environment (no batch dimension, 
+                and np.array values for each key)
+        """
+        ob = TensorUtils.to_tensor(ob)
+        ob = TensorUtils.to_batch(ob)
+        ob = TensorUtils.to_device(ob, self.policy.device)
+        ob = TensorUtils.to_float(ob)
+        if self.obs_normalization_stats is not None:
+            # ensure obs_normalization_stats are torch Tensors on proper device
+            obs_normalization_stats = TensorUtils.to_float(TensorUtils.to_device(TensorUtils.to_tensor(self.obs_normalization_stats), self.policy.device))
+            # limit normalization to obs keys being used, in case environment includes extra keys
+            ob = { k : ob[k] for k in self.policy.global_config.all_obs_keys }
+            ob = ObsUtils.normalize_dict(ob, normalization_stats=obs_normalization_stats)
+        return ob
+
+    def __repr__(self):
+        """Pretty print network description"""
+        return self.policy.__repr__()
+
+    def __call__(self, ob, goal=None):
+        """
+        Produce action from raw observation dict (and maybe goal dict) from environment.
+
+        Args:
+            ob (dict): single observation dictionary from environment (no batch dimension, 
+                and np.array values for each key)
+            goal (dict): goal observation
+        """
+        ob = self._prepare_observation(ob)
+        if goal is not None:
+            goal = self._prepare_observation(goal)
+        ac = self.policy.get_action(obs_dict=ob, goal_dict=goal)
+        ac = TensorUtils.to_numpy(ac[0])
+        if self.action_normalization_stats is not None:
+            action_keys = self.policy.global_config.train.action_keys
+            action_shapes = {k: self.action_normalization_stats[k]["offset"].shape[1:] for k in self.action_normalization_stats}
+            ac_dict = AcUtils.vector_to_action_dict(ac, action_shapes=action_shapes, action_keys=action_keys)
+            ac_dict = ObsUtils.unnormalize_dict(ac_dict, normalization_stats=self.action_normalization_stats)
+            action_config = self.policy.global_config.train.action_config
+            for key, value in ac_dict.items():
+                this_format = action_config[key].get('format', None)
+                if this_format == 'rot_6d':
+                    rot_6d = torch.from_numpy(value).unsqueeze(0)
+                    rot = TorchUtils.rot_6d_to_axis_angle(rot_6d=rot_6d).squeeze().numpy()
+                    ac_dict[key] = rot
+            ac = AcUtils.action_dict_to_vector(ac_dict, action_keys=action_keys)
+        return ac
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/bc.py b/phantom/submodules/phantom-robomimic/robomimic/algo/bc.py
new file mode 100644
index 0000000000000000000000000000000000000000..0797b7eae94b792833857ca6e04958084d4001dc
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/bc.py
@@ -0,0 +1,875 @@
+"""
+Implementation of Behavioral Cloning (BC).
+"""
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributions as D
+
+import robomimic.models.base_nets as BaseNets
+import robomimic.models.obs_nets as ObsNets
+import robomimic.models.policy_nets as PolicyNets
+import robomimic.models.vae_nets as VAENets
+import robomimic.utils.loss_utils as LossUtils
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.torch_utils as TorchUtils
+import robomimic.utils.obs_utils as ObsUtils
+
+from robomimic.algo import register_algo_factory_func, PolicyAlgo
+
+
+@register_algo_factory_func("bc")
+def algo_config_to_class(algo_config):
+    """
+    Maps algo config to the BC algo class to instantiate, along with additional algo kwargs.
+
+    Args:
+        algo_config (Config instance): algo config
+
+    Returns:
+        algo_class: subclass of Algo
+        algo_kwargs (dict): dictionary of additional kwargs to pass to algorithm
+    """
+
+    # note: we need the check below because some configs import BCConfig and exclude
+    # some of these options
+    gaussian_enabled = ("gaussian" in algo_config and algo_config.gaussian.enabled)
+    gmm_enabled = ("gmm" in algo_config and algo_config.gmm.enabled)
+    vae_enabled = ("vae" in algo_config and algo_config.vae.enabled)
+
+    rnn_enabled = algo_config.rnn.enabled
+    # support legacy configs that do not have "transformer" item
+    transformer_enabled = ("transformer" in algo_config) and algo_config.transformer.enabled
+
+    if gaussian_enabled:
+        if rnn_enabled:
+            raise NotImplementedError
+        elif transformer_enabled:
+            raise NotImplementedError
+        else:
+            algo_class, algo_kwargs = BC_Gaussian, {}
+    elif gmm_enabled:
+        if rnn_enabled:
+            algo_class, algo_kwargs = BC_RNN_GMM, {}
+        elif transformer_enabled:
+            algo_class, algo_kwargs = BC_Transformer_GMM, {}
+        else:
+            algo_class, algo_kwargs = BC_GMM, {}
+    elif vae_enabled:
+        if rnn_enabled:
+            raise NotImplementedError
+        elif transformer_enabled:
+            raise NotImplementedError
+        else:
+            algo_class, algo_kwargs = BC_VAE, {}
+    else:
+        if rnn_enabled:
+            algo_class, algo_kwargs = BC_RNN, {}
+        elif transformer_enabled:
+            algo_class, algo_kwargs = BC_Transformer, {}
+        else:
+            algo_class, algo_kwargs = BC, {}
+
+    return algo_class, algo_kwargs
+
+
+class BC(PolicyAlgo):
+    """
+    Normal BC training.
+    """
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        self.nets = nn.ModuleDict()
+        self.nets["policy"] = PolicyNets.ActorNetwork(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.actor_layer_dims,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+        self.nets = self.nets.float().to(self.device)
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training 
+        """
+        input_batch = dict()
+        input_batch["obs"] = {k: batch["obs"][k][:, 0, :] for k in batch["obs"]}
+        input_batch["goal_obs"] = batch.get("goal_obs", None) # goals may not be present
+        input_batch["actions"] = batch["actions"][:, 0, :]
+        # we move to device first before float conversion because image observation modalities will be uint8 -
+        # this minimizes the amount of data transferred to GPU
+        return TensorUtils.to_float(TensorUtils.to_device(input_batch, self.device))
+
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        with TorchUtils.maybe_no_grad(no_grad=validate):
+            info = super(BC, self).train_on_batch(batch, epoch, validate=validate)
+            predictions = self._forward_training(batch)
+            losses = self._compute_losses(predictions, batch)
+
+            info["predictions"] = TensorUtils.detach(predictions)
+            info["losses"] = TensorUtils.detach(losses)
+
+            if not validate:
+                step_info = self._train_step(losses)
+                info.update(step_info)
+
+        return info
+
+    def _forward_training(self, batch):
+        """
+        Internal helper function for BC algo class. Compute forward pass
+        and return network outputs in @predictions dict.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+        Returns:
+            predictions (dict): dictionary containing network outputs
+        """
+        predictions = OrderedDict()
+        actions = self.nets["policy"](obs_dict=batch["obs"], goal_dict=batch["goal_obs"])
+        predictions["actions"] = actions
+        return predictions
+
+    def _compute_losses(self, predictions, batch):
+        """
+        Internal helper function for BC algo class. Compute losses based on
+        network outputs in @predictions dict, using reference labels in @batch.
+
+        Args:
+            predictions (dict): dictionary containing network outputs, from @_forward_training
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+        Returns:
+            losses (dict): dictionary of losses computed over the batch
+        """
+        losses = OrderedDict()
+        a_target = batch["actions"]
+        actions = predictions["actions"]
+        losses["l2_loss"] = nn.MSELoss()(actions, a_target)
+        losses["l1_loss"] = nn.SmoothL1Loss()(actions, a_target)
+        # cosine direction loss on eef delta position
+        losses["cos_loss"] = LossUtils.cosine_loss(actions[..., :3], a_target[..., :3])
+
+        action_losses = [
+            self.algo_config.loss.l2_weight * losses["l2_loss"],
+            self.algo_config.loss.l1_weight * losses["l1_loss"],
+            self.algo_config.loss.cos_weight * losses["cos_loss"],
+        ]
+        action_loss = sum(action_losses)
+        losses["action_loss"] = action_loss
+        return losses
+
+    def _train_step(self, losses):
+        """
+        Internal helper function for BC algo class. Perform backpropagation on the
+        loss tensors in @losses to update networks.
+
+        Args:
+            losses (dict): dictionary of losses computed over the batch, from @_compute_losses
+        """
+
+        # gradient step
+        info = OrderedDict()
+        policy_grad_norms = TorchUtils.backprop_for_loss(
+            net=self.nets["policy"],
+            optim=self.optimizers["policy"],
+            loss=losses["action_loss"],
+        )
+        info["policy_grad_norms"] = policy_grad_norms
+        return info
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        log = super(BC, self).log_info(info)
+        log["Loss"] = info["losses"]["action_loss"].item()
+        if "l2_loss" in info["losses"]:
+            log["L2_Loss"] = info["losses"]["l2_loss"].item()
+        if "l1_loss" in info["losses"]:
+            log["L1_Loss"] = info["losses"]["l1_loss"].item()
+        if "cos_loss" in info["losses"]:
+            log["Cosine_Loss"] = info["losses"]["cos_loss"].item()
+        if "policy_grad_norms" in info:
+            log["Policy_Grad_Norms"] = info["policy_grad_norms"]
+        return log
+
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        assert not self.nets.training
+        return self.nets["policy"](obs_dict, goal_dict=goal_dict)
+
+
+class BC_Gaussian(BC):
+    """
+    BC training with a Gaussian policy.
+    """
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        assert self.algo_config.gaussian.enabled
+
+        self.nets = nn.ModuleDict()
+        self.nets["policy"] = PolicyNets.GaussianActorNetwork(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.actor_layer_dims,
+            fixed_std=self.algo_config.gaussian.fixed_std,
+            init_std=self.algo_config.gaussian.init_std,
+            std_limits=(self.algo_config.gaussian.min_std, 7.5),
+            std_activation=self.algo_config.gaussian.std_activation,
+            low_noise_eval=self.algo_config.gaussian.low_noise_eval,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+
+        self.nets = self.nets.float().to(self.device)
+
+    def _forward_training(self, batch):
+        """
+        Internal helper function for BC algo class. Compute forward pass
+        and return network outputs in @predictions dict.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+        Returns:
+            predictions (dict): dictionary containing network outputs
+        """
+        dists = self.nets["policy"].forward_train(
+            obs_dict=batch["obs"], 
+            goal_dict=batch["goal_obs"],
+        )
+
+        # make sure that this is a batch of multivariate action distributions, so that
+        # the log probability computation will be correct
+        assert len(dists.batch_shape) == 1
+        log_probs = dists.log_prob(batch["actions"])
+
+        predictions = OrderedDict(
+            log_probs=log_probs,
+        )
+        return predictions
+
+    def _compute_losses(self, predictions, batch):
+        """
+        Internal helper function for BC algo class. Compute losses based on
+        network outputs in @predictions dict, using reference labels in @batch.
+
+        Args:
+            predictions (dict): dictionary containing network outputs, from @_forward_training
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+        Returns:
+            losses (dict): dictionary of losses computed over the batch
+        """
+
+        # loss is just negative log-likelihood of action targets
+        action_loss = -predictions["log_probs"].mean()
+        return OrderedDict(
+            log_probs=-action_loss,
+            action_loss=action_loss,
+        )
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        log = PolicyAlgo.log_info(self, info)
+        log["Loss"] = info["losses"]["action_loss"].item()
+        log["Log_Likelihood"] = info["losses"]["log_probs"].item() 
+        if "policy_grad_norms" in info:
+            log["Policy_Grad_Norms"] = info["policy_grad_norms"]
+        return log
+
+
+class BC_GMM(BC_Gaussian):
+    """
+    BC training with a Gaussian Mixture Model policy.
+    """
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        assert self.algo_config.gmm.enabled
+
+        self.nets = nn.ModuleDict()
+        self.nets["policy"] = PolicyNets.GMMActorNetwork(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.actor_layer_dims,
+            num_modes=self.algo_config.gmm.num_modes,
+            min_std=self.algo_config.gmm.min_std,
+            std_activation=self.algo_config.gmm.std_activation,
+            low_noise_eval=self.algo_config.gmm.low_noise_eval,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+
+        self.nets = self.nets.float().to(self.device)
+
+
+class BC_VAE(BC):
+    """
+    BC training with a VAE policy.
+    """
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        self.nets = nn.ModuleDict()
+        self.nets["policy"] = PolicyNets.VAEActor(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            device=self.device,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+            **VAENets.vae_args_from_config(self.algo_config.vae),
+        )
+        
+        self.nets = self.nets.float().to(self.device)
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Update from superclass to set categorical temperature, for categorical VAEs.
+        """
+        if self.algo_config.vae.prior.use_categorical:
+            temperature = self.algo_config.vae.prior.categorical_init_temp - epoch * self.algo_config.vae.prior.categorical_temp_anneal_step
+            temperature = max(temperature, self.algo_config.vae.prior.categorical_min_temp)
+            self.nets["policy"].set_gumbel_temperature(temperature)
+        return super(BC_VAE, self).train_on_batch(batch, epoch, validate=validate)
+
+    def _forward_training(self, batch):
+        """
+        Internal helper function for BC algo class. Compute forward pass
+        and return network outputs in @predictions dict.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+        Returns:
+            predictions (dict): dictionary containing network outputs
+        """
+        vae_inputs = dict(
+            actions=batch["actions"],
+            obs_dict=batch["obs"],
+            goal_dict=batch["goal_obs"],
+            freeze_encoder=batch.get("freeze_encoder", False),
+        )
+
+        vae_outputs = self.nets["policy"].forward_train(**vae_inputs)
+        predictions = OrderedDict(
+            actions=vae_outputs["decoder_outputs"],
+            kl_loss=vae_outputs["kl_loss"],
+            reconstruction_loss=vae_outputs["reconstruction_loss"],
+            encoder_z=vae_outputs["encoder_z"],
+        )
+        if not self.algo_config.vae.prior.use_categorical:
+            with torch.no_grad():
+                encoder_variance = torch.exp(vae_outputs["encoder_params"]["logvar"])
+            predictions["encoder_variance"] = encoder_variance
+        return predictions
+
+    def _compute_losses(self, predictions, batch):
+        """
+        Internal helper function for BC algo class. Compute losses based on
+        network outputs in @predictions dict, using reference labels in @batch.
+
+        Args:
+            predictions (dict): dictionary containing network outputs, from @_forward_training
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+        Returns:
+            losses (dict): dictionary of losses computed over the batch
+        """
+
+        # total loss is sum of reconstruction and KL, weighted by beta
+        kl_loss = predictions["kl_loss"]
+        recons_loss = predictions["reconstruction_loss"]
+        action_loss = recons_loss + self.algo_config.vae.kl_weight * kl_loss
+        return OrderedDict(
+            recons_loss=recons_loss,
+            kl_loss=kl_loss,
+            action_loss=action_loss,
+        )
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        log = PolicyAlgo.log_info(self, info)
+        log["Loss"] = info["losses"]["action_loss"].item()
+        log["KL_Loss"] = info["losses"]["kl_loss"].item()
+        log["Reconstruction_Loss"] = info["losses"]["recons_loss"].item()
+        if self.algo_config.vae.prior.use_categorical:
+            log["Gumbel_Temperature"] = self.nets["policy"].get_gumbel_temperature()
+        else:
+            log["Encoder_Variance"] = info["predictions"]["encoder_variance"].mean().item()
+        if "policy_grad_norms" in info:
+            log["Policy_Grad_Norms"] = info["policy_grad_norms"]
+        return log
+
+
+class BC_RNN(BC):
+    """
+    BC training with an RNN policy.
+    """
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        self.nets = nn.ModuleDict()
+        self.nets["policy"] = PolicyNets.RNNActorNetwork(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.actor_layer_dims,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+            **BaseNets.rnn_args_from_config(self.algo_config.rnn),
+        )
+
+        self._rnn_hidden_state = None
+        self._rnn_horizon = self.algo_config.rnn.horizon
+        self._rnn_counter = 0
+        self._rnn_is_open_loop = self.algo_config.rnn.get("open_loop", False)
+
+        self.nets = self.nets.float().to(self.device)
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training
+        """
+        input_batch = dict()
+        input_batch["obs"] = batch["obs"]
+        input_batch["goal_obs"] = batch.get("goal_obs", None) # goals may not be present
+        input_batch["actions"] = batch["actions"]
+
+        if self._rnn_is_open_loop:
+            # replace the observation sequence with one that only consists of the first observation.
+            # This way, all actions are predicted "open-loop" after the first observation, based
+            # on the rnn hidden state.
+            n_steps = batch["actions"].shape[1]
+            obs_seq_start = TensorUtils.index_at_time(batch["obs"], ind=0)
+            input_batch["obs"] = TensorUtils.unsqueeze_expand_at(obs_seq_start, size=n_steps, dim=1)
+
+        # we move to device first before float conversion because image observation modalities will be uint8 -
+        # this minimizes the amount of data transferred to GPU
+        return TensorUtils.to_float(TensorUtils.to_device(input_batch, self.device))
+
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        assert not self.nets.training
+
+        if self._rnn_hidden_state is None or self._rnn_counter % self._rnn_horizon == 0:
+            batch_size = list(obs_dict.values())[0].shape[0]
+            self._rnn_hidden_state = self.nets["policy"].get_rnn_init_state(batch_size=batch_size, device=self.device)
+
+            if self._rnn_is_open_loop:
+                # remember the initial observation, and use it instead of the current observation
+                # for open-loop action sequence prediction
+                self._open_loop_obs = TensorUtils.clone(TensorUtils.detach(obs_dict))
+
+        obs_to_use = obs_dict
+        if self._rnn_is_open_loop:
+            # replace current obs with last recorded obs
+            obs_to_use = self._open_loop_obs
+
+        self._rnn_counter += 1
+        action, self._rnn_hidden_state = self.nets["policy"].forward_step(
+            obs_to_use, goal_dict=goal_dict, rnn_state=self._rnn_hidden_state)
+        return action
+
+    def reset(self):
+        """
+        Reset algo state to prepare for environment rollouts.
+        """
+        self._rnn_hidden_state = None
+        self._rnn_counter = 0
+
+
+class BC_RNN_GMM(BC_RNN):
+    """
+    BC training with an RNN GMM policy.
+    """
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        assert self.algo_config.gmm.enabled
+        assert self.algo_config.rnn.enabled
+
+        self.nets = nn.ModuleDict()
+        self.nets["policy"] = PolicyNets.RNNGMMActorNetwork(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.actor_layer_dims,
+            num_modes=self.algo_config.gmm.num_modes,
+            min_std=self.algo_config.gmm.min_std,
+            std_activation=self.algo_config.gmm.std_activation,
+            low_noise_eval=self.algo_config.gmm.low_noise_eval,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+            **BaseNets.rnn_args_from_config(self.algo_config.rnn),
+        )
+
+        self._rnn_hidden_state = None
+        self._rnn_horizon = self.algo_config.rnn.horizon
+        self._rnn_counter = 0
+        self._rnn_is_open_loop = self.algo_config.rnn.get("open_loop", False)
+
+        self.nets = self.nets.float().to(self.device)
+
+    def _forward_training(self, batch):
+        """
+        Internal helper function for BC algo class. Compute forward pass
+        and return network outputs in @predictions dict.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+        Returns:
+            predictions (dict): dictionary containing network outputs
+        """
+        dists = self.nets["policy"].forward_train(
+            obs_dict=batch["obs"], 
+            goal_dict=batch["goal_obs"],
+        )
+
+        # make sure that this is a batch of multivariate action distributions, so that
+        # the log probability computation will be correct
+        assert len(dists.batch_shape) == 2 # [B, T]
+        log_probs = dists.log_prob(batch["actions"])
+
+        predictions = OrderedDict(
+            log_probs=log_probs,
+        )
+        return predictions
+
+    def _compute_losses(self, predictions, batch):
+        """
+        Internal helper function for BC algo class. Compute losses based on
+        network outputs in @predictions dict, using reference labels in @batch.
+
+        Args:
+            predictions (dict): dictionary containing network outputs, from @_forward_training
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+        Returns:
+            losses (dict): dictionary of losses computed over the batch
+        """
+
+        # loss is just negative log-likelihood of action targets
+        action_loss = -predictions["log_probs"].mean()
+        return OrderedDict(
+            log_probs=-action_loss,
+            action_loss=action_loss,
+        )
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        log = PolicyAlgo.log_info(self, info)
+        log["Loss"] = info["losses"]["action_loss"].item()
+        log["Log_Likelihood"] = info["losses"]["log_probs"].item()
+        if "policy_grad_norms" in info:
+            log["Policy_Grad_Norms"] = info["policy_grad_norms"]
+        return log
+
+
+class BC_Transformer(BC):
+    """
+    BC training with a Transformer policy.
+    """
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        assert self.algo_config.transformer.enabled
+
+        self.nets = nn.ModuleDict()
+        self.nets["policy"] = PolicyNets.TransformerActorNetwork(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+            **BaseNets.transformer_args_from_config(self.algo_config.transformer),
+        )
+        self._set_params_from_config()
+        self.nets = self.nets.float().to(self.device)
+
+    def _set_params_from_config(self):
+        """
+        Read specific config variables we need for training / eval.
+        Called by @_create_networks method
+        """
+        self.context_length = self.algo_config.transformer.context_length
+        self.supervise_all_steps = self.algo_config.transformer.supervise_all_steps
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training
+        """
+        input_batch = dict()
+        h = self.context_length
+        input_batch["obs"] = {k: batch["obs"][k][:, :h, :] for k in batch["obs"]}
+        input_batch["goal_obs"] = batch.get("goal_obs", None) # goals may not be present
+
+        if self.supervise_all_steps:
+            # supervision on entire sequence (instead of just current timestep)
+            input_batch["actions"] = batch["actions"][:, :h, :]
+        else:
+            # just use current timestep
+            input_batch["actions"] = batch["actions"][:, h-1, :]
+
+        input_batch = TensorUtils.to_device(TensorUtils.to_float(input_batch), self.device)
+        return input_batch
+
+    def _forward_training(self, batch, epoch=None):
+        """
+        Internal helper function for BC_Transformer algo class. Compute forward pass
+        and return network outputs in @predictions dict.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+        Returns:
+            predictions (dict): dictionary containing network outputs
+        """
+        # ensure that transformer context length is consistent with temporal dimension of observations
+        TensorUtils.assert_size_at_dim(
+            batch["obs"],
+            size=(self.context_length),
+            dim=1,
+            msg="Error: expect temporal dimension of obs batch to match transformer context length {}".format(self.context_length),
+        )
+
+        predictions = OrderedDict()
+        predictions["actions"] = self.nets["policy"](obs_dict=batch["obs"], actions=None, goal_dict=batch["goal_obs"])
+        if not self.supervise_all_steps:
+            # only supervise final timestep
+            predictions["actions"] = predictions["actions"][:, -1, :]
+        return predictions
+
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        assert not self.nets.training
+
+        return self.nets["policy"](obs_dict, actions=None, goal_dict=goal_dict)[:, -1, :]
+
+
+class BC_Transformer_GMM(BC_Transformer):
+    """
+    BC training with a Transformer GMM policy.
+    """
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        assert self.algo_config.gmm.enabled
+        assert self.algo_config.transformer.enabled
+
+        self.nets = nn.ModuleDict()
+        self.nets["policy"] = PolicyNets.TransformerGMMActorNetwork(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            num_modes=self.algo_config.gmm.num_modes,
+            min_std=self.algo_config.gmm.min_std,
+            std_activation=self.algo_config.gmm.std_activation,
+            low_noise_eval=self.algo_config.gmm.low_noise_eval,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+            **BaseNets.transformer_args_from_config(self.algo_config.transformer),
+        )
+        self._set_params_from_config()
+        self.nets = self.nets.float().to(self.device)
+
+    def _forward_training(self, batch, epoch=None):
+        """
+        Modify from super class to support GMM training.
+        """
+        # ensure that transformer context length is consistent with temporal dimension of observations
+        TensorUtils.assert_size_at_dim(
+            batch["obs"],
+            size=(self.context_length),
+            dim=1,
+            msg="Error: expect temporal dimension of obs batch to match transformer context length {}".format(self.context_length),
+        )
+
+        dists = self.nets["policy"].forward_train(
+            obs_dict=batch["obs"],
+            actions=None,
+            goal_dict=batch["goal_obs"],
+            low_noise_eval=False,
+        )
+
+        # make sure that this is a batch of multivariate action distributions, so that
+        # the log probability computation will be correct
+        assert len(dists.batch_shape) == 2 # [B, T]
+
+        if not self.supervise_all_steps:
+            # only use final timestep prediction by making a new distribution with only final timestep.
+            # This essentially does `dists = dists[:, -1]`
+            component_distribution = D.Normal(
+                loc=dists.component_distribution.base_dist.loc[:, -1],
+                scale=dists.component_distribution.base_dist.scale[:, -1],
+            )
+            component_distribution = D.Independent(component_distribution, 1)
+            mixture_distribution = D.Categorical(logits=dists.mixture_distribution.logits[:, -1])
+            dists = D.MixtureSameFamily(
+                mixture_distribution=mixture_distribution,
+                component_distribution=component_distribution,
+            )
+
+        log_probs = dists.log_prob(batch["actions"])
+
+        predictions = OrderedDict(
+            log_probs=log_probs,
+        )
+        return predictions
+
+    def _compute_losses(self, predictions, batch):
+        """
+        Internal helper function for BC_Transformer_GMM algo class. Compute losses based on
+        network outputs in @predictions dict, using reference labels in @batch.
+        Args:
+            predictions (dict): dictionary containing network outputs, from @_forward_training
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+        Returns:
+            losses (dict): dictionary of losses computed over the batch
+        """
+
+        # loss is just negative log-likelihood of action targets
+        action_loss = -predictions["log_probs"].mean()
+        return OrderedDict(
+            log_probs=-action_loss,
+            action_loss=action_loss,
+        )
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+        Args:
+            info (dict): dictionary of info
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        log = PolicyAlgo.log_info(self, info)
+        log["Loss"] = info["losses"]["action_loss"].item()
+        log["Log_Likelihood"] = info["losses"]["log_probs"].item() 
+        if "policy_grad_norms" in info:
+            log["Policy_Grad_Norms"] = info["policy_grad_norms"]
+        return log
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/bcq.py b/phantom/submodules/phantom-robomimic/robomimic/algo/bcq.py
new file mode 100644
index 0000000000000000000000000000000000000000..5843ccb5bd594c596a8dc138eab863bb3f5e3550
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/bcq.py
@@ -0,0 +1,1022 @@
+"""
+Batch-Constrained Q-Learning (BCQ), with support for more general
+generative action models (the original paper uses a cVAE).
+(Paper - https://arxiv.org/abs/1812.02900).
+"""
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import robomimic.models.obs_nets as ObsNets
+import robomimic.models.policy_nets as PolicyNets
+import robomimic.models.value_nets as ValueNets
+import robomimic.models.vae_nets as VAENets
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.torch_utils as TorchUtils
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.loss_utils as LossUtils
+
+from robomimic.algo import register_algo_factory_func, PolicyAlgo, ValueAlgo
+
+
+@register_algo_factory_func("bcq")
+def algo_config_to_class(algo_config):
+    """
+    Maps algo config to the BCQ algo class to instantiate, along with additional algo kwargs.
+
+    Args:
+        algo_config (Config instance): algo config
+
+    Returns:
+        algo_class: subclass of Algo
+        algo_kwargs (dict): dictionary of additional kwargs to pass to algorithm
+    """
+    if algo_config.critic.distributional.enabled:
+        return BCQ_Distributional, {}
+    if algo_config.action_sampler.gmm.enabled:
+        return BCQ_GMM, {}
+    assert algo_config.action_sampler.vae.enabled
+    return BCQ, {}
+
+
+class BCQ(PolicyAlgo, ValueAlgo):
+    """
+    Default BCQ training, based on https://arxiv.org/abs/1812.02900 and
+    https://github.com/sfujim/BCQ
+    """
+    def __init__(self, **kwargs):
+        PolicyAlgo.__init__(self, **kwargs)
+
+        # save the discount factor - it may be overriden later
+        self.set_discount(self.algo_config.discount)
+
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        self.nets = nn.ModuleDict()
+
+        self._create_critics()
+        self._create_action_sampler()
+        if self.algo_config.actor.enabled:
+            self._create_actor()
+
+        # sync target networks at beginning of training
+        with torch.no_grad():
+            for critic_ind in range(len(self.nets["critic"])):
+                TorchUtils.hard_update(
+                    source=self.nets["critic"][critic_ind], 
+                    target=self.nets["critic_target"][critic_ind],
+                )
+
+            if self.algo_config.actor.enabled:
+                TorchUtils.hard_update(
+                    source=self.nets["actor"], 
+                    target=self.nets["actor_target"],
+                )
+
+        self.nets = self.nets.float().to(self.device)
+
+    def _create_critics(self):
+        """
+        Called in @_create_networks to make critic networks.
+        """
+        critic_class = ValueNets.ActionValueNetwork
+        critic_args = dict(
+            obs_shapes=self.obs_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.critic.layer_dims,
+            value_bounds=self.algo_config.critic.value_bounds,
+            goal_shapes=self.goal_shapes,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+
+        # Q network ensemble and target ensemble
+        self.nets["critic"] = nn.ModuleList()
+        self.nets["critic_target"] = nn.ModuleList()
+        for _ in range(self.algo_config.critic.ensemble.n):
+            critic = critic_class(**critic_args)
+            self.nets["critic"].append(critic)
+
+            critic_target = critic_class(**critic_args)
+            self.nets["critic_target"].append(critic_target)
+
+    def _create_action_sampler(self):
+        """
+        Called in @_create_networks to make action sampler network.
+        """
+
+        # VAE network for approximate sampling from batch dataset
+        assert self.algo_config.action_sampler.vae.enabled
+        self.nets["action_sampler"] = PolicyNets.VAEActor(
+            obs_shapes=self.obs_shapes,
+            ac_dim=self.ac_dim,
+            device=self.device,
+            goal_shapes=self.goal_shapes,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+            **VAENets.vae_args_from_config(self.algo_config.action_sampler.vae),
+        )
+
+    def _create_actor(self):
+        """
+        Called in @_create_networks to make actor network.
+        """
+        assert self.algo_config.actor.enabled
+        actor_class = PolicyNets.PerturbationActorNetwork
+        actor_args = dict(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.actor.layer_dims,
+            perturbation_scale=self.algo_config.actor.perturbation_scale,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+
+        self.nets["actor"] = actor_class(**actor_args)
+        self.nets["actor_target"] = actor_class(**actor_args)
+
+    def _check_epoch(self, net_name, epoch):
+        """
+        Helper function to check whether backprop should happen this epoch.
+
+        Args:
+            net_name (str): name of network in @self.nets and @self.optim_params
+            epoch (int): epoch number
+        """
+        epoch_start_check = (self.optim_params[net_name]["start_epoch"] == -1) or (epoch >= self.optim_params[net_name]["start_epoch"])
+        epoch_end_check = (self.optim_params[net_name]["end_epoch"] == -1) or (epoch < self.optim_params[net_name]["end_epoch"])
+        return (epoch_start_check and epoch_end_check)
+
+    def set_discount(self, discount):
+        """
+        Useful function to modify discount factor if necessary (e.g. for n-step returns).
+        """
+        self.discount = discount
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training 
+        """
+        input_batch = dict()
+
+        # n-step returns (default is 1)
+        n_step = self.algo_config.n_step
+        assert batch["actions"].shape[1] >= n_step
+
+        # remove temporal batches for all
+        input_batch["obs"] = {k: batch["obs"][k][:, 0, :] for k in batch["obs"]}
+        input_batch["next_obs"] = {k: batch["next_obs"][k][:, n_step - 1, :] for k in batch["next_obs"]}
+        input_batch["goal_obs"] = batch.get("goal_obs", None) # goals may not be present
+        input_batch["actions"] = batch["actions"][:, 0, :]
+
+        # note: ensure scalar signals (rewards, done) retain last dimension of 1 to be compatible with model outputs
+
+        # single timestep reward is discounted sum of intermediate rewards in sequence
+        reward_seq = batch["rewards"][:, :n_step]
+        discounts = torch.pow(self.algo_config.discount, torch.arange(n_step).float()).unsqueeze(0)
+        input_batch["rewards"] = (reward_seq * discounts).sum(dim=1).unsqueeze(1)
+
+        # discount rate will be gamma^N for computing n-step returns
+        new_discount = (self.algo_config.discount ** n_step)
+        self.set_discount(new_discount)
+
+        # consider this n-step seqeunce done if any intermediate dones are present
+        done_seq = batch["dones"][:, :n_step]
+        input_batch["dones"] = (done_seq.sum(dim=1) > 0).float().unsqueeze(1)
+
+        if self.algo_config.infinite_horizon:
+            # scale terminal rewards by 1 / (1 - gamma) for infinite horizon MDPs
+            done_inds = input_batch["dones"].round().long().nonzero(as_tuple=False)[:, 0]
+            if done_inds.shape[0] > 0:
+                input_batch["rewards"][done_inds] = input_batch["rewards"][done_inds] * (1. / (1. - self.discount))
+
+        # we move to device first before float conversion because image observation modalities will be uint8 -
+        # this minimizes the amount of data transferred to GPU
+        return TensorUtils.to_float(TensorUtils.to_device(input_batch, self.device))
+
+    def _train_action_sampler_on_batch(self, batch, epoch, no_backprop=False):
+        """
+        A modular helper function that can be overridden in case
+        subclasses would like to modify training behavior for the
+        action sampler.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            no_backprop (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+            outputs (dict): dictionary of outputs to use during critic training
+                (for computing target values)
+        """
+        info = OrderedDict()
+        if self.algo_config.action_sampler.vae.prior.use_categorical:
+            temperature = self.algo_config.action_sampler.vae.prior.categorical_init_temp - epoch * self.algo_config.action_sampler.vae.prior.categorical_temp_anneal_step
+            temperature = max(temperature, self.algo_config.action_sampler.vae.prior.categorical_min_temp)
+            self.nets["action_sampler"].set_gumbel_temperature(temperature)
+
+        vae_inputs = dict(
+            actions=batch["actions"],
+            obs_dict=batch["obs"],
+            goal_dict=batch["goal_obs"],
+        )
+
+        # maybe freeze encoder weights
+        if (self.algo_config.action_sampler.freeze_encoder_epoch != -1) and (epoch >= self.algo_config.action_sampler.freeze_encoder_epoch):
+            vae_inputs["freeze_encoder"] = True
+
+        # VAE forward
+        vae_outputs = self.nets["action_sampler"].forward_train(**vae_inputs)
+        recons_loss = vae_outputs["reconstruction_loss"]
+        kl_loss = vae_outputs["kl_loss"]
+        vae_loss = recons_loss + self.algo_config.action_sampler.vae.kl_weight * kl_loss
+        info["action_sampler/loss"] = vae_loss
+        info["action_sampler/recons_loss"] = recons_loss
+        info["action_sampler/kl_loss"] = kl_loss
+        if not self.algo_config.action_sampler.vae.prior.use_categorical:
+            with torch.no_grad():
+                encoder_variance = torch.exp(vae_outputs["encoder_params"]["logvar"]).mean()
+            info["action_sampler/encoder_variance"] = encoder_variance
+        outputs = TensorUtils.detach(vae_outputs)
+
+        # VAE gradient step
+        if not no_backprop:
+            vae_grad_norms = TorchUtils.backprop_for_loss(
+                net=self.nets["action_sampler"],
+                optim=self.optimizers["action_sampler"],
+                loss=vae_loss,
+            )
+            info["action_sampler/grad_norms"] = vae_grad_norms
+        return info, outputs
+
+    def _train_critic_on_batch(self, batch, action_sampler_outputs, epoch, no_backprop=False):
+        """
+        A modular helper function that can be overridden in case
+        subclasses would like to modify training behavior for the
+        critics.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            action_sampler_outputs (dict): dictionary of outputs from the action sampler. Used
+                to form target values for training the critic
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            no_backprop (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+            critic_outputs (dict): dictionary of critic outputs - useful for 
+                logging purposes
+        """
+        info = OrderedDict()
+
+        # batch variables
+        s_batch = batch["obs"]
+        a_batch = batch["actions"]
+        r_batch = batch["rewards"]
+        ns_batch = batch["next_obs"]
+        goal_s_batch = batch["goal_obs"]
+
+        # 1 if not done, 0 otherwise
+        done_mask_batch = 1. - batch["dones"]
+        info["done_masks"] = done_mask_batch
+
+        # Bellman backup for Q-targets
+        q_targets = self._get_target_values(
+            next_states=ns_batch, 
+            goal_states=goal_s_batch, 
+            rewards=r_batch, 
+            dones=done_mask_batch,
+            action_sampler_outputs=action_sampler_outputs,
+        )
+        info["critic/q_targets"] = q_targets
+
+        # Train all critics using this set of targets for regression
+        critic_outputs = []
+        for critic_ind, critic in enumerate(self.nets["critic"]):
+            critic_loss, critic_output = self._compute_critic_loss(
+                critic=critic, 
+                states=s_batch, 
+                actions=a_batch, 
+                goal_states=goal_s_batch, 
+                q_targets=q_targets,
+            )
+            info["critic/critic{}_loss".format(critic_ind + 1)] = critic_loss
+            critic_outputs.append(critic_output)
+
+            if not no_backprop:
+                critic_grad_norms = TorchUtils.backprop_for_loss(
+                    net=self.nets["critic"][critic_ind],
+                    optim=self.optimizers["critic"][critic_ind],
+                    loss=critic_loss, 
+                    max_grad_norm=self.algo_config.critic.max_gradient_norm,
+                )
+                info["critic/critic{}_grad_norms".format(critic_ind + 1)] = critic_grad_norms
+
+        return info, critic_outputs
+
+    def _train_actor_on_batch(self, batch, action_sampler_outputs, critic_outputs, epoch, no_backprop=False):
+        """
+        A modular helper function that can be overridden in case
+        subclasses would like to modify training behavior for the
+        perturbation actor.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            action_sampler_outputs (dict): dictionary of outputs from the action sampler. Currently
+                unused, although more sophisticated models may use it.
+
+            critic_outputs (dict): dictionary of outputs from the critic. Currently
+                unused, although more sophisticated models may use it.
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            no_backprop (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        assert self.algo_config.actor.enabled
+
+        info = OrderedDict()
+
+        # Actor loss (update with DDPG loss)
+        s_batch = batch["obs"]
+        goal_s_batch = batch["goal_obs"]
+
+        # sample some actions from action sampler and perturb them, then improve perturbations
+        # where improvement is measured by the critic
+        sampled_actions = self.nets["action_sampler"](s_batch, goal_s_batch).detach() # don't backprop into samples
+        perturbed_actions = self.nets["actor"](s_batch, sampled_actions, goal_s_batch)
+        actor_loss = -(self.nets["critic"][0](s_batch, perturbed_actions, goal_s_batch)).mean()
+        info["actor/loss"] = actor_loss
+
+        if not no_backprop:
+            actor_grad_norms = TorchUtils.backprop_for_loss(
+                net=self.nets["actor"],
+                optim=self.optimizers["actor"],
+                loss=actor_loss,
+            )
+            info["actor/grad_norms"] = actor_grad_norms
+
+        return info
+
+    def _get_target_values(self, next_states, goal_states, rewards, dones, action_sampler_outputs=None):
+        """
+        Helper function to get target values for training Q-function with TD-loss.
+
+        Args:
+            next_states (dict): batch of next observations
+            goal_states (dict): if not None, batch of goal observations
+            rewards (torch.Tensor): batch of rewards - should be shape (B, 1)
+            dones (torch.Tensor): batch of done signals - should be shape (B, 1)
+            action_sampler_outputs (dict): dictionary of outputs from the action sampler. Currently
+                unused, although more sophisticated models may use it.
+
+        Returns:
+            q_targets (torch.Tensor): target Q-values to use for TD loss
+        """
+
+        with torch.no_grad():
+            # we need to stack the observations with redundancy @num_action_samples here, then decode 
+            # to get all sampled actions. for example, if we generate 2 samples per observation and
+            # the batch size is 3, then ob_tiled = [ob1; ob1; ob2; ob2; ob3; ob3]
+            next_states_tiled = ObsUtils.repeat_and_stack_observation(next_states, n=self.algo_config.critic.num_action_samples)
+            goal_states_tiled = None
+            if len(self.goal_shapes) > 0:
+                goal_states_tiled = ObsUtils.repeat_and_stack_observation(goal_states, n=self.algo_config.critic.num_action_samples)
+
+            # sample action proposals
+            next_sampled_actions = self._sample_actions_for_value_maximization(
+                states_tiled=next_states_tiled,
+                goal_states_tiled=goal_states_tiled,
+                for_target_update=True,
+            )
+
+            q_targets = self._get_target_values_from_sampled_actions(
+                next_states_tiled=next_states_tiled, 
+                next_sampled_actions=next_sampled_actions, 
+                goal_states_tiled=goal_states_tiled, 
+                rewards=rewards, 
+                dones=dones,
+            )
+
+        return q_targets
+
+    def _sample_actions_for_value_maximization(self, states_tiled, goal_states_tiled, for_target_update):
+        """
+        Helper function to sample actions for maximization (the "batch-constrained" part of 
+        batch-constrained q-learning).
+
+        Args:
+            states_tiled (dict): observations to use for sampling actions. Assumes that tiling
+                has already occurred - so that if the batch size is B, and N samples are
+                desired for each observation in the batch, the leading dimension for each
+                observation in the dict is B * N
+
+            goal_states_tiled (dict): if not None, goal observations
+
+            for_target_update (bool): if True, actions are being sampled for use in training the
+                critic - which means the target actor network should be used
+
+        Returns:
+            sampled_actions (torch.Tensor): actions sampled from the action sampler, and maybe
+                perturbed by the actor network
+        """
+
+        with torch.no_grad():
+            sampled_actions = self.nets["action_sampler"](states_tiled, goal_states_tiled)
+            if self.algo_config.actor.enabled:
+                actor = self.nets["actor"]
+                if for_target_update:
+                    actor = self.nets["actor_target"]
+                # perturb the actions with the policy
+                sampled_actions = actor(states_tiled, sampled_actions, goal_states_tiled)
+
+        return sampled_actions
+
+    def _get_target_values_from_sampled_actions(self, next_states_tiled, next_sampled_actions, goal_states_tiled, rewards, dones):
+        """
+        Helper function to get target values for training Q-function with TD-loss. The function
+        assumes that action candidates to maximize over have already been computed, and that
+        the input states have been tiled (repeated) to be compatible with the sampled actions.
+
+        Args:
+            next_states_tiled (dict): next observations to use for sampling actions. Assumes that 
+                tiling has already occurred - so that if the batch size is B, and N samples are
+                desired for each observation in the batch, the leading dimension for each
+                observation in the dict is B * N
+
+            next_sampled_actions (torch.Tensor): actions sampled from the action sampler. This function
+                will maximize the critic over these action candidates (using the TD3 trick)
+
+            goal_states_tiled (dict): if not None, goal observations
+
+            rewards (torch.Tensor): batch of rewards - should be shape (B, 1)
+
+            dones (torch.Tensor): batch of done signals - should be shape (B, 1)
+
+        Returns:
+            q_targets (torch.Tensor): target Q-values to use for TD loss
+        """
+        with torch.no_grad():
+            # feed tiled observations and sampled actions into the critics and then
+            # reshape to get all Q-values in second dimension per observation in batch.
+            all_value_targets = self.nets["critic_target"][0](next_states_tiled, next_sampled_actions, goal_states_tiled).reshape(
+                -1, self.algo_config.critic.num_action_samples)
+            max_value_targets = all_value_targets
+            min_value_targets = all_value_targets
+
+            # TD3 trick to combine max and min over all Q-ensemble estimates into single target estimates
+            for critic_target in self.nets["critic_target"][1:]:
+                all_value_targets = critic_target(next_states_tiled, next_sampled_actions, goal_states_tiled).reshape(
+                    -1, self.algo_config.critic.num_action_samples)
+                max_value_targets = torch.max(max_value_targets, all_value_targets)
+                min_value_targets = torch.min(min_value_targets, all_value_targets)
+            all_value_targets = self.algo_config.critic.ensemble.weight * min_value_targets + \
+                                (1. - self.algo_config.critic.ensemble.weight) * max_value_targets
+
+            # take maximum over all sampled action values per observation and compute targets
+            value_targets = torch.max(all_value_targets, dim=1, keepdim=True)[0]
+            q_targets = rewards + dones * self.discount * value_targets
+
+        return q_targets
+
+    def _compute_critic_loss(self, critic, states, actions, goal_states, q_targets):
+        """
+        Helper function to compute loss between estimated Q-values and target Q-values.
+        It should also return outputs needed for downstream training (for training the
+        actor).
+
+        Args:
+            critic (torch.nn.Module): critic network
+            states (dict): batch of observations
+            actions (torch.Tensor): batch of actions
+            goal_states (dict): if not None, batch of goal observations
+            q_targets (torch.Tensor): batch of target q-values for the TD loss
+
+        Returns:
+            critic_loss (torch.Tensor): critic loss
+            critic_output (dict): additional outputs from the critic. This function
+                returns None, but subclasses may want to provide some information
+                here.
+        """
+        q_estimated = critic(states, actions, goal_states)
+        if self.algo_config.critic.use_huber:
+            critic_loss = nn.SmoothL1Loss()(q_estimated, q_targets)
+        else:
+            critic_loss = nn.MSELoss()(q_estimated, q_targets)
+        return critic_loss, None
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        with TorchUtils.maybe_no_grad(no_grad=validate):
+            info = PolicyAlgo.train_on_batch(self, batch, epoch, validate=validate)
+
+            # Action Sampler training
+            no_action_sampler_backprop = validate or (not self._check_epoch(net_name="action_sampler", epoch=epoch))
+            with TorchUtils.maybe_no_grad(no_grad=no_action_sampler_backprop):
+                action_sampler_info, action_sampler_outputs = self._train_action_sampler_on_batch(
+                    batch=batch, 
+                    epoch=epoch, 
+                    no_backprop=no_action_sampler_backprop,
+                )
+            info.update(action_sampler_info)
+
+            # make sure action sampler is in eval mode for models like GMM which may require low-noise
+            # samples when sampling actions.
+            self.nets["action_sampler"].eval()
+
+            # Critic training
+            no_critic_backprop = validate or (not self._check_epoch(net_name="critic", epoch=epoch))
+            with TorchUtils.maybe_no_grad(no_grad=no_critic_backprop):
+                critic_info, critic_outputs = self._train_critic_on_batch(
+                    batch=batch, 
+                    action_sampler_outputs=action_sampler_outputs,
+                    epoch=epoch, 
+                    no_backprop=no_critic_backprop,
+                )
+            info.update(critic_info)
+
+            if self.algo_config.actor.enabled:
+                # Actor training
+                no_actor_backprop = validate or (not self._check_epoch(net_name="actor", epoch=epoch))
+                with TorchUtils.maybe_no_grad(no_grad=no_actor_backprop):
+                    actor_info = self._train_actor_on_batch(
+                        batch=batch, 
+                        action_sampler_outputs=action_sampler_outputs, 
+                        critic_outputs=critic_outputs, 
+                        epoch=epoch, 
+                        no_backprop=no_actor_backprop,
+                    )
+                info.update(actor_info)
+
+            if not validate:
+                # restore to train mode if necessary
+                self.nets["action_sampler"].train()
+
+            # update the target critic networks (only when critic has gradient update)
+            if not no_critic_backprop:
+                with torch.no_grad():
+                    for critic_ind in range(len(self.nets["critic"])):
+                        TorchUtils.soft_update(
+                            source=self.nets["critic"][critic_ind], 
+                            target=self.nets["critic_target"][critic_ind], 
+                            tau=self.algo_config.target_tau,
+                        )
+
+            # update target actor network (only when actor has gradient update)
+            if self.algo_config.actor.enabled and (not no_actor_backprop):
+                with torch.no_grad():
+                    TorchUtils.soft_update(
+                        source=self.nets["actor"], 
+                        target=self.nets["actor_target"], 
+                        tau=self.algo_config.target_tau,
+                    )
+
+        return info
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        loss_log = OrderedDict()
+
+        # record current optimizer learning rates
+        for k in self.optimizers:
+            keys = [k]
+            optims = [self.optimizers[k]]
+            if k == "critic":
+                # account for critic having one optimizer per ensemble member
+                keys = ["{}{}".format(k, critic_ind) for critic_ind in range(len(self.nets["critic"]))]
+                optims = self.optimizers[k]
+            for kp, optimizer in zip(keys, optims):
+                for i, param_group in enumerate(optimizer.param_groups):
+                    loss_log["Optimizer/{}{}_lr".format(kp, i)] = param_group["lr"]
+
+        # extract relevant logs for action sampler, critic, and actor
+        loss_log["Loss"] = 0.
+        for loss_logger in [self._log_action_sampler_info, self._log_critic_info, self._log_actor_info]:
+            this_log = loss_logger(info)
+            if "Loss" in this_log:
+                # manually merge total loss
+                loss_log["Loss"] += this_log["Loss"]
+                del this_log["Loss"]
+            loss_log.update(this_log)
+
+        return loss_log
+
+    def _log_action_sampler_info(self, info):
+        """
+        Helper function to extract action sampler-relevant information for logging.
+        """
+        loss_log = OrderedDict()
+        loss_log["Action_Sampler/Loss"] = info["action_sampler/loss"].item()
+        loss_log["Action_Sampler/Reconsruction_Loss"] = info["action_sampler/recons_loss"].item()
+        loss_log["Action_Sampler/KL_Loss"] = info["action_sampler/kl_loss"].item()
+        if self.algo_config.action_sampler.vae.prior.use_categorical:
+            loss_log["Action_Sampler/Gumbel_Temperature"] = self.nets["action_sampler"].get_gumbel_temperature()
+        else:
+            loss_log["Action_Sampler/Encoder_Variance"] = info["action_sampler/encoder_variance"].item()
+        if "action_sampler/grad_norms" in info:
+            loss_log["Action_Sampler/Grad_Norms"] = info["action_sampler/grad_norms"]
+        loss_log["Loss"] = loss_log["Action_Sampler/Loss"]
+        return loss_log
+
+    def _log_critic_info(self, info):
+        """
+        Helper function to extract critic-relevant information for logging.
+        """
+        loss_log = OrderedDict()
+        if "done_masks" in info:
+            loss_log["Critic/Done_Mask_Percentage"] = 100. * torch.mean(info["done_masks"]).item()
+        if "critic/q_targets" in info:
+            loss_log["Critic/Q_Targets"] = info["critic/q_targets"].mean().item()
+        loss_log["Loss"] = 0.
+        for critic_ind in range(len(self.nets["critic"])):
+            loss_log["Critic/Critic{}_Loss".format(critic_ind + 1)] = info["critic/critic{}_loss".format(critic_ind + 1)].item()
+            if "critic/critic{}_grad_norms".format(critic_ind + 1) in info:
+                loss_log["Critic/Critic{}_Grad_Norms".format(critic_ind + 1)] = info["critic/critic{}_grad_norms".format(critic_ind + 1)]
+            loss_log["Loss"] += loss_log["Critic/Critic{}_Loss".format(critic_ind + 1)]
+        return loss_log
+
+    def _log_actor_info(self, info):
+        """
+        Helper function to extract actor-relevant information for logging.
+        """
+        loss_log = OrderedDict()
+        if self.algo_config.actor.enabled:
+            loss_log["Actor/Loss"] = info["actor/loss"].item()
+            if "actor/grad_norms" in info:
+                loss_log["Actor/Grad_Norms"] = info["actor/grad_norms"]
+            loss_log["Loss"] = loss_log["Actor/Loss"]
+        return loss_log
+
+    def set_train(self):
+        """
+        Prepare networks for evaluation. Update from super class to make sure
+        target networks stay in evaluation mode all the time.
+        """
+        self.nets.train()
+
+        # target networks always in eval
+        for critic_ind in range(len(self.nets["critic_target"])):
+            self.nets["critic_target"][critic_ind].eval()
+
+        if self.algo_config.actor.enabled:
+            self.nets["actor_target"].eval()
+
+    def on_epoch_end(self, epoch):
+        """
+        Called at the end of each epoch.
+        """
+
+        # LR scheduling updates
+        for lr_sc in self.lr_schedulers["critic"]:
+            if lr_sc is not None:
+                lr_sc.step()
+
+        if self.lr_schedulers["action_sampler"] is not None:
+            self.lr_schedulers["action_sampler"].step()
+
+        if self.algo_config.actor.enabled and self.lr_schedulers["actor"] is not None:
+            self.lr_schedulers["actor"].step()
+
+    def _get_best_value(self, obs_dict, goal_dict=None):
+        """
+        Internal helper function for getting the best value for a given state and 
+        the corresponding best action. Meant to be used at test-time. Key differences 
+        between this and retrieving target values at train-time are that (1) only a 
+        single critic is used for the value estimate and (2) the critic and actor 
+        are used instead of the target critic and target actor.
+
+        Args:
+            obs_dict (dict): batch of current observations
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            best_value (torch.Tensor): best values
+            best_action (torch.Tensor): best actions
+        """
+        assert not self.nets.training
+
+        random_key = list(obs_dict.keys())[0]
+        batch_size = obs_dict[random_key].shape[0]
+
+        # number of action proposals from action sampler
+        num_action_samples = self.algo_config.critic.num_action_samples_rollout
+
+        # we need to stack the observations with redundancy @num_action_samples here, then decode 
+        # to get all sampled actions. for example, if we generate 2 samples per observation and
+        # the batch size is 3, then ob_tiled = [ob1; ob1; ob2; ob2; ob3; ob3]
+        ob_tiled = ObsUtils.repeat_and_stack_observation(obs_dict, n=num_action_samples)
+        goal_tiled = None
+        if len(self.goal_shapes) > 0:
+            goal_tiled = ObsUtils.repeat_and_stack_observation(goal_dict, n=num_action_samples)
+
+        sampled_actions = self._sample_actions_for_value_maximization(
+            states_tiled=ob_tiled, 
+            goal_states_tiled=goal_tiled,
+            for_target_update=False,
+        )
+
+        # feed tiled observations and perturbed sampled actions into the critic and then
+        # reshape to get all Q-values in second dimension per observation in batch.
+        # finally, just take a maximum across that second dimension to take the best sampled action
+        all_critic_values = self.nets["critic"][0](ob_tiled, sampled_actions, goal_tiled).reshape(-1, num_action_samples)
+        best_action_index = torch.argmax(all_critic_values, dim=1)
+
+        all_actions = sampled_actions.reshape(batch_size, num_action_samples, -1)
+        best_action = all_actions[torch.arange(all_actions.shape[0]), best_action_index]
+        best_value = all_critic_values[torch.arange(all_critic_values.shape[0]), best_action_index].unsqueeze(1)
+
+        return best_value, best_action
+
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        assert not self.nets.training
+
+        _, best_action = self._get_best_value(obs_dict=obs_dict, goal_dict=goal_dict)
+        return best_action
+
+    def get_state_value(self, obs_dict, goal_dict=None):
+        """
+        Get state value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        assert not self.nets.training
+
+        best_value, _ = self._get_best_value(obs_dict=obs_dict, goal_dict=goal_dict)
+        return best_value
+
+    def get_state_action_value(self, obs_dict, actions, goal_dict=None):
+        """
+        Get state-action value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            actions (torch.Tensor): action
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        assert not self.nets.training
+
+        return self.nets["critic"][0](obs_dict, actions, goal_dict)
+
+
+class BCQ_GMM(BCQ):
+    """
+    A simple modification to BCQ that replaces the VAE used to sample action proposals from the
+    batch with a GMM.
+    """
+    def _create_action_sampler(self):
+        """
+        Called in @_create_networks to make action sampler network.
+        """
+        assert self.algo_config.action_sampler.gmm.enabled
+
+        # GMM network for approximate sampling from batch dataset
+        self.nets["action_sampler"] = PolicyNets.GMMActorNetwork(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.action_sampler.actor_layer_dims,
+            num_modes=self.algo_config.action_sampler.gmm.num_modes,
+            min_std=self.algo_config.action_sampler.gmm.min_std,
+            std_activation=self.algo_config.action_sampler.gmm.std_activation,
+            low_noise_eval=self.algo_config.action_sampler.gmm.low_noise_eval,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+
+    def _train_action_sampler_on_batch(self, batch, epoch, no_backprop=False):
+        """
+        Modify this helper function from superclass to train GMM action sampler
+        with maximum likelihood.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            no_backprop (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+            outputs (dict): dictionary of outputs to use during critic training
+                (for computing target values)
+        """
+        info = OrderedDict()
+
+        # GMM forward
+        dists = self.nets["action_sampler"].forward_train(
+            obs_dict=batch["obs"], 
+            goal_dict=batch["goal_obs"],
+        )
+
+        # make sure that this is a batch of multivariate action distributions, so that
+        # the log probability computation will be correct
+        assert len(dists.batch_shape) == 1
+        log_probs = dists.log_prob(batch["actions"])
+        loss = -log_probs.mean()
+        info["action_sampler/loss"] = loss
+
+        # GMM gradient step
+        if not no_backprop:
+            gmm_grad_norms = TorchUtils.backprop_for_loss(
+                net=self.nets["action_sampler"],
+                optim=self.optimizers["action_sampler"],
+                loss=loss,
+            )
+            info["action_sampler/grad_norms"] = gmm_grad_norms
+        return info, None
+
+    def _log_action_sampler_info(self, info):
+        """
+        Update from superclass for GMM (no KL loss).
+        """
+        loss_log = OrderedDict()
+        loss_log["Action_Sampler/Loss"] = info["action_sampler/loss"].item()
+        if "action_sampler/grad_norms" in info:
+            loss_log["Action_Sampler/Grad_Norms"] = info["action_sampler/grad_norms"]
+        loss_log["Loss"] = loss_log["Action_Sampler/Loss"]
+        return loss_log
+
+
+class BCQ_Distributional(BCQ):
+    """
+    BCQ with distributional critics. Distributional critics output categorical
+    distributions over a discrete set of values instead of expected returns.
+    Some parts of this implementation were adapted from ACME (https://github.com/deepmind/acme).
+    """
+    def _create_critics(self):
+        """
+        Called in @_create_networks to make critic networks.
+        """
+        assert self.algo_config.critic.distributional.enabled
+        critic_class = ValueNets.DistributionalActionValueNetwork
+        critic_args = dict(
+            obs_shapes=self.obs_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.critic.layer_dims,
+            value_bounds=self.algo_config.critic.value_bounds,
+            num_atoms=self.algo_config.critic.distributional.num_atoms,
+            goal_shapes=self.goal_shapes,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+
+        # Q network ensemble and target ensemble
+        self.nets["critic"] = nn.ModuleList()
+        self.nets["critic_target"] = nn.ModuleList()
+
+        # NOTE: ensemble value in config is ignored, and only 1 critic is used.
+        critic = critic_class(**critic_args)
+        self.nets["critic"].append(critic)
+
+        critic_target = critic_class(**critic_args)
+        self.nets["critic_target"].append(critic_target)
+
+    def _get_target_values_from_sampled_actions(self, next_states_tiled, next_sampled_actions, goal_states_tiled, rewards, dones):
+        """
+        Helper function to get target values for training Q-function with TD-loss. Update from superclass
+        to account for distributional value functions.
+
+        Args:
+            next_states_tiled (dict): next observations to use for sampling actions. Assumes that 
+                tiling has already occurred - so that if the batch size is B, and N samples are
+                desired for each observation in the batch, the leading dimension for each
+                observation in the dict is B * N
+
+            next_sampled_actions (torch.Tensor): actions sampled from the action sampler. This function
+                will maximize the critic over these action candidates (using the TD3 trick)
+
+            goal_states_tiled (dict): if not None, goal observations
+
+            rewards (torch.Tensor): batch of rewards - should be shape (B, 1)
+
+            dones (torch.Tensor): batch of done signals - should be shape (B, 1)
+
+        Returns:
+            target_categorical_probabilities (torch.Tensor): target categorical probabilities
+                to use in the bellman backup
+        """
+
+        with torch.no_grad():
+            # compute expected returns of the sampled actions and maximize to find the best action
+            all_vds = self.nets["critic_target"][0].forward_train(next_states_tiled, next_sampled_actions, goal_states_tiled)
+            expected_values = all_vds.mean().reshape(-1, self.algo_config.critic.num_action_samples)
+            best_action_index = torch.argmax(expected_values, dim=1)
+            all_actions = next_sampled_actions.reshape(-1, self.algo_config.critic.num_action_samples, self.ac_dim)
+            best_action = all_actions[torch.arange(all_actions.shape[0]), best_action_index]
+
+            # get the corresponding probabilities for the categorical distributions corresponding to the best actions
+            all_vd_probs = all_vds.probs.reshape(-1, self.algo_config.critic.num_action_samples, self.algo_config.critic.distributional.num_atoms)
+            target_vd_probs = all_vd_probs[torch.arange(all_vd_probs.shape[0]), best_action_index]
+
+            # bellman backup to get a new grid of values - then project onto the canonical atoms to obtain a
+            # target set of categorical probabilities over the atoms
+            atom_value_grid = all_vds.values
+            target_value_grid = rewards + dones * self.discount * atom_value_grid
+            target_categorical_probabilities = LossUtils.project_values_onto_atoms(
+                values=target_value_grid,
+                probabilities=target_vd_probs,
+                atoms=atom_value_grid,
+            )
+
+        return target_categorical_probabilities
+
+    def _compute_critic_loss(self, critic, states, actions, goal_states, q_targets):
+        """
+        Overrides super class to compute a distributional loss. Since values are
+        categorical distributions, this is just computing a cross-entropy
+        loss between the two distributions.
+
+        NOTE: q_targets is expected to be a batch of normalized probability vectors that correspond to
+              the target categorical distributions over the value atoms.
+
+        Args:
+            critic (torch.nn.Module): critic network
+            states (dict): batch of observations
+            actions (torch.Tensor): batch of actions
+            goal_states (dict): if not None, batch of goal observations
+            q_targets (torch.Tensor): batch of target q-values for the TD loss
+
+        Returns:
+            critic_loss (torch.Tensor): critic loss
+            critic_output (dict): additional outputs from the critic. This function
+                returns None, but subclasses may want to provide some information
+                here.
+        """
+
+        # this should be the equivalent of softmax with logits from tf
+        vd = critic.forward_train(states, actions, goal_states)
+        log_probs = F.log_softmax(vd.logits, dim=-1)
+        critic_loss = nn.KLDivLoss(reduction='batchmean')(log_probs, q_targets)
+        return critic_loss, None
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/cql.py b/phantom/submodules/phantom-robomimic/robomimic/algo/cql.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c24d50abd91426a4d96e91896c958b7df1ada0a
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/cql.py
@@ -0,0 +1,668 @@
+"""
+Implementation of Conservative Q-Learning (CQL).
+Based off of https://github.com/aviralkumar2907/CQL.
+(Paper - https://arxiv.org/abs/2006.04779).
+"""
+import numpy as np
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+import robomimic.models.base_nets as BaseNets
+import robomimic.models.obs_nets as ObsNets
+import robomimic.models.policy_nets as PolicyNets
+import robomimic.models.value_nets as ValueNets
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.torch_utils as TorchUtils
+from robomimic.algo import register_algo_factory_func, ValueAlgo, PolicyAlgo
+
+
+@register_algo_factory_func("cql")
+def algo_config_to_class(algo_config):
+    """
+    Maps algo config to the CQL algo class to instantiate, along with additional algo kwargs.
+
+    Args:
+        algo_config (Config instance): algo config
+
+    Returns:
+        algo_class: subclass of Algo
+        algo_kwargs (dict): dictionary of additional kwargs to pass to algorithm
+    """
+    return CQL, {}
+
+
+class CQL(PolicyAlgo, ValueAlgo):
+    """
+    CQL-extension of SAC for the off-policy, offline setting. See https://arxiv.org/abs/2006.04779
+    """
+    def __init__(self, **kwargs):
+        # Store entropy / cql settings first since the super init call requires them
+        self.automatic_entropy_tuning = kwargs["algo_config"].actor.target_entropy is not None
+        self.automatic_cql_tuning = kwargs["algo_config"].critic.target_q_gap is not None and \
+                                    kwargs["algo_config"].critic.target_q_gap >= 0.0
+
+        # Run super init first
+        super().__init__(**kwargs)
+
+        # Reward settings
+        self.n_step = self.algo_config.n_step
+        self.discount = self.algo_config.discount ** self.n_step
+
+        # Now also store additional SAC- and CQL-specific stuff from the config
+        self._num_batch_steps = 0
+        self.bc_start_steps = self.algo_config.actor.bc_start_steps
+        self.deterministic_backup = self.algo_config.critic.deterministic_backup
+        self.td_loss_fcn = nn.SmoothL1Loss() if self.algo_config.critic.use_huber else nn.MSELoss()
+
+        # Entropy settings
+        self.target_entropy = -np.prod(self.ac_dim) if self.algo_config.actor.target_entropy in {None, "default"} else\
+            self.algo_config.actor.target_entropy
+
+        # CQL settings
+        self.min_q_weight = self.algo_config.critic.min_q_weight
+        self.target_q_gap = self.algo_config.critic.target_q_gap if self.automatic_cql_tuning else 0.0
+
+    @property
+    def log_entropy_weight(self):
+        return self.nets["log_entropy_weight"]() if self.automatic_entropy_tuning else\
+            torch.zeros(1, requires_grad=False, device=self.device)
+
+    @property
+    def log_cql_weight(self):
+        return self.nets["log_cql_weight"]() if self.automatic_cql_tuning else\
+            torch.log(torch.tensor(self.algo_config.critic.cql_weight, requires_grad=False, device=self.device))
+
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+
+        Networks for this algo: critic (potentially ensemble), policy
+        """
+
+        # Create nets
+        self.nets = nn.ModuleDict()
+
+        # Assemble args to pass to actor
+        actor_args = dict(self.algo_config.actor.net.common)
+
+        # Add network-specific args and define network class
+        if self.algo_config.actor.net.type == "gaussian":
+            actor_cls = PolicyNets.GaussianActorNetwork
+            actor_args.update(dict(self.algo_config.actor.net.gaussian))
+        else:
+            # Unsupported actor type!
+            raise ValueError(f"Unsupported actor requested. "
+                             f"Requested: {self.algo_config.actor.net.type}, "
+                             f"valid options are: {['gaussian']}")
+
+        # Policy
+        self.nets["actor"] = actor_cls(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.actor.layer_dims,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+            **actor_args,
+        )
+
+        # Critics
+        self.nets["critic"] = nn.ModuleList()
+        self.nets["critic_target"] = nn.ModuleList()
+        for _ in range(self.algo_config.critic.ensemble.n):
+            for net_list in (self.nets["critic"], self.nets["critic_target"]):
+                critic = ValueNets.ActionValueNetwork(
+                    obs_shapes=self.obs_shapes,
+                    ac_dim=self.ac_dim,
+                    mlp_layer_dims=self.algo_config.critic.layer_dims,
+                    value_bounds=self.algo_config.critic.value_bounds,
+                    goal_shapes=self.goal_shapes,
+                    encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+                )
+                net_list.append(critic)
+
+        # Entropy (if automatically tuning)
+        if self.automatic_entropy_tuning:
+            self.nets["log_entropy_weight"] = BaseNets.Parameter(torch.zeros(1))
+
+        # CQL (if automatically tuning)
+        if self.automatic_cql_tuning:
+            self.nets["log_cql_weight"] = BaseNets.Parameter(torch.zeros(1))
+
+        # Send networks to appropriate device
+        self.nets = self.nets.float().to(self.device)
+
+        # sync target networks at beginning of training
+        with torch.no_grad():
+            for critic, critic_target in zip(self.nets["critic"], self.nets["critic_target"]):
+                TorchUtils.hard_update(
+                    source=critic,
+                    target=critic_target,
+                )
+
+    def _create_optimizers(self):
+        """
+        Creates optimizers using @self.optim_params and places them into @self.optimizers.
+
+        Overrides base method since we might need to create aditional optimizers for the entropy
+        and cql weight parameters (by default, the base class only creates optimizers for all
+        entries in @self.nets that have corresponding entries in `self.optim_params` but these
+        parameters do not).
+        """
+
+        # Create actor and critic optimizers via super method
+        super()._create_optimizers()
+
+        # We still need to potentially create additional optimizers based on algo settings
+
+        # entropy (if automatically tuning)
+        if self.automatic_entropy_tuning:
+            self.optimizers["entropy"] = optim.Adam(
+                params=self.nets["log_entropy_weight"].parameters(),
+                lr=self.optim_params["actor"]["learning_rate"]["initial"],
+                weight_decay=0.0,
+            )
+
+        # cql (if automatically tuning)
+        if self.automatic_cql_tuning:
+            self.optimizers["cql"] = optim.Adam(
+                params=self.nets["log_cql_weight"].parameters(),
+                lr=self.optim_params["critic"]["learning_rate"]["initial"],
+                weight_decay=0.0,
+            )
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out relevant info and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training
+        """
+        input_batch = dict()
+
+        # Make sure the trajectory of actions received is greater than our step horizon
+        assert batch["actions"].shape[1] >= self.n_step
+
+        # remove temporal batches for all
+        input_batch["obs"] = {k: batch["obs"][k][:, 0, :] for k in batch["obs"]}
+        input_batch["next_obs"] = {k: batch["next_obs"][k][:, self.n_step - 1, :] for k in batch["next_obs"]}
+        input_batch["goal_obs"] = batch.get("goal_obs", None) # goals may not be present
+        input_batch["actions"] = batch["actions"][:, 0, :]
+
+        # note: ensure scalar signals (rewards, done) retain last dimension of 1 to be compatible with model outputs
+
+        # single timestep reward is discounted sum of intermediate rewards in sequence
+        reward_seq = batch["rewards"][:, :self.n_step]
+        discounts = torch.pow(self.algo_config.discount, torch.arange(self.n_step).float()).unsqueeze(0)
+        input_batch["rewards"] = (reward_seq * discounts).sum(dim=1).unsqueeze(1)
+
+        # consider this n-step seqeunce done if any intermediate dones are present
+        done_seq = batch["dones"][:, :self.n_step]
+        input_batch["dones"] = (done_seq.sum(dim=1) > 0).float().unsqueeze(1)
+
+        # we move to device first before float conversion because image observation modalities will be uint8 -
+        # this minimizes the amount of data transferred to GPU
+        return TensorUtils.to_float(TensorUtils.to_device(input_batch, self.device))
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        info = OrderedDict()
+
+        # Set the correct context for this training step
+        with TorchUtils.maybe_no_grad(no_grad=validate):
+            # Always run super call first
+            super_info = super().train_on_batch(batch, epoch, validate=validate)
+            # Train actor
+            actor_info = self._train_policy_on_batch(batch, epoch, validate)
+            # Train critic(s)
+            critic_info = self._train_critic_on_batch(batch, epoch, validate)
+            # Update info
+            info.update(super_info)
+            info.update(actor_info)
+            info.update(critic_info)
+
+        # Return stats
+        return info
+
+    def _train_policy_on_batch(self, batch, epoch, validate=False):
+        """
+        Training policy on a single batch of data.
+
+        Loss is the ExpValue over sampled states of the (weighted) logprob of a sampled action
+        under the current policy minus the Q value of associated with the (s, a) combo
+
+        Intuitively, this tries to improve the odds of sampling actions with high Q values while simultaneously
+        penalizing high probability actions.
+
+        Since we're in the continuous setting, we monte carlo sample.
+
+        Concretely:
+            Loss = Average[ entropy_weight * logprob(f(eps; s) | s) - Q(s, f(eps; s) ]
+
+            where we use the reparameterization trick with Gaussian function f(*) to parameterize
+            actions as a function of the sampled noise param eps given input state s
+
+        Additionally, we update the (log) entropy weight parameter if we're tuning that as well.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        info = OrderedDict()
+
+        # Sample actions from policy and get log probs
+        dist = self.nets["actor"].forward_train(obs_dict=batch["obs"], goal_dict=batch["goal_obs"])
+        actions, log_prob = self._get_actions_and_log_prob(dist=dist)
+
+        # Calculate alpha
+        entropy_weight_loss = -(self.log_entropy_weight * (log_prob + self.target_entropy).detach()).mean() if\
+            self.automatic_entropy_tuning else 0.0
+        entropy_weight = self.log_entropy_weight.exp()
+
+        # Get predicted Q-values for all state, action pairs
+        pred_qs = [critic(obs_dict=batch["obs"], acts=actions, goal_dict=batch["goal_obs"])
+                   for critic in self.nets["critic"]]
+        # We take the minimum for stability
+        pred_qs, _ = torch.cat(pred_qs, dim=1).min(dim=1, keepdim=True)
+
+        # Use BC if we're in the beginning of training, otherwise calculate policy loss normally
+        baseline = dist.log_prob(batch["actions"]).unsqueeze(dim=-1) if\
+            self._num_batch_steps < self.bc_start_steps else pred_qs
+        policy_loss = (entropy_weight * log_prob - baseline).mean()
+
+        # Add info
+        info["entropy_weight"] = entropy_weight.item()
+        info["entropy_weight_loss"] = entropy_weight_loss.item() if \
+            self.automatic_entropy_tuning else entropy_weight_loss
+        info["actor/loss"] = policy_loss
+
+        # Take a training step if we're not validating
+        if not validate:
+            # Update batch step
+            self._num_batch_steps += 1
+            if self.automatic_entropy_tuning:
+                # Alpha
+                self.optimizers["entropy"].zero_grad()
+                entropy_weight_loss.backward()
+                self.optimizers["entropy"].step()
+                info["entropy_grad_norms"] = self.log_entropy_weight.grad.data.norm(2).pow(2).item()
+
+            # Policy
+            actor_grad_norms = TorchUtils.backprop_for_loss(
+                net=self.nets["actor"],
+                optim=self.optimizers["actor"],
+                loss=policy_loss,
+                max_grad_norm=self.algo_config.actor.max_gradient_norm,
+            )
+            # Add info
+            info["actor/grad_norms"] = actor_grad_norms
+
+        # Return stats
+        return info
+
+    def _train_critic_on_batch(self, batch, epoch, validate=False):
+        """
+        Training critic(s) on a single batch of data.
+
+        For a given batch of (s, a, r, s') tuples and n sampled actions (a_, a'_ corresponding to actions
+        sampled from the learned policy at states s and s', respectively; a~ corresponding to uniformly random
+        sampled actions):
+
+            Loss = CQL_loss + SAC_loss
+
+        Since we're in the continuous setting, we monte carlo sample for all ExpValues, which become Averages instead
+
+        SAC_loss is the standard single-step TD error, corresponding to the following:
+
+            SAC_loss = 0.5 * Average[ (Q(s,a) - (r + Average over a'_ [ Q(s', a'_) ]))^2 ]
+
+        The CQL_loss corresponds to a weighted secondary objective, corresponding to the (ExpValue of Q values over
+        sampled states and sampled actions from the LEARNED policy) minus the (ExpValue of Q values over
+        sampled states and sampled actions from the DATASET policy) plus a regularizer as a function
+        of the learned policy.
+
+        Intuitively, this tries to penalize Q-values arbitrarily resulting from the learned policy (which may produce
+        out-of-distribution (s,a) pairs) while preserving (known) Q-values taken from the dataset policy.
+
+        As we are using SAC, we choose our regularizer to correspond to the negative KL divergence between our
+        learned policy and a uniform distribution such that the first term in the CQL loss corresponds to the
+        soft maximum over all Q values at any state s.
+
+        For stability, we importance sample actions over random actions and from the current policy at s, s'.
+
+        Moreover, if we want to tune the cql_weight automatically, we include the threshold value target_q_gap
+        to penalize Q values that are overly-optimistic by the given threshold.
+
+        In this case, the CQL_loss is as follows:
+
+            CQL_loss = cql_weight * (Average [log (Average over a` in {a~, a_, a_'}: exp(Q(s,a`) - logprob(a`)) - Average [Q(s,a)]] - target_q_gap)
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        info = OrderedDict()
+        B, A = batch["actions"].shape
+        N = self.algo_config.critic.num_random_actions
+
+        # Get predicted Q-values from taken actions
+        q_preds = [critic(obs_dict=batch["obs"], acts=batch["actions"], goal_dict=batch["goal_obs"])
+                   for critic in self.nets["critic"]]
+
+        # Sample actions at the current and next step
+        curr_dist = self.nets["actor"].forward_train(obs_dict=batch["obs"], goal_dict=batch["goal_obs"])
+        next_dist = self.nets["actor"].forward_train(obs_dict=batch["next_obs"], goal_dict=batch["goal_obs"])
+        next_actions, next_log_prob = self._get_actions_and_log_prob(dist=next_dist)
+
+        # Don't capture gradients here, since the critic target network doesn't get trained (only soft updated)
+        with torch.no_grad():
+            # We take the max over all samples if the number of action samples is > 1
+            if self.algo_config.critic.num_action_samples > 1:
+                # Generate the target q values, using the backup from the next state
+                temp_actions = next_dist.rsample(sample_shape=(self.algo_config.critic.num_action_samples,)).permute(1, 0, 2)
+                target_qs = [self._get_qs_from_actions(
+                    obs_dict=batch["next_obs"], actions=temp_actions, goal_dict=batch["goal_obs"], q_net=critic)
+                                 .max(dim=1, keepdim=True)[0] for critic in self.nets["critic_target"]]
+            else:
+                target_qs = [critic(obs_dict=batch["next_obs"], acts=next_actions, goal_dict=batch["goal_obs"])
+                             for critic in self.nets["critic_target"]]
+            # Take the minimum over all critics
+            target_qs, _ = torch.cat(target_qs, dim=1).min(dim=1, keepdim=True)
+            # If only sampled once from each critic and not using a deterministic backup, subtract the logprob as well
+            if self.algo_config.critic.num_action_samples == 1 and not self.deterministic_backup:
+                target_qs = target_qs - self.log_entropy_weight.exp() * next_log_prob
+
+            # Calculate the q target values
+            done_mask_batch = 1. - batch["dones"]
+            info["done_masks"] = done_mask_batch
+            q_target = batch["rewards"] + done_mask_batch * self.discount * target_qs
+
+        # Calculate CQL stuff
+        cql_random_actions = torch.FloatTensor(N, B, A).uniform_(-1., 1.).to(self.device)                           # shape (N, B, A)
+        cql_random_log_prob = np.log(0.5 ** A)
+        cql_curr_actions, cql_curr_log_prob = self._get_actions_and_log_prob(dist=curr_dist, sample_shape=(N,))     # shape (N, B, A) and (N, B, 1)
+        cql_next_actions, cql_next_log_prob = self._get_actions_and_log_prob(dist=next_dist, sample_shape=(N,))     # shape (N, B, A) and (N, B, 1)
+        cql_curr_log_prob = cql_curr_log_prob.squeeze(dim=-1).permute(1, 0).detach()                                # shape (B, N)
+        cql_next_log_prob = cql_next_log_prob.squeeze(dim=-1).permute(1, 0).detach()                                # shape (B, N)
+        q_cats = []     # Each entry shape will be (B, N)
+
+        for critic, q_pred in zip(self.nets["critic"], q_preds):
+            # Compose Q values over all sampled actions (importance sampled)
+            q_rand = self._get_qs_from_actions(obs_dict=batch["obs"], actions=cql_random_actions.permute(1, 0, 2), goal_dict=batch["goal_obs"], q_net=critic)
+            q_curr = self._get_qs_from_actions(obs_dict=batch["obs"], actions=cql_curr_actions.permute(1, 0, 2), goal_dict=batch["goal_obs"], q_net=critic)
+            q_next = self._get_qs_from_actions(obs_dict=batch["obs"], actions=cql_next_actions.permute(1, 0, 2), goal_dict=batch["goal_obs"], q_net=critic)
+            q_cat = torch.cat([
+                q_rand - cql_random_log_prob,
+                q_next - cql_next_log_prob,
+                q_curr - cql_curr_log_prob,
+            ], dim=1)           # shape (B, 3 * N)
+            q_cats.append(q_cat)
+
+        # Calculate the losses for all critics
+        cql_losses = []
+        critic_losses = []
+        cql_weight = torch.clamp(self.log_cql_weight.exp(), min=0.0, max=1000000.0)
+        info["critic/cql_weight"] = cql_weight.item()
+        for i, (q_pred, q_cat) in enumerate(zip(q_preds, q_cats)):
+            # Calculate td error loss
+            td_loss = self.td_loss_fcn(q_pred, q_target)
+            # Calculate cql loss
+            cql_loss = cql_weight * (self.min_q_weight * (torch.logsumexp(q_cat, dim=1).mean() - q_pred.mean()) -
+                                     self.target_q_gap)
+            cql_losses.append(cql_loss)
+            # Calculate total loss
+            loss = td_loss + cql_loss
+            critic_losses.append(loss)
+            info[f"critic/critic{i+1}_loss"] = loss
+
+        # Run gradient descent if we're not validating
+        if not validate:
+            # Train CQL weight if tuning automatically
+            if self.automatic_cql_tuning:
+                cql_weight_loss = -torch.stack(cql_losses).mean()
+                info[
+                    "critic/cql_weight_loss"] = cql_weight_loss.item()  # Make sure to not store computation graph since we retain graph after backward() call
+                self.optimizers["cql"].zero_grad()
+                cql_weight_loss.backward(retain_graph=True)
+                self.optimizers["cql"].step()
+                info["critic/cql_grad_norms"] = self.log_cql_weight.grad.data.norm(2).pow(2).item()
+
+            # Train critics
+            for i, (critic_loss, critic, critic_target, optimizer) in enumerate(zip(
+                    critic_losses, self.nets["critic"], self.nets["critic_target"], self.optimizers["critic"]
+            )):
+                retain_graph = (i < (len(critic_losses) - 1))
+                critic_grad_norms = TorchUtils.backprop_for_loss(
+                    net=critic,
+                    optim=optimizer,
+                    loss=critic_loss,
+                    max_grad_norm=self.algo_config.critic.max_gradient_norm,
+                    retain_graph=retain_graph,
+                )
+                info[f"critic/critic{i+1}_grad_norms"] = critic_grad_norms
+                with torch.no_grad():
+                    TorchUtils.soft_update(source=critic, target=critic_target, tau=self.algo_config.target_tau)
+
+        # Return stats
+        return info
+
+    def _get_actions_and_log_prob(self, dist, sample_shape=torch.Size()):
+        """
+        Helper method to sample actions and compute corresponding log probabilities
+
+        Args:
+            dist (Distribution): Distribution to sample from
+            sample_shape (torch.Size or tuple): Shape of output when sampling (number of samples)
+
+        Returns:
+            2-tuple:
+                - (tensor) sampled actions (..., B, ..., A)
+                - (tensor) corresponding log probabilities (..., B, ..., 1)
+        """
+        # Process networks with tanh differently than normal distributions
+        if self.algo_config.actor.net.common.use_tanh:
+            actions, actions_pre_tanh = dist.rsample(sample_shape=sample_shape, return_pretanh_value=True)
+            log_prob = dist.log_prob(actions, pre_tanh_value=actions_pre_tanh).unsqueeze(dim=-1)
+        else:
+            actions = dist.rsample(sample_shape=sample_shape)
+            log_prob = dist.log_prob(actions)
+
+        return actions, log_prob
+
+    @staticmethod
+    def _get_qs_from_actions(obs_dict, actions, goal_dict, q_net):
+        """
+        Helper function for grabbing Q values given a single state and multiple (N) sampled actions.
+
+        Args:
+            obs_dict (dict): Observation dict from batch
+            actions (tensor): Torch tensor, with dim1 assumed to be the extra sampled dimension
+            goal_dict (dict): Goal dict from batch
+            q_net (nn.Module): Q net to pass the observations and actions
+
+        Returns:
+            tensor: (B, N) corresponding Q values
+        """
+        # Get the number of sampled actions
+        B, N, D = actions.shape
+
+        # Repeat obs and goals in the batch dimension
+        obs_dict_stacked = ObsUtils.repeat_and_stack_observation(obs_dict, N)
+        goal_dict_stacked = ObsUtils.repeat_and_stack_observation(goal_dict, N)
+
+        # Pass the obs and (flattened) actions through to get the Q values
+        qs = q_net(obs_dict=obs_dict_stacked, acts=actions.reshape(-1, D), goal_dict=goal_dict_stacked)
+
+        # Unflatten output
+        qs = qs.reshape(B, N)
+
+        return qs
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        loss_log = OrderedDict()
+
+        # record current optimizer learning rates
+        for k in self.optimizers:
+            keys = [k]
+            optims = [self.optimizers[k]]
+            if k == "critic":
+                # account for critic having one optimizer per ensemble member
+                keys = ["{}{}".format(k, critic_ind) for critic_ind in range(len(self.nets["critic"]))]
+                optims = self.optimizers[k]
+            for kp, optimizer in zip(keys, optims):
+                for i, param_group in enumerate(optimizer.param_groups):
+                    loss_log["Optimizer/{}{}_lr".format(kp, i)] = param_group["lr"]
+
+        # extract relevant logs for critic, and actor
+        loss_log["Loss"] = 0.
+        for loss_logger in [self._log_critic_info, self._log_actor_info]:
+            this_log = loss_logger(info)
+            if "Loss" in this_log:
+                # manually merge total loss
+                loss_log["Loss"] += this_log["Loss"]
+                del this_log["Loss"]
+            loss_log.update(this_log)
+
+        return loss_log
+
+    def _log_critic_info(self, info):
+        """
+        Helper function to extract critic-relevant information for logging.
+        """
+        loss_log = OrderedDict()
+        if "done_masks" in info:
+            loss_log["Critic/Done_Mask_Percentage"] = 100. * torch.mean(info["done_masks"]).item()
+        if "critic/q_targets" in info:
+            loss_log["Critic/Q_Targets"] = info["critic/q_targets"].mean().item()
+        loss_log["Loss"] = 0.
+        for critic_ind in range(len(self.nets["critic"])):
+            loss_log["Critic/Critic{}_Loss".format(critic_ind + 1)] = info["critic/critic{}_loss".format(critic_ind + 1)].item()
+            if "critic/critic{}_grad_norms".format(critic_ind + 1) in info:
+                loss_log["Critic/Critic{}_Grad_Norms".format(critic_ind + 1)] = info["critic/critic{}_grad_norms".format(critic_ind + 1)]
+            loss_log["Loss"] += loss_log["Critic/Critic{}_Loss".format(critic_ind + 1)]
+        if "critic/cql_weight_loss" in info:
+            loss_log["Critic/CQL_Weight"] = info["critic/cql_weight"]
+            loss_log["Critic/CQL_Weight_Loss"] = info["critic/cql_weight_loss"]
+            loss_log["Critic/CQL_Grad_Norms"] = info["critic/cql_grad_norms"]
+        return loss_log
+
+    def _log_actor_info(self, info):
+        """
+        Helper function to extract actor-relevant information for logging.
+        """
+        loss_log = OrderedDict()
+        loss_log["Actor/Loss"] = info["actor/loss"].item()
+        if "actor/grad_norms" in info:
+            loss_log["Actor/Grad_Norms"] = info["actor/grad_norms"]
+        loss_log["Loss"] = loss_log["Actor/Loss"]
+        loss_log["Entropy_Weight_Loss"] = info["entropy_weight_loss"]
+        loss_log["Entropy_Weight"] = info["entropy_weight"]
+        if "entropy_grad_norms" in info:
+            loss_log["Entropy_Grad_Norms"] = info["entropy_grad_norms"]
+        return loss_log
+
+    def set_train(self):
+        """
+        Prepare networks for evaluation. Update from super class to make sure
+        target networks stay in evaluation mode all the time.
+        """
+        self.nets.train()
+
+        # target networks always in eval
+        for critic in self.nets["critic_target"]:
+            critic.eval()
+
+    def on_epoch_end(self, epoch):
+        """
+        Called at the end of each epoch.
+        """
+
+        # LR scheduling updates
+        for lr_sc in self.lr_schedulers["critic"]:
+            if lr_sc is not None:
+                lr_sc.step()
+
+        if self.lr_schedulers["actor"] is not None:
+            self.lr_schedulers["actor"].step()
+
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        assert not self.nets.training
+
+        return self.nets["actor"](obs_dict=obs_dict, goal_dict=goal_dict)
+
+    def get_state_action_value(self, obs_dict, actions, goal_dict=None):
+        """
+        Get state-action value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            actions (torch.Tensor): action
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        assert not self.nets.training
+
+        return self.nets["critic"][0](obs_dict, actions, goal_dict)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/diffusion_policy.py b/phantom/submodules/phantom-robomimic/robomimic/algo/diffusion_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..5262ae8b2aac4cc4f8e947fd4d6b8b513d8a83fb
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/diffusion_policy.py
@@ -0,0 +1,693 @@
+"""
+Implementation of Diffusion Policy https://diffusion-policy.cs.columbia.edu/ by Cheng Chi
+"""
+from typing import Callable, Union
+import math
+from collections import OrderedDict, deque
+from packaging.version import parse as parse_version
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# requires diffusers==0.11.1
+from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from diffusers.schedulers.scheduling_ddim import DDIMScheduler
+from diffusers.training_utils import EMAModel
+
+import robomimic.models.obs_nets as ObsNets
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.torch_utils as TorchUtils
+import robomimic.utils.obs_utils as ObsUtils
+
+from robomimic.algo import register_algo_factory_func, PolicyAlgo
+
+@register_algo_factory_func("diffusion_policy")
+def algo_config_to_class(algo_config):
+    """
+    Maps algo config to the BC algo class to instantiate, along with additional algo kwargs.
+
+    Args:
+        algo_config (Config instance): algo config
+
+    Returns:
+        algo_class: subclass of Algo
+        algo_kwargs (dict): dictionary of additional kwargs to pass to algorithm
+    """
+
+    if algo_config.unet.enabled:
+        return DiffusionPolicyUNet, {}
+    elif algo_config.transformer.enabled:
+        raise NotImplementedError()
+    else:
+        raise RuntimeError()
+
+class DiffusionPolicyUNet(PolicyAlgo):
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        # set up different observation groups for @MIMO_MLP
+        observation_group_shapes = OrderedDict()
+        observation_group_shapes["obs"] = OrderedDict(self.obs_shapes)
+        encoder_kwargs = ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder)
+        
+        obs_encoder = ObsNets.ObservationGroupEncoder(
+            observation_group_shapes=observation_group_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+        # IMPORTANT!
+        # replace all BatchNorm with GroupNorm to work with EMA
+        # performance will tank if you forget to do this!
+        obs_encoder = replace_bn_with_gn(obs_encoder)
+        
+        obs_dim = obs_encoder.output_shape()[0]
+
+        # create network object
+        noise_pred_net = ConditionalUnet1D(
+            input_dim=self.ac_dim,
+            global_cond_dim=obs_dim*self.algo_config.horizon.observation_horizon
+        )
+
+        # the final arch has 2 parts
+        nets = nn.ModuleDict({
+            'policy': nn.ModuleDict({
+                'obs_encoder': obs_encoder,
+                'noise_pred_net': noise_pred_net
+            })
+        })
+
+        nets = nets.float().to(self.device)
+        
+        # setup noise scheduler
+        noise_scheduler = None
+        if self.algo_config.ddpm.enabled:
+            noise_scheduler = DDPMScheduler(
+                num_train_timesteps=self.algo_config.ddpm.num_train_timesteps,
+                beta_schedule=self.algo_config.ddpm.beta_schedule,
+                clip_sample=self.algo_config.ddpm.clip_sample,
+                prediction_type=self.algo_config.ddpm.prediction_type
+            )
+        elif self.algo_config.ddim.enabled:
+            noise_scheduler = DDIMScheduler(
+                num_train_timesteps=self.algo_config.ddim.num_train_timesteps,
+                beta_schedule=self.algo_config.ddim.beta_schedule,
+                clip_sample=self.algo_config.ddim.clip_sample,
+                set_alpha_to_one=self.algo_config.ddim.set_alpha_to_one,
+                steps_offset=self.algo_config.ddim.steps_offset,
+                prediction_type=self.algo_config.ddim.prediction_type
+            )
+        else:
+            raise RuntimeError()
+        
+        # setup EMA
+        ema = None
+        if self.algo_config.ema.enabled:
+            ema = EMAModel(parameters=nets.parameters(), power=self.algo_config.ema.power)
+                
+        # set attrs
+        self.nets = nets
+        self._shadow_nets = copy.deepcopy(self.nets).eval()
+        self._shadow_nets.requires_grad_(False)
+        self.noise_scheduler = noise_scheduler
+        self.ema = ema
+        self.action_check_done = False
+        self.obs_queue = None
+        self.action_queue = None
+    
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training 
+        """
+        To = self.algo_config.horizon.observation_horizon
+        Ta = self.algo_config.horizon.action_horizon
+        Tp = self.algo_config.horizon.prediction_horizon
+
+        input_batch = dict()
+        input_batch["obs"] = {k: batch["obs"][k][:, :To, :] for k in batch["obs"]}
+        input_batch["goal_obs"] = batch.get("goal_obs", None) # goals may not be present
+        input_batch["actions"] = batch["actions"][:, :Tp, :]
+        
+        # check if actions are normalized to [-1,1]
+        if not self.action_check_done:
+            actions = input_batch["actions"]
+            in_range = (-1 <= actions) & (actions <= 1)
+            all_in_range = torch.all(in_range).item()
+            if not all_in_range:
+                raise ValueError('"actions" must be in range [-1,1] for Diffusion Policy! Check if hdf5_normalize_action is enabled.')
+            self.action_check_done = True
+        
+        return TensorUtils.to_device(TensorUtils.to_float(input_batch), self.device)
+        
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        To = self.algo_config.horizon.observation_horizon
+        Ta = self.algo_config.horizon.action_horizon
+        Tp = self.algo_config.horizon.prediction_horizon
+        action_dim = self.ac_dim
+        B = batch['actions'].shape[0]
+        
+        
+        with TorchUtils.maybe_no_grad(no_grad=validate):
+            info = super(DiffusionPolicyUNet, self).train_on_batch(batch, epoch, validate=validate)
+            actions = batch['actions']
+            
+            # encode obs
+            inputs = {
+                'obs': batch["obs"],
+                'goal': batch["goal_obs"]
+            }
+            for k in self.obs_shapes:
+                # first two dimensions should be [B, T] for inputs
+                assert inputs['obs'][k].ndim - 2 == len(self.obs_shapes[k])
+            
+            obs_features = TensorUtils.time_distributed(inputs, self.nets['policy']['obs_encoder'], inputs_as_kwargs=True)
+            assert obs_features.ndim == 3  # [B, T, D]
+
+            obs_cond = obs_features.flatten(start_dim=1)
+            
+            # sample noise to add to actions
+            noise = torch.randn(actions.shape, device=self.device)
+            
+            # sample a diffusion iteration for each data point
+            timesteps = torch.randint(
+                0, self.noise_scheduler.config.num_train_timesteps, 
+                (B,), device=self.device
+            ).long()
+            
+            # add noise to the clean actions according to the noise magnitude at each diffusion iteration
+            # (this is the forward diffusion process)
+            noisy_actions = self.noise_scheduler.add_noise(
+                actions, noise, timesteps)
+            
+            # predict the noise residual
+            noise_pred = self.nets['policy']['noise_pred_net'](
+                noisy_actions, timesteps, global_cond=obs_cond)
+            
+            # L2 loss
+            loss = F.mse_loss(noise_pred, noise)
+            
+            # logging
+            losses = {
+                'l2_loss': loss
+            }
+            info["losses"] = TensorUtils.detach(losses)
+
+            if not validate:
+                # gradient step
+                policy_grad_norms = TorchUtils.backprop_for_loss(
+                    net=self.nets,
+                    optim=self.optimizers["policy"],
+                    loss=loss,
+                )
+                
+                # update Exponential Moving Average of the model weights
+                if self.ema is not None:
+                    self.ema.step(self.nets.parameters())
+                
+                step_info = {
+                    'policy_grad_norms': policy_grad_norms
+                }
+                info.update(step_info)
+
+        return info
+    
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        log = super(DiffusionPolicyUNet, self).log_info(info)
+        log["Loss"] = info["losses"]["l2_loss"].item()
+        if "policy_grad_norms" in info:
+            log["Policy_Grad_Norms"] = info["policy_grad_norms"]
+        return log
+    
+    def reset(self):
+        """
+        Reset algo state to prepare for environment rollouts.
+        """
+        # setup inference queues
+        To = self.algo_config.horizon.observation_horizon
+        Ta = self.algo_config.horizon.action_horizon
+        obs_queue = deque(maxlen=To)
+        action_queue = deque(maxlen=Ta)
+        self.obs_queue = obs_queue
+        self.action_queue = action_queue
+    
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+
+        Args:
+            obs_dict (dict): current observation [1, Do]
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor [1, Da]
+        """
+        # obs_dict: key: [1,D]
+        To = self.algo_config.horizon.observation_horizon
+        Ta = self.algo_config.horizon.action_horizon
+
+        # TODO: obs_queue already handled by frame_stack
+        # make sure we have at least To observations in obs_queue
+        # if not enough, repeat
+        # if already full, append one to the obs_queue
+        # n_repeats = max(To - len(self.obs_queue), 1)
+        # self.obs_queue.extend([obs_dict] * n_repeats)
+        
+        if len(self.action_queue) == 0:
+            # no actions left, run inference
+            # turn obs_queue into dict of tensors (concat at T dim)
+            # import pdb; pdb.set_trace()
+            # obs_dict_list = TensorUtils.list_of_flat_dict_to_dict_of_list(list(self.obs_queue))
+            # obs_dict_tensor = dict((k, torch.cat(v, dim=0).unsqueeze(0)) for k,v in obs_dict_list.items())
+            
+            # run inference
+            # [1,T,Da]
+            action_sequence = self._get_action_trajectory(obs_dict=obs_dict)
+            
+            # put actions into the queue
+            self.action_queue.extend(action_sequence[0])
+        
+        # has action, execute from left to right
+        # [Da]
+        action = self.action_queue.popleft()
+        
+        # [1,Da]
+        action = action.unsqueeze(0)
+        return action
+        
+    def _get_action_trajectory(self, obs_dict, goal_dict=None):
+        assert not self.nets.training
+        To = self.algo_config.horizon.observation_horizon
+        Ta = self.algo_config.horizon.action_horizon
+        Tp = self.algo_config.horizon.prediction_horizon
+        action_dim = self.ac_dim
+        if self.algo_config.ddpm.enabled is True:
+            num_inference_timesteps = self.algo_config.ddpm.num_inference_timesteps
+        elif self.algo_config.ddim.enabled is True:
+            num_inference_timesteps = self.algo_config.ddim.num_inference_timesteps
+        else:
+            raise ValueError
+        
+        # select network
+        nets = self.nets
+        if self.ema is not None:
+            self.ema.copy_to(parameters=self._shadow_nets.parameters())
+            nets = self._shadow_nets
+        
+        # encode obs
+        inputs = {
+            'obs': obs_dict,
+            'goal': goal_dict
+        }
+        for k in self.obs_shapes:
+            # first two dimensions should be [B, T] for inputs
+            assert inputs['obs'][k].ndim - 2 == len(self.obs_shapes[k])
+        obs_features = TensorUtils.time_distributed(inputs, self.nets['policy']['obs_encoder'], inputs_as_kwargs=True)
+        assert obs_features.ndim == 3  # [B, T, D]
+        B = obs_features.shape[0]
+
+        # reshape observation to (B,obs_horizon*obs_dim)
+        obs_cond = obs_features.flatten(start_dim=1)
+
+        # initialize action from Guassian noise
+        noisy_action = torch.randn(
+            (B, Tp, action_dim), device=self.device)
+        naction = noisy_action
+        
+        # init scheduler
+        self.noise_scheduler.set_timesteps(num_inference_timesteps)
+
+        for k in self.noise_scheduler.timesteps:
+            # predict noise
+            noise_pred = nets['policy']['noise_pred_net'](
+                sample=naction, 
+                timestep=k,
+                global_cond=obs_cond
+            )
+
+            # inverse diffusion step (remove noise)
+            naction = self.noise_scheduler.step(
+                model_output=noise_pred,
+                timestep=k,
+                sample=naction
+            ).prev_sample
+
+        # process action using Ta
+        start = To - 1
+        end = start + Ta
+        action = naction[:,start:end]
+        return action
+
+    def serialize(self):
+        """
+        Get dictionary of current model parameters.
+        """
+        return {
+            "nets": self.nets.state_dict(),
+            "ema": self.ema.state_dict() if self.ema is not None else None,
+        }
+
+    def deserialize(self, model_dict):
+        """
+        Load model from a checkpoint.
+
+        Args:
+            model_dict (dict): a dictionary saved by self.serialize() that contains
+                the same keys as @self.network_classes
+        """
+        self.nets.load_state_dict(model_dict["nets"])
+        if model_dict.get("ema", None) is not None:
+            self.ema.load_state_dict(model_dict["ema"])
+        
+
+# =================== Vision Encoder Utils =====================
+def replace_submodules(
+        root_module: nn.Module, 
+        predicate: Callable[[nn.Module], bool], 
+        func: Callable[[nn.Module], nn.Module]) -> nn.Module:
+    """
+    Replace all submodules selected by the predicate with
+    the output of func.
+
+    predicate: Return true if the module is to be replaced.
+    func: Return new module to use.
+    """
+    if predicate(root_module):
+        return func(root_module)
+
+    if parse_version(torch.__version__) < parse_version('1.9.0'):
+        raise ImportError('This function requires pytorch >= 1.9.0')
+
+    bn_list = [k.split('.') for k, m 
+        in root_module.named_modules(remove_duplicate=True) 
+        if predicate(m)]
+    for *parent, k in bn_list:
+        parent_module = root_module
+        if len(parent) > 0:
+            parent_module = root_module.get_submodule('.'.join(parent))
+        if isinstance(parent_module, nn.Sequential):
+            src_module = parent_module[int(k)]
+        else:
+            src_module = getattr(parent_module, k)
+        tgt_module = func(src_module)
+        if isinstance(parent_module, nn.Sequential):
+            parent_module[int(k)] = tgt_module
+        else:
+            setattr(parent_module, k, tgt_module)
+    # verify that all modules are replaced
+    bn_list = [k.split('.') for k, m 
+        in root_module.named_modules(remove_duplicate=True) 
+        if predicate(m)]
+    assert len(bn_list) == 0
+    return root_module
+
+def replace_bn_with_gn(
+    root_module: nn.Module, 
+    features_per_group: int=16) -> nn.Module:
+    """
+    Relace all BatchNorm layers with GroupNorm.
+    """
+    replace_submodules(
+        root_module=root_module,
+        predicate=lambda x: isinstance(x, nn.BatchNorm2d),
+        func=lambda x: nn.GroupNorm(
+            num_groups=x.num_features//features_per_group, 
+            num_channels=x.num_features)
+    )
+    return root_module
+
+# =================== UNet for Diffusion ==============
+
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+
+
+class Downsample1d(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.Conv1d(dim, dim, 3, 2, 1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+class Upsample1d(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.ConvTranspose1d(dim, dim, 4, 2, 1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Conv1dBlock(nn.Module):
+    '''
+        Conv1d --> GroupNorm --> Mish
+    '''
+
+    def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
+        super().__init__()
+
+        self.block = nn.Sequential(
+            nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2),
+            nn.GroupNorm(n_groups, out_channels),
+            nn.Mish(),
+        )
+
+    def forward(self, x):
+        return self.block(x)
+
+
+class ConditionalResidualBlock1D(nn.Module):
+    def __init__(self, 
+            in_channels, 
+            out_channels, 
+            cond_dim,
+            kernel_size=3,
+            n_groups=8):
+        super().__init__()
+
+        self.blocks = nn.ModuleList([
+            Conv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups),
+            Conv1dBlock(out_channels, out_channels, kernel_size, n_groups=n_groups),
+        ])
+
+        # FiLM modulation https://arxiv.org/abs/1709.07871
+        # predicts per-channel scale and bias
+        cond_channels = out_channels * 2
+        self.out_channels = out_channels
+        self.cond_encoder = nn.Sequential(
+            nn.Mish(),
+            nn.Linear(cond_dim, cond_channels),
+            nn.Unflatten(-1, (-1, 1))
+        )
+
+        # make sure dimensions compatible
+        self.residual_conv = nn.Conv1d(in_channels, out_channels, 1) \
+            if in_channels != out_channels else nn.Identity()
+
+    def forward(self, x, cond):
+        '''
+            x : [ batch_size x in_channels x horizon ]
+            cond : [ batch_size x cond_dim]
+
+            returns:
+            out : [ batch_size x out_channels x horizon ]
+        '''
+        out = self.blocks[0](x)
+        embed = self.cond_encoder(cond)
+
+        embed = embed.reshape(
+            embed.shape[0], 2, self.out_channels, 1)
+        scale = embed[:,0,...]
+        bias = embed[:,1,...]
+        out = scale * out + bias
+
+        out = self.blocks[1](out)
+        out = out + self.residual_conv(x)
+        return out
+
+
+class ConditionalUnet1D(nn.Module):
+    def __init__(self, 
+        input_dim,
+        global_cond_dim,
+        diffusion_step_embed_dim=256,
+        down_dims=[256,512,1024],
+        kernel_size=5,
+        n_groups=8
+        ):
+        """
+        input_dim: Dim of actions.
+        global_cond_dim: Dim of global conditioning applied with FiLM 
+          in addition to diffusion step embedding. This is usually obs_horizon * obs_dim
+        diffusion_step_embed_dim: Size of positional encoding for diffusion iteration k
+        down_dims: Channel size for each UNet level. 
+          The length of this array determines numebr of levels.
+        kernel_size: Conv kernel size
+        n_groups: Number of groups for GroupNorm
+        """
+
+        super().__init__()
+        all_dims = [input_dim] + list(down_dims)
+        start_dim = down_dims[0]
+
+        dsed = diffusion_step_embed_dim
+        diffusion_step_encoder = nn.Sequential(
+            SinusoidalPosEmb(dsed),
+            nn.Linear(dsed, dsed * 4),
+            nn.Mish(),
+            nn.Linear(dsed * 4, dsed),
+        )
+        cond_dim = dsed + global_cond_dim
+
+        in_out = list(zip(all_dims[:-1], all_dims[1:]))
+        mid_dim = all_dims[-1]
+        self.mid_modules = nn.ModuleList([
+            ConditionalResidualBlock1D(
+                mid_dim, mid_dim, cond_dim=cond_dim,
+                kernel_size=kernel_size, n_groups=n_groups
+            ),
+            ConditionalResidualBlock1D(
+                mid_dim, mid_dim, cond_dim=cond_dim,
+                kernel_size=kernel_size, n_groups=n_groups
+            ),
+        ])
+
+        down_modules = nn.ModuleList([])
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (len(in_out) - 1)
+            down_modules.append(nn.ModuleList([
+                ConditionalResidualBlock1D(
+                    dim_in, dim_out, cond_dim=cond_dim, 
+                    kernel_size=kernel_size, n_groups=n_groups),
+                ConditionalResidualBlock1D(
+                    dim_out, dim_out, cond_dim=cond_dim, 
+                    kernel_size=kernel_size, n_groups=n_groups),
+                Downsample1d(dim_out) if not is_last else nn.Identity()
+            ]))
+
+        up_modules = nn.ModuleList([])
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            is_last = ind >= (len(in_out) - 1)
+            up_modules.append(nn.ModuleList([
+                ConditionalResidualBlock1D(
+                    dim_out*2, dim_in, cond_dim=cond_dim,
+                    kernel_size=kernel_size, n_groups=n_groups),
+                ConditionalResidualBlock1D(
+                    dim_in, dim_in, cond_dim=cond_dim,
+                    kernel_size=kernel_size, n_groups=n_groups),
+                Upsample1d(dim_in) if not is_last else nn.Identity()
+            ]))
+        
+        final_conv = nn.Sequential(
+            Conv1dBlock(start_dim, start_dim, kernel_size=kernel_size),
+            nn.Conv1d(start_dim, input_dim, 1),
+        )
+
+        self.diffusion_step_encoder = diffusion_step_encoder
+        self.up_modules = up_modules
+        self.down_modules = down_modules
+        self.final_conv = final_conv
+
+        print("number of parameters: {:e}".format(
+            sum(p.numel() for p in self.parameters()))
+        )
+
+    def forward(self, 
+            sample: torch.Tensor, 
+            timestep: Union[torch.Tensor, float, int], 
+            global_cond=None):
+        """
+        x: (B,T,input_dim)
+        timestep: (B,) or int, diffusion step
+        global_cond: (B,global_cond_dim)
+        output: (B,T,input_dim)
+        """
+        # (B,T,C)
+        sample = sample.moveaxis(-1,-2)
+        # (B,C,T)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        global_feature = self.diffusion_step_encoder(timesteps)
+
+        if global_cond is not None:
+            global_feature = torch.cat([
+                global_feature, global_cond
+            ], axis=-1)
+        
+        x = sample
+        h = []
+        for idx, (resnet, resnet2, downsample) in enumerate(self.down_modules):
+            x = resnet(x, global_feature)
+            x = resnet2(x, global_feature)
+            h.append(x)
+            x = downsample(x)
+
+        for mid_module in self.mid_modules:
+            x = mid_module(x, global_feature)
+
+        for idx, (resnet, resnet2, upsample) in enumerate(self.up_modules):
+            x = torch.cat((x, h.pop()), dim=1)
+            x = resnet(x, global_feature)
+            x = resnet2(x, global_feature)
+            x = upsample(x)
+
+        x = self.final_conv(x)
+
+        # (B,C,T)
+        x = x.moveaxis(-1,-2)
+        # (B,T,C)
+        return x
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/gl.py b/phantom/submodules/phantom-robomimic/robomimic/algo/gl.py
new file mode 100644
index 0000000000000000000000000000000000000000..24ae800892ee0866f9b4df3d94ff49eb1cd8d112
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/gl.py
@@ -0,0 +1,775 @@
+"""
+Subgoal prediction models, used in HBC / IRIS.
+"""
+import numpy as np
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+import robomimic.models.obs_nets as ObsNets
+import robomimic.models.vae_nets as VAENets
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.torch_utils as TorchUtils
+import robomimic.utils.obs_utils as ObsUtils
+
+from robomimic.algo import register_algo_factory_func, PlannerAlgo, ValueAlgo
+
+
+@register_algo_factory_func("gl")
+def algo_config_to_class(algo_config):
+    """
+    Maps algo config to the GL algo class to instantiate, along with additional algo kwargs.
+
+    Args:
+        algo_config (Config instance): algo config
+
+    Returns:
+        algo_class: subclass of Algo
+        algo_kwargs (dict): dictionary of additional kwargs to pass to algorithm
+    """
+    if algo_config.vae.enabled:
+        return GL_VAE, {}
+    return GL, {}
+
+
+class GL(PlannerAlgo):
+    """
+    Implements goal prediction component for HBC and IRIS.
+    """
+    def __init__(
+        self,
+        algo_config,
+        obs_config,
+        global_config,
+        obs_key_shapes,
+        ac_dim,
+        device
+    ):
+        """
+        Args:
+            algo_config (Config object): instance of Config corresponding to the algo section
+                of the config
+
+            obs_config (Config object): instance of Config corresponding to the observation
+                section of the config
+
+            global_config (Config object): global training config
+
+            obs_key_shapes (OrderedDict): dictionary that maps observation keys to shapes
+
+            ac_dim (int): dimension of action space
+
+            device (torch.Device): where the algo should live (i.e. cpu, gpu)
+        """
+
+        self._subgoal_horizon = algo_config.subgoal_horizon
+        super(GL, self).__init__(
+            algo_config=algo_config,
+            obs_config=obs_config,
+            global_config=global_config,
+            obs_key_shapes=obs_key_shapes,
+            ac_dim=ac_dim,
+            device=device
+        )
+
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        self.nets = nn.ModuleDict()
+
+        obs_group_shapes = OrderedDict()
+        obs_group_shapes["obs"] = OrderedDict(self.obs_shapes)
+        if len(self.goal_shapes) > 0:
+            obs_group_shapes["goal"] = OrderedDict(self.goal_shapes)
+
+        # deterministic goal prediction network
+        self.nets["goal_network"] = ObsNets.MIMO_MLP(
+            input_obs_group_shapes=obs_group_shapes, 
+            output_shapes=self.subgoal_shapes,
+            layer_dims=self.algo_config.ae.planner_layer_dims,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+
+        self.nets = self.nets.float().to(self.device)
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training 
+        """
+        input_batch = dict()
+
+        # remove temporal batches for all except scalar signals (to be compatible with model outputs)
+        input_batch["obs"] = { k: batch["obs"][k][:, 0, :] for k in batch["obs"] }
+        # extract multi-horizon subgoal target
+        input_batch["subgoals"] = {k: batch["next_obs"][k][:, self._subgoal_horizon - 1, :] for k in batch["next_obs"]}
+        input_batch["target_subgoals"] = input_batch["subgoals"]
+        input_batch["goal_obs"] = batch.get("goal_obs", None) # goals may not be present
+
+        # we move to device first before float conversion because image observation modalities will be uint8 -
+        # this minimizes the amount of data transferred to GPU
+        return TensorUtils.to_float(TensorUtils.to_device(input_batch, self.device))
+
+    def get_actor_goal_for_training_from_processed_batch(self, processed_batch, **kwargs):
+        """
+        Retrieve subgoals from processed batch to use for training the actor. Subclasses
+        can modify this function to change the subgoals.
+
+        Args:
+            processed_batch (dict): processed batch from @process_batch_for_training
+
+        Returns:
+            actor_subgoals (dict): subgoal observations to condition actor on
+        """
+        return processed_batch["target_subgoals"]
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        with TorchUtils.maybe_no_grad(no_grad=validate):
+            info = super(GL, self).train_on_batch(batch, epoch, validate=validate)
+
+            # predict subgoal observations with goal network
+            pred_subgoals = self.nets["goal_network"](obs=batch["obs"], goal=batch["goal_obs"])
+
+            # compute loss as L2 error for each observation key
+            losses = OrderedDict()
+            target_subgoals = batch["target_subgoals"]  # targets for network prediction
+            goal_loss = 0.
+            for k in pred_subgoals:
+                assert pred_subgoals[k].shape == target_subgoals[k].shape, "mismatch in predicted and target subgoals!"
+                mode_loss = nn.MSELoss()(pred_subgoals[k], target_subgoals[k])
+                goal_loss += mode_loss
+                losses["goal_{}_loss".format(k)] = mode_loss
+            losses["goal_loss"] = goal_loss
+            info.update(TensorUtils.detach(losses))
+
+            if not validate:
+                # gradient step
+                goal_grad_norms = TorchUtils.backprop_for_loss(
+                    net=self.nets["goal_network"],
+                    optim=self.optimizers["goal_network"],
+                    loss=losses["goal_loss"],
+                )
+                info["goal_grad_norms"] = goal_grad_norms
+
+        return info
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        loss_log = super(GL, self).log_info(info)
+
+        loss_log["Loss"] = info["goal_loss"].item()
+        for k in info:
+            if k.endswith("_loss"):
+                loss_log[k] = info[k].item()
+        if "goal_grad_norms" in info:
+            loss_log["Grad_Norms"] = info["goal_grad_norms"]
+
+        return loss_log
+
+    def get_subgoal_predictions(self, obs_dict, goal_dict=None):
+        """
+        Takes a batch of observations and predicts a batch of subgoals.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            subgoal prediction (dict): name -> Tensor [batch_size, ...]
+        """
+        return self.nets["goal_network"](obs=obs_dict, goal=goal_dict)
+
+    def sample_subgoals(self, obs_dict, goal_dict=None, num_samples=1):
+        """
+        Sample @num_samples subgoals from the network per observation.
+        Since this class implements a deterministic subgoal prediction, 
+        this function returns identical subgoals for each input observation.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            subgoals (dict): name -> Tensor [batch_size, num_samples, ...]
+        """
+
+        # stack observations to get all samples in one forward pass
+        obs_tiled = ObsUtils.repeat_and_stack_observation(obs_dict, n=num_samples)
+        goal_tiled = None
+        if goal_dict is not None:
+            goal_tiled = ObsUtils.repeat_and_stack_observation(goal_dict, n=num_samples)
+
+        # [batch_size * num_samples, ...]
+        goals = self.get_subgoal_predictions(obs_dict=obs_tiled, goal_dict=goal_tiled)
+        # reshape to [batch_size, num_samples, ...]
+        return TensorUtils.reshape_dimensions(goals, begin_axis=0, end_axis=0, target_dims=(-1, num_samples))
+
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs. Assumes one input observation (first dimension should be 1).
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        raise Exception("Rollouts are not supported by GL")
+
+
+class GL_VAE(GL):
+    """
+    Implements goal prediction via VAE.
+    """
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        self.nets = nn.ModuleDict()
+
+        self.nets["goal_network"] = VAENets.VAE(
+            input_shapes=self.subgoal_shapes,
+            output_shapes=self.subgoal_shapes,
+            condition_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            device=self.device,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+            **VAENets.vae_args_from_config(self.algo_config.vae),
+        )
+
+        self.nets = self.nets.float().to(self.device)
+
+    def get_actor_goal_for_training_from_processed_batch(
+        self,
+        processed_batch,
+        use_latent_subgoals=False,
+        use_prior_correction=False,
+        num_prior_samples=100,
+        **kwargs,
+    ):
+        """
+        Modify from superclass to support a @use_latent_subgoals option.
+        The VAE can optionally return latent subgoals by passing the subgoal 
+        observations in the batch through the encoder.
+
+        Args:
+            processed_batch (dict): processed batch from @process_batch_for_training
+
+            use_latent_subgoals (bool): if True, condition the actor on latent subgoals
+                by using the VAE encoder to encode subgoal observations at train-time,
+                and using the VAE prior to generate latent subgoals at test-time
+
+            use_prior_correction (bool): if True, use a "prior correction" trick to
+                choose a latent subgoal sampled from the prior that is close to the
+                latent from the VAE encoder (posterior). This can help with issues at 
+                test-time where the encoder latent distribution might not match 
+                the prior latent distribution.
+
+            num_prior_samples (int): number of VAE prior samples to take and choose among,
+                if @use_prior_correction is true
+
+        Returns:
+            actor_subgoals (dict): subgoal observations to condition actor on
+        """
+
+        if not use_latent_subgoals:
+            return processed_batch["target_subgoals"]
+
+        # batch variables
+        obs = processed_batch["obs"]
+        subgoals = processed_batch["subgoals"]  # full subgoal observations
+        target_subgoals = processed_batch["target_subgoals"]  # targets for network prediction
+        goal_obs = processed_batch["goal_obs"]
+
+        with torch.no_grad():
+            # run VAE forward pass to get samples from posterior for the current observation and subgoal
+            vae_outputs = self.nets["goal_network"](
+                inputs=subgoals, # encoder takes full subgoals
+                outputs=target_subgoals, # reconstruct target subgoals
+                goals=goal_obs,
+                conditions=obs, # condition on observations
+            )
+            posterior_z = vae_outputs["encoder_z"]
+            latent_subgoals = posterior_z
+
+            if use_prior_correction:
+                # instead of treating posterior samples as latent subgoals, sample latents from
+                # the prior and choose the closest one as the latent subgoal
+
+                random_key = list(obs.keys())[0]
+                batch_size = obs[random_key].shape[0]
+
+                # for each batch member, get @num_prior_samples samples from the prior
+                obs_tiled = ObsUtils.repeat_and_stack_observation(obs, n=num_prior_samples)
+                goal_tiled = None
+                if len(self.goal_shapes) > 0:
+                    goal_tiled = ObsUtils.repeat_and_stack_observation(goal_obs, n=num_prior_samples)
+
+                prior_z_samples = self.nets["goal_network"].sample_prior(
+                    conditions=obs_tiled,
+                    goals=goal_tiled,
+                )
+
+                # choose prior samples that are closest to the sampled posterior latents
+                # note: every posterior sample in the batch has @num_prior_samples corresponding prior samples
+
+                # reshape prior samples to (batch_size, num_samples, latent_dim)
+                prior_z_samples = prior_z_samples.reshape(batch_size, num_prior_samples, -1)
+
+                # reshape posterior latents to (batch_size, 1, latent_dim)
+                posterior_z_expanded = posterior_z.unsqueeze(1)
+
+                # compute distances with broadcasting so that each posterior sample
+                # has distances to all of its prior samples
+                distances = (prior_z_samples - posterior_z_expanded).pow(2).sum(dim=2)
+
+                # then gather the closest prior sample for each posterior sample
+                neighbors = torch.argmin(distances, dim=1)
+                latent_subgoals = prior_z_samples[torch.arange(batch_size).long(), neighbors]
+
+        return { "latent_subgoal" : latent_subgoals }
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        with TorchUtils.maybe_no_grad(no_grad=validate):
+            info = super(GL, self).train_on_batch(batch, epoch, validate=validate)
+
+            if self.algo_config.vae.prior.use_categorical:
+                temperature = self.algo_config.vae.prior.categorical_init_temp - epoch * self.algo_config.vae.prior.categorical_temp_anneal_step
+                temperature = max(temperature, self.algo_config.vae.prior.categorical_min_temp)
+                self.nets["goal_network"].set_gumbel_temperature(temperature)
+
+            # batch variables
+            obs = batch["obs"]
+            subgoals = batch["subgoals"]  # full subgoal observations
+            target_subgoals = batch["target_subgoals"]  # targets for network prediction
+            goal_obs = batch["goal_obs"]
+
+            vae_outputs = self.nets["goal_network"](
+                inputs=subgoals, # encoder takes full subgoals
+                outputs=target_subgoals, # reconstruct target subgoals
+                goals=goal_obs,
+                conditions=obs, # condition on observations
+            )
+            recons_loss = vae_outputs["reconstruction_loss"]
+            kl_loss = vae_outputs["kl_loss"]
+            goal_loss = recons_loss + self.algo_config.vae.kl_weight * kl_loss
+            info["recons_loss"] = recons_loss
+            info["kl_loss"] = kl_loss
+            info["goal_loss"] = goal_loss
+
+            if not self.algo_config.vae.prior.use_categorical:
+                with torch.no_grad():
+                    info["encoder_variance"] = torch.exp(vae_outputs["encoder_params"]["logvar"])
+
+            # VAE gradient step
+            if not validate:
+                goal_grad_norms = TorchUtils.backprop_for_loss(
+                    net=self.nets["goal_network"],
+                    optim=self.optimizers["goal_network"],
+                    loss=goal_loss,
+                )
+                info["goal_grad_norms"] = goal_grad_norms
+
+        return info
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        loss_log = super(GL_VAE, self).log_info(info)
+        loss_log["Reconstruction_Loss"] = info["recons_loss"].item()
+        loss_log["KL_Loss"] = info["kl_loss"].item()
+        if self.algo_config.vae.prior.use_categorical:
+            loss_log["Gumbel_Temperature"] = self.nets["goal_network"].get_gumbel_temperature()
+        else:
+            loss_log["Encoder_Variance"] = info["encoder_variance"].mean().item()
+        return loss_log
+
+    def get_subgoal_predictions(self, obs_dict, goal_dict=None):
+        """
+        Takes a batch of observations and predicts a batch of subgoals.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            subgoal prediction (dict): name -> Tensor [batch_size, ...]
+        """
+
+        if self.global_config.algo.latent_subgoal.enabled:
+            # latent subgoals from sampling prior
+            latent_subgoals = self.nets["goal_network"].sample_prior(
+                conditions=obs_dict,
+                goals=goal_dict,
+            )
+
+            return OrderedDict(latent_subgoal=latent_subgoals)
+
+        # sample a single goal from the VAE
+        goals = self.sample_subgoals(obs_dict=obs_dict, goal_dict=goal_dict, num_samples=1)
+        return { k : goals[k][:, 0, ...] for k in goals }
+
+    def sample_subgoals(self, obs_dict, goal_dict=None, num_samples=1):
+        """
+        Sample @num_samples subgoals from the VAE per observation.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            subgoals (dict): name -> Tensor [batch_size, num_samples, ...]
+        """
+
+        # stack observations to get all samples in one forward pass
+        obs_tiled = ObsUtils.repeat_and_stack_observation(obs_dict, n=num_samples)
+        goal_tiled = None
+        if goal_dict is not None:
+            goal_tiled = ObsUtils.repeat_and_stack_observation(goal_dict, n=num_samples)
+
+        # VAE decode expects number of samples explicitly
+        mod = list(obs_tiled.keys())[0]
+        n = obs_tiled[mod].shape[0]
+        # [batch_size * num_samples, ...]
+        goals = self.nets["goal_network"].decode(n=n, conditions=obs_tiled, goals=goal_tiled)
+        # reshape to [batch_size, num_samples, ...]
+        return TensorUtils.reshape_dimensions(goals, begin_axis=0, end_axis=0, target_dims=(-1, num_samples))
+
+
+class ValuePlanner(PlannerAlgo, ValueAlgo):
+    """
+    Base class for all algorithms that are used for planning subgoals
+    based on (1) a @PlannerAlgo that is used to sample candidate subgoals
+    and (2) a @ValueAlgo that is used to select one of the subgoals.
+    """
+    def __init__(
+        self,
+        planner_algo_class,
+        value_algo_class,
+        algo_config,
+        obs_config,
+        global_config,
+        obs_key_shapes,
+        ac_dim,
+        device,
+
+    ):
+        """
+        Args:
+            planner_algo_class (Algo class): algo class for the planner
+
+            value_algo_class (Algo class): algo class for the value network
+
+            algo_config (Config object): instance of Config corresponding to the algo section
+                of the config
+
+            obs_config (Config object): instance of Config corresponding to the observation
+                section of the config
+
+            global_config (Config object); global config
+
+            obs_key_shapes (OrderedDict): dictionary that maps input/output observation keys to shapes
+
+            ac_dim (int): action dimension
+
+            device: torch device
+        """
+        self.algo_config = algo_config
+        self.obs_config = obs_config
+        self.global_config = global_config
+
+        self.ac_dim = ac_dim
+        self.device = device
+
+        self.planner = planner_algo_class(
+            algo_config=algo_config.planner,
+            obs_config=obs_config.planner,
+            global_config=global_config,
+            obs_key_shapes=obs_key_shapes,
+            ac_dim=ac_dim,
+            device=device
+        )
+
+        self.value_net = value_algo_class(
+            algo_config=algo_config.value,
+            obs_config=obs_config.value,
+            global_config=global_config,
+            obs_key_shapes=obs_key_shapes,
+            ac_dim=ac_dim,
+            device=device
+        )
+
+        self.subgoal_shapes = self.planner.subgoal_shapes
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training 
+        """
+        input_batch = dict()
+
+        input_batch["planner"] = self.planner.process_batch_for_training(batch)
+        input_batch["value_net"] = self.value_net.process_batch_for_training(batch)
+
+        # we move to device first before float conversion because image observation modalities will be uint8 -
+        # this minimizes the amount of data transferred to GPU
+        return TensorUtils.to_float(TensorUtils.to_device(input_batch, self.device))
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        if validate:
+            assert not self.planner.nets.training
+            assert not self.value_net.nets.training
+
+        info = dict(planner=dict(), value_net=dict())
+
+        # train planner
+        info["planner"].update(self.planner.train_on_batch(batch["planner"], epoch, validate=validate))
+
+        # train value network
+        info["value_net"].update(self.value_net.train_on_batch(batch["value_net"], epoch, validate=validate))
+
+        return info
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        loss = 0.
+
+        # planner
+        planner_log = self.planner.log_info(info["planner"])
+        planner_log = dict(("Planner/" + k, v) for k, v in planner_log.items())
+        loss += planner_log["Planner/Loss"]
+
+        # value network
+        value_net_log = self.value_net.log_info(info["value_net"])
+        value_net_log = dict(("ValueNetwork/" + k, v) for k, v in value_net_log.items())
+        loss += value_net_log["ValueNetwork/Loss"]
+        planner_log.update(value_net_log)
+
+        planner_log["Loss"] = loss
+        return planner_log
+
+    def on_epoch_end(self, epoch):
+        """
+        Called at the end of each epoch.
+        """
+        self.planner.on_epoch_end(epoch)
+        self.value_net.on_epoch_end(epoch)
+
+    def set_eval(self):
+        """
+        Prepare networks for evaluation.
+        """
+        self.planner.set_eval()
+        self.value_net.set_eval()
+
+    def set_train(self):
+        """
+        Prepare networks for training.
+        """
+        self.planner.set_train()
+        self.value_net.set_train()
+
+    def serialize(self):
+        """
+        Get dictionary of current model parameters.
+        """
+        return dict(
+            planner=self.planner.serialize(),
+            value_net=self.value_net.serialize(),
+        )
+
+    def deserialize(self, model_dict):
+        """
+        Load model from a checkpoint.
+
+        Args:
+            model_dict (dict): a dictionary saved by self.serialize() that contains
+                the same keys as @self.network_classes
+        """
+        self.planner.deserialize(model_dict["planner"])
+        self.value_net.deserialize(model_dict["value_net"])
+
+    def reset(self):
+        """
+        Reset algo state to prepare for environment rollouts.
+        """
+        self.planner.reset()
+        self.value_net.reset()
+
+    def __repr__(self):
+        """
+        Pretty print algorithm and network description.
+        """
+        msg = str(self.__class__.__name__)
+        import textwrap
+        return msg + "Planner:\n" + textwrap.indent(self.planner.__repr__(), '  ') + \
+               "\n\nValue Network:\n" + textwrap.indent(self.value_net.__repr__(), '  ')
+
+    def get_subgoal_predictions(self, obs_dict, goal_dict=None):
+        """
+        Takes a batch of observations and predicts a batch of subgoals.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            subgoal prediction (dict): name -> Tensor [batch_size, ...]
+        """
+
+        num_samples = self.algo_config.num_samples
+
+        # sample subgoals from the planner (shape: [batch_size, num_samples, ...])
+        subgoals = self.sample_subgoals(obs_dict=obs_dict, goal_dict=goal_dict, num_samples=num_samples)
+
+        # stack subgoals to get all values in one forward pass (shape [batch_size * num_samples, ...])
+        k = list(obs_dict.keys())[0]
+        bsize = obs_dict[k].shape[0]
+        subgoals_tiled = TensorUtils.reshape_dimensions(subgoals, begin_axis=0, end_axis=1, target_dims=(bsize * num_samples,))
+
+        # also repeat goals if necessary
+        goal_tiled = None
+        if len(self.planner.goal_shapes) > 0:
+            goal_tiled = ObsUtils.repeat_and_stack_observation(goal_dict, n=num_samples)
+
+        # evaluate the value of each subgoal
+        subgoal_values = self.value_net.get_state_value(obs_dict=subgoals_tiled, goal_dict=goal_tiled).reshape(-1, num_samples)
+
+        # pick the best subgoal
+        best_index = torch.argmax(subgoal_values, dim=1)
+        best_subgoal = {k: subgoals[k][torch.arange(bsize), best_index] for k in subgoals}
+        return best_subgoal
+
+    def sample_subgoals(self, obs_dict, goal_dict, num_samples=1):
+        """
+        Sample @num_samples subgoals from the planner algo per observation.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            subgoals (dict): name -> Tensor [batch_size, num_samples, ...]
+        """
+        return self.planner.sample_subgoals(obs_dict=obs_dict, goal_dict=goal_dict, num_samples=num_samples)
+
+    def get_state_value(self, obs_dict, goal_dict=None):
+        """
+        Get state value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        return self.value_net.get_state_value(obs_dict=obs_dict, goal_dict=goal_dict)
+
+    def get_state_action_value(self, obs_dict, actions, goal_dict=None):
+        """
+        Get state-action value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            actions (torch.Tensor): action
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        return self.value_net.get_state_action_value(obs_dict=obs_dict, actions=actions, goal_dict=goal_dict)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/hbc.py b/phantom/submodules/phantom-robomimic/robomimic/algo/hbc.py
new file mode 100644
index 0000000000000000000000000000000000000000..543b1fbcf4ced11b9628d506b1972f1123a357b6
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/hbc.py
@@ -0,0 +1,344 @@
+"""
+Implementation of Hierarchical Behavioral Cloning, where
+a planner model outputs subgoals (future observations), and
+an actor model is conditioned on the subgoals to try and
+reach them. Largely based on the Generalization Through Imitation (GTI)
+paper (see https://arxiv.org/abs/2003.06085).
+"""
+import textwrap
+import numpy as np
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.obs_utils as ObsUtils
+from robomimic.config.config import Config
+from robomimic.algo import register_algo_factory_func, algo_name_to_factory_func, HierarchicalAlgo, GL_VAE
+
+
+@register_algo_factory_func("hbc")
+def algo_config_to_class(algo_config):
+    """
+    Maps algo config to the HBC algo class to instantiate, along with additional algo kwargs.
+
+    Args:
+        algo_config (Config instance): algo config
+
+    Returns:
+        algo_class: subclass of Algo
+        algo_kwargs (dict): dictionary of additional kwargs to pass to algorithm
+    """
+    pol_cls, _ = algo_name_to_factory_func("bc")(algo_config.actor)
+    plan_cls, _ = algo_name_to_factory_func("gl")(algo_config.planner)
+    return HBC, dict(policy_algo_class=pol_cls, planner_algo_class=plan_cls)
+
+
+class HBC(HierarchicalAlgo):
+    """
+    Default HBC training, largely based on https://arxiv.org/abs/2003.06085
+    """
+    def __init__(
+        self,
+        planner_algo_class,
+        policy_algo_class,
+        algo_config,
+        obs_config,
+        global_config,
+        obs_key_shapes,
+        ac_dim,
+        device,
+    ):
+        """
+        Args:
+            planner_algo_class (Algo class): algo class for the planner
+
+            policy_algo_class (Algo class): algo class for the policy
+
+            algo_config (Config object): instance of Config corresponding to the algo section
+                of the config
+
+            obs_config (Config object): instance of Config corresponding to the observation
+                section of the config
+
+            global_config (Config object): global training config
+
+            obs_key_shapes (dict): dictionary that maps input/output observation keys to shapes
+
+            ac_dim (int): action dimension
+
+            device: torch device
+        """
+        self.algo_config = algo_config
+        self.obs_config = obs_config
+        self.global_config = global_config
+
+        self.ac_dim = ac_dim
+        self.device = device
+
+        self._subgoal_step_count = 0  # current step count for deciding when to update subgoal
+        self._current_subgoal = None  # latest subgoal
+        self._subgoal_update_interval = self.algo_config.subgoal_update_interval  # subgoal update frequency
+        self._subgoal_horizon = self.algo_config.planner.subgoal_horizon
+        self._actor_horizon = self.algo_config.actor.rnn.horizon
+
+        self._algo_mode = self.algo_config.mode
+        assert self._algo_mode in ["separate", "cascade"]
+
+        self.planner = planner_algo_class(
+            algo_config=algo_config.planner,
+            obs_config=obs_config.planner,
+            global_config=global_config,
+            obs_key_shapes=obs_key_shapes,
+            ac_dim=ac_dim,
+            device=device
+        )
+
+        # goal-conditional actor follows goals set by the planner
+        self.actor_goal_shapes = self.planner.subgoal_shapes
+        if self.algo_config.latent_subgoal.enabled:
+            assert planner_algo_class == GL_VAE  # only VAE supported for now
+            self.actor_goal_shapes = OrderedDict(latent_subgoal=(self.planner.algo_config.vae.latent_dim,))
+
+        # only for the actor: override goal modalities and shapes to match the subgoal set by the planner
+        actor_obs_key_shapes = deepcopy(obs_key_shapes)
+        # make sure we are not modifying existing observation key shapes
+        for k in self.actor_goal_shapes:
+            if k in actor_obs_key_shapes:
+                assert actor_obs_key_shapes[k] == self.actor_goal_shapes[k]
+        actor_obs_key_shapes.update(self.actor_goal_shapes)
+
+        goal_obs_keys = {obs_modality: [] for obs_modality in ObsUtils.OBS_MODALITY_CLASSES.keys()}
+        for k in self.actor_goal_shapes.keys():
+            goal_obs_keys[ObsUtils.OBS_KEYS_TO_MODALITIES[k]].append(k)
+
+        actor_obs_config = deepcopy(obs_config.actor)
+        with actor_obs_config.unlocked():
+            actor_obs_config["goal"] = Config(**goal_obs_keys)
+
+        self.actor = policy_algo_class(
+            algo_config=algo_config.actor,
+            obs_config=actor_obs_config,
+            global_config=global_config,
+            obs_key_shapes=actor_obs_key_shapes,
+            ac_dim=ac_dim,
+            device=device,
+        )
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training 
+        """
+        input_batch = dict()
+
+        input_batch["planner"] = self.planner.process_batch_for_training(batch)
+        input_batch["actor"] = self.actor.process_batch_for_training(batch)
+
+        if self.algo_config.actor_use_random_subgoals:
+            # optionally use randomly sampled step between [1, seq_length] as policy goal
+            policy_subgoal_indices = torch.randint(
+                low=0, high=self.global_config.train.seq_length, size=(batch["actions"].shape[0],))
+            goal_obs = TensorUtils.gather_sequence(batch["next_obs"], policy_subgoal_indices)
+            goal_obs = TensorUtils.to_float(TensorUtils.to_device(goal_obs, self.device))
+            input_batch["actor"]["goal_obs"] = \
+                self.planner.get_actor_goal_for_training_from_processed_batch(
+                    goal_obs,
+                    use_latent_subgoals=self.algo_config.latent_subgoal.enabled,
+                    use_prior_correction=self.algo_config.latent_subgoal.prior_correction.enabled,
+                    num_prior_samples=self.algo_config.latent_subgoal.prior_correction.num_samples,
+                )
+        else:
+            # otherwise, use planner subgoal target as goal for the policy
+            input_batch["actor"]["goal_obs"] = \
+                self.planner.get_actor_goal_for_training_from_processed_batch(
+                    input_batch["planner"],
+                    use_latent_subgoals=self.algo_config.latent_subgoal.enabled,
+                    use_prior_correction=self.algo_config.latent_subgoal.prior_correction.enabled,
+                    num_prior_samples=self.algo_config.latent_subgoal.prior_correction.num_samples,
+                )
+
+        # we move to device first before float conversion because image observation modalities will be uint8 -
+        # this minimizes the amount of data transferred to GPU
+        return TensorUtils.to_float(TensorUtils.to_device(input_batch, self.device))
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        info = dict(planner=dict(), actor=dict())
+        # train planner
+        info["planner"].update(self.planner.train_on_batch(batch["planner"], epoch, validate=validate))
+
+        # train actor
+        if self._algo_mode == "separate":
+            # train low-level actor by getting subgoals from the dataset
+            info["actor"].update(self.actor.train_on_batch(batch["actor"], epoch, validate=validate))
+
+        elif self._algo_mode == "cascade":
+            # get predictions from the planner
+            with torch.no_grad():
+                batch["actor"]["goal_obs"] = self.planner.get_subgoal_predictions(
+                    obs_dict=batch["planner"]["obs"], goal_dict=batch["planner"]["goal_obs"])
+
+            # train actor with the predicted goal
+            info["actor"].update(self.actor.train_on_batch(batch["actor"], epoch, validate=validate))
+
+        else:
+            raise NotImplementedError("algo mode {} is not implemented".format(self._algo_mode))
+
+        return info
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        planner_log = dict()
+        actor_log = dict()
+        loss = 0.
+
+        planner_log = self.planner.log_info(info["planner"])
+        planner_log = dict(("Planner/" + k, v) for k, v in planner_log.items())
+        loss += planner_log["Planner/Loss"]
+
+        actor_log = self.actor.log_info(info["actor"])
+        actor_log = dict(("Actor/" + k, v) for k, v in actor_log.items())
+        loss += actor_log["Actor/Loss"]
+
+        planner_log.update(actor_log)
+        planner_log["Loss"] = loss
+        return planner_log
+
+    def on_epoch_end(self, epoch):
+        """
+        Called at the end of each epoch.
+        """
+        self.planner.on_epoch_end(epoch)
+        self.actor.on_epoch_end(epoch)
+
+    def set_eval(self):
+        """
+        Prepare networks for evaluation.
+        """
+        self.planner.set_eval()
+        self.actor.set_eval()
+
+    def set_train(self):
+        """
+        Prepare networks for training.
+        """
+        self.planner.set_train()
+        self.actor.set_train()
+
+    def serialize(self):
+        """
+        Get dictionary of current model parameters.
+        """
+        return dict(
+            planner=self.planner.serialize(),
+            actor=self.actor.serialize(),
+        )
+
+    def deserialize(self, model_dict):
+        """
+        Load model from a checkpoint.
+
+        Args:
+            model_dict (dict): a dictionary saved by self.serialize() that contains
+                the same keys as @self.network_classes
+        """
+        self.actor.deserialize(model_dict["actor"])
+        self.planner.deserialize(model_dict["planner"])
+
+    @property
+    def current_subgoal(self):
+        """
+        Return the current subgoal (at rollout time) with shape (batch, ...)
+        """
+        return { k : self._current_subgoal[k].clone() for k in self._current_subgoal }
+
+    @current_subgoal.setter
+    def current_subgoal(self, sg):
+        """
+        Sets the current subgoal being used by the actor.
+        """
+        for k, v in sg.items():
+            if not self.algo_config.latent_subgoal.enabled:
+                # subgoal should only match subgoal shapes if not using latent subgoals
+                assert list(v.shape[1:]) == list(self.planner.subgoal_shapes[k])
+            # subgoal shapes should always match actor goal shapes
+            assert list(v.shape[1:]) == list(self.actor_goal_shapes[k])
+        self._current_subgoal = { k : sg[k].clone() for k in sg }
+
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        if self._current_subgoal is None or self._subgoal_step_count % self._subgoal_update_interval == 0:
+            # update current subgoal
+            self.current_subgoal = self.planner.get_subgoal_predictions(obs_dict=obs_dict, goal_dict=goal_dict)
+
+        action = self.actor.get_action(obs_dict=obs_dict, goal_dict=self.current_subgoal)
+        self._subgoal_step_count += 1
+        return action
+
+    def reset(self):
+        """
+        Reset algo state to prepare for environment rollouts.
+        """
+        self._current_subgoal = None
+        self._subgoal_step_count = 0
+        self.planner.reset()
+        self.actor.reset()
+
+    def __repr__(self):
+        """
+        Pretty print algorithm and network description.
+        """
+        msg = str(self.__class__.__name__)
+        msg += "(subgoal_horizon={}, actor_horizon={}, subgoal_update_interval={}, mode={}, " \
+               "actor_use_random_subgoals={})\n".format(
+            self._subgoal_horizon,
+            self._actor_horizon,
+            self._subgoal_update_interval,
+            self._algo_mode,
+            self.algo_config.actor_use_random_subgoals
+        )
+        return msg + "Planner:\n" + textwrap.indent(self.planner.__repr__(), '  ') + \
+               "\n\nPolicy:\n" + textwrap.indent(self.actor.__repr__(), '  ')
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/iql.py b/phantom/submodules/phantom-robomimic/robomimic/algo/iql.py
new file mode 100644
index 0000000000000000000000000000000000000000..bde522b2292e6140b5ce4e3120ad0c83e4064fff
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/iql.py
@@ -0,0 +1,428 @@
+"""
+Implementation of Implicit Q-Learning (IQL).
+Based off of https://github.com/rail-berkeley/rlkit/blob/master/rlkit/torch/sac/iql_trainer.py.
+(Paper - https://arxiv.org/abs/2110.06169).
+"""
+import numpy as np
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import robomimic.models.policy_nets as PolicyNets
+import robomimic.models.value_nets as ValueNets
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.torch_utils as TorchUtils
+from robomimic.algo import register_algo_factory_func, ValueAlgo, PolicyAlgo
+
+
+@register_algo_factory_func("iql")
+def algo_config_to_class(algo_config):
+    """
+    Maps algo config to the IQL algo class to instantiate, along with additional algo kwargs.
+
+    Args:
+        algo_config (Config instance): algo config
+
+    Returns:
+        algo_class: subclass of Algo
+        algo_kwargs (dict): dictionary of additional kwargs to pass to algorithm
+    """
+    return IQL, {}
+
+
+class IQL(PolicyAlgo, ValueAlgo):
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+
+        Networks for this algo: critic (potentially ensemble), actor, value function
+        """
+
+        # Create nets
+        self.nets = nn.ModuleDict()
+
+        # Assemble args to pass to actor
+        actor_args = dict(self.algo_config.actor.net.common)
+
+        # Add network-specific args and define network class
+        if self.algo_config.actor.net.type == "gaussian":
+            actor_cls = PolicyNets.GaussianActorNetwork
+            actor_args.update(dict(self.algo_config.actor.net.gaussian))
+        elif self.algo_config.actor.net.type == "gmm":
+            actor_cls = PolicyNets.GMMActorNetwork
+            actor_args.update(dict(self.algo_config.actor.net.gmm))
+        else:
+            # Unsupported actor type!
+            raise ValueError(f"Unsupported actor requested. "
+                             f"Requested: {self.algo_config.actor.net.type}, "
+                             f"valid options are: {['gaussian', 'gmm']}")
+
+        # Actor
+        self.nets["actor"] = actor_cls(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.actor.layer_dims,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+            **actor_args,
+        )
+
+        # Critics
+        self.nets["critic"] = nn.ModuleList()
+        self.nets["critic_target"] = nn.ModuleList()
+        for _ in range(self.algo_config.critic.ensemble.n):
+            for net_list in (self.nets["critic"], self.nets["critic_target"]):
+                critic = ValueNets.ActionValueNetwork(
+                    obs_shapes=self.obs_shapes,
+                    ac_dim=self.ac_dim,
+                    mlp_layer_dims=self.algo_config.critic.layer_dims,
+                    goal_shapes=self.goal_shapes,
+                    encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+                )
+                net_list.append(critic)
+
+        # Value function network
+        self.nets["vf"] = ValueNets.ValueNetwork(
+            obs_shapes=self.obs_shapes,
+            mlp_layer_dims=self.algo_config.critic.layer_dims,
+            goal_shapes=self.goal_shapes,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+
+        # Send networks to appropriate device
+        self.nets = self.nets.float().to(self.device)
+
+        # sync target networks at beginning of training
+        with torch.no_grad():
+            for critic, critic_target in zip(self.nets["critic"], self.nets["critic_target"]):
+                TorchUtils.hard_update(
+                    source=critic,
+                    target=critic_target,
+                )
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out relevant info and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training
+        """
+
+        input_batch = dict()
+
+        # remove temporal batches for all
+        input_batch["obs"] = {k: batch["obs"][k][:, 0, :] for k in batch["obs"]}
+        input_batch["next_obs"] = {k: batch["next_obs"][k][:, 0, :] for k in batch["next_obs"]}
+        input_batch["goal_obs"] = batch.get("goal_obs", None) # goals may not be present
+        input_batch["actions"] = batch["actions"][:, 0, :]
+        input_batch["dones"] = batch["dones"][:, 0]
+        input_batch["rewards"] = batch["rewards"][:, 0]
+
+        return TensorUtils.to_device(TensorUtils.to_float(input_batch), self.device)
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        info = OrderedDict()
+
+        # Set the correct context for this training step
+        with TorchUtils.maybe_no_grad(no_grad=validate):
+            # Always run super call first
+            info = super().train_on_batch(batch, epoch, validate=validate)
+
+            # Compute loss for critic(s)
+            critic_losses, vf_loss, critic_info = self._compute_critic_loss(batch)
+            # Compute loss for actor
+            actor_loss, actor_info = self._compute_actor_loss(batch, critic_info)
+
+            if not validate:
+                # Critic update
+                self._update_critic(critic_losses, vf_loss)
+                
+                # Actor update
+                self._update_actor(actor_loss)
+
+            # Update info
+            info.update(actor_info)
+            info.update(critic_info)
+
+        # Return stats
+        return info
+
+    def _compute_critic_loss(self, batch):
+        """
+        Helper function for computing Q and V losses. Called by @train_on_batch
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+        Returns:
+            critic_losses (list): list of critic (Q function) losses
+            vf_loss (torch.Tensor): value function loss
+            info (dict): dictionary of Q / V predictions and losses
+        """
+        info = OrderedDict()
+
+        # get batch values
+        obs = batch["obs"]
+        actions = batch["actions"]
+        next_obs = batch["next_obs"]
+        goal_obs = batch["goal_obs"]
+        rewards = torch.unsqueeze(batch["rewards"], 1)
+        dones = torch.unsqueeze(batch["dones"], 1)
+
+        # Q predictions
+        pred_qs = [critic(obs_dict=obs, acts=actions, goal_dict=goal_obs)
+                   for critic in self.nets["critic"]]
+
+        info["critic/critic1_pred"] = pred_qs[0].mean()
+
+        # Q target values
+        target_vf_pred = self.nets["vf"](obs_dict=next_obs, goal_dict=goal_obs).detach()
+        q_target = rewards + (1. - dones) * self.algo_config.discount * target_vf_pred
+        q_target = q_target.detach()
+
+        # Q losses
+        critic_losses = []
+        td_loss_fcn = nn.SmoothL1Loss() if self.algo_config.critic.use_huber else nn.MSELoss()
+        for (i, q_pred) in enumerate(pred_qs):
+            # Calculate td error loss
+            td_loss = td_loss_fcn(q_pred, q_target)
+            info[f"critic/critic{i+1}_loss"] = td_loss
+            critic_losses.append(td_loss)
+
+        # V predictions
+        pred_qs = [critic(obs_dict=obs, acts=actions, goal_dict=goal_obs)
+                        for critic in self.nets["critic_target"]]
+        q_pred, _ = torch.cat(pred_qs, dim=1).min(dim=1, keepdim=True)
+        q_pred = q_pred.detach()
+        vf_pred = self.nets["vf"](obs)
+        
+        # V losses: expectile regression. see section 4.1 in https://arxiv.org/pdf/2110.06169.pdf
+        vf_err = vf_pred - q_pred
+        vf_sign = (vf_err > 0).float()
+        vf_weight = (1 - vf_sign) * self.algo_config.vf_quantile + vf_sign * (1 - self.algo_config.vf_quantile)
+        vf_loss = (vf_weight * (vf_err ** 2)).mean()
+        
+        # update logs for V loss
+        info["vf/q_pred"] = q_pred
+        info["vf/v_pred"] = vf_pred
+        info["vf/v_loss"] = vf_loss
+
+        # Return stats
+        return critic_losses, vf_loss, info
+
+    def _update_critic(self, critic_losses, vf_loss):
+        """
+        Helper function for updating critic and vf networks. Called by @train_on_batch
+
+        Args:
+            critic_losses (list): list of critic (Q function) losses
+            vf_loss (torch.Tensor): value function loss
+        """
+
+        # update ensemble of critics
+        for (critic_loss, critic, critic_target, optimizer) in zip(
+                critic_losses, self.nets["critic"], self.nets["critic_target"], self.optimizers["critic"]
+        ):
+            TorchUtils.backprop_for_loss(
+                net=critic,
+                optim=optimizer,
+                loss=critic_loss,
+                max_grad_norm=self.algo_config.critic.max_gradient_norm,
+                retain_graph=False,
+            )
+
+            # update target network
+            with torch.no_grad():
+                TorchUtils.soft_update(source=critic, target=critic_target, tau=self.algo_config.target_tau)
+
+        # update V function network
+        TorchUtils.backprop_for_loss(
+            net=self.nets["vf"],
+            optim=self.optimizers["vf"],
+            loss=vf_loss,
+            max_grad_norm=self.algo_config.critic.max_gradient_norm,
+            retain_graph=False,
+        )
+
+    def _compute_actor_loss(self, batch, critic_info):
+        """
+        Helper function for computing actor loss. Called by @train_on_batch
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            critic_info (dict): dictionary containing Q and V function predictions,
+                to be used for computing advantage estimates
+
+        Returns:
+            actor_loss (torch.Tensor): actor loss
+            info (dict): dictionary of actor losses, log_probs, advantages, and weights
+        """
+        info = OrderedDict()
+
+        # compute log probability of batch actions
+        dist = self.nets["actor"].forward_train(obs_dict=batch["obs"], goal_dict=batch["goal_obs"])
+        log_prob = dist.log_prob(batch["actions"])
+
+        info["actor/log_prob"] = log_prob.mean()
+
+        # compute advantage estimate
+        q_pred = critic_info["vf/q_pred"]
+        v_pred = critic_info["vf/v_pred"]
+        adv = q_pred - v_pred
+        
+        # compute weights
+        weights = self._get_adv_weights(adv)
+
+        # compute advantage weighted actor loss. disable gradients through weights
+        actor_loss = (-log_prob * weights.detach()).mean()
+
+        info["actor/loss"] = actor_loss
+
+        # log adv-related values
+        info["adv/adv"] = adv
+        info["adv/adv_weight"] = weights
+
+        # Return stats
+        return actor_loss, info
+
+    def _update_actor(self, actor_loss):
+        """
+        Helper function for updating actor network. Called by @train_on_batch
+
+        Args:
+            actor_loss (torch.Tensor): actor loss
+        """
+
+        TorchUtils.backprop_for_loss(
+            net=self.nets["actor"],
+            optim=self.optimizers["actor"],
+            loss=actor_loss,
+            max_grad_norm=self.algo_config.actor.max_gradient_norm,
+        )
+    
+    def _get_adv_weights(self, adv):
+        """
+        Helper function for computing advantage weights. Called by @_compute_actor_loss
+
+        Args:
+            adv (torch.Tensor): raw advantage estimates
+
+        Returns:
+            weights (torch.Tensor): weights computed based on advantage estimates,
+                in shape (B,) where B is batch size
+        """
+        
+        # clip raw advantage values
+        if self.algo_config.adv.clip_adv_value is not None:
+            adv = adv.clamp(max=self.algo_config.adv.clip_adv_value)
+
+        # compute weights based on advantage values
+        beta = self.algo_config.adv.beta # temprature factor        
+        weights = torch.exp(adv / beta)
+
+        # clip final weights
+        if self.algo_config.adv.use_final_clip is True:
+            weights = weights.clamp(-100.0, 100.0)
+
+        # reshape from (B, 1) to (B,)
+        return weights[:, 0]
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        log = OrderedDict()
+
+        log["actor/log_prob"] = info["actor/log_prob"].item()
+        log["actor/loss"] = info["actor/loss"].item()
+
+        log["critic/critic1_pred"] = info["critic/critic1_pred"].item()
+        log["critic/critic1_loss"] = info["critic/critic1_loss"].item()
+
+        log["vf/v_loss"] = info["vf/v_loss"].item()
+
+        self._log_data_attributes(log, info, "vf/q_pred")
+        self._log_data_attributes(log, info, "vf/v_pred")
+        self._log_data_attributes(log, info, "adv/adv")
+        self._log_data_attributes(log, info, "adv/adv_weight")
+
+        return log
+
+    def _log_data_attributes(self, log, info, key):
+        """
+        Helper function for logging statistics. Moodifies log in-place
+
+        Args:
+            log (dict): existing log dictionary
+            log (dict): existing dictionary of tensors containing raw stats
+            key (str): key to log
+        """
+        log[key + "/max"] = info[key].max().item()
+        log[key + "/min"] = info[key].min().item()
+        log[key + "/mean"] = info[key].mean().item()
+        log[key + "/std"] = info[key].std().item()
+
+    def on_epoch_end(self, epoch):
+        """
+        Called at the end of each epoch.
+        """
+
+        # LR scheduling updates
+        for lr_sc in self.lr_schedulers["critic"]:
+            if lr_sc is not None:
+                lr_sc.step()
+
+        if self.lr_schedulers["vf"] is not None:
+            self.lr_schedulers["vf"].step()
+
+        if self.lr_schedulers["actor"] is not None:
+            self.lr_schedulers["actor"].step()
+
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        assert not self.nets.training
+
+        return self.nets["actor"](obs_dict=obs_dict, goal_dict=goal_dict)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/iris.py b/phantom/submodules/phantom-robomimic/robomimic/algo/iris.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b441470c796749f92682ecf2b38a48e0bb3ada5
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/iris.py
@@ -0,0 +1,183 @@
+"""
+Implementation of IRIS (https://arxiv.org/abs/1911.05321).
+"""
+import numpy as np
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.obs_utils as ObsUtils
+from robomimic.config.config import Config
+from robomimic.algo import register_algo_factory_func, algo_name_to_factory_func, HBC, ValuePlanner, ValueAlgo, GL_VAE
+
+
+@register_algo_factory_func("iris")
+def algo_config_to_class(algo_config):
+    """
+    Maps algo config to the IRIS algo class to instantiate, along with additional algo kwargs.
+
+    Args:
+        algo_config (Config instance): algo config
+
+    Returns:
+        algo_class: subclass of Algo
+        algo_kwargs (dict): dictionary of additional kwargs to pass to algorithm
+    """
+    pol_cls, _ = algo_name_to_factory_func("bc")(algo_config.actor)
+    plan_cls, _ = algo_name_to_factory_func("gl")(algo_config.value_planner.planner)
+    value_cls, _ = algo_name_to_factory_func("bcq")(algo_config.value_planner.value)
+    return IRIS, dict(policy_algo_class=pol_cls, planner_algo_class=plan_cls, value_algo_class=value_cls)
+
+
+class IRIS(HBC, ValueAlgo):
+    """
+    Implementation of IRIS (https://arxiv.org/abs/1911.05321).
+    """
+    def __init__(
+        self,
+        planner_algo_class,
+        value_algo_class,
+        policy_algo_class,
+        algo_config,
+        obs_config,
+        global_config,
+        obs_key_shapes,
+        ac_dim,
+        device,
+    ):
+        """
+        Args:
+            planner_algo_class (Algo class): algo class for the planner
+
+            policy_algo_class (Algo class): algo class for the policy
+
+            algo_config (Config object): instance of Config corresponding to the algo section
+                of the config
+
+            obs_config (Config object): instance of Config corresponding to the observation
+                section of the config
+
+            global_config (Config object): global training config
+
+            obs_key_shapes (OrderedDict): dictionary that maps input/output observation keys to shapes
+
+            ac_dim (int): action dimension
+
+            device: torch device
+        """
+        self.algo_config = algo_config
+        self.obs_config = obs_config
+        self.global_config = global_config
+
+        self.ac_dim = ac_dim
+        self.device = device
+
+        self._subgoal_step_count = 0  # current step count for deciding when to update subgoal
+        self._current_subgoal = None  # latest subgoal
+        self._subgoal_update_interval = self.algo_config.subgoal_update_interval  # subgoal update frequency
+        self._subgoal_horizon = self.algo_config.value_planner.planner.subgoal_horizon
+        self._actor_horizon = self.algo_config.actor.rnn.horizon
+
+        self._algo_mode = self.algo_config.mode
+        assert self._algo_mode in ["separate", "cascade"]
+
+        self.planner = ValuePlanner(
+            planner_algo_class=planner_algo_class,
+            value_algo_class=value_algo_class,
+            algo_config=algo_config.value_planner,
+            obs_config=obs_config.value_planner,
+            global_config=global_config,
+            obs_key_shapes=obs_key_shapes,
+            ac_dim=ac_dim,
+            device=device
+        )
+
+        self.actor_goal_shapes = self.planner.subgoal_shapes
+        assert not algo_config.latent_subgoal.enabled, "IRIS does not support latent subgoals"
+
+        # only for the actor: override goal modalities and shapes to match the subgoal set by the planner
+        actor_obs_key_shapes = deepcopy(obs_key_shapes)
+        # make sure we are not modifying existing observation key shapes
+        for k in self.actor_goal_shapes:
+            if k in actor_obs_key_shapes:
+                assert actor_obs_key_shapes[k] == self.actor_goal_shapes[k]
+        actor_obs_key_shapes.update(self.actor_goal_shapes)
+
+        goal_modalities = {obs_modality: [] for obs_modality in ObsUtils.OBS_MODALITY_CLASSES.keys()}
+        for k in self.actor_goal_shapes.keys():
+            goal_modalities[ObsUtils.OBS_KEYS_TO_MODALITIES[k]].append(k)
+
+        actor_obs_config = deepcopy(obs_config.actor)
+        with actor_obs_config.unlocked():
+            actor_obs_config["goal"] = Config(**goal_modalities)
+
+        self.actor = policy_algo_class(
+            algo_config=algo_config.actor,
+            obs_config=actor_obs_config,
+            global_config=global_config,
+            obs_key_shapes=actor_obs_key_shapes,
+            ac_dim=ac_dim,
+            device=device
+        )
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training 
+        """
+        input_batch = dict()
+
+        input_batch["planner"] = self.planner.process_batch_for_training(batch)
+        input_batch["actor"] = self.actor.process_batch_for_training(batch)
+
+        if self.algo_config.actor_use_random_subgoals:
+            # optionally use randomly sampled step between [1, seq_length] as policy goal
+            policy_subgoal_indices = torch.randint(
+                low=0, high=self.global_config.train.seq_length, size=(batch["actions"].shape[0],))
+            goal_obs = TensorUtils.gather_sequence(batch["next_obs"], policy_subgoal_indices)
+            goal_obs = TensorUtils.to_float(TensorUtils.to_device(goal_obs, self.device))
+            input_batch["actor"]["goal_obs"] = goal_obs
+        else:
+            # otherwise, use planner subgoal target as goal for the policy
+            input_batch["actor"]["goal_obs"] = input_batch["planner"]["planner"]["target_subgoals"]
+
+        # we move to device first before float conversion because image observation modalities will be uint8 -
+        # this minimizes the amount of data transferred to GPU
+        return TensorUtils.to_float(TensorUtils.to_device(input_batch, self.device))
+
+    def get_state_value(self, obs_dict, goal_dict=None):
+        """
+        Get state value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        return self.planner.get_state_value(obs_dict=obs_dict, goal_dict=goal_dict)
+
+    def get_state_action_value(self, obs_dict, actions, goal_dict=None):
+        """
+        Get state-action value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            actions (torch.Tensor): action
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        return self.planner.get_state_action_value(obs_dict=obs_dict, actions=actions, goal_dict=goal_dict)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/algo/td3_bc.py b/phantom/submodules/phantom-robomimic/robomimic/algo/td3_bc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e324c54a1614c1c01b3efdb1def9c8f6f11b2c70
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/algo/td3_bc.py
@@ -0,0 +1,567 @@
+"""
+Implementation of TD3-BC. 
+Based on https://github.com/sfujim/TD3_BC
+(Paper - https://arxiv.org/abs/1812.02900).
+
+Note that several parts are exactly the same as the BCQ implementation,
+such as @_create_critics, @process_batch_for_training, and 
+@_train_critic_on_batch. They are replicated here (instead of subclassing 
+from the BCQ algo class) to be explicit and have implementation details 
+self-contained in this file.
+"""
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import robomimic.models.obs_nets as ObsNets
+import robomimic.models.policy_nets as PolicyNets
+import robomimic.models.value_nets as ValueNets
+import robomimic.models.vae_nets as VAENets
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.torch_utils as TorchUtils
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.loss_utils as LossUtils
+
+from robomimic.algo import register_algo_factory_func, PolicyAlgo, ValueAlgo
+
+
+@register_algo_factory_func("td3_bc")
+def algo_config_to_class(algo_config):
+    """
+    Maps algo config to the TD3_BC algo class to instantiate, along with additional algo kwargs.
+
+    Args:
+        algo_config (Config instance): algo config
+
+    Returns:
+        algo_class: subclass of Algo
+        algo_kwargs (dict): dictionary of additional kwargs to pass to algorithm
+    """
+    # only one variant of TD3_BC for now
+    return TD3_BC, {}
+
+
+class TD3_BC(PolicyAlgo, ValueAlgo):
+    """
+    Default TD3_BC training, based on https://arxiv.org/abs/2106.06860 and
+    https://github.com/sfujim/TD3_BC.
+    """
+    def __init__(self, **kwargs):
+        PolicyAlgo.__init__(self, **kwargs)
+
+        # save the discount factor - it may be overriden later
+        self.set_discount(self.algo_config.discount)
+
+        # initialize actor update counter. This is used to train the actor at a lower freq than critic
+        self.actor_update_counter = 0
+
+    def _create_networks(self):
+        """
+        Creates networks and places them into @self.nets.
+        """
+        self.nets = nn.ModuleDict()
+
+        self._create_critics()
+        self._create_actor()
+
+        # sync target networks at beginning of training
+        with torch.no_grad():
+            for critic_ind in range(len(self.nets["critic"])):
+                TorchUtils.hard_update(
+                    source=self.nets["critic"][critic_ind], 
+                    target=self.nets["critic_target"][critic_ind],
+                )
+
+            TorchUtils.hard_update(
+                source=self.nets["actor"], 
+                target=self.nets["actor_target"],
+            )
+
+        self.nets = self.nets.float().to(self.device)
+
+    def _create_critics(self):
+        """
+        Called in @_create_networks to make critic networks.
+
+        Exactly the same as BCQ.
+        """
+        critic_class = ValueNets.ActionValueNetwork
+        critic_args = dict(
+            obs_shapes=self.obs_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.critic.layer_dims,
+            value_bounds=self.algo_config.critic.value_bounds,
+            goal_shapes=self.goal_shapes,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+
+        # Q network ensemble and target ensemble
+        self.nets["critic"] = nn.ModuleList()
+        self.nets["critic_target"] = nn.ModuleList()
+        for _ in range(self.algo_config.critic.ensemble.n):
+            critic = critic_class(**critic_args)
+            self.nets["critic"].append(critic)
+
+            critic_target = critic_class(**critic_args)
+            self.nets["critic_target"].append(critic_target)
+
+    def _create_actor(self):
+        """
+        Called in @_create_networks to make actor network.
+        """
+        actor_class = PolicyNets.ActorNetwork
+        actor_args = dict(
+            obs_shapes=self.obs_shapes,
+            goal_shapes=self.goal_shapes,
+            ac_dim=self.ac_dim,
+            mlp_layer_dims=self.algo_config.actor.layer_dims,
+            encoder_kwargs=ObsUtils.obs_encoder_kwargs_from_config(self.obs_config.encoder),
+        )
+
+        self.nets["actor"] = actor_class(**actor_args)
+        self.nets["actor_target"] = actor_class(**actor_args)
+
+    def _check_epoch(self, net_name, epoch):
+        """
+        Helper function to check whether backprop should happen this epoch.
+
+        Args:
+            net_name (str): name of network in @self.nets and @self.optim_params
+            epoch (int): epoch number
+        """
+        epoch_start_check = (self.optim_params[net_name]["start_epoch"] == -1) or (epoch >= self.optim_params[net_name]["start_epoch"])
+        epoch_end_check = (self.optim_params[net_name]["end_epoch"] == -1) or (epoch < self.optim_params[net_name]["end_epoch"])
+        return (epoch_start_check and epoch_end_check)
+
+    def set_discount(self, discount):
+        """
+        Useful function to modify discount factor if necessary (e.g. for n-step returns).
+        """
+        self.discount = discount
+
+    def process_batch_for_training(self, batch):
+        """
+        Processes input batch from a data loader to filter out
+        relevant information and prepare the batch for training.
+
+        Exactly the same as BCQ.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader
+
+        Returns:
+            input_batch (dict): processed and filtered batch that
+                will be used for training 
+        """
+        input_batch = dict()
+
+        # n-step returns (default is 1)
+        n_step = self.algo_config.n_step
+        assert batch["actions"].shape[1] >= n_step
+
+        # remove temporal batches for all
+        input_batch["obs"] = {k: batch["obs"][k][:, 0, :] for k in batch["obs"]}
+        input_batch["next_obs"] = {k: batch["next_obs"][k][:, n_step - 1, :] for k in batch["next_obs"]}
+        input_batch["goal_obs"] = batch.get("goal_obs", None) # goals may not be present
+        input_batch["actions"] = batch["actions"][:, 0, :]
+
+        # note: ensure scalar signals (rewards, done) retain last dimension of 1 to be compatible with model outputs
+
+        # single timestep reward is discounted sum of intermediate rewards in sequence
+        reward_seq = batch["rewards"][:, :n_step]
+        discounts = torch.pow(self.algo_config.discount, torch.arange(n_step).float()).unsqueeze(0)
+        input_batch["rewards"] = (reward_seq * discounts).sum(dim=1).unsqueeze(1)
+
+        # discount rate will be gamma^N for computing n-step returns
+        new_discount = (self.algo_config.discount ** n_step)
+        self.set_discount(new_discount)
+
+        # consider this n-step seqeunce done if any intermediate dones are present
+        done_seq = batch["dones"][:, :n_step]
+        input_batch["dones"] = (done_seq.sum(dim=1) > 0).float().unsqueeze(1)
+
+        if self.algo_config.infinite_horizon:
+            # scale terminal rewards by 1 / (1 - gamma) for infinite horizon MDPs
+            done_inds = input_batch["dones"].round().long().nonzero(as_tuple=False)[:, 0]
+            if done_inds.shape[0] > 0:
+                input_batch["rewards"][done_inds] = input_batch["rewards"][done_inds] * (1. / (1. - self.discount))
+
+        # we move to device first before float conversion because image observation modalities will be uint8 -
+        # this minimizes the amount of data transferred to GPU
+        return TensorUtils.to_float(TensorUtils.to_device(input_batch, self.device))
+
+    def _train_critic_on_batch(self, batch, epoch, no_backprop=False):
+        """
+        A modular helper function that can be overridden in case
+        subclasses would like to modify training behavior for the
+        critics.
+
+        Exactly the same as BCQ (except for removal of @action_sampler_outputs and @critic_outputs)
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            no_backprop (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        info = OrderedDict()
+
+        # batch variables
+        s_batch = batch["obs"]
+        a_batch = batch["actions"]
+        r_batch = batch["rewards"]
+        ns_batch = batch["next_obs"]
+        goal_s_batch = batch["goal_obs"]
+
+        # 1 if not done, 0 otherwise
+        done_mask_batch = 1. - batch["dones"]
+        info["done_masks"] = done_mask_batch
+
+        # Bellman backup for Q-targets
+        q_targets = self._get_target_values(
+            next_states=ns_batch, 
+            goal_states=goal_s_batch, 
+            rewards=r_batch, 
+            dones=done_mask_batch,
+        )
+        info["critic/q_targets"] = q_targets
+
+        # Train all critics using this set of targets for regression
+        for critic_ind, critic in enumerate(self.nets["critic"]):
+            critic_loss = self._compute_critic_loss(
+                critic=critic, 
+                states=s_batch, 
+                actions=a_batch, 
+                goal_states=goal_s_batch, 
+                q_targets=q_targets,
+            )
+            info["critic/critic{}_loss".format(critic_ind + 1)] = critic_loss
+
+            if not no_backprop:
+                critic_grad_norms = TorchUtils.backprop_for_loss(
+                    net=self.nets["critic"][critic_ind],
+                    optim=self.optimizers["critic"][critic_ind],
+                    loss=critic_loss, 
+                    max_grad_norm=self.algo_config.critic.max_gradient_norm,
+                )
+                info["critic/critic{}_grad_norms".format(critic_ind + 1)] = critic_grad_norms
+
+        return info
+
+    def _train_actor_on_batch(self, batch, epoch, no_backprop=False):
+        """
+        A modular helper function that can be overridden in case
+        subclasses would like to modify training behavior for the
+        actor.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            no_backprop (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        info = OrderedDict()
+
+        # Actor loss (update with mixture of DDPG loss and BC loss)
+        s_batch = batch["obs"]
+        a_batch = batch["actions"]
+        goal_s_batch = batch["goal_obs"]
+
+        # lambda mixture weight is combination of hyperparameter (alpha) and Q-value normalization
+        actor_actions = self.nets["actor"](s_batch, goal_s_batch)
+        Q_values = self.nets["critic"][0](s_batch, actor_actions, goal_s_batch)
+        lam = self.algo_config.alpha / Q_values.abs().mean().detach()
+        actor_loss = -lam * Q_values.mean() + nn.MSELoss()(actor_actions, a_batch)
+        info["actor/loss"] = actor_loss
+
+        if not no_backprop:
+            actor_grad_norms = TorchUtils.backprop_for_loss(
+                net=self.nets["actor"],
+                optim=self.optimizers["actor"],
+                loss=actor_loss,
+            )
+            info["actor/grad_norms"] = actor_grad_norms
+
+        return info
+
+    def _get_target_values(self, next_states, goal_states, rewards, dones):
+        """
+        Helper function to get target values for training Q-function with TD-loss.
+
+        Args:
+            next_states (dict): batch of next observations
+            goal_states (dict): if not None, batch of goal observations
+            rewards (torch.Tensor): batch of rewards - should be shape (B, 1)
+            dones (torch.Tensor): batch of done signals - should be shape (B, 1)
+
+        Returns:
+            q_targets (torch.Tensor): target Q-values to use for TD loss
+        """
+
+        with torch.no_grad():
+            # get next actions via target actor and noise
+            next_target_actions = self.nets["actor_target"](next_states, goal_states)
+            noise = (
+                torch.randn_like(next_target_actions) * self.algo_config.actor.noise_std
+            ).clamp(-self.algo_config.actor.noise_clip, self.algo_config.actor.noise_clip)
+            next_actions = (next_target_actions + noise).clamp(-1.0, 1.0)
+
+            # TD3 trick to combine max and min over all Q-ensemble estimates into single target estimates
+            all_value_targets = self.nets["critic_target"][0](next_states, next_actions, goal_states).reshape(-1, 1)
+            max_value_targets = all_value_targets
+            min_value_targets = all_value_targets
+            for critic_target in self.nets["critic_target"][1:]:
+                all_value_targets = critic_target(next_states, next_actions, goal_states).reshape(-1, 1)
+                max_value_targets = torch.max(max_value_targets, all_value_targets)
+                min_value_targets = torch.min(min_value_targets, all_value_targets)
+            value_targets = self.algo_config.critic.ensemble.weight * min_value_targets + \
+                                (1. - self.algo_config.critic.ensemble.weight) * max_value_targets
+            q_targets = rewards + dones * self.discount * value_targets
+
+        return q_targets
+
+    def _compute_critic_loss(self, critic, states, actions, goal_states, q_targets):
+        """
+        Helper function to compute loss between estimated Q-values and target Q-values.
+
+        Nearly the same as BCQ (return type slightly different).
+
+        Args:
+            critic (torch.nn.Module): critic network
+            states (dict): batch of observations
+            actions (torch.Tensor): batch of actions
+            goal_states (dict): if not None, batch of goal observations
+            q_targets (torch.Tensor): batch of target q-values for the TD loss
+
+        Returns:
+            critic_loss (torch.Tensor): critic loss
+        """
+        q_estimated = critic(states, actions, goal_states)
+        if self.algo_config.critic.use_huber:
+            critic_loss = nn.SmoothL1Loss()(q_estimated, q_targets)
+        else:
+            critic_loss = nn.MSELoss()(q_estimated, q_targets)
+        return critic_loss
+
+    def train_on_batch(self, batch, epoch, validate=False):
+        """
+        Training on a single batch of data.
+
+        Args:
+            batch (dict): dictionary with torch.Tensors sampled
+                from a data loader and filtered by @process_batch_for_training
+
+            epoch (int): epoch number - required by some Algos that need
+                to perform staged training and early stopping
+
+            validate (bool): if True, don't perform any learning updates.
+
+        Returns:
+            info (dict): dictionary of relevant inputs, outputs, and losses
+                that might be relevant for logging
+        """
+        with TorchUtils.maybe_no_grad(no_grad=validate):
+            info = PolicyAlgo.train_on_batch(self, batch, epoch, validate=validate)
+
+            # Critic training
+            no_critic_backprop = validate or (not self._check_epoch(net_name="critic", epoch=epoch))
+            with TorchUtils.maybe_no_grad(no_grad=no_critic_backprop):
+                critic_info = self._train_critic_on_batch(
+                    batch=batch, 
+                    epoch=epoch, 
+                    no_backprop=no_critic_backprop,
+                )
+            info.update(critic_info)
+
+            # update actor and target networks at lower frequency
+            if not no_critic_backprop:
+                # update counter only on critic training gradient steps
+                self.actor_update_counter += 1
+            do_actor_update = (self.actor_update_counter % self.algo_config.actor.update_freq == 0)
+
+            # Actor training
+            no_actor_backprop = validate or (not self._check_epoch(net_name="actor", epoch=epoch))
+            no_actor_backprop = no_actor_backprop or (not do_actor_update)
+            with TorchUtils.maybe_no_grad(no_grad=no_actor_backprop):
+                actor_info = self._train_actor_on_batch(
+                    batch=batch, 
+                    epoch=epoch, 
+                    no_backprop=no_actor_backprop,
+                )
+            info.update(actor_info)
+
+            if not no_actor_backprop:
+                # to match original implementation, only update target networks on 
+                # actor gradient steps
+                with torch.no_grad():
+                    # update the target critic networks
+                    for critic_ind in range(len(self.nets["critic"])):
+                        TorchUtils.soft_update(
+                            source=self.nets["critic"][critic_ind], 
+                            target=self.nets["critic_target"][critic_ind], 
+                            tau=self.algo_config.target_tau,
+                        )
+
+                    # update target actor network
+                    TorchUtils.soft_update(
+                        source=self.nets["actor"], 
+                        target=self.nets["actor_target"], 
+                        tau=self.algo_config.target_tau,
+                    )
+
+        return info
+
+    def log_info(self, info):
+        """
+        Process info dictionary from @train_on_batch to summarize
+        information to pass to tensorboard for logging.
+
+        Args:
+            info (dict): dictionary of info
+
+        Returns:
+            loss_log (dict): name -> summary statistic
+        """
+        loss_log = OrderedDict()
+
+        # record current optimizer learning rates
+        for k in self.optimizers:
+            keys = [k]
+            optims = [self.optimizers[k]]
+            if k == "critic":
+                # account for critic having one optimizer per ensemble member
+                keys = ["{}{}".format(k, critic_ind) for critic_ind in range(len(self.nets["critic"]))]
+                optims = self.optimizers[k]
+            for kp, optimizer in zip(keys, optims):
+                for i, param_group in enumerate(optimizer.param_groups):
+                    loss_log["Optimizer/{}{}_lr".format(kp, i)] = param_group["lr"]
+
+        # extract relevant logs for critic, and actor
+        loss_log["Loss"] = 0.
+        for loss_logger in [self._log_critic_info, self._log_actor_info]:
+            this_log = loss_logger(info)
+            if "Loss" in this_log:
+                # manually merge total loss
+                loss_log["Loss"] += this_log["Loss"]
+                del this_log["Loss"]
+            loss_log.update(this_log)
+
+        return loss_log
+
+    def _log_critic_info(self, info):
+        """
+        Helper function to extract critic-relevant information for logging.
+        """
+        loss_log = OrderedDict()
+        if "done_masks" in info:
+            loss_log["Critic/Done_Mask_Percentage"] = 100. * torch.mean(info["done_masks"]).item()
+        if "critic/q_targets" in info:
+            loss_log["Critic/Q_Targets"] = info["critic/q_targets"].mean().item()
+        loss_log["Loss"] = 0.
+        for critic_ind in range(len(self.nets["critic"])):
+            loss_log["Critic/Critic{}_Loss".format(critic_ind + 1)] = info["critic/critic{}_loss".format(critic_ind + 1)].item()
+            if "critic/critic{}_grad_norms".format(critic_ind + 1) in info:
+                loss_log["Critic/Critic{}_Grad_Norms".format(critic_ind + 1)] = info["critic/critic{}_grad_norms".format(critic_ind + 1)]
+            loss_log["Loss"] += loss_log["Critic/Critic{}_Loss".format(critic_ind + 1)]
+        return loss_log
+
+    def _log_actor_info(self, info):
+        """
+        Helper function to extract actor-relevant information for logging.
+        """
+        loss_log = OrderedDict()
+        loss_log["Actor/Loss"] = info["actor/loss"].item()
+        if "actor/grad_norms" in info:
+            loss_log["Actor/Grad_Norms"] = info["actor/grad_norms"]
+        loss_log["Loss"] = loss_log["Actor/Loss"]
+        return loss_log
+
+    def set_train(self):
+        """
+        Prepare networks for evaluation. Update from super class to make sure
+        target networks stay in evaluation mode all the time.
+        """
+        self.nets.train()
+
+        # target networks always in eval
+        for critic_ind in range(len(self.nets["critic_target"])):
+            self.nets["critic_target"][critic_ind].eval()
+
+        self.nets["actor_target"].eval()
+
+    def on_epoch_end(self, epoch):
+        """
+        Called at the end of each epoch.
+        """
+
+        # LR scheduling updates
+        for lr_sc in self.lr_schedulers["critic"]:
+            if lr_sc is not None:
+                lr_sc.step()
+
+        if self.lr_schedulers["actor"] is not None:
+            self.lr_schedulers["actor"].step()
+
+    def get_action(self, obs_dict, goal_dict=None):
+        """
+        Get policy action outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            action (torch.Tensor): action tensor
+        """
+        assert not self.nets.training
+
+        return self.nets["actor"](obs_dict=obs_dict, goal_dict=goal_dict)
+
+    def get_state_value(self, obs_dict, goal_dict=None):
+        """
+        Get state value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        assert not self.nets.training
+
+        actions = self.nets["actor"](obs_dict=obs_dict, goal_dict=goal_dict)
+        return self.nets["critic"][0](obs_dict, actions, goal_dict)
+
+    def get_state_action_value(self, obs_dict, actions, goal_dict=None):
+        """
+        Get state-action value outputs.
+
+        Args:
+            obs_dict (dict): current observation
+            actions (torch.Tensor): action
+            goal_dict (dict): (optional) goal
+
+        Returns:
+            value (torch.Tensor): value tensor
+        """
+        assert not self.nets.training
+
+        return self.nets["critic"][0](obs_dict, actions, goal_dict)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/__init__.py b/phantom/submodules/phantom-robomimic/robomimic/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2cba6d3d89bcf9c73d7de8995dfef86ba9a8a94
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/__init__.py
@@ -0,0 +1,13 @@
+from robomimic.config.config import Config
+from robomimic.config.base_config import config_factory, get_all_registered_configs
+
+# note: these imports are needed to register these classes in the global config registry
+from robomimic.config.bc_config import BCConfig
+from robomimic.config.bcq_config import BCQConfig
+from robomimic.config.cql_config import CQLConfig
+from robomimic.config.iql_config import IQLConfig
+from robomimic.config.gl_config import GLConfig
+from robomimic.config.hbc_config import HBCConfig
+from robomimic.config.iris_config import IRISConfig
+from robomimic.config.td3_bc_config import TD3_BCConfig
+from robomimic.config.diffusion_policy_config import DiffusionPolicyConfig
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/base_config.py b/phantom/submodules/phantom-robomimic/robomimic/config/base_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8321365f446a15161e857598009cc6c69e97a26b
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/base_config.py
@@ -0,0 +1,336 @@
+"""
+The base config class that is used for all algorithm configs in this repository.
+Subclasses get registered into a global dictionary, making it easy to instantiate
+the correct config class given the algorithm name.
+"""
+
+import six # preserve metaclass compatibility between python 2 and 3
+from copy import deepcopy
+
+import robomimic
+from robomimic.config.config import Config
+
+# global dictionary for remembering name - class mappings
+REGISTERED_CONFIGS = {}
+
+
+def get_all_registered_configs():
+    """
+    Give access to dictionary of all registered configs for external use.
+    """
+    return deepcopy(REGISTERED_CONFIGS)
+
+
+def config_factory(algo_name, dic=None):
+    """
+    Creates an instance of a config from the algo name. Optionally pass
+    a dictionary to instantiate the config from the dictionary.
+    """
+    if algo_name not in REGISTERED_CONFIGS:
+        raise Exception("Config for algo name {} not found. Make sure it is a registered config among: {}".format(
+            algo_name, ', '.join(REGISTERED_CONFIGS)))
+    return REGISTERED_CONFIGS[algo_name](dict_to_load=dic)
+
+
+class ConfigMeta(type):
+    """
+    Define a metaclass for constructing a config class.
+    It registers configs into the global registry.
+    """
+    def __new__(meta, name, bases, class_dict):
+        cls = super(ConfigMeta, meta).__new__(meta, name, bases, class_dict)
+        if cls.__name__ != "BaseConfig":
+            REGISTERED_CONFIGS[cls.ALGO_NAME] = cls
+        return cls
+
+
+@six.add_metaclass(ConfigMeta)
+class BaseConfig(Config):
+    def __init__(self, dict_to_load=None):
+        if dict_to_load is not None:
+            super(BaseConfig, self).__init__(dict_to_load)
+            return
+
+        super(BaseConfig, self).__init__()
+
+        # store algo name class property in the config (must be implemented by subclasses)
+        self.algo_name = type(self).ALGO_NAME
+
+        self.experiment_config()
+        self.train_config()
+        self.algo_config()
+        self.observation_config()
+        self.meta_config()
+
+        # After Config init, new keys cannot be added to the config, except under nested
+        # attributes that have called @do_not_lock_keys
+        self.lock_keys()
+
+    @property
+    @classmethod
+    def ALGO_NAME(cls):
+        # must be specified by subclasses
+        raise NotImplementedError
+
+    def experiment_config(self):
+        """
+        This function populates the `config.experiment` attribute of the config, 
+        which has several experiment settings such as the name of the training run, 
+        whether to do logging, whether to save models (and how often), whether to render 
+        videos, and whether to do rollouts (and how often). This class has a default 
+        implementation that usually doesn't need to be overriden.
+        """
+
+        self.experiment.name = "test"                               # name of experiment used to make log files
+        self.experiment.validate = False                            # whether to do validation or not
+        self.experiment.logging.terminal_output_to_txt = True       # whether to log stdout to txt file 
+        self.experiment.logging.log_tb = True                       # enable tensorboard logging
+        self.experiment.logging.log_wandb = False                   # enable wandb logging
+        self.experiment.logging.wandb_proj_name = "debug"           # project name if using wandb
+
+
+        ## save config - if and when to save model checkpoints ##
+        self.experiment.save.enabled = True                         # whether model saving should be enabled or disabled
+        self.experiment.save.every_n_seconds = None                 # save model every n seconds (set to None to disable)
+        self.experiment.save.every_n_epochs = 50                    # save model every n epochs (set to None to disable)
+        self.experiment.save.epochs = []                            # save model on these specific epochs
+        self.experiment.save.on_best_validation = False             # save models that achieve best validation score
+        self.experiment.save.on_best_rollout_return = False         # save models that achieve best rollout return
+        self.experiment.save.on_best_rollout_success_rate = True    # save models that achieve best success rate
+
+        # epoch definitions - if not None, set an epoch to be this many gradient steps, else the full dataset size will be used
+        self.experiment.epoch_every_n_steps = 100                   # number of gradient steps in train epoch (None for full dataset pass)
+        self.experiment.validation_epoch_every_n_steps = 10         # number of gradient steps in valid epoch (None for full dataset pass)
+
+        # envs to evaluate model on (assuming rollouts are enabled), to override the metadata stored in dataset
+        self.experiment.env = None                                  # no need to set this (unless you want to override)
+        self.experiment.additional_envs = None                      # additional environments that should get evaluated
+
+
+        ## rendering config ##
+        self.experiment.render = False                              # render on-screen or not
+        self.experiment.render_video = True                         # render evaluation rollouts to videos
+        self.experiment.keep_all_videos = False                     # save all videos, instead of only saving those for saved model checkpoints
+        self.experiment.video_skip = 5                              # render video frame every n environment steps during rollout
+
+
+        ## evaluation rollout config ##
+        self.experiment.rollout.enabled = True                      # enable evaluation rollouts
+        self.experiment.rollout.n = 50                              # number of rollouts per evaluation
+        self.experiment.rollout.horizon = 400                       # maximum number of env steps per rollout
+        self.experiment.rollout.rate = 50                           # do rollouts every @rate epochs
+        self.experiment.rollout.warmstart = 0                       # number of epochs to wait before starting rollouts
+        self.experiment.rollout.terminate_on_success = True         # end rollout early after task success
+
+        # for updating the evaluation env meta data
+        self.experiment.env_meta_update_dict = Config()
+        self.experiment.env_meta_update_dict.do_not_lock_keys()
+
+    def train_config(self):
+        """
+        This function populates the `config.train` attribute of the config, which 
+        has several settings related to the training process, such as the dataset 
+        to use for training, and how the data loader should load the data. This 
+        class has a default implementation that usually doesn't need to be overriden.
+        """
+
+        # Path to hdf5 dataset to use for training
+        self.train.data = None                                      
+
+        # Write all results to this directory. A new folder with the timestamp will be created
+        # in this directory, and it will contain three subfolders - "log", "models", and "videos".
+        # The "log" directory will contain tensorboard and stdout txt logs. The "models" directory
+        # will contain saved model checkpoints. The "videos" directory contains evaluation rollout
+        # videos.
+        self.train.output_dir = "../{}_trained_models".format(self.algo_name)
+
+
+        ## dataset loader config ##
+
+        # num workers for loading data - generally set to 0 for low-dim datasets, and 2 for image datasets
+        self.train.num_data_workers = 0  
+
+        # One of ["all", "low_dim", or None]. Set to "all" to cache entire hdf5 in memory - this is 
+        # by far the fastest for data loading. Set to "low_dim" to cache all non-image data. Set
+        # to None to use no caching - in this case, every batch sample is retrieved via file i/o.
+        # You should almost never set this to None, even for large image datasets.
+        self.train.hdf5_cache_mode = "all"
+
+        # used for parallel data loading
+        self.train.hdf5_use_swmr = True
+
+        # whether to load "next_obs" group from hdf5 - only needed for batch / offline RL algorithms
+        self.train.hdf5_load_next_obs = True
+
+        # if true, normalize observations at train and test time, using the global mean and standard deviation
+        # of each observation in each dimension, computed across the training set. See SequenceDataset.normalize_obs
+        # in utils/dataset.py for more information.
+        self.train.hdf5_normalize_obs = False
+
+        # if provided, use the list of demo keys under the hdf5 group "mask/@hdf5_filter_key" for training, instead 
+        # of the full dataset. This provides a convenient way to train on only a subset of the trajectories in a dataset.
+        self.train.hdf5_filter_key = None
+
+        # if provided, use the list of demo keys under the hdf5 group "mask/@hdf5_validation_filter_key" for validation.
+        # Must be provided if @experiment.validate is True.
+        self.train.hdf5_validation_filter_key = None
+
+        # length of experience sequence to fetch from the dataset
+        # and whether to pad the beginning / end of the sequence at boundaries of trajectory in dataset
+        self.train.seq_length = 1
+        self.train.pad_seq_length = True
+        self.train.frame_stack = 1
+        self.train.pad_frame_stack = True
+
+        # keys from hdf5 to load into each batch, besides "obs" and "next_obs". If algorithms
+        # require additional keys from each trajectory in the hdf5, they should be specified here.
+        self.train.dataset_keys = (
+            "actions", 
+            "rewards", 
+            "dones",
+        )
+
+        self.train.action_keys = ["actions"]
+
+        # specifing each action keys to load and their corresponding normalization/conversion requirement
+        # e.g. for dataset keys "action/eef_pos" and "action/eef_rot"
+        # the desired value of self.train.action_config is: 
+        # {
+        #   "action/eef_pos": {
+        #       "normalization": "min_max",
+        #       "rot_conversion: None  
+        #   },
+        #   "action/eef_rot": {
+        #       "normalization": None,
+        #       "rot_conversion: "axis_angle_to_6d"
+        #   }
+        # }
+        # self.train.action_config.actions.normalization = None # "min_max"
+        # self.train.action_config.actions.rot_conversion = None # "axis_angle_to_6d"
+        self.train.action_config = {}
+        # self.train.action_config.do_not_lock_keys()
+
+        # one of [None, "last"] - set to "last" to include goal observations in each batch
+        self.train.goal_mode = None
+
+
+        ## learning config ##
+        self.train.cuda = True          # use GPU or not
+        self.train.batch_size = 100     # batch size
+        self.train.num_epochs = 2000    # number of training epochs
+        self.train.seed = 1             # seed for training (for reproducibility)
+
+        self.train.data_format = "robomimic" # either "robomimic" or "r2d2"
+
+    def algo_config(self):
+        """
+        This function populates the `config.algo` attribute of the config, and is given to the 
+        `Algo` subclass (see `algo/algo.py`) for each algorithm through the `algo_config` 
+        argument to the constructor. Any parameter that an algorithm needs to determine its 
+        training and test-time behavior should be populated here. This function should be 
+        implemented by every subclass.
+        """
+        pass
+
+    def observation_config(self):
+        """
+        This function populates the `config.observation` attribute of the config, and is given 
+        to the `Algo` subclass (see `algo/algo.py`) for each algorithm through the `obs_config` 
+        argument to the constructor. This portion of the config is used to specify what 
+        observation modalities should be used by the networks for training, and how the 
+        observation modalities should be encoded by the networks. While this class has a 
+        default implementation that usually doesn't need to be overriden, certain algorithm 
+        configs may choose to, in order to have seperate configs for different networks 
+        in the algorithm. 
+        """
+
+        # observation modalities
+        self.observation.modalities.obs.low_dim = [             # specify low-dim observations for agent
+            "robot0_eef_pos", 
+            "robot0_eef_quat", 
+            "robot0_gripper_qpos", 
+            "object",
+        ]
+        self.observation.modalities.obs.rgb = []              # specify rgb image observations for agent
+        self.observation.modalities.obs.depth = []
+        self.observation.modalities.obs.scan = []
+        self.observation.modalities.goal.low_dim = []           # specify low-dim goal observations to condition agent on
+        self.observation.modalities.goal.rgb = []             # specify rgb image goal observations to condition agent on
+        self.observation.modalities.goal.depth = []
+        self.observation.modalities.goal.scan = []
+        self.observation.modalities.obs.do_not_lock_keys()
+        self.observation.modalities.goal.do_not_lock_keys()
+
+        # observation encoder architectures (per obs modality)
+        # This applies to all networks that take observation dicts as input
+
+        # =============== Low Dim default encoder (no encoder) ===============
+        self.observation.encoder.low_dim.core_class = None
+        self.observation.encoder.low_dim.core_kwargs = Config()                 # No kwargs by default
+        self.observation.encoder.low_dim.core_kwargs.do_not_lock_keys()
+
+        # Low Dim: Obs Randomizer settings
+        self.observation.encoder.low_dim.obs_randomizer_class = None
+        self.observation.encoder.low_dim.obs_randomizer_kwargs = Config()       # No kwargs by default
+        self.observation.encoder.low_dim.obs_randomizer_kwargs.do_not_lock_keys()
+
+        # =============== RGB default encoder (ResNet backbone + linear layer output) ===============
+        self.observation.encoder.rgb.core_class = "VisualCore"                  # Default VisualCore class combines backbone (like ResNet-18) with pooling operation (like spatial softmax)
+        self.observation.encoder.rgb.core_kwargs = Config()                     # See models/obs_core.py for important kwargs to set and defaults used
+        self.observation.encoder.rgb.core_kwargs.do_not_lock_keys()
+
+        # RGB: Obs Randomizer settings
+        self.observation.encoder.rgb.obs_randomizer_class = None                # Can set to 'CropRandomizer' to use crop randomization
+        self.observation.encoder.rgb.obs_randomizer_kwargs = Config()           # See models/obs_core.py for important kwargs to set and defaults used
+        self.observation.encoder.rgb.obs_randomizer_kwargs.do_not_lock_keys()
+
+        # Allow for other custom modalities to be specified
+        self.observation.encoder.do_not_lock_keys()
+
+        # =============== Depth default encoder (same as rgb) ===============
+        self.observation.encoder.depth = deepcopy(self.observation.encoder.rgb)
+
+        # =============== Scan default encoder (Conv1d backbone + linear layer output) ===============
+        self.observation.encoder.scan = deepcopy(self.observation.encoder.rgb)
+
+        # Scan: Modify the core class + kwargs, otherwise, is same as rgb encoder
+        self.observation.encoder.scan.core_class = "ScanCore"                   # Default ScanCore class uses Conv1D to process this modality
+        self.observation.encoder.scan.core_kwargs = Config()                    # See models/obs_core.py for important kwargs to set and defaults used
+        self.observation.encoder.scan.core_kwargs.do_not_lock_keys()
+
+    def meta_config(self):
+        """
+        This function populates the `config.meta` attribute of the config. This portion of the config 
+        is used to specify job information primarily for hyperparameter sweeps.
+        It contains hyperparameter keys and values, which are populated automatically
+        by the hyperparameter config generator (see `utils/hyperparam_utils.py`).
+        These values are read by the wandb logger (see `utils/log_utils.py`) to set job tags.
+        """
+        
+        self.meta.hp_base_config_file = None            # base config file in hyperparam sweep
+        self.meta.hp_keys = []                          # relevant keys (swept) in hyperparam sweep
+        self.meta.hp_values = []                        # values corresponding to keys in hyperparam sweep
+    
+    @property
+    def use_goals(self):
+        # whether the agent is goal-conditioned
+        return len([obs_key for modality in self.observation.modalities.goal.values() for obs_key in modality]) > 0
+
+    @property
+    def all_obs_keys(self):
+        """
+        This grabs the union of observation keys over all modalities (e.g.: low_dim, rgb, depth, etc.) and over all
+        modality groups (e.g: obs, goal, subgoal, etc...)
+
+        Returns:
+            n-array: all observation keys used for this model
+        """
+        # pool all modalities
+        return sorted(tuple(set([
+            obs_key for group in [
+                self.observation.modalities.obs.values(),
+                self.observation.modalities.goal.values()
+            ]
+            for modality in group
+            for obs_key in modality
+         ])))
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/bc_config.py b/phantom/submodules/phantom-robomimic/robomimic/config/bc_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f701c685e9deb7755729d446ff272ba52a5ccc1
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/bc_config.py
@@ -0,0 +1,106 @@
+"""
+Config for BC algorithm.
+"""
+
+from robomimic.config.base_config import BaseConfig
+
+
+class BCConfig(BaseConfig):
+    ALGO_NAME = "bc"
+
+    def train_config(self):
+        """
+        BC algorithms don't need "next_obs" from hdf5 - so save on storage and compute by disabling it.
+        """
+        super(BCConfig, self).train_config()
+        self.train.hdf5_load_next_obs = False
+
+    def algo_config(self):
+        """
+        This function populates the `config.algo` attribute of the config, and is given to the 
+        `Algo` subclass (see `algo/algo.py`) for each algorithm through the `algo_config` 
+        argument to the constructor. Any parameter that an algorithm needs to determine its 
+        training and test-time behavior should be populated here.
+        """
+
+        # optimization parameters
+        self.algo.optim_params.policy.optimizer_type = "adam"
+        self.algo.optim_params.policy.learning_rate.initial = 1e-4      # policy learning rate
+        self.algo.optim_params.policy.learning_rate.decay_factor = 0.1  # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.policy.learning_rate.epoch_schedule = [] # epochs where LR decay occurs
+        self.algo.optim_params.policy.learning_rate.scheduler_type = "multistep" # learning rate scheduler ("multistep", "linear", etc) 
+        self.algo.optim_params.policy.regularization.L2 = 0.00          # L2 regularization strength
+
+        # loss weights
+        self.algo.loss.l2_weight = 1.0      # L2 loss weight
+        self.algo.loss.l1_weight = 0.0      # L1 loss weight
+        self.algo.loss.cos_weight = 0.0     # cosine loss weight
+
+        # MLP network architecture (layers after observation encoder and RNN, if present)
+        self.algo.actor_layer_dims = (1024, 1024)
+
+        # stochastic Gaussian policy settings
+        self.algo.gaussian.enabled = False              # whether to train a Gaussian policy
+        self.algo.gaussian.fixed_std = False            # whether to train std output or keep it constant
+        self.algo.gaussian.init_std = 0.1               # initial standard deviation (or constant)
+        self.algo.gaussian.min_std = 0.01               # minimum std output from network
+        self.algo.gaussian.std_activation = "softplus"  # activation to use for std output from policy net
+        self.algo.gaussian.low_noise_eval = True        # low-std at test-time 
+
+        # stochastic GMM policy settings
+        self.algo.gmm.enabled = False                   # whether to train a GMM policy
+        self.algo.gmm.num_modes = 5                     # number of GMM modes
+        self.algo.gmm.min_std = 0.0001                  # minimum std output from network
+        self.algo.gmm.std_activation = "softplus"       # activation to use for std output from policy net
+        self.algo.gmm.low_noise_eval = True             # low-std at test-time 
+
+        # stochastic VAE policy settings
+        self.algo.vae.enabled = False                   # whether to train a VAE policy
+        self.algo.vae.latent_dim = 14                   # VAE latent dimnsion - set to twice the dimensionality of action space
+        self.algo.vae.latent_clip = None                # clip latent space when decoding (set to None to disable)
+        self.algo.vae.kl_weight = 1.                    # beta-VAE weight to scale KL loss relative to reconstruction loss in ELBO
+
+        # VAE decoder settings
+        self.algo.vae.decoder.is_conditioned = True                         # whether decoder should condition on observation
+        self.algo.vae.decoder.reconstruction_sum_across_elements = False    # sum instead of mean for reconstruction loss
+
+        # VAE prior settings
+        self.algo.vae.prior.learn = False                                   # learn Gaussian / GMM prior instead of N(0, 1)
+        self.algo.vae.prior.is_conditioned = False                          # whether to condition prior on observations
+        self.algo.vae.prior.use_gmm = False                                 # whether to use GMM prior
+        self.algo.vae.prior.gmm_num_modes = 10                              # number of GMM modes
+        self.algo.vae.prior.gmm_learn_weights = False                       # whether to learn GMM weights 
+        self.algo.vae.prior.use_categorical = False                         # whether to use categorical prior
+        self.algo.vae.prior.categorical_dim = 10                            # the number of categorical classes for each latent dimension
+        self.algo.vae.prior.categorical_gumbel_softmax_hard = False         # use hard selection in forward pass
+        self.algo.vae.prior.categorical_init_temp = 1.0                     # initial gumbel-softmax temp
+        self.algo.vae.prior.categorical_temp_anneal_step = 0.001            # linear temp annealing rate
+        self.algo.vae.prior.categorical_min_temp = 0.3                      # lowest gumbel-softmax temp
+
+        self.algo.vae.encoder_layer_dims = (300, 400)                       # encoder MLP layer dimensions
+        self.algo.vae.decoder_layer_dims = (300, 400)                       # decoder MLP layer dimensions
+        self.algo.vae.prior_layer_dims = (300, 400)                         # prior MLP layer dimensions (if learning conditioned prior)
+
+        # RNN policy settings
+        self.algo.rnn.enabled = False                               # whether to train RNN policy
+        self.algo.rnn.horizon = 10                                  # unroll length for RNN - should usually match train.seq_length
+        self.algo.rnn.hidden_dim = 400                              # hidden dimension size    
+        self.algo.rnn.rnn_type = "LSTM"                             # rnn type - one of "LSTM" or "GRU"
+        self.algo.rnn.num_layers = 2                                # number of RNN layers that are stacked
+        self.algo.rnn.open_loop = False                             # if True, action predictions are only based on a single observation (not sequence)
+        self.algo.rnn.kwargs.bidirectional = False                  # rnn kwargs
+        self.algo.rnn.kwargs.do_not_lock_keys()
+
+        # Transformer policy settings
+        self.algo.transformer.enabled = False                       # whether to train transformer policy
+        self.algo.transformer.context_length = 10                   # length of (s, a) seqeunces to feed to transformer - should usually match train.frame_stack
+        self.algo.transformer.embed_dim = 512                       # dimension for embeddings used by transformer
+        self.algo.transformer.num_layers = 6                        # number of transformer blocks to stack
+        self.algo.transformer.num_heads = 8                         # number of attention heads for each transformer block (should divide embed_dim evenly)
+        self.algo.transformer.emb_dropout = 0.1                     # dropout probability for embedding inputs in transformer
+        self.algo.transformer.attn_dropout = 0.1                    # dropout probability for attention outputs for each transformer block
+        self.algo.transformer.block_output_dropout = 0.1            # dropout probability for final outputs for each transformer block
+        self.algo.transformer.sinusoidal_embedding = False          # if True, use standard positional encodings (sin/cos)
+        self.algo.transformer.activation = "gelu"                   # activation function for MLP in Transformer Block
+        self.algo.transformer.supervise_all_steps = False           # if true, supervise all intermediate actions, otherwise only final one
+        self.algo.transformer.nn_parameter_for_timesteps = True     # if true, use nn.Parameter otherwise use nn.Embedding
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/bcq_config.py b/phantom/submodules/phantom-robomimic/robomimic/config/bcq_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e28f5ba5668aa2e5e6d9ca2187953a4e05b56a7d
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/bcq_config.py
@@ -0,0 +1,83 @@
+"""
+Config for BCQ algorithm.
+"""
+
+from robomimic.config.base_config import BaseConfig
+from robomimic.config.bc_config import BCConfig
+
+
+class BCQConfig(BaseConfig):
+    ALGO_NAME = "bcq"
+
+    def algo_config(self):
+        """
+        This function populates the `config.algo` attribute of the config, and is given to the 
+        `Algo` subclass (see `algo/algo.py`) for each algorithm through the `algo_config` 
+        argument to the constructor. Any parameter that an algorithm needs to determine its 
+        training and test-time behavior should be populated here.
+        """
+        
+        # optimization parameters
+        self.algo.optim_params.critic.learning_rate.initial = 1e-3              # critic learning rate
+        self.algo.optim_params.critic.learning_rate.decay_factor = 0.1          # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.critic.learning_rate.epoch_schedule = []         # epochs where LR decay occurs
+        self.algo.optim_params.critic.regularization.L2 = 0.00                  # L2 regularization strength
+        self.algo.optim_params.critic.start_epoch = -1                          # number of epochs before starting critic training (-1 means start right away)
+        self.algo.optim_params.critic.end_epoch = -1                            # number of epochs before ending critic training (-1 means start right away)
+
+        self.algo.optim_params.action_sampler.learning_rate.initial = 1e-3      # action sampler learning rate
+        self.algo.optim_params.action_sampler.learning_rate.decay_factor = 0.1  # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.action_sampler.learning_rate.epoch_schedule = [] # epochs where LR decay occurs
+        self.algo.optim_params.action_sampler.regularization.L2 = 0.00          # L2 regularization strength
+        self.algo.optim_params.action_sampler.start_epoch = -1                  # number of epochs before starting action sampler training (-1 means start right away)
+        self.algo.optim_params.action_sampler.end_epoch = -1                    # number of epochs before ending action sampler training (-1 means start right away)
+
+        self.algo.optim_params.actor.learning_rate.initial = 1e-3               # actor learning rate
+        self.algo.optim_params.actor.learning_rate.decay_factor = 0.1           # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.actor.learning_rate.epoch_schedule = []          # epochs where LR decay occurs
+        self.algo.optim_params.actor.regularization.L2 = 0.00                   # L2 regularization strength
+        self.algo.optim_params.actor.start_epoch = -1                           # number of epochs before starting actor training (-1 means start right away)
+        self.algo.optim_params.actor.end_epoch = -1                             # number of epochs before ending actor training (-1 means start right away)
+
+        # target network related parameters
+        self.algo.discount = 0.99                           # discount factor to use
+        self.algo.n_step = 1                                # for using n-step returns in TD-updates
+        self.algo.target_tau = 0.005                        # update rate for target networks
+        self.algo.infinite_horizon = False                  # if True, scale terminal rewards by 1 / (1 - discount) to treat as infinite horizon
+
+        # ================== Critic Network Config ===================
+        self.algo.critic.use_huber = False                  # Huber Loss instead of L2 for critic
+        self.algo.critic.max_gradient_norm = None           # L2 gradient clipping for critic (None to use no clipping)
+        self.algo.critic.value_bounds = None                # optional 2-tuple to ensure lower and upper bound on value estimates 
+        self.algo.critic.num_action_samples = 10            # number of actions to sample per training batch to get target critic value
+        self.algo.critic.num_action_samples_rollout = 100   # number of actions to sample per environment step
+
+        # critic ensemble parameters (TD3 trick)
+        self.algo.critic.ensemble.n = 2                     # number of Q networks in the ensemble
+        self.algo.critic.ensemble.weight = 0.75             # weighting for mixing min and max for target Q value
+
+        # distributional critic
+        self.algo.critic.distributional.enabled = False     # train distributional critic (C51)
+        self.algo.critic.distributional.num_atoms = 51      # number of values in categorical distribution
+
+        self.algo.critic.layer_dims = (300, 400)            # size of critic MLP
+
+        # ================== Action Sampler Config ===================
+        self.algo.action_sampler = BCConfig().algo
+        # use VAE by default
+        self.algo.action_sampler.vae.enabled = True
+        # remove unused parts of BCConfig algo config
+        del self.algo.action_sampler.optim_params           # since action sampler optim params specified at top-level
+        del self.algo.action_sampler.loss
+        del self.algo.action_sampler.gaussian
+        del self.algo.action_sampler.rnn
+        del self.algo.action_sampler.transformer
+
+        # Number of epochs before freezing encoder (-1 for no freezing). Only applies to cVAE-based action samplers.
+        with self.algo.action_sampler.unlocked():
+            self.algo.action_sampler.freeze_encoder_epoch = -1
+
+        # ================== Actor Network Config ===================
+        self.algo.actor.enabled = False                     # whether to use the actor perturbation network
+        self.algo.actor.perturbation_scale = 0.05           # size of learned action perturbations
+        self.algo.actor.layer_dims = (300, 400)             # size of actor MLP
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/config.py b/phantom/submodules/phantom-robomimic/robomimic/config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..74da6535b385f91aa5c34e20af731ba2e3d06ecb
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/config.py
@@ -0,0 +1,322 @@
+"""
+Basic config class - provides a convenient way to work with nested
+dictionaries (by exposing keys as attributes) and to save / load from jsons.
+
+Based on addict: https://github.com/mewwts/addict
+"""
+
+import json
+import copy
+import contextlib
+from copy import deepcopy
+
+
+class Config(dict):
+
+    def __init__(__self, *args, **kwargs):
+        object.__setattr__(__self, '__key_locked', False)  # disallow adding new keys
+        object.__setattr__(__self, '__all_locked', False)  # disallow both key and value update
+        object.__setattr__(__self, '__do_not_lock_keys', False)  # cannot be key-locked
+        object.__setattr__(__self, '__parent', kwargs.pop('__parent', None))
+        object.__setattr__(__self, '__key', kwargs.pop('__key', None))
+        for arg in args:
+            if not arg:
+                continue
+            elif isinstance(arg, dict):
+                for key, val in arg.items():
+                    __self[key] = __self._hook(val)
+            elif isinstance(arg, tuple) and (not isinstance(arg[0], tuple)):
+                __self[arg[0]] = __self._hook(arg[1])
+            else:
+                for key, val in iter(arg):
+                    __self[key] = __self._hook(val)
+
+        for key, val in kwargs.items():
+            __self[key] = __self._hook(val)
+
+    def lock(self):
+        """
+        Lock the config. Afterwards, new keys cannot be added to the
+        config, and the values of existing keys cannot be modified.
+        """
+        object.__setattr__(self, '__all_locked', True)
+        if self.key_lockable:
+            object.__setattr__(self, '__key_locked', True)
+
+        for k in self:
+            if isinstance(self[k], Config):
+                self[k].lock()
+
+    def unlock(self):
+        """
+        Unlock the config. Afterwards, new keys can be added to the
+        config, and the values of existing keys can be modified.
+        """
+        object.__setattr__(self, '__all_locked', False)
+        object.__setattr__(self, '__key_locked', False)
+
+        for k in self:
+            if isinstance(self[k], Config):
+                self[k].unlock()
+
+    def _get_lock_state_recursive(self):
+        """
+        Internal helper function to get the lock state of all sub-configs recursively.
+        """
+        lock_state = {"__all_locked": self.is_locked, "__key_locked": self.is_key_locked}
+        for k in self:
+            if isinstance(self[k], Config):
+                assert k not in ["__all_locked", "__key_locked"]
+                lock_state[k] = self[k]._get_lock_state_recursive()
+        return lock_state
+
+    def _set_lock_state_recursive(self, lock_state):
+        """
+        Internal helper function to set the lock state of all sub-configs recursively.
+        """
+        lock_state = deepcopy(lock_state)
+        object.__setattr__(self, '__all_locked', lock_state.pop("__all_locked"))
+        object.__setattr__(self, '__key_locked', lock_state.pop("__key_locked"))
+        for k in lock_state:
+            if isinstance(self[k], Config):
+                self[k]._set_lock_state_recursive(lock_state[k])
+
+    def _get_lock_state(self):
+        """
+        Retrieves the lock state of this config.
+
+        Returns:
+            lock_state (dict): a dictionary with an "all_locked" key that is True
+                if both key and value updates are locked and False otherwise, and
+                a "key_locked" key that is True if only key updates are locked (value
+                updates still allowed) and False otherwise
+        """
+        return {
+            "all_locked": self.is_locked,
+            "key_locked": self.is_key_locked
+        }
+
+    def _set_lock_state(self, lock_state):
+        """
+        Sets the lock state for this config.
+
+        Args:
+            lock_state (dict): a dictionary with an "all_locked" key that is True
+                if both key and value updates should be locked and False otherwise, and
+                a "key_locked" key that is True if only key updates should be locked (value
+                updates still allowed) and False otherwise
+        """
+        if lock_state["all_locked"]:
+            self.lock()
+        if lock_state["key_locked"]:
+            self.lock_keys()
+
+    @contextlib.contextmanager
+    def unlocked(self):
+        """
+        A context scope for modifying a Config object. Within the scope,
+        both keys and values can be updated. Upon leaving the scope,
+        the initial level of locking is restored.
+        """
+        lock_state = self._get_lock_state()
+        self.unlock()
+        yield
+        self._set_lock_state(lock_state)
+
+    @contextlib.contextmanager
+    def values_unlocked(self):
+        """
+        A context scope for modifying a Config object. Within the scope,
+        only values can be updated (new keys cannot be created). Upon 
+        leaving the scope, the initial level of locking is restored.
+        """
+        lock_state = self._get_lock_state()
+        self.unlock()
+        self.lock_keys()
+        yield
+        self._set_lock_state(lock_state)
+
+    def lock_keys(self):
+        """
+        Lock this config so that new keys cannot be added.
+        """
+        if not self.key_lockable:
+            return
+        object.__setattr__(self, '__key_locked', True)
+        for k in self:
+            if isinstance(self[k], Config):
+                self[k].lock_keys()
+
+    def unlock_keys(self):
+        """
+        Unlock this config so that new keys can be added.
+        """
+        object.__setattr__(self, '__key_locked', False)
+        for k in self:
+            if isinstance(self[k], Config):
+                self[k].unlock_keys()
+
+    @property
+    def is_locked(self):
+        """
+        Returns True if the config is locked (no key or value updates allowed).
+        """
+        return object.__getattribute__(self, '__all_locked')
+
+    @property
+    def is_key_locked(self):
+        """
+        Returns True if the config is key-locked (no key updates allowed).
+        """
+        return object.__getattribute__(self, '__key_locked')
+
+    def do_not_lock_keys(self):
+        """
+        Calling this function on this config indicates that key updates should be 
+        allowed even when this config is key-locked (but not when it is completely
+        locked). This is convenient for attributes that contain kwargs, where there
+        might be a variable type and number of arguments contained in the sub-config.
+        """
+        object.__setattr__(self, '__do_not_lock_keys', True)
+
+    @property
+    def key_lockable(self):
+        """
+        Returns true if this config is key-lockable (new keys cannot be inserted in a 
+        key-locked lock level).
+        """
+        return not object.__getattribute__(self, '__do_not_lock_keys')
+
+    def __setattr__(self, name, value):
+        if self.is_locked:
+            raise RuntimeError("This config has been locked - cannot set attribute '{}' to {}".format(name, value))
+
+        if hasattr(Config, name):
+            raise AttributeError("'Dict' object attribute "
+                                 "'{0}' is read-only".format(name))
+        elif not hasattr(self, name) and self.is_key_locked:
+            raise RuntimeError("This config is key-locked - cannot add key '{}'".format(name))
+        else:
+            self[name] = value
+
+    def __setitem__(self, name, value):
+        super(Config, self).__setitem__(name, value)
+        p = object.__getattribute__(self, '__parent')
+        key = object.__getattribute__(self, '__key')
+        if p is not None:
+            p[key] = self
+
+    def __add__(self, other):
+        if not self.keys():
+            return other
+        else:
+            self_type = type(self).__name__
+            other_type = type(other).__name__
+            msg = "unsupported operand type(s) for +: '{}' and '{}'"
+            raise TypeError(msg.format(self_type, other_type))
+
+    @classmethod
+    def _hook(cls, item):
+        if isinstance(item, dict):
+            # We return Config instance instead of cls instance to ensure all sub-configs are not a top-level class
+            return Config(item)
+        elif isinstance(item, (list, tuple)):
+            return type(item)(Config._hook(elem) for elem in item)
+        return item
+
+    def __getattr__(self, item):
+        return self.__getitem__(item)
+
+    def __repr__(self):
+        json_string = json.dumps(self.to_dict(), indent=4)
+        return json_string
+
+    def __getitem__(self, name):
+        if name not in self:
+            if object.__getattribute__(self, '__all_locked') or object.__getattribute__(self, '__key_locked'):
+                 raise RuntimeError("This config has been locked and '{}' is not in this config".format(name))
+            return Config(__parent=self, __key=name)
+        return super(Config, self).__getitem__(name)
+
+    def __delattr__(self, name):
+        del self[name]
+
+    def to_dict(self):
+        base = {}
+        for key, value in self.items():
+            if isinstance(value, type(self)):
+                base[key] = value.to_dict()
+            elif isinstance(value, (list, tuple)):
+                base[key] = type(value)(
+                    item.to_dict() if isinstance(item, type(self)) else
+                    item for item in value)
+            else:
+                base[key] = value
+        return base
+
+    def copy(self):
+        return copy.copy(self)
+
+    def deepcopy(self):
+        return copy.deepcopy(self)
+
+    def __deepcopy__(self, memo):
+        other = self.__class__()
+        memo[id(self)] = other
+        for key, value in self.items():
+            other[copy.deepcopy(key, memo)] = copy.deepcopy(value, memo)
+        return other
+
+    def update(self, *args, **kwargs):
+        """
+        Update this config using another config or nested dictionary.
+        """
+        if self.is_locked:
+            raise RuntimeError('Cannot update - this config has been locked')
+        other = {}
+        if args:
+            if len(args) > 1:
+                raise TypeError()
+            other.update(args[0])
+        other.update(kwargs)
+        for k, v in other.items():
+            if self.is_key_locked and k not in self:
+                raise RuntimeError("Cannot update - this config has been key-locked and key '{}' does not exist".format(k))
+            if (not isinstance(self[k], dict)) or (not isinstance(v, dict)):
+                self[k] = v
+            else:
+                self[k].update(v)
+
+    def __getnewargs__(self):
+        return tuple(self.items())
+
+    def __getstate__(self):
+        return self
+
+    def __setstate__(self, state):
+        self.update(state)
+
+    def setdefault(self, key, default=None):
+        if key in self:
+            return self[key]
+        else:
+            self[key] = default
+            return default
+
+    def dump(self, filename=None):
+        """
+        Dumps the config to a json.
+
+        Args:
+            filename (str): if not None, save to json file.
+
+        Returns:
+            json_string (str): json string representation of
+                this config
+        """
+        json_string = json.dumps(self.to_dict(), indent=4)
+        if filename is not None:
+            f = open(filename, "w")
+            f.write(json_string)
+            f.close()
+        return json_string
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/cql_config.py b/phantom/submodules/phantom-robomimic/robomimic/config/cql_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..26fea048fe49d2d2d03f888eedab6754f37c8dcc
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/cql_config.py
@@ -0,0 +1,82 @@
+"""
+Config for CQL algorithm.
+"""
+
+from robomimic.config.base_config import BaseConfig
+
+
+class CQLConfig(BaseConfig):
+    ALGO_NAME = "cql"
+
+    def train_config(self):
+        """
+        Update from superclass to change default batch size.
+        """
+        super(CQLConfig, self).train_config()
+
+        # increase batch size to 1024 (found to work better for most manipulation experiments)
+        self.train.batch_size = 1024
+
+    def algo_config(self):
+        """
+        This function populates the `config.algo` attribute of the config, and is given to the 
+        `Algo` subclass (see `algo/algo.py`) for each algorithm through the `algo_config` 
+        argument to the constructor. Any parameter that an algorithm needs to determine its 
+        training and test-time behavior should be populated here.
+        """
+
+        # optimization parameters
+        self.algo.optim_params.critic.learning_rate.initial = 1e-3          # critic learning rate
+        self.algo.optim_params.critic.learning_rate.decay_factor = 0.0      # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.critic.learning_rate.epoch_schedule = []     # epochs where LR decay occurs
+        self.algo.optim_params.critic.regularization.L2 = 0.00              # L2 regularization strength
+
+        self.algo.optim_params.actor.learning_rate.initial = 3e-4           # actor learning rate
+        self.algo.optim_params.actor.learning_rate.decay_factor = 0.0       # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.actor.learning_rate.epoch_schedule = []      # epochs where LR decay occurs
+        self.algo.optim_params.actor.regularization.L2 = 0.00               # L2 regularization strength
+
+        # target network related parameters
+        self.algo.discount = 0.99                                           # discount factor to use
+        self.algo.n_step = 1                                                # for using n-step returns in TD-updates
+        self.algo.target_tau = 0.005                                        # update rate for target networks
+
+        # ================== Actor Network Config ===================
+        self.algo.actor.bc_start_steps = 0                                  # uses BC policy loss for first n-training steps
+        self.algo.actor.target_entropy = "default"                          # None is fixed entropy, otherwise is automatically tuned to match target. Can specify "default" as well for default tuning target
+        self.algo.actor.max_gradient_norm = None                            # L2 gradient clipping for actor
+
+        # Actor network settings
+        self.algo.actor.net.type = "gaussian"                               # Options are currently only "gaussian" (no support for GMM yet)
+
+        # Actor network settings - shared
+        self.algo.actor.net.common.std_activation = "exp"                   # Activation to use for std output from policy net
+        self.algo.actor.net.common.use_tanh = True                          # Whether to use tanh at output of actor network
+        self.algo.actor.net.common.low_noise_eval = True                    # Whether to use deterministic action sampling at eval stage
+
+        # Actor network settings - gaussian
+        self.algo.actor.net.gaussian.init_last_fc_weight = 0.001            # If set, will override the initialization of the final fc layer to be uniformly sampled limited by this value
+        self.algo.actor.net.gaussian.init_std = 0.3                         # Relative scaling factor for std from policy net
+        self.algo.actor.net.gaussian.fixed_std = False                      # Whether to learn std dev or not
+
+        self.algo.actor.layer_dims = (300, 400)                             # actor MLP layer dimensions
+
+        # ================== Critic Network Config ===================
+        self.algo.critic.use_huber = False                                  # Huber Loss instead of L2 for critic
+        self.algo.critic.max_gradient_norm = None                           # L2 gradient clipping for critic (None to use no clipping)
+
+        self.algo.critic.value_bounds = None                                # optional 2-tuple to ensure lower and upper bound on value estimates 
+
+        self.algo.critic.num_action_samples = 1                             # number of actions to sample per training batch to get target critic value; use maximum Q value from n random sampled actions when doing TD error backup
+
+        # cql settings for critic
+        self.algo.critic.cql_weight = 1.0                                   # weighting for cql component of critic loss (only used if target_q_gap is < 0 or None)
+        self.algo.critic.deterministic_backup = True                        # if not set, subtract weighted logprob of action when doing backup
+        self.algo.critic.min_q_weight = 1.0                                 # min q weight (scaling factor) to apply
+        self.algo.critic.target_q_gap = 5.0                                 # if set, sets the diff threshold at which Q-values will be penalized more (note: this overrides cql weight above!) Use None or a negative value if not set
+        self.algo.critic.num_random_actions = 10                            # Number of random actions to sample when calculating CQL loss
+
+        # critic ensemble parameters (TD3 trick)
+        self.algo.critic.ensemble.n = 2                                     # number of Q networks in the ensemble
+
+        self.algo.critic.layer_dims = (300, 400)                            # critic MLP layer dimensions
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/default_templates/bc_transformer.json b/phantom/submodules/phantom-robomimic/robomimic/config/default_templates/bc_transformer.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed59f175b532c1cd61e8c4efefba1d985e8eaa31
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/default_templates/bc_transformer.json
@@ -0,0 +1,171 @@
+{
+    "algo_name": "bc",
+    "experiment": {
+        "name": "test",
+        "validate": false,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 50,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": false,
+            "on_best_rollout_success_rate": true
+        },
+        "epoch_every_n_steps": 100,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": true,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 400,
+            "rate": 50,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir": "../bc_transformer_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "all",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": false,
+        "hdf5_normalize_obs": false,
+        "hdf5_filter_key": null,
+        "seq_length": 1,
+        "pad_seq_length": true,
+        "frame_stack": 10,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 100,
+        "num_epochs": 2000,
+        "seed": 1
+    },
+    "algo": {
+        "optim_params": {
+            "policy": {
+                "optimizer_type": "adamw",
+                "learning_rate": {
+                    "initial": 0.0001,
+                    "decay_factor": 0.1,
+                    "epoch_schedule": [100],
+                    "scheduler_type": "linear"
+                },
+                "regularization": {
+                    "L2": 0.01
+                }
+            }
+        },
+        "loss": {
+            "l2_weight": 1.0,
+            "l1_weight": 0.0,
+            "cos_weight": 0.0
+        },
+        "actor_layer_dims": [],
+        "gaussian": {
+            "enabled": false
+        },
+        "gmm": {
+            "enabled": true,
+            "num_modes": 5,
+            "min_std": 0.0001,
+            "std_activation": "softplus",
+            "low_noise_eval": true
+        },
+        "vae": {
+            "enabled": false
+        },
+        "rnn": {
+            "enabled": false
+        },
+        "transformer": {
+            "enabled": true,
+            "supervise_all_steps": false,
+            "num_layers": 6,
+            "embed_dim": 512,
+            "num_heads": 8
+        }
+    },
+    "observation": {
+        "modalities": {
+            "obs": {
+                "low_dim": [
+                    "robot0_eef_pos",
+                    "robot0_eef_quat",
+                    "robot0_gripper_qpos",
+                    "object"
+                ],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            },
+            "goal": {
+                "low_dim": [],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            }
+        },
+        "encoder": {
+            "low_dim": {
+                "core_class": null,
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "rgb": {
+                "core_class": "VisualCore",
+                "core_kwargs": {
+                    "feature_dimension": 64,
+                    "backbone_class": "ResNet18Conv",
+                    "backbone_kwargs": {
+                        "pretrained": false,
+                        "input_coord_conv": false
+                    },
+                    "pool_class": "SpatialSoftmax",
+                    "pool_kwargs": {
+                        "num_kp": 32,
+                        "learnable_temperature": false,
+                        "temperature": 1.0,
+                        "noise_std": 0.0
+                    }
+                },
+                "obs_randomizer_class": "CropRandomizer",
+                "obs_randomizer_kwargs": {
+                    "crop_height": 76,
+                    "crop_width": 76,
+                    "num_crops": 1,
+                    "pos_enc": false
+                }
+            },
+            "depth": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "scan": {
+                "core_class": "ScanCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            }
+        }
+    }
+}
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/diffusion_policy_config.py b/phantom/submodules/phantom-robomimic/robomimic/config/diffusion_policy_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8662a107d53c2cdae95454caa521a677326a01d8
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/diffusion_policy_config.py
@@ -0,0 +1,57 @@
+"""
+Config for Diffusion Policy algorithm.
+"""
+
+from robomimic.config.base_config import BaseConfig
+
+class DiffusionPolicyConfig(BaseConfig):
+    ALGO_NAME = "diffusion_policy"
+    
+    def algo_config(self):
+        """
+        This function populates the `config.algo` attribute of the config, and is given to the 
+        `Algo` subclass (see `algo/algo.py`) for each algorithm through the `algo_config` 
+        argument to the constructor. Any parameter that an algorithm needs to determine its 
+        training and test-time behavior should be populated here.
+        """
+        
+        # optimization parameters
+        self.algo.optim_params.policy.learning_rate.initial = 1e-4      # policy learning rate
+        self.algo.optim_params.policy.learning_rate.decay_factor = 0.1  # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.policy.learning_rate.epoch_schedule = [] # epochs where LR decay occurs
+        self.algo.optim_params.policy.regularization.L2 = 0.00          # L2 regularization strength
+
+        # horizon parameters
+        self.algo.horizon.observation_horizon = 2
+        self.algo.horizon.action_horizon = 8
+        self.algo.horizon.prediction_horizon = 16
+        
+        # UNet parameters
+        self.algo.unet.enabled = True
+        self.algo.unet.diffusion_step_embed_dim = 256
+        self.algo.unet.down_dims = [256,512,1024]
+        self.algo.unet.kernel_size = 5
+        self.algo.unet.n_groups = 8
+        
+        # EMA parameters
+        self.algo.ema.enabled = True
+        self.algo.ema.power = 0.75
+        
+        # Noise Scheduler
+        ## DDPM
+        self.algo.ddpm.enabled = True
+        self.algo.ddpm.num_train_timesteps = 100
+        self.algo.ddpm.num_inference_timesteps = 100
+        self.algo.ddpm.beta_schedule = 'squaredcos_cap_v2'
+        self.algo.ddpm.clip_sample = True
+        self.algo.ddpm.prediction_type = 'epsilon'
+
+        ## DDIM
+        self.algo.ddim.enabled = False
+        self.algo.ddim.num_train_timesteps = 100
+        self.algo.ddim.num_inference_timesteps = 10
+        self.algo.ddim.beta_schedule = 'squaredcos_cap_v2'
+        self.algo.ddim.clip_sample = True
+        self.algo.ddim.set_alpha_to_one = True
+        self.algo.ddim.steps_offset = 0
+        self.algo.ddim.prediction_type = 'epsilon'
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/gl_config.py b/phantom/submodules/phantom-robomimic/robomimic/config/gl_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..939103e65dd5f7519fb7be2c9fa1928d5b430bf2
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/gl_config.py
@@ -0,0 +1,89 @@
+"""
+Config for Goal Learning (sub-algorithm used by hierarchical models like HBC and IRIS).
+This class of model predicts (or samples) subgoal observations given a current observation.
+"""
+
+from robomimic.config.base_config import BaseConfig
+
+
+class GLConfig(BaseConfig):
+    ALGO_NAME = "gl"
+
+    def algo_config(self):
+        """
+        This function populates the `config.algo` attribute of the config, and is given to the 
+        `Algo` subclass (see `algo/algo.py`) for each algorithm through the `algo_config` 
+        argument to the constructor. Any parameter that an algorithm needs to determine its 
+        training and test-time behavior should be populated here.
+        """
+
+        # optimization parameters
+        self.algo.optim_params.goal_network.learning_rate.initial = 1e-4        # goal network learning rate
+        self.algo.optim_params.goal_network.learning_rate.decay_factor = 0.1    # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.goal_network.learning_rate.epoch_schedule = []   # epochs where LR decay occurs
+        self.algo.optim_params.goal_network.regularization.L2 = 0.00
+
+        # subgoal definition: observation that is @subgoal_horizon number of timesteps in future from current observation
+        self.algo.subgoal_horizon = 10 
+
+        # MLP size for deterministic goal network (unused if VAE is enabled)
+        self.algo.ae.planner_layer_dims = (300, 400)
+
+        # ================== VAE config ==================
+        self.algo.vae.enabled = True                                        # set to true to use VAE network
+        self.algo.vae.latent_dim = 16                                       # VAE latent dimension
+        self.algo.vae.latent_clip = None                                    # clip latent space when decoding (set to None to disable)
+        self.algo.vae.kl_weight = 1.                                        # beta-VAE weight to scale KL loss relative to reconstruction loss in ELBO
+
+        # VAE decoder settings
+        self.algo.vae.decoder.is_conditioned = True                         # whether decoder should condition on observation
+        self.algo.vae.decoder.reconstruction_sum_across_elements = False    # sum instead of mean for reconstruction loss
+
+        # VAE prior settings
+        self.algo.vae.prior.learn = False                                   # learn Gaussian / GMM prior instead of N(0, 1)
+        self.algo.vae.prior.is_conditioned = False                          # whether to condition prior on observations
+        self.algo.vae.prior.use_gmm = False                                 # whether to use GMM prior
+        self.algo.vae.prior.gmm_num_modes = 10                              # number of GMM modes
+        self.algo.vae.prior.gmm_learn_weights = False                       # whether to learn GMM weights 
+        self.algo.vae.prior.use_categorical = False                         # whether to use categorical prior
+        self.algo.vae.prior.categorical_dim = 10                            # the number of categorical classes for each latent dimension
+        self.algo.vae.prior.categorical_gumbel_softmax_hard = False         # use hard selection in forward pass
+        self.algo.vae.prior.categorical_init_temp = 1.0                     # initial gumbel-softmax temp
+        self.algo.vae.prior.categorical_temp_anneal_step = 0.001            # linear temp annealing rate
+        self.algo.vae.prior.categorical_min_temp = 0.3                      # lowest gumbel-softmax temp
+
+        self.algo.vae.encoder_layer_dims = (300, 400)                       # encoder MLP layer dimensions
+        self.algo.vae.decoder_layer_dims = (300, 400)                       # decoder MLP layer dimensions
+        self.algo.vae.prior_layer_dims = (300, 400)                         # prior MLP layer dimensions (if learning conditioned prior)
+
+    def observation_config(self):
+        """
+        Update from superclass to specify subgoal modalities.
+        """
+        super(GLConfig, self).observation_config()
+        self.observation.modalities.subgoal.low_dim = [                     # specify low-dim subgoal observations for agent to predict
+            "robot0_eef_pos", 
+            "robot0_eef_quat", 
+            "robot0_gripper_qpos", 
+            "object",
+        ]
+        self.observation.modalities.subgoal.rgb = []                      # specify rgb image subgoal observations for agent to predict
+        self.observation.modalities.subgoal.depth = []
+        self.observation.modalities.subgoal.scan = []
+        self.observation.modalities.subgoal.do_not_lock_keys()
+
+    @property
+    def all_obs_keys(self):
+        """
+        Update from superclass to include subgoals.
+        """
+        # pool all modalities
+        return sorted(tuple(set([
+            obs_key for group in [
+                self.observation.modalities.obs.values(),
+                self.observation.modalities.goal.values(),
+                self.observation.modalities.subgoal.values(),
+            ]
+            for modality in group
+            for obs_key in modality
+        ])))
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/hbc_config.py b/phantom/submodules/phantom-robomimic/robomimic/config/hbc_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae65c9b85fc168dc65666392fd334810b622ab04
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/hbc_config.py
@@ -0,0 +1,96 @@
+"""
+Config for HBC algorithm.
+"""
+
+from robomimic.config.base_config import BaseConfig
+from robomimic.config.gl_config import GLConfig
+from robomimic.config.bc_config import BCConfig
+
+
+class HBCConfig(BaseConfig):
+    ALGO_NAME = "hbc"
+
+    def train_config(self):
+        """
+        Update from superclass to change default sequence length to load from dataset.
+        """
+        super(HBCConfig, self).train_config()
+        self.train.seq_length = 10  # length of experience sequence to fetch from the buffer
+
+    def algo_config(self):
+        """
+        This function populates the `config.algo` attribute of the config, and is given to the 
+        `Algo` subclass (see `algo/algo.py`) for each algorithm through the `algo_config` 
+        argument to the constructor. Any parameter that an algorithm needs to determine its 
+        training and test-time behavior should be populated here.
+        """
+
+        # One of ["separate", "cascade"]. In "separate" mode (default),
+        # the planner and actor are trained independently and then the planner subgoal predictions are
+        # used to condition the actor at test-time. In "cascade" mode, the actor is trained directly
+        # on planner subgoal predictions. In "actor_only" mode, only the actor is trained, and in
+        # "planner_only" mode, only the planner is trained.
+        self.algo.mode = "separate"
+        self.algo.actor_use_random_subgoals = False  # whether to sample subgoal index from [1, subgoal_horizon]
+        self.algo.subgoal_update_interval = 10  # how frequently the subgoal should be updated at test-time
+
+
+        # ================== Latent Subgoal Config ==================
+        self.algo.latent_subgoal.enabled = False    # if True, use VAE latent space as subgoals for actor, instead of reconstructions
+
+        # prior correction trick for actor and value training: instead of using encoder for 
+        # transforming subgoals to latent subgoals, generate prior samples and choose
+        # the closest one to the encoder output
+        self.algo.latent_subgoal.prior_correction.enabled = False
+        self.algo.latent_subgoal.prior_correction.num_samples = 100
+
+        # ================== Planner Config ==================
+        self.algo.planner = GLConfig().algo  # config for goal learning
+        # set subgoal horizon explicitly
+        self.algo.planner.subgoal_horizon = 10
+        # ensure VAE is used
+        self.algo.planner.vae.enabled = True
+
+        # ================== Actor Config ===================
+        self.algo.actor = BCConfig().algo
+        # use RNN
+        self.algo.actor.rnn.enabled = True
+        self.algo.actor.rnn.horizon = 10
+        # remove unused parts of BCConfig algo config
+        del self.algo.actor.gaussian
+        del self.algo.actor.gmm
+        del self.algo.actor.vae
+
+    def observation_config(self):
+        """
+        Update from superclass so that planner and actor each get their own observation config.
+        """
+        self.observation.planner = GLConfig().observation
+        self.observation.actor = BCConfig().observation
+
+    @property
+    def use_goals(self):
+        """
+        Update from superclass - planner goal modalities determine goal-conditioning
+        """
+        return len(
+            self.observation.planner.modalities.goal.low_dim +
+            self.observation.planner.modalities.goal.rgb) > 0
+
+    @property
+    def all_obs_keys(self):
+        """
+        Update from superclass to include modalities from planner and actor.
+        """
+        # pool all modalities
+        return sorted(tuple(set([
+            obs_key for group in [
+                self.observation.planner.modalities.obs.values(),
+                self.observation.planner.modalities.goal.values(),
+                self.observation.planner.modalities.subgoal.values(),
+                self.observation.actor.modalities.obs.values(),
+                self.observation.actor.modalities.goal.values(),
+            ]
+            for modality in group
+            for obs_key in modality
+        ])))
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/iql_config.py b/phantom/submodules/phantom-robomimic/robomimic/config/iql_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd603d1aa0183639971b16747c5020afa6d04fe3
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/iql_config.py
@@ -0,0 +1,73 @@
+"""
+Config for IQL algorithm.
+"""
+
+from robomimic.config.base_config import BaseConfig
+
+
+class IQLConfig(BaseConfig):
+    ALGO_NAME = "iql"
+
+    def algo_config(self):
+        """
+        This function populates the `config.algo` attribute of the config, and is given to the 
+        `Algo` subclass (see `algo/algo.py`) for each algorithm through the `algo_config` 
+        argument to the constructor. Any parameter that an algorithm needs to determine its 
+        training and test-time behavior should be populated here.
+        """
+        super(IQLConfig, self).algo_config()
+
+        # optimization parameters        
+        self.algo.optim_params.critic.learning_rate.initial = 1e-4          # critic learning rate
+        self.algo.optim_params.critic.learning_rate.decay_factor = 0.0      # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.critic.learning_rate.epoch_schedule = []     # epochs where LR decay occurs
+        self.algo.optim_params.critic.regularization.L2 = 0.00              # L2 regularization strength
+
+        self.algo.optim_params.vf.learning_rate.initial = 1e-4              # vf learning rate
+        self.algo.optim_params.vf.learning_rate.decay_factor = 0.0          # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.vf.learning_rate.epoch_schedule = []         # epochs where LR decay occurs
+        self.algo.optim_params.vf.regularization.L2 = 0.00                  # L2 regularization strength
+
+        self.algo.optim_params.actor.learning_rate.initial = 1e-4           # actor learning rate
+        self.algo.optim_params.actor.learning_rate.decay_factor = 0.0       # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.actor.learning_rate.epoch_schedule = []      # epochs where LR decay occurs
+        self.algo.optim_params.actor.regularization.L2 = 0.00               # L2 regularization strength
+
+        # target network related parameters
+        self.algo.discount = 0.99                                           # discount factor to use
+        self.algo.target_tau = 0.01                                         # update rate for target networks
+
+        # ================== Actor Network Config ===================
+        # Actor network settings
+        self.algo.actor.net.type = "gaussian"                               # Options are currently ["gaussian", "gmm"]
+
+        # Actor network settings - shared
+        self.algo.actor.net.common.std_activation = "softplus"              # Activation to use for std output from policy net
+        self.algo.actor.net.common.low_noise_eval = True                    # Whether to use deterministic action sampling at eval stage
+        self.algo.actor.net.common.use_tanh = False                         # Whether to use tanh at output of actor network
+
+        # Actor network settings - gaussian
+        self.algo.actor.net.gaussian.init_last_fc_weight = 0.001            # If set, will override the initialization of the final fc layer to be uniformly sampled limited by this value
+        self.algo.actor.net.gaussian.init_std = 0.3                         # Relative scaling factor for std from policy net
+        self.algo.actor.net.gaussian.fixed_std = False                      # Whether to learn std dev or not
+
+        self.algo.actor.net.gmm.num_modes = 5                               # number of GMM modes
+        self.algo.actor.net.gmm.min_std = 0.0001                            # minimum std output from network
+
+        self.algo.actor.layer_dims = (300, 400)                             # actor MLP layer dimensions
+
+        self.algo.actor.max_gradient_norm = None                            # L2 gradient clipping for actor
+
+        # ================== Critic Network Config ===================
+        # critic ensemble parameters
+        self.algo.critic.ensemble.n = 2                                     # number of Q networks in the ensemble
+        self.algo.critic.layer_dims = (300, 400)                            # critic MLP layer dimensions
+        self.algo.critic.use_huber = False                                  # Huber Loss instead of L2 for critic
+        self.algo.critic.max_gradient_norm = None                           # L2 gradient clipping for actor
+
+        # ================== Adv Config ==============================
+        self.algo.adv.clip_adv_value = None                                 # whether to clip raw advantage estimates
+        self.algo.adv.beta = 1.0                                            # temperature for operator
+        self.algo.adv.use_final_clip = True                                 # whether to clip final weight calculations
+
+        self.algo.vf_quantile = 0.9                                         # quantile factor in quantile regression
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/iris_config.py b/phantom/submodules/phantom-robomimic/robomimic/config/iris_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c03328cead61f1a977d76bb4b684613586c2a08c
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/iris_config.py
@@ -0,0 +1,99 @@
+"""
+Config for IRIS algorithm.
+"""
+
+from robomimic.config.bcq_config import BCQConfig
+from robomimic.config.gl_config import GLConfig
+from robomimic.config.bc_config import BCConfig
+from robomimic.config.hbc_config import HBCConfig
+
+
+class IRISConfig(HBCConfig):
+    ALGO_NAME = "iris"
+
+    def algo_config(self):
+        """
+        This function populates the `config.algo` attribute of the config, and is given to the 
+        `Algo` subclass (see `algo/algo.py`) for each algorithm through the `algo_config` 
+        argument to the constructor. Any parameter that an algorithm needs to determine its 
+        training and test-time behavior should be populated here.
+        """
+
+        # One of ["separate", "cascade"]. In "separate" mode (default),
+        # the planner and actor are trained independently and then the planner subgoal predictions are
+        # used to condition the actor at test-time. In "cascade" mode, the actor is trained directly
+        # on planner subgoal predictions. In "actor_only" mode, only the actor is trained, and in
+        # "planner_only" mode, only the planner is trained.
+        self.algo.mode = "separate"
+
+        self.algo.actor_use_random_subgoals = False # whether to sample subgoal index from [1, subgoal_horizon]
+        self.algo.subgoal_update_interval = 10      # how frequently the subgoal should be updated at test-time (usually matches train.seq_length)
+
+        # ================== Latent Subgoal Config ==================
+
+        # NOTE: latent subgoals are not supported by IRIS, but superclass expects this config
+        self.algo.latent_subgoal.enabled = False
+        self.algo.latent_subgoal.prior_correction.enabled = False
+        self.algo.latent_subgoal.prior_correction.num_samples = 100
+
+        # ================== Planner Config ==================
+
+        # The ValuePlanner planner component is a Goal Learning VAE model
+        self.algo.value_planner.planner = GLConfig().algo  # config for goal learning
+        # set subgoal horizon explicitly
+        self.algo.value_planner.planner.subgoal_horizon = 10
+        # ensure VAE is used
+        self.algo.value_planner.planner.vae.enabled = True
+
+        # The ValuePlanner value component is a BCQ model
+        self.algo.value_planner.value = BCQConfig().algo
+        self.algo.value_planner.value.actor.enabled = False # ensure no BCQ actor
+        # number of subgoal samples to use for value planner
+        self.algo.value_planner.num_samples = 100
+
+        # ================== Actor Config ===================
+        self.algo.actor = BCConfig().algo
+        # use RNN
+        self.algo.actor.rnn.enabled = True
+        self.algo.actor.rnn.horizon = 10
+        # remove unused parts of BCConfig algo config
+        del self.algo.actor.gaussian
+        del self.algo.actor.gmm
+        del self.algo.actor.vae
+
+    def observation_config(self):
+        """
+        Update from superclass so that value planner and actor each get their own obs config.
+        """
+        self.observation.value_planner.planner = GLConfig().observation
+        self.observation.value_planner.value = BCQConfig().observation
+        self.observation.actor = BCConfig().observation
+
+    @property
+    def use_goals(self):
+        """
+        Update from superclass - value planner goal modalities determine goal-conditioning.
+        """
+        return len(
+            self.observation.value_planner.planner.modalities.goal.low_dim +
+            self.observation.value_planner.planner.modalities.goal.rgb) > 0
+
+    @property
+    def all_obs_keys(self):
+        """
+        Update from superclass to include modalities from value planner and actor.
+        """
+        # pool all modalities
+        return sorted(tuple(set([
+            obs_key for group in [
+                self.observation.value_planner.planner.modalities.obs.values(),
+                self.observation.value_planner.planner.modalities.goal.values(),
+                self.observation.value_planner.planner.modalities.subgoal.values(),
+                self.observation.value_planner.value.modalities.obs.values(),
+                self.observation.value_planner.value.modalities.goal.values(),
+                self.observation.actor.modalities.obs.values(),
+                self.observation.actor.modalities.goal.values(),
+            ]
+            for modality in group
+            for obs_key in modality
+        ])))
diff --git a/phantom/submodules/phantom-robomimic/robomimic/config/td3_bc_config.py b/phantom/submodules/phantom-robomimic/robomimic/config/td3_bc_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..036a2591a91b4a4f5da4e2415dd035117e587900
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/config/td3_bc_config.py
@@ -0,0 +1,111 @@
+"""
+Config for TD3_BC.
+"""
+
+from robomimic.config.base_config import BaseConfig
+
+
+class TD3_BCConfig(BaseConfig):
+    ALGO_NAME = "td3_bc"
+
+    def experiment_config(self):
+        """
+        Update from subclass to set paper defaults for gym envs.
+        """
+        super(TD3_BCConfig, self).experiment_config()
+
+        # no validation and no video rendering
+        self.experiment.validate = False
+        self.experiment.render_video = False
+
+        # save 10 checkpoints throughout training
+        self.experiment.save.every_n_epochs = 20 
+
+        # save models that achieve best rollout return instead of best success rate
+        self.experiment.save.on_best_rollout_return = True
+        self.experiment.save.on_best_rollout_success_rate = False
+
+        # epoch definition - 5000 gradient steps per epoch, with 200 epochs = 1M gradient steps, and eval every 1 epochs
+        self.experiment.epoch_every_n_steps = 5000
+
+        # evaluate with normal environment rollouts
+        self.experiment.rollout.enabled = True
+        self.experiment.rollout.n = 50              # paper uses 10, but we can afford to do 50
+        self.experiment.rollout.horizon = 1000
+        self.experiment.rollout.rate = 1            # rollout every epoch to match paper
+
+    def train_config(self):
+        """
+        Update from subclass to set paper defaults for gym envs.
+        """
+        super(TD3_BCConfig, self).train_config()
+
+        # update to normalize observations
+        self.train.hdf5_normalize_obs = True 
+
+        # increase batch size to 256
+        self.train.batch_size = 256
+
+        # 200 epochs, with each epoch lasting 5000 gradient steps, for 1M total steps
+        self.train.num_epochs = 200
+
+    def algo_config(self):
+        """
+        This function populates the `config.algo` attribute of the config, and is given to the 
+        `Algo` subclass (see `algo/algo.py`) for each algorithm through the `algo_config` 
+        argument to the constructor. Any parameter that an algorithm needs to determine its 
+        training and test-time behavior should be populated here.
+        """
+
+        # optimization parameters
+        self.algo.optim_params.critic.learning_rate.initial = 3e-4      # critic learning rate
+        self.algo.optim_params.critic.learning_rate.decay_factor = 0.1  # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.critic.learning_rate.epoch_schedule = [] # epochs where LR decay occurs
+        self.algo.optim_params.critic.regularization.L2 = 0.00          # L2 regularization strength
+        self.algo.optim_params.critic.start_epoch = -1                  # number of epochs before starting critic training (-1 means start right away)
+        self.algo.optim_params.critic.end_epoch = -1                    # number of epochs before ending critic training (-1 means start right away)
+
+        self.algo.optim_params.actor.learning_rate.initial = 3e-4       # actor learning rate
+        self.algo.optim_params.actor.learning_rate.decay_factor = 0.1   # factor to decay LR by (if epoch schedule non-empty)
+        self.algo.optim_params.actor.learning_rate.epoch_schedule = []  # epochs where LR decay occurs
+        self.algo.optim_params.actor.regularization.L2 = 0.00           # L2 regularization strength
+        self.algo.optim_params.actor.start_epoch = -1                   # number of epochs before starting actor training (-1 means start right away)
+        self.algo.optim_params.actor.end_epoch = -1                     # number of epochs before ending actor training (-1 means start right away)
+
+        # alpha value - for weighting critic loss vs. BC loss
+        self.algo.alpha = 2.5
+
+        # target network related parameters
+        self.algo.discount = 0.99                       # discount factor to use
+        self.algo.n_step = 1                            # for using n-step returns in TD-updates
+        self.algo.target_tau = 0.005                    # update rate for target networks
+        self.algo.infinite_horizon = False              # if True, scale terminal rewards by 1 / (1 - discount) to treat as infinite horizon
+
+        # ================== Critic Network Config ===================
+        self.algo.critic.use_huber = False              # Huber Loss instead of L2 for critic
+        self.algo.critic.max_gradient_norm = None       # L2 gradient clipping for critic (None to use no clipping)
+        self.algo.critic.value_bounds = None            # optional 2-tuple to ensure lower and upper bound on value estimates 
+
+        # critic ensemble parameters (TD3 trick)
+        self.algo.critic.ensemble.n = 2                 # number of Q networks in the ensemble
+        self.algo.critic.ensemble.weight = 1.0          # weighting for mixing min and max for target Q value
+
+        self.algo.critic.layer_dims = (256, 256)        # size of critic MLP
+
+        # ================== Actor Network Config ===================
+
+        # update actor and target networks every n gradients steps for each critic gradient step
+        self.algo.actor.update_freq = 2
+
+        # exploration noise used to form target action for Q-update - clipped Gaussian noise
+        self.algo.actor.noise_std = 0.2                 # zero-mean gaussian noise with this std is applied to actions
+        self.algo.actor.noise_clip = 0.5                # noise is clipped in each dimension to (-noise_clip, noise_clip)
+
+        self.algo.actor.layer_dims = (256, 256)         # size of actor MLP
+
+    def observation_config(self):
+        """
+        Update from superclass to use flat observations from gym envs.
+        """
+        super(TD3_BCConfig, self).observation_config()
+        self.observation.modalities.obs.low_dim = ["flat"]
diff --git a/phantom/submodules/phantom-robomimic/robomimic/envs/__init__.py b/phantom/submodules/phantom-robomimic/robomimic/envs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-robomimic/robomimic/envs/env_base.py b/phantom/submodules/phantom-robomimic/robomimic/envs/env_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1006ea1bf4f29357f2b32127fd6cf268697c948
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/envs/env_base.py
@@ -0,0 +1,245 @@
+"""
+This file contains the base class for environment wrappers that are used
+to provide a standardized environment API for training policies and interacting
+with metadata present in datasets.
+"""
+import abc
+
+
+class EnvType:
+    """
+    Holds environment types - one per environment class.
+    These act as identifiers for different environments.
+    """
+    ROBOSUITE_TYPE = 1
+    GYM_TYPE = 2
+    IG_MOMART_TYPE = 3
+    REAL_TYPE = 6
+    GPRS_REAL_TYPE = 7
+    REAL_UR5E_TYPE = 8
+    REAL_KINOVA_TYPE = 9
+
+
+class EnvBase(abc.ABC):
+    """A base class method for environments used by this repo."""
+    @abc.abstractmethod
+    def __init__(
+        self,
+        env_name, 
+        render=False, 
+        render_offscreen=False, 
+        use_image_obs=False, 
+        use_depth_obs=False,
+        postprocess_visual_obs=True,
+        **kwargs,
+    ):
+        """
+        Args:
+            env_name (str): name of environment. Only needs to be provided if making a different
+                environment from the one in @env_meta.
+
+            render (bool): if True, environment supports on-screen rendering
+
+            render_offscreen (bool): if True, environment supports off-screen rendering. This
+                is forced to be True if @env_meta["use_images"] is True.
+
+            use_image_obs (bool): if True, environment is expected to render rgb image observations
+                on every env.step call. Set this to False for efficiency reasons, if image
+                observations are not required.
+
+            use_depth_obs (bool): if True, environment is expected to render depth image observations
+                on every env.step call. Set this to False for efficiency reasons, if depth
+                observations are not required.
+
+            postprocess_visual_obs (bool): if True, postprocess image observations
+                to prepare for learning. This should only be False when extracting observations
+                for saving to a dataset (to save space on RGB images for example).
+        """
+        return
+
+    @abc.abstractmethod
+    def step(self, action):
+        """
+        Step in the environment with an action.
+
+        Args:
+            action (np.array): action to take
+
+        Returns:
+            observation (dict): new observation dictionary
+            reward (float): reward for this step
+            done (bool): whether the task is done
+            info (dict): extra information
+        """
+        return
+
+    @abc.abstractmethod
+    def reset(self):
+        """
+        Reset environment.
+
+        Returns:
+            observation (dict): initial observation dictionary.
+        """
+        return
+
+    @abc.abstractmethod
+    def reset_to(self, state):
+        """
+        Reset to a specific simulator state.
+
+        Args:
+            state (dict): current simulator state
+        
+        Returns:
+            observation (dict): observation dictionary after setting the simulator state
+        """
+        return
+
+    @abc.abstractmethod
+    def render(self, mode="human", height=None, width=None, camera_name=None):
+        """Render"""
+        return
+
+    @abc.abstractmethod
+    def get_observation(self):
+        """Get environment observation"""
+        return
+
+    @abc.abstractmethod
+    def get_state(self):
+        """Get environment simulator state, compatible with @reset_to"""
+        return
+
+    @abc.abstractmethod
+    def get_reward(self):
+        """
+        Get current reward.
+        """
+        return
+
+    @abc.abstractmethod
+    def get_goal(self):
+        """
+        Get goal observation. Not all environments support this.
+        """
+        return
+
+    @abc.abstractmethod
+    def set_goal(self, **kwargs):
+        """
+        Set goal observation with external specification. Not all environments support this.
+        """
+        return
+
+    @abc.abstractmethod
+    def is_done(self):
+        """
+        Check if the task is done (not necessarily successful).
+        """
+        return
+
+    @abc.abstractmethod
+    def is_success(self):
+        """
+        Check if the task condition(s) is reached. Should return a dictionary
+        { str: bool } with at least a "task" key for the overall task success,
+        and additional optional keys corresponding to other task criteria.
+        """
+        return
+
+    @property
+    @abc.abstractmethod
+    def action_dimension(self):
+        """
+        Returns dimension of actions (int).
+        """
+        return
+
+    @property
+    @abc.abstractmethod
+    def name(self):
+        """
+        Returns name of environment name (str).
+        """
+        return
+
+    @property
+    @abc.abstractmethod
+    def type(self):
+        """
+        Returns environment type (int) for this kind of environment.
+        This helps identify this env class.
+        """
+        return
+
+    @property
+    def version(self):
+        """
+        Returns version of environment (str).
+        This is not an abstract method, some subclasses do not implement it
+        """
+        return None
+
+    @abc.abstractmethod
+    def serialize(self):
+        """
+        Save all information needed to re-instantiate this environment in a dictionary.
+        This is the same as @env_meta - environment metadata stored in hdf5 datasets,
+        and used in utils/env_utils.py.
+        """
+        return
+
+    @classmethod
+    @abc.abstractmethod
+    def create_for_data_processing(
+        cls,
+        camera_names,
+        camera_height,
+        camera_width,
+        reward_shaping,
+        render=None,
+        render_offscreen=None,
+        use_image_obs=None,
+        use_depth_obs=None,
+        **kwargs,
+    ):
+        """
+        Create environment for processing datasets, which includes extracting
+        observations, labeling dense / sparse rewards, and annotating dones in
+        transitions. 
+
+        Args:
+            camera_names ([str]): list of camera names that correspond to image observations
+            camera_height (int): camera height for all cameras
+            camera_width (int): camera width for all cameras
+            reward_shaping (bool): if True, use shaped environment rewards, else use sparse task completion rewards
+            render (bool or None): optionally override rendering behavior. Defaults to False.
+            render_offscreen (bool or None): optionally override rendering behavior. The default value is True if
+                @camera_names is non-empty, False otherwise.
+            use_image_obs (bool or None): optionally override rendering behavior. The default value is True if
+                @camera_names is non-empty, False otherwise.
+            use_depth_obs (bool): if True, use depth observations
+
+        Returns:
+            env (EnvBase instance)
+        """
+        return
+
+    @property
+    @abc.abstractmethod
+    def rollout_exceptions(self):
+        """
+        Return tuple of exceptions to except when doing rollouts. This is useful to ensure
+        that the entire training run doesn't crash because of a bad policy that causes unstable
+        simulation computations.
+        """
+        return
+
+    @property
+    @abc.abstractmethod
+    def base_env(self):
+        """
+        Grabs base simulation environment.
+        """
+        return
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/envs/env_gym.py b/phantom/submodules/phantom-robomimic/robomimic/envs/env_gym.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b56d1ebb3be670c8e2207fa6afcdf4ee1ec5190
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/envs/env_gym.py
@@ -0,0 +1,267 @@
+"""
+This file contains the gym environment wrapper that is used
+to provide a standardized environment API for training policies and interacting
+with metadata present in datasets.
+"""
+import json
+import numpy as np
+from copy import deepcopy
+
+import gym
+try:
+    import d4rl
+except:
+    print("WARNING: could not load d4rl environments!")
+
+import robomimic.envs.env_base as EB
+import robomimic.utils.obs_utils as ObsUtils
+
+
+class EnvGym(EB.EnvBase):
+    """Wrapper class for gym"""
+    def __init__(
+        self,
+        env_name, 
+        render=False, 
+        render_offscreen=False, 
+        use_image_obs=False, 
+        use_depth_obs=False, 
+        postprocess_visual_obs=True, 
+        **kwargs,
+    ):
+        """
+        Args:
+            env_name (str): name of environment. Only needs to be provided if making a different
+                environment from the one in @env_meta.
+
+            render (bool): ignored - gym envs always support on-screen rendering
+
+            render_offscreen (bool): ignored - gym envs always support off-screen rendering
+
+            use_image_obs (bool): ignored - gym envs don't typically use images
+
+            postprocess_visual_obs (bool): ignored - gym envs don't typically use images
+        """
+        self._init_kwargs = deepcopy(kwargs)
+        self._env_name = env_name
+        self._current_obs = None
+        self._current_reward = None
+        self._current_done = None
+        self._done = None
+        self.env = gym.make(env_name, **kwargs)
+
+    def step(self, action):
+        """
+        Step in the environment with an action.
+
+        Args:
+            action (np.array): action to take
+
+        Returns:
+            observation (dict): new observation dictionary
+            reward (float): reward for this step
+            done (bool): whether the task is done
+            info (dict): extra information
+        """
+        obs, reward, done, info = self.env.step(action)
+        self._current_obs = obs
+        self._current_reward = reward
+        self._current_done = done
+        return self.get_observation(obs), reward, self.is_done(), info
+
+    def reset(self):
+        """
+        Reset environment.
+
+        Returns:
+            observation (dict): initial observation dictionary.
+        """
+        self._current_obs = self.env.reset()
+        self._current_reward = None
+        self._current_done = None
+        return self.get_observation(self._current_obs)
+
+    def reset_to(self, state):
+        """
+        Reset to a specific simulator state.
+
+        Args:
+            state (dict): current simulator state that contains:
+                - states (np.ndarray): initial state of the mujoco environment
+        
+        Returns:
+            observation (dict): observation dictionary after setting the simulator state
+        """
+        if hasattr(self.env.unwrapped.sim, "set_state_from_flattened"):
+            self.env.unwrapped.sim.set_state_from_flattened(state["states"])
+            self.env.unwrapped.sim.forward()
+            return { "flat" : self.env.unwrapped._get_obs() }
+        else:
+            raise NotImplementedError
+
+    def render(self, mode="human", height=None, width=None, camera_name=None, **kwargs):
+        """
+        Render from simulation to either an on-screen window or off-screen to RGB array.
+
+        Args:
+            mode (str): pass "human" for on-screen rendering or "rgb_array" for off-screen rendering
+            height (int): height of image to render - only used if mode is "rgb_array"
+            width (int): width of image to render - only used if mode is "rgb_array"
+        """
+        if mode =="human":
+            return self.env.render(mode=mode, **kwargs)
+        if mode == "rgb_array":
+            return self.env.render(mode="rgb_array", height=height, width=width)
+        else:
+            raise NotImplementedError("mode={} is not implemented".format(mode))
+
+    def get_observation(self, obs=None):
+        """
+        Get current environment observation dictionary.
+
+        Args:
+            ob (np.array): current flat observation vector to wrap and provide as a dictionary.
+                If not provided, uses self._current_obs.
+        """
+        if obs is None:
+            assert self._current_obs is not None
+            obs = self._current_obs
+        return { "flat" : np.copy(obs) }
+
+    def get_state(self):
+        """
+        Get current environment simulator state as a dictionary. Should be compatible with @reset_to.
+        """
+        # NOTE: assumes MuJoCo gym task!
+        xml = self.env.sim.model.get_xml() # model xml file
+        state = np.array(self.env.sim.get_state().flatten()) # simulator state
+        return dict(model=xml, states=state)
+
+    def get_reward(self):
+        """
+        Get current reward.
+        """
+        assert self._current_reward is not None
+        return self._current_reward
+
+    def get_goal(self):
+        """
+        Get goal observation. Not all environments support this.
+        """
+        raise NotImplementedError
+
+    def set_goal(self, **kwargs):
+        """
+        Set goal observation with external specification. Not all environments support this.
+        """
+        raise NotImplementedError
+
+    def is_done(self):
+        """
+        Check if the task is done (not necessarily successful).
+        """
+        assert self._current_done is not None
+        return self._current_done
+
+    def is_success(self):
+        """
+        Check if the task condition(s) is reached. Should return a dictionary
+        { str: bool } with at least a "task" key for the overall task success,
+        and additional optional keys corresponding to other task criteria.
+        """
+        if hasattr(self.env.unwrapped, "_check_success"):
+            return self.env.unwrapped._check_success()
+
+        # gym envs generally don't check task success - we only compare returns
+        return { "task" : False }
+
+    @property
+    def action_dimension(self):
+        """
+        Returns dimension of actions (int).
+        """
+        return self.env.action_space.shape[0]
+
+    @property
+    def name(self):
+        """
+        Returns name of environment name (str).
+        """
+        return self._env_name
+
+    @property
+    def type(self):
+        """
+        Returns environment type (int) for this kind of environment.
+        This helps identify this env class.
+        """
+        return EB.EnvType.GYM_TYPE
+
+    def serialize(self):
+        """
+        Save all information needed to re-instantiate this environment in a dictionary.
+        This is the same as @env_meta - environment metadata stored in hdf5 datasets,
+        and used in utils/env_utils.py.
+        """
+        return dict(env_name=self.name, type=self.type, env_kwargs=deepcopy(self._init_kwargs))
+
+    @classmethod
+    def create_for_data_processing(
+        cls, 
+        env_name, 
+        camera_names, 
+        camera_height, 
+        camera_width, 
+        reward_shaping, 
+        render=None, 
+        render_offscreen=None, 
+        use_image_obs=None, 
+        use_depth_obs=None, 
+        **kwargs,
+    ):
+        """
+        Create environment for processing datasets, which includes extracting
+        observations, labeling dense / sparse rewards, and annotating dones in
+        transitions. For gym environments, input arguments (other than @env_name)
+        are ignored, since environments are mostly pre-configured.
+
+        Args:
+            env_name (str): name of gym environment to create
+
+        Returns:
+            env (EnvGym instance)
+        """
+
+        # make sure to initialize obs utils so it knows which modalities are image modalities.
+        # For currently supported gym tasks, there are no image observations.
+        obs_modality_specs = {
+            "obs": {
+                "low_dim": ["flat"],
+                "rgb": [],
+            }
+        }
+        ObsUtils.initialize_obs_utils_with_obs_specs(obs_modality_specs)
+
+        return cls(env_name=env_name, **kwargs)
+
+    @property
+    def rollout_exceptions(self):
+        """
+        Return tuple of exceptions to except when doing rollouts. This is useful to ensure
+        that the entire training run doesn't crash because of a bad policy that causes unstable
+        simulation computations.
+        """
+        return ()
+
+    @property
+    def base_env(self):
+        """
+        Grabs base simulation environment.
+        """
+        return self.env
+
+    def __repr__(self):
+        """
+        Pretty-print env description.
+        """
+        return self.name + "\n" + json.dumps(self._init_kwargs, sort_keys=True, indent=4)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/envs/env_ig_momart.py b/phantom/submodules/phantom-robomimic/robomimic/envs/env_ig_momart.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd0a0db9df116ea57fe75cad5c526a07bb08e81d
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/envs/env_ig_momart.py
@@ -0,0 +1,414 @@
+"""
+Wrapper environment class to enable using iGibson-based environments used in the MOMART paper
+"""
+
+from copy import deepcopy
+import numpy as np
+import json
+
+import pybullet as p
+import gibson2
+from gibson2.envs.semantic_organize_and_fetch import SemanticOrganizeAndFetch
+from gibson2.utils.custom_utils import ObjectConfig
+import gibson2.external.pybullet_tools.utils as PBU
+import tempfile
+import os
+import yaml
+import cv2
+
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.envs.env_base as EB
+
+
+# TODO: Once iG 2.0 is more stable, automate available environments, similar to robosuite
+ENV_MAPPING = {
+    "SemanticOrganizeAndFetch": SemanticOrganizeAndFetch,
+}
+
+
+class EnvGibsonMOMART(EB.EnvBase):
+    """
+    Wrapper class for gibson environments (https://github.com/StanfordVL/iGibson) specifically compatible with
+    MoMaRT datasets
+    """
+    def __init__(
+            self,
+            env_name,
+            ig_config,
+            postprocess_visual_obs=True,
+            render=False,
+            render_offscreen=False,
+            use_image_obs=False,
+            use_depth_obs=False,
+            image_height=None,
+            image_width=None,
+            physics_timestep=1./240.,
+            action_timestep=1./20.,
+            **kwargs,
+    ):
+        """
+        Args:
+            ig_config (dict): YAML configuration to use for iGibson, as a dict
+
+            postprocess_visual_obs (bool): if True, postprocess image observations
+                to prepare for learning
+
+            render (bool): if True, environment supports on-screen rendering
+
+            render_offscreen (bool): if True, environment supports off-screen rendering. This
+                is forced to be True if @use_image_obs is True.
+
+            use_image_obs (bool): if True, environment is expected to render rgb image observations
+                on every env.step call. Set this to False for efficiency reasons, if image
+                observations are not required.
+
+            use_depth_obs (bool): if True, environment is expected to render depth image observations
+                on every env.step call. Set this to False for efficiency reasons, if depth
+                observations are not required.
+
+            render_mode (str): How to run simulation rendering. Options are {"pbgui", "iggui", or "headless"}
+
+            image_height (int): If specified, overrides internal iG image height when rendering
+
+            image_width (int): If specified, overrides internal iG image width when rendering
+
+            physics_timestep (float): Pybullet physics timestep to use
+
+            action_timestep (float): Action timestep to use for robot in simulation
+
+            kwargs (unrolled dict): Any args to substitute in the ig_configuration
+        """
+        self._env_name = env_name
+        self.ig_config = deepcopy(ig_config)
+        self.postprocess_visual_obs = postprocess_visual_obs
+        self._init_kwargs = kwargs
+
+        # Determine rendering mode
+        self.render_mode = "iggui" if render else "headless"
+        self.render_onscreen = render
+
+        # Make sure rgb is part of obs in ig config
+        self.ig_config["output"] = list(set(self.ig_config["output"] + ["rgb"]))
+
+        # Warn user that iG always uses a renderer
+        if (not render) and (not render_offscreen):
+            print("WARNING: iGibson always uses a renderer -- using headless by default.")
+
+        # Update ig config
+        for k, v in kwargs.items():
+            assert k in self.ig_config, f"Got unknown ig configuration key {k}!"
+            self.ig_config[k] = v
+
+        # Set rendering values
+        self.obs_img_height = image_height if image_height is not None else self.ig_config.get("obs_image_height", 120)
+        self.obs_img_width = image_width if image_width is not None else self.ig_config.get("obs_image_width", 120)
+
+        # Get class to create
+        envClass = ENV_MAPPING.get(self._env_name, None)
+
+        # Make sure we have a valid environment class
+        assert envClass is not None, "No valid environment for the requested task was found!"
+
+        # Set device idx for rendering
+        # ensure that we select the correct GPU device for rendering by testing for EGL rendering
+        # NOTE: this package should be installed from this link (https://github.com/StanfordVL/egl_probe)
+        import egl_probe
+        device_idx = 0
+        valid_gpu_devices = egl_probe.get_available_devices()
+        if len(valid_gpu_devices) > 0:
+            device_idx = valid_gpu_devices[0]
+
+        # Create environment
+        self.env = envClass(
+            config_file=deepcopy(self.ig_config),
+            mode=self.render_mode,
+            physics_timestep=physics_timestep,
+            action_timestep=action_timestep,
+            device_idx=device_idx,
+        )
+
+        # If we have a viewer, make sure to remove all bodies belonging to the visual markers
+        self.exclude_body_ids = []      # Bodies to exclude when saving state
+        if self.env.simulator.viewer is not None:
+            self.exclude_body_ids.append(self.env.simulator.viewer.constraint_marker.body_id)
+            self.exclude_body_ids.append(self.env.simulator.viewer.constraint_marker2.body_id)
+
+    def step(self, action):
+        """
+        Step in the environment with an action
+
+        Args:
+            action: action to take
+
+        Returns:
+            observation: new observation
+            reward: step reward
+            done: whether the task is done
+            info: extra information
+        """
+        obs, r, done, info = self.env.step(action)
+        obs = self.get_observation(obs)
+        return obs, r, self.is_done(), info
+
+    def reset(self):
+        """Reset environment"""
+        di = self.env.reset()
+        return self.get_observation(di)
+
+    def reset_to(self, state):
+        """
+        Reset to a specific state
+        Args:
+            state (dict): contains:
+                - states (np.ndarray): initial state of the mujoco environment
+                - goal (dict): goal components to reset
+        Returns:
+            new observation
+        """
+        if "states" in state:
+            self.env.reset_to(state["states"], exclude=self.exclude_body_ids)
+
+        if "goal" in state:
+            self.set_goal(**state["goal"])
+
+        # Return obs
+        return self.get_observation()
+
+    def render(self, mode="human", camera_name="rgb", height=None, width=None):
+        """
+        Render
+
+        Args:
+            mode (str): Mode(s) to render. Options are either 'human' (rendering onscreen) or 'rgb' (rendering to
+                frames offscreen)
+            camera_name (str): Name of the camera to use -- valid options are "rgb" or "rgb_wrist"
+            height (int): If specified with width, resizes the rendered image to this height
+            width (int): If specified with height, resizes the rendered image to this width
+
+        Returns:
+            array or None: If rendering to frame, returns the rendered frame. Otherwise, returns None
+        """
+        # Only robotview camera is currently supported
+        assert camera_name in {"rgb", "rgb_wrist"}, \
+            f"Only rgb, rgb_wrist cameras currently supported, got {camera_name}."
+
+        if mode == "human":
+            assert self.render_onscreen, "Rendering has not been enabled for onscreen!"
+            self.env.simulator.sync()
+        else:
+            assert self.env.simulator.renderer is not None, "No renderer enabled for this env!"
+
+            frame = self.env.sensors["vision"].get_obs(self.env)[camera_name]
+
+            # Reshape all frames
+            if height is not None and width is not None:
+                frame = cv2.resize(frame, dsize=(height, width), interpolation=cv2.INTER_CUBIC)
+                return frame
+
+    def resize_obs_frame(self, frame):
+        """
+        Resizes frame to be internal height and width values
+        """
+        return cv2.resize(frame, dsize=(self.obs_img_width, self.obs_img_height), interpolation=cv2.INTER_CUBIC)
+
+    def get_observation(self, di=None):
+        """Get environment observation"""
+        if di is None:
+            di = self.env.get_state()
+        ret = {}
+        for k in di:
+            # RGB Images
+            if "rgb" in k:
+                ret[k] = di[k]
+                # ret[k] = np.transpose(di[k], (2, 0, 1))
+                if self.postprocess_visual_obs:
+                    ret[k] = ObsUtils.process_obs(obs=self.resize_obs_frame(ret[k]), obs_key=k)
+
+            # Depth images
+            elif "depth" in k:
+                # ret[k] = np.transpose(di[k], (2, 0, 1))
+                # Values can be corrupted (negative or > 1.0, so we clip values)
+                ret[k] = np.clip(di[k], 0.0, 1.0)
+                if self.postprocess_visual_obs:
+                    ret[k] = ObsUtils.process_obs(obs=self.resize_obs_frame(ret[k])[..., None], obs_key=k)
+
+            # Segmentation Images
+            elif "seg" in k:
+                ret[k] = di[k][..., None]
+                if self.postprocess_visual_obs:
+                    ret[k] = ObsUtils.process_obs(obs=self.resize_obs_frame(ret[k]), obs_key=k)
+
+            # Scans
+            elif "scan" in k:
+                ret[k] = np.transpose(np.array(di[k]), axes=(1, 0))
+
+        # Compose proprio obs
+        proprio_obs = di["proprio"]
+
+        # Compute intermediate values
+        lin_vel = np.linalg.norm(proprio_obs["base_lin_vel"][:2])
+        ang_vel = proprio_obs["base_ang_vel"][2]
+
+        ret["proprio"] = np.concatenate([
+            proprio_obs["head_joint_pos"],
+            proprio_obs["grasped"],
+            proprio_obs["eef_pos"],
+            proprio_obs["eef_quat"],
+        ])
+
+        # Proprio info that's only relevant for navigation
+        ret["proprio_nav"] = np.concatenate([
+            [lin_vel],
+            [ang_vel],
+        ])
+
+        # Compose task obs
+        ret["object"] = np.concatenate([
+            np.array(di["task_obs"]["object-state"]),
+        ])
+
+        # Add ground truth navigational state
+        ret["gt_nav"] = np.concatenate([
+            proprio_obs["base_pos"][:2],
+            [np.sin(proprio_obs["base_rpy"][2])],
+            [np.cos(proprio_obs["base_rpy"][2])],
+        ])
+
+        return ret
+
+    def sync_task(self):
+        """
+        Method to synchronize iG task, since we're not actually resetting the env but instead setting states directly.
+        Should only be called after resetting the initial state of an episode
+        """
+        self.env.task.update_target_object_init_pos()
+        self.env.task.update_location_info()
+
+    def set_task_conditions(self, task_conditions):
+        """
+        Method to override task conditions (e.g.: target object), useful in cases such as playing back
+            from demonstrations
+
+        Args:
+            task_conditions (dict): Keyword-mapped arguments to pass to task instance to set internally
+        """
+        self.env.set_task_conditions(task_conditions)
+
+    def get_state(self):
+        """Get iG flattened state"""
+        return {"states": PBU.WorldSaver(exclude_body_ids=self.exclude_body_ids).serialize()}
+
+    def get_reward(self):
+        return self.env.task.get_reward(self.env)[0]
+        # return float(self.is_success()["task"])
+
+    def get_goal(self):
+        """Get goal specification"""
+        # No support yet in iG
+        raise NotImplementedError
+
+    def set_goal(self, **kwargs):
+        """Set env target with external specification"""
+        # No support yet in iG
+        raise NotImplementedError
+
+    def is_done(self):
+        """Check if the agent is done (not necessarily successful)."""
+        return False
+
+    def is_success(self):
+        """
+        Check if the task condition(s) is reached. Should return a dictionary
+        { str: bool } with at least a "task" key for the overall task success,
+        and additional optional keys corresponding to other task criteria.
+        """
+        succ = self.env.check_success()
+        if isinstance(succ, dict):
+            assert "task" in succ
+            return succ
+        return { "task" : succ }
+
+    @classmethod
+    def create_for_data_processing(
+            cls,
+            env_name,
+            camera_names,
+            camera_height,
+            camera_width,
+            reward_shaping,
+            render=None,
+            render_offscreen=None,
+            use_image_obs=None,
+            use_depth_obs=None,
+            **kwargs,
+    ):
+        """
+        Create environment for processing datasets, which includes extracting
+        observations, labeling dense / sparse rewards, and annotating dones in
+        transitions.
+
+        Args:
+            env_name (str): name of environment
+            camera_names (list of str): list of camera names that correspond to image observations
+            camera_height (int): camera height for all cameras
+            camera_width (int): camera width for all cameras
+            reward_shaping (bool): if True, use shaped environment rewards, else use sparse task completion rewards
+            render (bool or None): optionally override rendering behavior
+            render_offscreen (bool or None): optionally override rendering behavior
+            use_image_obs (bool or None): optionally override rendering behavior
+        """
+        has_camera = (len(camera_names) > 0)
+
+        # note that @postprocess_visual_obs is False since this env's images will be written to a dataset
+        return cls(
+            env_name=env_name,
+            render=(False if render is None else render),
+            render_offscreen=(has_camera if render_offscreen is None else render_offscreen),
+            use_image_obs=(has_camera if use_image_obs is None else use_image_obs),
+            postprocess_visual_obs=False,
+            image_height=camera_height,
+            image_width=camera_width,
+            **kwargs,
+        )
+
+    @property
+    def action_dimension(self):
+        """Action dimension"""
+        return self.env.robots[0].action_dim
+
+    @property
+    def name(self):
+        """Environment name"""
+        return self._env_name
+
+    @property
+    def type(self):
+        """Environment type"""
+        return EB.EnvType.IG_MOMART_TYPE
+
+    def serialize(self):
+        """Serialize to dictionary"""
+        return dict(env_name=self.name, type=self.type,
+                    ig_config=self.ig_config,
+                    env_kwargs=deepcopy(self._init_kwargs))
+
+    @classmethod
+    def deserialize(cls, info, postprocess_visual_obs=True):
+        """Create environment with external info"""
+        return cls(env_name=info["env_name"], ig_config=info["ig_config"], postprocess_visual_obs=postprocess_visual_obs, **info["env_kwargs"])
+
+    @property
+    def rollout_exceptions(self):
+        """Return tuple of exceptions to except when doing rollouts"""
+        return (RuntimeError)
+
+    @property
+    def base_env(self):
+        """
+        Grabs base simulation environment.
+        """
+        return self.env
+
+    def __repr__(self):
+        return self.name + "\n" + json.dumps(self._init_kwargs, sort_keys=True, indent=4) + \
+               "\niGibson Config: \n" + json.dumps(self.ig_config, sort_keys=True, indent=4)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/envs/env_real_panda.py b/phantom/submodules/phantom-robomimic/robomimic/envs/env_real_panda.py
new file mode 100644
index 0000000000000000000000000000000000000000..c59a979724fe946c0e56c4e9f0e8eab6b8d03214
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/envs/env_real_panda.py
@@ -0,0 +1,448 @@
+"""
+This file contains the base class for environment wrappers that are used
+to provide a standardized environment API for training policies and interacting
+with metadata present in datasets.
+"""
+import time
+import json
+import sys
+import numpy as np
+from copy import deepcopy
+
+import cv2
+
+import RobotTeleop
+import RobotTeleop.utils as U
+from RobotTeleop.utils import Rate, RateMeasure, Timers
+
+import robomimic.envs.env_base as EB
+import robomimic.utils.obs_utils as ObsUtils
+
+class EnvRealPanda(EB.EnvBase):
+    """Wrapper class for real panda environment"""
+    def __init__(
+        self,
+        env_name,
+        render=False,
+        render_offscreen=False,
+        use_image_obs=True,
+        use_depth_obs=False,
+        postprocess_visual_obs=True,
+        control_freq=20.,
+        action_scale=None,
+        camera_names_to_sizes=None,
+        init_ros_node=True,
+        publish_target_pose=False,
+        fake_controller=False,
+        use_moveit=True,
+    ):
+        """
+        Args:
+            env_name (str): name of environment.
+
+            render (bool): ignored - on-screen rendering is not supported
+
+            render_offscreen (bool): ignored - image observations are supplied by default
+
+            use_image_obs (bool): ignored - image observations are used by default.
+
+            postprocess_visual_obs (bool): if True, postprocess image observations
+                to prepare for learning. This should only be False when extracting observations
+                for saving to a dataset (to save space on RGB images for example).
+
+            control_freq (int): real-world control frequency to try and enforce through rate-limiting
+
+            action_scale (list): list of 7 numbers for what the -1 and 1 action in each dimension corresponds to
+                for the physical robot action space
+
+            camera_names_to_sizes (dict):  dictionary that maps camera names to tuple of image height and width
+                to return
+        """
+        self._env_name = env_name
+        self.postprocess_visual_obs = postprocess_visual_obs
+        self.control_freq = control_freq
+
+        # to enforce control rate
+        self.rate = Rate(control_freq)
+        self.rate_measure = RateMeasure(name="robot", freq_threshold=round(0.95 * control_freq))
+        self.timers = Timers(history=100, disable_on_creation=False)
+
+        assert (action_scale is not None), "must provide action scaling bounds"
+        assert len(action_scale) == 7, "must provide scaling for all dimensions"
+        self.action_scale = np.array(action_scale).reshape(-1)
+
+        camera_names_to_sizes = deepcopy(camera_names_to_sizes)
+        if camera_names_to_sizes is None:
+            self.camera_names_to_sizes = {}
+        else:
+            self.camera_names_to_sizes = camera_names_to_sizes
+
+        # save kwargs for serialization
+        kwargs = dict(
+            camera_names_to_sizes=camera_names_to_sizes,
+            action_scale=action_scale,
+            init_ros_node=init_ros_node,
+            publish_target_pose=publish_target_pose,
+            fake_controller=fake_controller,
+            use_moveit=use_moveit,
+            control_freq=control_freq
+        )
+        self._init_kwargs = deepcopy(kwargs)
+
+        # connect to robot
+        # if (sys.version_info > (3, 0)):
+        #     from RobotTeleop.robots.panda_redis_interface import PandaRedisInterface
+        #     self.robot_interface = PandaRedisInterface(
+        #         init_ros_node=init_ros_node,
+        #         publish_target_pose=publish_target_pose,
+        #         fake_controller=fake_controller,
+        #         use_moveit=use_moveit,
+        #         camera_names_to_sizes=camera_names_to_sizes,
+        #         debug_times=True,
+        #     )
+        # else:
+        from RobotTeleop.robots.panda_ros_interface import PandaRosInterface
+        self.robot_interface = PandaRosInterface(
+            init_ros_node=init_ros_node,
+            publish_target_pose=publish_target_pose,
+            fake_controller=fake_controller,
+            use_moveit=use_moveit,
+            camera_names_to_sizes=camera_names_to_sizes,
+            #use_redis=True,
+        )
+
+        # IMPORTANT: initialize JIT functions that may need to compile
+        self._compile_jit_functions()
+
+        # last grasp action - initialize to false, since gripper should start open
+        self.did_grasp = False
+
+    def _compile_jit_functions(self):
+        """
+        Helper function to incur the cost of compiling jit functions used by this class upfront.
+
+        NOTE: this function looks strange because we apparently need to make it look like the env.step function
+              for it to compile properly, otherwise we will have a heavy delay on the first env.step call...
+
+        TODO: figure out why this needs to look like the step function code below...
+        """
+
+        # current robot state to use as reference
+        ee_pos, ee_quat = self.robot_interface.ee_pose
+        ee_mat = U.quat2mat(ee_quat)
+        ee_quat_hat = U.mat2quat(ee_mat)
+
+        # convert delta axis-angle to delta rotation matrix, and from there, to absolute target rotation
+        drot = np.array([0., 0., 0.05])
+        angle = np.linalg.norm(drot)
+        if U.isclose(angle, 0.):
+            drot_quat = np.array([0., 0., 0., 1.])
+        else:
+            axis = drot / angle
+            drot_quat = U.axisangle2quat(axis, angle)
+
+        # get target rotation
+        drot_mat = U.quat2mat(drot_quat)
+        target_rot_mat = (drot_mat.T).dot(ee_mat)
+        target_rot_quat = U.mat2quat(target_rot_mat)
+
+    def step(self, action, need_obs=True):
+        """
+        Step in the environment with an action.
+
+        Args:
+            action (np.array): action to take, should be in [-1, 1]
+            need_obs (bool): if False, don't return the observation, because this
+                can involve copying image data around. This allows for more
+                flexibility on when observations are retrieved.
+
+        Returns:
+            observation (dict): new observation dictionary
+            reward (float): reward for this step
+            done (bool): whether the task is done
+            info (dict): extra information
+        """
+        assert len(action.shape) == 1 and action.shape[0] == 7, "action has incorrect dimensions"
+        assert np.min(action) >= -1. and np.max(action) <= 1., "incorrect action bounds"
+
+        # rate-limiting
+        self.rate.sleep()
+        self.rate_measure.measure()
+
+        self.timers.tic("real_panda_step")
+
+        # unscale action
+        action = self.action_scale * action
+
+        # extract action components
+        dpos = action[:3]
+        drot = action[3:6]
+        gripper_command = action[6:7]
+
+        # current robot state to use as reference
+        ee_pos, ee_quat = self.robot_interface.ee_pose
+        ee_mat = U.quat2mat(ee_quat)
+
+        # absolute target position
+        target_pos = ee_pos + dpos
+
+        # convert delta axis-angle to delta rotation matrix, and from there, to absolute target rotation
+        angle = np.linalg.norm(drot)
+        if U.isclose(angle, 0.):
+            drot_quat = np.array([0., 0., 0., 1.])
+        else:
+            axis = drot / angle
+            drot_quat = U.axisangle2quat(axis, angle)
+        drot_mat = U.quat2mat(drot_quat)
+        target_rot_mat = (drot_mat.T).dot(ee_mat)
+        target_rot_quat = U.mat2quat(target_rot_mat)
+
+        # play end effector action
+        self.robot_interface.move_to_ee_pose(pos=target_pos, ori=target_rot_quat)
+
+        # convert continuous control signal in [-1, 1] to boolean
+        should_close = (float(gripper_command) < 0.)
+
+        # only send command if trying to change gripper state.
+        # this is due to hardware limitations - robot grippers suck.
+        if should_close != self.did_grasp:
+            if should_close:
+                self.robot_interface.gripper_close()
+            else:
+                self.robot_interface.gripper_open()
+
+        # remember last grasp command
+        self.did_grasp = should_close
+
+        # get observation
+        obs = None
+        if need_obs:
+            obs = self.get_observation()
+        r = self.get_reward()
+        done = self.is_done()
+
+        self.timers.toc("real_panda_step")
+
+        return obs, r, done, {}
+
+    def reset(self):
+        """
+        Reset environment.
+
+        Returns:
+            observation (dict): initial observation dictionary.
+        """
+        self.robot_interface.gripper_open()
+        self.robot_interface.reset_teleop()
+        self.rate_measure = RateMeasure(name="robot", freq_threshold=round(0.95 * self.control_freq))
+
+        return self.get_observation()
+
+    def reset_to(self, state):
+        """
+        Reset to a specific state. On real robot, we visualize the start image,
+        and a human should manually reset the scene.
+
+        Reset to a specific simulator state.
+
+        Args:
+            state (dict): initial state that contains:
+                - image (np.ndarray): initial workspace image
+
+        Returns:
+            None
+        """
+        assert "front_image" in state
+        ref_img = cv2.cvtColor(state["front_image"], cv2.COLOR_RGB2BGR)
+
+        print("\n" + "*" * 50)
+        print("Reset environment to image shown in left pane")
+        print("Press 'c' when ready to continue.")
+        print("*" * 50 + "\n")
+        while(True):
+            # read current image
+            cur_img = self.robot_interface.get_camera_frame(camera_name="front_image")
+            cur_img = cv2.cvtColor(cur_img, cv2.COLOR_RGB2BGR)
+
+            # concatenate frames to display
+            img = np.concatenate([ref_img, cur_img], axis=1)
+
+            # display frame
+            cv2.imshow('initial state alignment window', img)
+            if cv2.waitKey(1) & 0xFF == ord('c'):
+                cv2.destroyAllWindows()
+                break
+
+    def render(self, mode="human", height=None, width=None, camera_name=None, **kwargs):
+        """
+        Render from simulation to either an on-screen window or off-screen to RGB array.
+
+        Args:
+            mode (str): pass "human" for on-screen rendering or "rgb_array" for off-screen rendering
+            height (int): height of image to render - only used if mode is "rgb_array"
+            width (int): width of image to render - only used if mode is "rgb_array"
+        """
+        if mode =="human":
+            raise Exception("on-screen rendering not supported currently")
+        if mode == "rgb_array":
+            # assert (height is None) and (width is None), "cannot resize images"
+            assert camera_name in self.camera_names_to_sizes, "invalid camera name"
+            return self.robot_interface.get_camera_frame(camera_name=camera_name)
+        else:
+            raise NotImplementedError("mode={} is not implemented".format(mode))
+
+    def get_observation(self, obs=None):
+        """
+        Get current environment observation dictionary.
+
+        Args:
+            ob (np.array): current observation dictionary.
+        """
+        self.timers.tic("get_observation")
+        observation = {}
+        observation["ee_pose"] = np.concatenate(self.robot_interface.ee_pose)
+        observation["joint_positions"] = self.robot_interface.joint_position
+        observation["joint_velocities"] = self.robot_interface.joint_velocity
+        observation["gripper_position"] = self.robot_interface.gripper_position
+        observation["gripper_velocity"] = self.robot_interface.gripper_velocity
+        for cam_name in self.camera_names_to_sizes:
+            im = self.robot_interface.get_camera_frame(camera_name=cam_name)
+            if self.postprocess_visual_obs:
+                im = ObsUtils.process_image(im)
+            observation[cam_name] = im
+        self.timers.toc("get_observation")
+        return observation
+
+    def get_state(self):
+        """
+        Get current environment simulator state as a dictionary. Should be compatible with @reset_to.
+        """
+        return dict(states=np.zeros(1))
+        # raise Exception("Real robot has no simulation state.")
+
+    def get_reward(self):
+        """
+        Get current reward.
+        """
+        return 0.
+
+    def get_goal(self):
+        """
+        Get goal observation. Not all environments support this.
+        """
+        raise NotImplementedError
+
+    def set_goal(self, **kwargs):
+        """
+        Set goal observation with external specification. Not all environments support this.
+        """
+        raise NotImplementedError
+
+    def is_done(self):
+        """
+        Check if the task is done (not necessarily successful).
+        """
+        return False
+
+    def is_success(self):
+        """
+        Check if the task condition(s) is reached. Should return a dictionary
+        { str: bool } with at least a "task" key for the overall task success,
+        and additional optional keys corresponding to other task criteria.
+        """
+
+        # real robot environments don't usually have a success check - this must be done manually
+        return { "task" : False }
+
+    @property
+    def action_dimension(self):
+        """
+        Returns dimension of actions (int).
+        """
+        return 7
+
+    @property
+    def name(self):
+        """
+        Returns name of environment name (str).
+        """
+        # return self._env_name
+
+        # for real robot. ensure class name is stored in env meta (as env name) for use with any external
+        # class registries
+        return self.__class__.__name__
+
+    @property
+    def type(self):
+        """
+        Returns environment type (int) for this kind of environment.
+        This helps identify this env class.
+        """
+        return EB.EnvType.REAL_TYPE
+
+    def serialize(self):
+        """
+        Save all information needed to re-instantiate this environment in a dictionary.
+        This is the same as @env_meta - environment metadata stored in hdf5 datasets,
+        and used in utils/env_utils.py.
+        """
+        return dict(env_name=self.name, type=self.type, env_kwargs=deepcopy(self._init_kwargs))
+
+    @classmethod
+    def create_for_data_processing(cls, env_name, camera_names, camera_height, camera_width, reward_shaping, **kwargs):
+        """
+        Create environment for processing datasets, which includes extracting
+        observations, labeling dense / sparse rewards, and annotating dones in
+        transitions. For gym environments, input arguments (other than @env_name)
+        are ignored, since environments are mostly pre-configured.
+
+        Args:
+            env_name (str): name of gym environment to create
+
+        Returns:
+            env (EnvRealPanda instance)
+        """
+
+        # initialize obs utils so it knows which modalities are image modalities
+        assert "camera_names_to_sizes" in kwargs
+        image_modalities = list(kwargs["camera_names_to_sizes"].keys())
+        obs_modality_specs = {
+            "obs": {
+                "low_dim": [], # technically unused, so we don't have to specify all of them
+                "image": image_modalities,
+            }
+        }
+        ObsUtils.initialize_obs_utils_with_obs_specs(obs_modality_specs)
+
+        # note that @postprocess_visual_obs is False since this env's images will be written to a dataset
+        return cls(
+            env_name=env_name,
+            render=False, 
+            render_offscreen=False, 
+            use_image_obs=True, 
+            postprocess_visual_obs=False,
+            **kwargs,
+        )
+
+    @property
+    def rollout_exceptions(self):
+        """
+        Return tuple of exceptions to except when doing rollouts. This is useful to ensure
+        that the entire training run doesn't crash because of a bad policy that causes unstable
+        simulation computations.
+        """
+        return ()
+
+    @property
+    def base_env(self):
+        """
+        Grabs base simulation environment.
+        """
+        # we don't wrap any env
+        return self
+
+    def __repr__(self):
+        """
+        Pretty-print env description.
+        """
+        return self.name + "\n" + json.dumps(self._init_kwargs, sort_keys=True, indent=4)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/envs/env_real_panda_gprs.py b/phantom/submodules/phantom-robomimic/robomimic/envs/env_real_panda_gprs.py
new file mode 100644
index 0000000000000000000000000000000000000000..f931074dd0d7f9bb45b706373cd0559d81cd1e53
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/envs/env_real_panda_gprs.py
@@ -0,0 +1,732 @@
+"""
+Real robot env wrapper for Yifeng's GPRS control stack.
+"""
+import os
+import time
+import json
+import sys
+import numpy as np
+from copy import deepcopy
+from easydict import EasyDict as edict
+
+import cv2
+from PIL import Image
+
+import RobotTeleop
+import RobotTeleop.utils as U
+from RobotTeleop.utils import Rate, RateMeasure, Timers
+
+try:
+    # GPRS imports
+    from gprs.franka_interface import FrankaInterface
+    from gprs.camera_redis_interface import CameraRedisSubInterface
+    from gprs.utils import YamlConfig
+    from gprs import config_root
+
+    from rpl_vision_utils.utils import img_utils as ImgUtils
+except ImportError:
+    print("WARNING: no GPRS...")
+
+import robomimic.envs.env_base as EB
+import robomimic.utils.obs_utils as ObsUtils
+from robomimic.utils.log_utils import log_warning
+
+try:
+    import robosuite.utils.transform_utils as T
+except ImportError:
+    print("WARNING: could not import robosuite transform utils (needed for using absolute actions with GPRS")
+
+
+def center_crop(im, t_h, t_w):
+    assert(im.shape[-3] >= t_h and im.shape[-2] >= t_w)
+    assert(im.shape[-1] in [1, 3])
+    crop_h = int((im.shape[-3] - t_h) / 2)
+    crop_w = int((im.shape[-2] - t_w) / 2)
+    return im[..., crop_h:crop_h + t_h, crop_w:crop_w + t_w, :]
+
+
+def get_depth_scale(camera_name):
+    """
+    Returns scaling factor that converts from uint16 depth to real-valued depth (in meters).
+    """
+
+    # TODO: fix duplication
+    if camera_name == "front":
+        return 0.0010000000474974513
+    if camera_name == "wrist":
+        return 0.0010000000474974513
+    raise Exception("should not reach here")
+    # from RobotTeleop.scripts.debug_april_tag import get_depth_scale_unified
+    # return get_depth_scale_unified(camera_name=camera_name)
+
+
+class EnvRealPandaGPRS(EB.EnvBase):
+    """Wrapper class for real panda environment"""
+    def __init__(
+        self,
+        env_name,
+        render=False,
+        render_offscreen=False,
+        use_image_obs=True,
+        postprocess_visual_obs=True,
+        control_freq=20.,
+        camera_names_to_sizes=None,
+        center_crop_images=True,
+        general_cfg_file=None,
+        controller_type=None,
+        controller_cfg_file=None,
+        controller_cfg_dict=None,
+        use_depth_obs=False,
+        absolute_actions=False, # use absolute pos and rot (axis-angle) in 7-dim action vector
+        # additional GPRS-specific args
+        state_freq=100.,
+        control_timeout=1.0,
+        has_gripper=True,
+        use_visualizer=False,
+        debug=False,
+    ):
+        """
+        Args:
+            env_name (str): name of environment.
+
+            render (bool): ignored - on-screen rendering is not supported
+
+            render_offscreen (bool): ignored - image observations are supplied by default
+
+            use_image_obs (bool): ignored - image observations are used by default.
+
+            postprocess_visual_obs (bool): if True, postprocess image observations
+                to prepare for learning. This should only be False when extracting observations
+                for saving to a dataset (to save space on RGB images for example).
+
+            control_freq (int): real-world control frequency to try and enforce through rate-limiting
+
+            camera_names_to_sizes (dict):  dictionary that maps camera names to tuple of image height and width
+                to return
+        """
+        self._env_name = env_name
+        self.postprocess_visual_obs = postprocess_visual_obs
+        self.control_freq = control_freq
+        self.absolute_actions = absolute_actions
+        self.general_cfg_file = general_cfg_file
+        self.controller_type = controller_type
+        self.controller_cfg_file = controller_cfg_file
+        self.controller_cfg_dict = deepcopy(controller_cfg_dict) if controller_cfg_dict is not None else None
+        if self.controller_cfg_dict is not None:
+            # control code expects easydict
+            self.controller_cfg = edict(self.controller_cfg_dict)
+        else:
+            assert controller_cfg_file is not None
+            self.controller_cfg = YamlConfig(os.path.join(config_root, controller_cfg_file)).as_easydict()
+        self.use_depth_obs = use_depth_obs
+
+        # to enforce control rate
+        self.rate = Rate(control_freq)
+        self.rate_measure = RateMeasure(name="robot", freq_threshold=round(0.95 * control_freq))
+        self.timers = Timers(history=100, disable_on_creation=False)
+
+        camera_names_to_sizes = deepcopy(camera_names_to_sizes)
+        if camera_names_to_sizes is None:
+            self.camera_names_to_sizes = {}
+        else:
+            self.camera_names_to_sizes = camera_names_to_sizes
+        self.center_crop_images = center_crop_images
+
+        self._exclude_depth_from_obs = (not self.use_depth_obs)
+        if self.use_depth_obs and self.postprocess_visual_obs:
+            for cam_name in self.camera_names_to_sizes:
+                depth_mod = "{}_depth".format(cam_name)
+                if not ((depth_mod in ObsUtils.OBS_KEYS_TO_MODALITIES) and ObsUtils.key_is_obs_modality(key=depth_mod, obs_modality="depth")):
+                    log_warning("depth observation {} will not be postprocessed since robomimic is not aware of it".format(depth_mod))
+                    # # HACK: assume this means we don't actually need depth, but we might the camera interface to support it for TAMP / perception
+                    # self.use_depth_obs = False
+                    self._exclude_depth_from_obs = True 
+
+        # save kwargs for serialization
+        kwargs = dict(
+            env_name=env_name,
+            camera_names_to_sizes=camera_names_to_sizes,
+            center_crop_images=center_crop_images,
+            general_cfg_file=general_cfg_file,
+            control_freq=control_freq,
+            controller_type=controller_type,
+            controller_cfg_file=controller_cfg_file,
+            controller_cfg_dict=controller_cfg_dict,
+            use_depth_obs=use_depth_obs,
+            state_freq=state_freq,
+            control_timeout=control_timeout,
+            has_gripper=has_gripper,
+            use_visualizer=use_visualizer,
+            debug=debug,
+        )
+        self._init_kwargs = deepcopy(kwargs)
+
+        # connect to robot
+        self.robot_interface = FrankaInterface(
+            general_cfg_file=os.path.join(config_root, general_cfg_file),
+            control_freq=control_freq,
+            state_freq=state_freq,
+            control_timeout=control_timeout,
+            has_gripper=has_gripper,
+            use_visualizer=use_visualizer,
+            debug=debug,
+        )
+
+        # TODO: clean up camera ID definition later
+
+        # start camera interfaces
+        camera_ids = list(range(len(self.camera_names_to_sizes)))
+        self.cr_interfaces = {}
+        for c_id, c_name in enumerate(self.camera_names_to_sizes):
+            cr_interface = CameraRedisSubInterface(camera_id=c_id, use_depth=self.use_depth_obs)
+            cr_interface.start()
+            self.cr_interfaces[c_name] = cr_interface
+
+        # IMPORTANT: initialize JIT functions that may need to compile
+        self._compile_jit_functions()
+
+    def _compile_jit_functions(self):
+        """
+        Helper function to incur the cost of compiling jit functions used by this class upfront.
+
+        NOTE: this function looks strange because we apparently need to make it look like the env.step function
+              for it to compile properly, otherwise we will have a heavy delay on the first env.step call...
+
+        TODO: figure out why this needs to look like the step function code below...
+        """
+
+        # current robot state to use as reference
+        # ee_pos, ee_quat = self.robot_interface.ee_pose
+        ee_mat = U.quat2mat(np.array([0., 0., 0., 1.]))
+        ee_quat_hat = U.mat2quat(ee_mat)
+
+        # convert delta axis-angle to delta rotation matrix, and from there, to absolute target rotation
+        drot = np.array([0., 0., 0.05])
+        angle = np.linalg.norm(drot)
+        if U.isclose(angle, 0.):
+            drot_quat = np.array([0., 0., 0., 1.])
+        else:
+            axis = drot / angle
+            drot_quat = U.axisangle2quat(axis, angle)
+
+        # get target rotation
+        drot_mat = U.quat2mat(drot_quat)
+        target_rot_mat = (drot_mat.T).dot(ee_mat)
+        target_rot_quat = U.mat2quat(target_rot_mat)
+
+        if self.absolute_actions:
+            test_mat = T.quat2mat(T.axisangle2quat(drot))
+
+    def _get_unified_getter(self):
+        """
+        For HITL-TAMP teleoperation only - provides access to important information for perception.
+        """
+        from htamp.scripts.test_real_world import UnifiedGetter
+        return UnifiedGetter(
+            use_real_robot=True,
+            robot_interface=self.robot_interface,
+            camera_interface=self.cr_interfaces["front_image"],
+        )
+
+    def switch_controllers(self, controller_dict):
+        """
+        Switch the controller type and controller config being used. Useful
+        for switching inbetween two different kinds of controllers during an
+        episode - for example, OSC and Joint Impedance.
+
+        Args:
+            controller_dict (dict): dictionary that contains two keys
+                type (str): type of controller
+                cfg (easydict): controller config
+
+        Returns:
+            old_controller_dict (dict): the previous @controller_dict
+        """
+        old_controller_dict = dict(type=self.controller_type, cfg=deepcopy(self.controller_cfg))
+        print("*" * 50)
+        print("SWITCH TO CONTROLLER TYPE: {}".format(controller_dict["type"]))
+        print("*" * 50)
+        self.controller_type = controller_dict["type"]
+        self.controller_cfg = controller_dict["cfg"]
+        return old_controller_dict
+
+    def step(self, action, need_obs=True):
+        """
+        Step in the environment with an action.
+
+        Args:
+            action (np.array): action to take, should be in [-1, 1]
+            need_obs (bool): if False, don't return the observation, because this
+                can involve copying image data around. This allows for more
+                flexibility on when observations are retrieved.
+
+        Returns:
+            observation (dict): new observation dictionary
+            reward (float): reward for this step
+            done (bool): whether the task is done
+            info (dict): extra information
+        """
+        # print("step got action: {}".format(action))
+        if self.controller_type == "OSC_POSE":
+            assert len(action.shape) == 1 and action.shape[0] == 7, "action has incorrect dimensions"
+
+            if self.absolute_actions:
+                # convert action from absolute to relative for compatibility with rest of code
+                action = np.array(action)
+
+                # absolute pose target
+                target_pos = action[:3]
+                target_rot = T.quat2mat(T.axisangle2quat(action[3:6]))
+
+                # current pose
+                last_robot_state = self.robot_interface._state_buffer[-1]
+                ee_pose = np.array(last_robot_state.O_T_EE).reshape((4, 4)).T
+                start_pos = ee_pose[:3, 3]
+                start_rot = ee_pose[:3, :3]
+
+                # TODO: remove hardcode
+                max_dpos = np.array([0.08, 0.08, 0.08])
+                max_drot = np.array([0.5, 0.5, 0.5])
+
+                # copied from MG class (TODO: unify)
+                delta_position = target_pos - start_pos
+                delta_position = np.clip(delta_position / max_dpos, -1., 1.)
+
+                delta_rot_mat = target_rot.dot(start_rot.T)
+                delta_rot_quat = U.mat2quat(delta_rot_mat)
+                delta_rot_aa = U.quat2axisangle(delta_rot_quat)
+                delta_rotation = delta_rot_aa[0] * delta_rot_aa[1]
+                delta_rotation = np.clip(delta_rotation / max_drot, -1., 1.)
+
+                # relative action
+                action[:3] = delta_position
+                action[3:6] = delta_rotation
+                action[6:] = np.clip(action[6:], -1., 1.)
+
+            assert np.min(action) >= -1. and np.max(action) <= 1., "incorrect action bounds"
+        elif self.controller_type == "JOINT_IMPEDANCE":
+            assert len(action.shape) == 1 and action.shape[0] == 8, "action has incorrect dimensions"
+            assert not self.absolute_actions
+            if not np.any(action[:7]):
+                raise Exception("GOT ZERO ACTION WITH JOINT IMPEDANCE CONTROLLER - TERMINATING")
+            
+            # compare current joint position with issued action
+            last_robot_state = self.robot_interface._state_buffer[-1]
+            cur_q = np.array(last_robot_state.q)
+
+            # print("joint action: {}".format(action[:7]))
+            # print("current joints: {}".format(cur_q))
+            # print("absolute error: {}".format(np.abs(action[:7] - cur_q)))
+            # print("max absolute error: {}".format(np.max(np.abs(action[:7] - cur_q))))
+
+            # if np.max(np.abs(action[:7] - cur_q)) > 0.2:
+            #     raise Exception("max absolute error too high - stopping")
+
+            # TODO: joint impedance controller takes in raw joint positions - we might need to change this later, if we want to learn from these actions
+            # assert np.min(action) >= -1. and np.max(action) <= 1., "incorrect action bounds"
+
+        # meaure rate-limiting
+        # self.rate.sleep()
+        self.rate_measure.measure()
+
+        self.timers.tic("real_panda_step")
+
+        self.robot_interface.control(
+            control_type=self.controller_type,
+            action=action,
+            controller_cfg=self.controller_cfg,
+        )
+
+        # remember the last gripper action taken in this variable
+        gripper_command = action[-1:]
+        self.did_grasp = (gripper_command[0] > 0.)
+
+        # get observation
+        obs = None
+        if need_obs:
+            obs = self.get_observation()
+        r = self.get_reward()
+        done = self.is_done()
+
+        self.timers.toc("real_panda_step")
+
+        return obs, r, done, {}
+
+    def reset(self):
+        """
+        Reset environment.
+
+        Returns:
+            observation (dict): initial observation dictionary.
+        """
+
+        # self.robot_interface.close()
+        # del self.robot_interface
+        # self.robot_interface = FrankaInterface(
+        #     general_cfg_file=os.path.join(config_root, self._init_kwargs['general_cfg_file']),
+        #     control_freq=self._init_kwargs['control_freq'],
+        #     state_freq=self._init_kwargs['state_freq'],
+        #     control_timeout=self._init_kwargs['control_timeout'],
+        #     has_gripper=self._init_kwargs['has_gripper'],
+        #     use_visualizer=self._init_kwargs['use_visualizer'],
+        #     debug=self._init_kwargs['debug'],
+        # )
+
+        self.robot_interface.clear_buffer()
+
+        print("restarting the robot interface")
+
+        # Code below based on https://github.com/UT-Austin-RPL/robot_infra/blob/master/gprs/examples/reset_robot_joints.py
+
+        # Golden resetting joints
+        reset_joint_positions = [0.09162008114028396, -0.19826458111314524, -0.01990020486871322, -2.4732269941140346, -0.01307073642274261, 2.30396583422025, 0.8480939705504309]
+
+        # This is for varying initialization of joints a little bit to
+        # increase data variation.
+        # reset_joint_positions = [e + np.clip(np.random.randn() * 0.005, -0.005, 0.005) for e in reset_joint_positions]
+        action = reset_joint_positions + [-1.]
+
+        # temp robot interface to use for joint position control
+        # tmp_robot_interface = FrankaInterface(os.path.join(config_root, self.general_cfg_file), use_visualizer=False)
+        # tmp_controller_cfg = YamlConfig(os.path.join(config_root, self.controller_cfg_file)).as_easydict()
+        tmp_controller_cfg = deepcopy(self.controller_cfg)
+
+        while True:
+            if len(self.robot_interface._state_buffer) > 0:
+                # print(self.robot_interface._state_buffer[-1].q)
+                # print(reset_joint_positions)
+                # print(np.max(np.abs(np.array(self.robot_interface._state_buffer[-1].q) - np.array(reset_joint_positions))))
+                # print("-----------------------")
+
+                # if np.max(np.abs(np.array(self.robot_interface._state_buffer[-1].q) - np.array(reset_joint_positions))) < 1e-3:
+                if np.max(np.abs(np.array(self.robot_interface._state_buffer[-1].q) - np.array(reset_joint_positions))) < 1e-2:
+                    break
+
+            self.robot_interface.control(
+                control_type="JOINT_POSITION",
+                action=action,
+                controller_cfg=tmp_controller_cfg,
+            )
+
+        # tmp_robot_interface.close()
+
+        # We added this sleep here to give the C++ controller time to reset from joint control mode to no control mode
+        # to prevent some issues.
+        time.sleep(1.0)
+        print("RESET DONE")
+
+        self.did_grasp = False
+
+        return self.get_observation()
+
+    def reset_to(self, state):
+        """
+        Reset to a specific state. On real robot, we visualize the start image,
+        and a human should manually reset the scene.
+
+        Reset to a specific simulator state.
+
+        Args:
+            state (dict): initial state that contains:
+                - image (np.ndarray): initial workspace image
+
+        Returns:
+            None
+        """
+        assert "front_image" in state
+        ref_img = cv2.cvtColor(state["front_image"], cv2.COLOR_RGB2BGR)
+
+        print("\n" + "*" * 50)
+        print("Reset environment to image shown in left pane")
+        print("Press 'c' when ready to continue.")
+        print("*" * 50 + "\n")
+        while(True):
+            # read current image
+            cur_img = self._get_image(camera_name="front_image")
+            if self.use_depth_obs:
+                cur_img = cur_img[0]
+
+            # concatenate frames to display
+            img = np.concatenate([ref_img, cur_img], axis=1)
+
+            # display frame
+            cv2.imshow('initial state alignment window', img)
+            if cv2.waitKey(1) & 0xFF == ord('c'):
+                cv2.destroyAllWindows()
+                break
+
+    def render(self, mode="human", height=None, width=None, camera_name=None, **kwargs):
+        """
+        Render from simulation to either an on-screen window or off-screen to RGB array.
+
+        Args:
+            mode (str): pass "human" for on-screen rendering or "rgb_array" for off-screen rendering
+            height (int): height of image to render - only used if mode is "rgb_array"
+            width (int): width of image to render - only used if mode is "rgb_array"
+        """
+        if mode =="human":
+            raise Exception("on-screen rendering not supported currently")
+        if mode == "rgb_array":
+            # assert (height is None) and (width is None), "cannot resize images"
+            assert camera_name in self.camera_names_to_sizes, "invalid camera name"
+            imgs = self.cr_interfaces[camera_name].get_img()
+            return imgs["color"][..., ::-1]
+            # return self._get_image(camera_name=camera_name)[..., ::-1]
+        else:
+            raise NotImplementedError("mode={} is not implemented".format(mode))
+
+    def get_observation(self, obs=None):
+        """
+        Get current environment observation dictionary.
+
+        Args:
+            ob (np.array): current observation dictionary.
+        """
+        self.timers.tic("get_observation")
+        observation = {}
+        last_robot_state = self.robot_interface._state_buffer[-1]
+        last_gripper_state = self.robot_interface._gripper_state_buffer[-1]
+        ee_pose = np.array(last_robot_state.O_T_EE).reshape((4, 4)).T
+        if np.count_nonzero(ee_pose.reshape(-1)) == 0:
+            raise Exception("GOT ZERO EE POSE")
+        ee_pos = ee_pose[:3, 3]
+        ee_quat = U.mat2quat(ee_pose[:3, :3])
+        observation["ee_pose"] = np.concatenate([ee_pos, ee_quat])
+        observation["joint_positions"] = np.array(last_robot_state.q)
+        observation["joint_velocities"] = np.array(last_robot_state.dq)
+        observation["gripper_position"] = np.array(last_gripper_state.width)
+        # observation["gripper_velocity"] = self.robot_interface.gripper_velocity
+        for cam_name in self.camera_names_to_sizes:
+            im = self._get_image(camera_name=cam_name)
+            if self.use_depth_obs:
+                im, depth_im = im
+                # im, depth_im, depth_im_unaligned = im
+                # observation[cam_name + "_depth"] = depth_im
+                # observation[cam_name + "_unaligned_depth"] = depth_im_unaligned
+                if (not self._exclude_depth_from_obs):
+                    depth_im_mod = cam_name + "_depth"
+                    if self.postprocess_visual_obs and (depth_im_mod in ObsUtils.OBS_KEYS_TO_MODALITIES) and ObsUtils.key_is_obs_modality(key=depth_im_mod, obs_modality="depth"):
+                        depth_im = ObsUtils.process_obs(obs=depth_im, obs_key=depth_im_mod)
+                    observation[depth_im_mod] = depth_im
+            im = im[..., ::-1]
+            if self.postprocess_visual_obs:
+                # NOTE: commented out for now, since run-trained-agent was running into issues with unneeded agent modalities that were present in @self.camera_names_to_sizes
+                # assert (cam_name in ObsUtils.OBS_KEYS_TO_MODALITIES) and ObsUtils.key_is_obs_modality(key=cam_name, obs_modality="rgb")
+                im = ObsUtils.process_obs(obs=im, obs_key=cam_name)
+            observation[cam_name] = im
+        self.timers.toc("get_observation")
+        return observation
+
+    def _get_image(self, camera_name):
+        """
+        Get image from camera interface
+        """
+
+        # get image
+        imgs = self.cr_interfaces[camera_name].get_img()
+        im = imgs["color"]
+        
+        # resize image
+        im_size = self.camera_names_to_sizes[camera_name]
+        if im_size is not None:
+            im = Image.fromarray(im).resize((im_size[1], im_size[0]), Image.BILINEAR)
+        im = np.array(im).astype(np.uint8)
+
+        if self.center_crop_images:
+            # center crop image
+            crop_size = min(im.shape[:2])
+            im = center_crop(im, crop_size, crop_size)
+
+        if self.use_depth_obs:
+            depth_im = imgs["depth"]
+            if im_size is not None:
+                # depth_im = Image.fromarray(depth_im).resize((im_size[1], im_size[0]), Image.BILINEAR)
+                depth_im = Image.fromarray(depth_im).resize((im_size[1], im_size[0]))
+            # note: depth images are uint16, with default scale 0.001m
+            depth_im = np.array(depth_im).astype(np.uint16)
+            if len(depth_im.shape) < 3:
+                depth_im = depth_im[..., None] # add channel dimension
+            if self.center_crop_images:
+                depth_im = center_crop(depth_im, crop_size, crop_size)
+            return im, depth_im
+            # depth_images = []
+            # for k in ["depth", "unaligned_depth"]:
+            #     depth_im = imgs[k]
+            #     if im_size is not None:
+            #         # depth_im = Image.fromarray(depth_im).resize((im_size[1], im_size[0]), Image.BILINEAR)
+            #         depth_im = Image.fromarray(depth_im).resize((im_size[1], im_size[0]))
+            #     # note: depth images are uint16, with default scale 0.001m
+            #     depth_im = np.array(depth_im).astype(np.uint16)
+            #     if len(depth_im.shape) < 3:
+            #         depth_im = depth_im[..., None]  # add channel dimension
+            #     if self.center_crop_images:
+            #         depth_im = center_crop(depth_im, crop_size, crop_size)
+            #     depth_images.append(depth_im)
+            # return im, depth_images[0], depth_images[1]
+        return im
+
+    def get_state(self):
+        """
+        Get current environment simulator state as a dictionary. Should be compatible with @reset_to.
+        """
+        return dict(states=np.zeros(1))
+        # raise Exception("Real robot has no simulation state.")
+
+    def get_reward(self):
+        """
+        Get current reward.
+        """
+        return 0.
+
+    def get_goal(self):
+        """
+        Get goal observation. Not all environments support this.
+        """
+        raise NotImplementedError
+
+    def set_goal(self, **kwargs):
+        """
+        Set goal observation with external specification. Not all environments support this.
+        """
+        raise NotImplementedError
+
+    def is_done(self):
+        """
+        Check if the task is done (not necessarily successful).
+        """
+        return False
+
+    def is_success(self):
+        """
+        Check if the task condition(s) is reached. Should return a dictionary
+        { str: bool } with at least a "task" key for the overall task success,
+        and additional optional keys corresponding to other task criteria.
+        """
+
+        # real robot environments don't usually have a success check - this must be done manually
+        return { "task" : False }
+
+    @property
+    def action_dimension(self):
+        """
+        Returns dimension of actions (int).
+        """
+        if self.controller_type == "OSC_POSE":
+            return 7
+        elif self.controller_type == "JOINT_IMPEDANCE":
+            return 8
+        assert False, "should never get here"
+    
+    @property
+    def action_dim(self):
+        """
+        Returns dimension of actions (int).
+        """
+        return self.action_dimension
+
+    @property
+    def name(self):
+        """
+        Returns name of environment name (str).
+        """
+        # return self._env_name
+
+        # for real robot. ensure class name is stored in env meta (as env name) for use with any external
+        # class registries
+        return self.__class__.__name__
+
+    @property
+    def type(self):
+        """
+        Returns environment type (int) for this kind of environment.
+        This helps identify this env class.
+        """
+        return EB.EnvType.GPRS_REAL_TYPE
+
+    def serialize(self):
+        """
+        Save all information needed to re-instantiate this environment in a dictionary.
+        This is the same as @env_meta - environment metadata stored in hdf5 datasets,
+        and used in utils/env_utils.py.
+        """
+        return dict(env_name=self.name, type=self.type, env_kwargs=deepcopy(self._init_kwargs))
+
+    @classmethod
+    def create_for_data_processing(
+        cls,
+        env_name,
+        camera_names,
+        camera_height,
+        camera_width,
+        reward_shaping,
+        render=None, 
+        render_offscreen=None, 
+        use_image_obs=None, 
+        use_depth_obs=None, 
+        **kwargs,
+    ):
+        """
+        Create environment for processing datasets, which includes extracting
+        observations, labeling dense / sparse rewards, and annotating dones in
+        transitions. For gym environments, input arguments (other than @env_name)
+        are ignored, since environments are mostly pre-configured.
+
+        Args:
+            env_name (str): name of gym environment to create
+
+        Returns:
+            env (EnvRealPanda instance)
+        """
+
+        # initialize obs utils so it knows which modalities are image modalities
+        assert "camera_names_to_sizes" in kwargs
+        image_modalities = list(kwargs["camera_names_to_sizes"].keys())
+        obs_modality_specs = {
+            "obs": {
+                "low_dim": [], # technically unused, so we don't have to specify all of them
+                "image": image_modalities,
+            }
+        }
+        ObsUtils.initialize_obs_utils_with_obs_specs(obs_modality_specs)
+
+        # note that @postprocess_visual_obs is False since this env's images will be written to a dataset
+        return cls(
+            env_name=env_name,
+            render=False, 
+            render_offscreen=True, 
+            use_image_obs=True, 
+            use_depth_obs=use_depth_obs if use_depth_obs is not None else False, 
+            postprocess_visual_obs=False,
+            **kwargs,
+        )
+
+    @property
+    def rollout_exceptions(self):
+        """
+        Return tuple of exceptions to except when doing rollouts. This is useful to ensure
+        that the entire training run doesn't crash because of a bad policy that causes unstable
+        simulation computations.
+        """
+        return ()
+
+    @property
+    def base_env(self):
+        """
+        Grabs base simulation environment.
+        """
+        # we don't wrap any env
+        return self
+
+    def __repr__(self):
+        """
+        Pretty-print env description.
+        """
+        return self.name + "\n" + json.dumps(self._init_kwargs, sort_keys=True, indent=4)
+
+    def close(self):
+        """
+        Clean up env
+        """
+        for c_name in self.cr_interfaces:
+            self.cr_interfaces[c_name].stop()
+        self.robot_interface.close()
diff --git a/phantom/submodules/phantom-robomimic/robomimic/envs/env_robosuite.py b/phantom/submodules/phantom-robomimic/robomimic/envs/env_robosuite.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddd958012116e1700a27711f6af39af9dd0a7e29
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/envs/env_robosuite.py
@@ -0,0 +1,537 @@
+"""
+This file contains the robosuite environment wrapper that is used
+to provide a standardized environment API for training policies and interacting
+with metadata present in datasets.
+"""
+import json
+import os
+import numpy as np
+from copy import deepcopy
+
+import robosuite
+import robosuite.utils.transform_utils as T
+try:
+    # this is needed for ensuring robosuite can find the additional mimicgen environments (see https://mimicgen.github.io)
+    import mimicgen_envs
+except ImportError:
+    pass
+
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.envs.env_base as EB
+
+# protect against missing mujoco-py module, since robosuite might be using mujoco-py or DM backend
+try:
+    import mujoco_py
+    MUJOCO_EXCEPTIONS = [mujoco_py.builder.MujocoException]
+except ImportError:
+    MUJOCO_EXCEPTIONS = []
+
+
+class EnvRobosuite(EB.EnvBase):
+    """Wrapper class for robosuite environments (https://github.com/ARISE-Initiative/robosuite)"""
+    def __init__(
+        self, 
+        env_name, 
+        render=False, 
+        render_offscreen=False, 
+        use_image_obs=False, 
+        use_depth_obs=False,
+        postprocess_visual_obs=True,
+        **kwargs,
+    ):
+        """
+        Args:
+            env_name (str): name of environment. Only needs to be provided if making a different
+                environment from the one in @env_meta.
+
+            render (bool): if True, environment supports on-screen rendering
+
+            render_offscreen (bool): if True, environment supports off-screen rendering. This
+                is forced to be True if @env_meta["use_images"] is True.
+
+            use_image_obs (bool): if True, environment is expected to render rgb image observations
+                on every env.step call. Set this to False for efficiency reasons, if image
+                observations are not required.
+
+            use_depth_obs (bool): if True, environment is expected to render depth image observations
+                on every env.step call. Set this to False for efficiency reasons, if depth
+                observations are not required.
+
+            postprocess_visual_obs (bool): if True, postprocess image observations
+                to prepare for learning. This should only be False when extracting observations
+                for saving to a dataset (to save space on RGB images for example).
+        """
+        self.postprocess_visual_obs = postprocess_visual_obs
+        self.use_depth_obs = use_depth_obs
+
+        # robosuite version check
+        self._is_v1 = (robosuite.__version__.split(".")[0] == "1")
+        if self._is_v1:
+            assert (int(robosuite.__version__.split(".")[1]) >= 2), "only support robosuite v0.3 and v1.2+"
+
+        kwargs = deepcopy(kwargs)
+
+        # update kwargs based on passed arguments
+        update_kwargs = dict(
+            has_renderer=render,
+            has_offscreen_renderer=(render_offscreen or use_image_obs),
+            ignore_done=True,
+            use_object_obs=True,
+            use_camera_obs=use_image_obs,
+            camera_depths=use_depth_obs,
+        )
+        kwargs.update(update_kwargs)
+
+        if self._is_v1:
+            if kwargs["has_offscreen_renderer"]:
+                cuda_visible_device = os.environ.get("CUDA_VISIBLE_DEVICES", "")
+                if cuda_visible_device.isnumeric():
+                    # assume that user specified a specific GPU ID
+                    kwargs["render_gpu_device_id"] = int(cuda_visible_device)
+                else:
+                    # ensure that we select the correct GPU device for rendering by testing for EGL rendering
+                    # NOTE: this package should be installed from this link (https://github.com/StanfordVL/egl_probe)
+                    import egl_probe
+                    valid_gpu_devices = egl_probe.get_available_devices()
+                    if len(valid_gpu_devices) > 0:
+                        kwargs["render_gpu_device_id"] = valid_gpu_devices[0]
+        else:
+            # make sure gripper visualization is turned off (we almost always want this for learning)
+            kwargs["gripper_visualization"] = False
+            del kwargs["camera_depths"]
+            kwargs["camera_depth"] = use_depth_obs # rename kwarg
+
+        self._env_name = env_name
+        self._init_kwargs = deepcopy(kwargs)
+        self.env = robosuite.make(self._env_name, **kwargs)
+
+        if self._is_v1:
+            # Make sure joint position observations and eef vel observations are active
+            for ob_name in self.env.observation_names:
+                if ("joint_pos" in ob_name) or ("eef_vel" in ob_name):
+                    self.env.modify_observable(observable_name=ob_name, attribute="active", modifier=True)
+
+    def step(self, action):
+        """
+        Step in the environment with an action.
+
+        Args:
+            action (np.array): action to take
+
+        Returns:
+            observation (dict): new observation dictionary
+            reward (float): reward for this step
+            done (bool): whether the task is done
+            info (dict): extra information
+        """
+        obs, r, done, info = self.env.step(action)
+        obs = self.get_observation(obs)
+        return obs, r, self.is_done(), info
+
+    def reset(self):
+        """
+        Reset environment.
+
+        Returns:
+            observation (dict): initial observation dictionary.
+        """
+        di = self.env.reset()
+        return self.get_observation(di)
+
+    def reset_to(self, state):
+        """
+        Reset to a specific simulator state.
+
+        Args:
+            state (dict): current simulator state that contains one or more of:
+                - states (np.ndarray): initial state of the mujoco environment
+                - model (str): mujoco scene xml
+        
+        Returns:
+            observation (dict): observation dictionary after setting the simulator state (only
+                if "states" is in @state)
+        """
+        should_ret = False
+        if "model" in state:
+            self.reset()
+            robosuite_version_id = int(robosuite.__version__.split(".")[1])
+            if robosuite_version_id <= 3:
+                from robosuite.utils.mjcf_utils import postprocess_model_xml
+                xml = postprocess_model_xml(state["model"])
+            else:
+                # v1.4 and above use the class-based edit_model_xml function
+                xml = self.env.edit_model_xml(state["model"])
+            self.env.reset_from_xml_string(xml)
+            self.env.sim.reset()
+            if not self._is_v1:
+                # hide teleop visualization after restoring from model
+                self.env.sim.model.site_rgba[self.env.eef_site_id] = np.array([0., 0., 0., 0.])
+                self.env.sim.model.site_rgba[self.env.eef_cylinder_id] = np.array([0., 0., 0., 0.])
+        if "states" in state:
+            self.env.sim.set_state_from_flattened(state["states"])
+            self.env.sim.forward()
+            should_ret = True
+
+        if "goal" in state:
+            self.set_goal(**state["goal"])
+        if should_ret:
+            # only return obs if we've done a forward call - otherwise the observations will be garbage
+            return self.get_observation()
+        return None
+
+    def render(self, mode="human", height=None, width=None, camera_name="agentview"):
+        """
+        Render from simulation to either an on-screen window or off-screen to RGB array.
+
+        Args:
+            mode (str): pass "human" for on-screen rendering or "rgb_array" for off-screen rendering
+            height (int): height of image to render - only used if mode is "rgb_array"
+            width (int): width of image to render - only used if mode is "rgb_array"
+            camera_name (str): camera name to use for rendering
+        """
+        if mode == "human":
+            cam_id = self.env.sim.model.camera_name2id(camera_name)
+            self.env.viewer.set_camera(cam_id)
+            return self.env.render()
+        elif mode == "rgb_array":
+            im = self.env.sim.render(height=height, width=width, camera_name=camera_name)
+            if self.use_depth_obs:
+                # render() returns a tuple when self.use_depth_obs=True
+                return im[0][::-1]
+            return im[::-1]
+        else:
+            raise NotImplementedError("mode={} is not implemented".format(mode))
+
+    def get_observation(self, di=None):
+        """
+        Get current environment observation dictionary.
+
+        Args:
+            di (dict): current raw observation dictionary from robosuite to wrap and provide 
+                as a dictionary. If not provided, will be queried from robosuite.
+        """
+        if di is None:
+            di = self.env._get_observations(force_update=True) if self._is_v1 else self.env._get_observation()
+        ret = {}
+        for k in di:
+            if (k in ObsUtils.OBS_KEYS_TO_MODALITIES) and ObsUtils.key_is_obs_modality(key=k, obs_modality="rgb"):
+                ret[k] = di[k]
+                if self.postprocess_visual_obs:
+                    ret[k] = ObsUtils.process_obs(obs=ret[k], obs_key=k)
+            elif (k in ObsUtils.OBS_KEYS_TO_MODALITIES) and ObsUtils.key_is_obs_modality(key=k, obs_modality="depth"):
+                ret[k] = di[k]
+                if len(ret[k].shape) == 2:
+                    ret[k] = ret[k][..., None] # (H, W, 1)
+                assert len(ret[k].shape) == 3
+                # scale entries in depth map to correspond to real distance.
+                ret[k] = self.get_real_depth_map(ret[k])
+                if self.postprocess_visual_obs:
+                    ret[k] = ObsUtils.process_obs(obs=ret[k], obs_key=k)
+            elif (k in ObsUtils.OBS_KEYS_TO_MODALITIES) and ObsUtils.key_is_obs_modality(key=k, obs_modality="depth"):
+                ret[k] = di[k]
+                if len(ret[k].shape) == 2:
+                    ret[k] = ret[k][..., None] # (H, W, 1)
+                assert len(ret[k].shape) == 3
+                # scale entries in depth map to correspond to real distance.
+                ret[k] = self.get_real_depth_map(ret[k])
+                if self.postprocess_visual_obs:
+                    ret[k] = ObsUtils.process_obs(obs=ret[k], obs_key=k)
+            elif k == "frontview_segmentation_instance" or k == "agentview_segmentation_instance":
+                ret[k] = di[k]
+                if len(ret[k].shape) == 2:
+                    ret[k] = ret[k][..., None] # (H, W, 1)
+            elif k == "frontview_depth" or "agentview_depth":
+                ret[k] = di[k]
+                if len(ret[k].shape) == 2:
+                    ret[k] = ret[k][..., None] # (H, W, 1)
+            
+
+        # "object" key contains object information
+        if "object-state" in di.keys():
+            ret["object"] = np.array(di["object-state"])
+
+        if self._is_v1:
+            for robot in self.env.robots:
+                # add all robot-arm-specific observations. Note the (k not in ret) check
+                # ensures that we don't accidentally add robot wrist images a second time
+                pf = robot.robot_model.naming_prefix
+                for k in di:
+                    if k.startswith(pf) and (k not in ret) and \
+                            (not k.endswith("proprio-state")):
+                        ret[k] = np.array(di[k])
+        else:
+            # minimal proprioception for older versions of robosuite
+            ret["proprio"] = np.array(di["robot-state"])
+            ret["eef_pos"] = np.array(di["eef_pos"])
+            ret["eef_quat"] = np.array(di["eef_quat"])
+            ret["gripper_qpos"] = np.array(di["gripper_qpos"])
+        return ret
+
+    def get_real_depth_map(self, depth_map):
+        """
+        Reproduced from https://github.com/ARISE-Initiative/robosuite/blob/c57e282553a4f42378f2635b9a3cbc4afba270fd/robosuite/utils/camera_utils.py#L106
+        since older versions of robosuite do not have this conversion from normalized depth values returned by MuJoCo
+        to real depth values.
+        """
+        # Make sure that depth values are normalized
+        assert np.all(depth_map >= 0.0) and np.all(depth_map <= 1.0)
+        extent = self.env.sim.model.stat.extent
+        far = self.env.sim.model.vis.map.zfar * extent
+        near = self.env.sim.model.vis.map.znear * extent
+        return near / (1.0 - depth_map * (1.0 - near / far))
+
+    def get_camera_intrinsic_matrix(self, camera_name, camera_height, camera_width):
+        """
+        Obtains camera intrinsic matrix.
+        Args:
+            camera_name (str): name of camera
+            camera_height (int): height of camera images in pixels
+            camera_width (int): width of camera images in pixels
+        Return:
+            K (np.array): 3x3 camera matrix
+        """
+        cam_id = self.env.sim.model.camera_name2id(camera_name)
+        fovy = self.env.sim.model.cam_fovy[cam_id]
+        f = 0.5 * camera_height / np.tan(fovy * np.pi / 360)
+        K = np.array([[f, 0, camera_width / 2], [0, f, camera_height / 2], [0, 0, 1]])
+        return K
+
+    def get_camera_extrinsic_matrix(self, camera_name):
+        """
+        Returns a 4x4 homogenous matrix corresponding to the camera pose in the
+        world frame. MuJoCo has a weird convention for how it sets up the
+        camera body axis, so we also apply a correction so that the x and y
+        axis are along the camera view and the z axis points along the
+        viewpoint.
+        Normal camera convention: https://docs.opencv.org/2.4/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.html
+        Args:
+            camera_name (str): name of camera
+        Return:
+            R (np.array): 4x4 camera extrinsic matrix
+        """
+        cam_id = self.env.sim.model.camera_name2id(camera_name)
+        camera_pos = self.env.sim.data.cam_xpos[cam_id]
+        camera_rot = self.env.sim.data.cam_xmat[cam_id].reshape(3, 3)
+        R = T.make_pose(camera_pos, camera_rot)
+
+        # IMPORTANT! This is a correction so that the camera axis is set up along the viewpoint correctly.
+        camera_axis_correction = np.array(
+            [[1.0, 0.0, 0.0, 0.0], [0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+        )
+        R = R @ camera_axis_correction
+        return R
+
+    def get_camera_transform_matrix(self, camera_name, camera_height, camera_width):
+        """
+        Camera transform matrix to project from world coordinates to pixel coordinates.
+        Args:
+            camera_name (str): name of camera
+            camera_height (int): height of camera images in pixels
+            camera_width (int): width of camera images in pixels
+        Return:
+            K (np.array): 4x4 camera matrix to project from world coordinates to pixel coordinates
+        """
+        R = self.get_camera_extrinsic_matrix(camera_name=camera_name)
+        K = self.get_camera_intrinsic_matrix(
+            camera_name=camera_name, camera_height=camera_height, camera_width=camera_width
+        )
+        K_exp = np.eye(4)
+        K_exp[:3, :3] = K
+
+        # Takes a point in world, transforms to camera frame, and then projects onto image plane.
+        return K_exp @ T.pose_inv(R)
+
+    def get_state(self):
+        """
+        Get current environment simulator state as a dictionary. Should be compatible with @reset_to.
+        """
+        xml = self.env.sim.model.get_xml() # model xml file
+        state = np.array(self.env.sim.get_state().flatten()) # simulator state
+        return dict(model=xml, states=state)
+
+    def get_reward(self):
+        """
+        Get current reward.
+        """
+        return self.env.reward()
+
+    def get_goal(self):
+        """
+        Get goal observation. Not all environments support this.
+        """
+        return self.get_observation(self.env._get_goal())
+
+    def set_goal(self, **kwargs):
+        """
+        Set goal observation with external specification. Not all environments support this.
+        """
+        return self.env.set_goal(**kwargs)
+
+    def is_done(self):
+        """
+        Check if the task is done (not necessarily successful).
+        """
+
+        # Robosuite envs always rollout to fixed horizon.
+        return False
+
+    def is_success(self):
+        """
+        Check if the task condition(s) is reached. Should return a dictionary
+        { str: bool } with at least a "task" key for the overall task success,
+        and additional optional keys corresponding to other task criteria.
+        """
+        succ = self.env._check_success()
+        if isinstance(succ, dict):
+            assert "task" in succ
+            return succ
+        return { "task" : succ }
+
+    @property
+    def action_dimension(self):
+        """
+        Returns dimension of actions (int).
+        """
+        return self.env.action_spec[0].shape[0]
+
+    @property
+    def name(self):
+        """
+        Returns name of environment name (str).
+        """
+        return self._env_name
+
+    @property
+    def type(self):
+        """
+        Returns environment type (int) for this kind of environment.
+        This helps identify this env class.
+        """
+        return EB.EnvType.ROBOSUITE_TYPE
+
+    @property
+    def version(self):
+        """
+        Returns version of robosuite used for this environment, eg. 1.2.0
+        """
+        return robosuite.__version__
+
+    def serialize(self):
+        """
+        Save all information needed to re-instantiate this environment in a dictionary.
+        This is the same as @env_meta - environment metadata stored in hdf5 datasets,
+        and used in utils/env_utils.py.
+        """
+        return dict(
+            env_name=self.name,
+            env_version=self.version,
+            type=self.type,
+            env_kwargs=deepcopy(self._init_kwargs)
+        )
+
+    @classmethod
+    def create_for_data_processing(
+        cls, 
+        env_name, 
+        camera_names, 
+        camera_height, 
+        camera_width, 
+        reward_shaping, 
+        render=None,
+        render_offscreen=None,
+        use_image_obs=None,
+        use_depth_obs=None,
+        **kwargs,
+    ):
+        """
+        Create environment for processing datasets, which includes extracting
+        observations, labeling dense / sparse rewards, and annotating dones in
+        transitions. 
+
+        Args:
+            env_name (str): name of environment
+            camera_names (list of str): list of camera names that correspond to image observations
+            camera_height (int): camera height for all cameras
+            camera_width (int): camera width for all cameras
+            reward_shaping (bool): if True, use shaped environment rewards, else use sparse task completion rewards
+            render (bool or None): optionally override rendering behavior. Defaults to False.
+            render_offscreen (bool or None): optionally override rendering behavior. The default value is True if
+                @camera_names is non-empty, False otherwise.
+            use_image_obs (bool or None): optionally override rendering behavior. The default value is True if
+                @camera_names is non-empty, False otherwise.
+            use_depth_obs (bool): if True, use depth observations
+        """
+        is_v1 = (robosuite.__version__.split(".")[0] == "1")
+        has_camera = (len(camera_names) > 0)
+
+        new_kwargs = {
+            "reward_shaping": reward_shaping,
+        }
+
+        if has_camera:
+            if is_v1:
+                new_kwargs["camera_names"] = list(camera_names)
+                new_kwargs["camera_heights"] = camera_height
+                new_kwargs["camera_widths"] = camera_width
+            else:
+                assert len(camera_names) == 1
+                if has_camera:
+                    new_kwargs["camera_name"] = camera_names[0]
+                    new_kwargs["camera_height"] = camera_height
+                    new_kwargs["camera_width"] = camera_width
+
+        kwargs.update(new_kwargs)
+
+        # also initialize obs utils so it knows which modalities are image modalities
+        image_modalities = list(camera_names)
+        depth_modalities = list(camera_names)
+        if is_v1:
+            image_modalities = ["{}_image".format(cn) for cn in camera_names]
+            depth_modalities = ["{}_depth".format(cn) for cn in camera_names]
+        elif has_camera:
+            # v0.3 only had support for one image, and it was named "image"
+            assert len(image_modalities) == 1
+            image_modalities = ["image"]
+            depth_modalities = ["depth"]
+        obs_modality_specs = {
+            "obs": {
+                "low_dim": [], # technically unused, so we don't have to specify all of them
+                "rgb": image_modalities,
+            }
+        }
+        if use_depth_obs:
+            obs_modality_specs["obs"]["depth"] = depth_modalities
+        ObsUtils.initialize_obs_utils_with_obs_specs(obs_modality_specs)
+
+        # note that @postprocess_visual_obs is False since this env's images will be written to a dataset
+        return cls(
+            env_name=env_name,
+            render=(False if render is None else render),
+            render_offscreen=(has_camera if render_offscreen is None else render_offscreen),
+            use_image_obs=(has_camera if use_image_obs is None else use_image_obs),
+            use_depth_obs=use_depth_obs,
+            postprocess_visual_obs=False,
+            **kwargs,
+        )
+
+    @property
+    def rollout_exceptions(self):
+        """
+        Return tuple of exceptions to except when doing rollouts. This is useful to ensure
+        that the entire training run doesn't crash because of a bad policy that causes unstable
+        simulation computations.
+        """
+        return tuple(MUJOCO_EXCEPTIONS)
+
+    @property
+    def base_env(self):
+        """
+        Grabs base simulation environment.
+        """
+        return self.env
+
+    def __repr__(self):
+        """
+        Pretty-print env description.
+        """
+        return self.name + "\n" + json.dumps(self._init_kwargs, sort_keys=True, indent=4)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/envs/wrappers.py b/phantom/submodules/phantom-robomimic/robomimic/envs/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb46091ef33279ce9199f9d70e8add72818671a3
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/envs/wrappers.py
@@ -0,0 +1,222 @@
+"""
+A collection of useful environment wrappers.
+"""
+from copy import deepcopy
+import textwrap
+import numpy as np
+from collections import deque
+
+import robomimic.envs.env_base as EB
+
+
+class EnvWrapper(object):
+    """
+    Base class for all environment wrappers in robomimic.
+    """
+    def __init__(self, env):
+        """
+        Args:
+            env (EnvBase instance): The environment to wrap.
+        """
+        assert isinstance(env, EB.EnvBase) or isinstance(env, EnvWrapper)
+        self.env = env
+
+    @classmethod
+    def class_name(cls):
+        return cls.__name__
+
+    def _warn_double_wrap(self):
+        """
+        Utility function that checks if we're accidentally trying to double wrap an env
+        Raises:
+            Exception: [Double wrapping env]
+        """
+        env = self.env
+        while True:
+            if isinstance(env, EnvWrapper):
+                if env.class_name() == self.class_name():
+                    raise Exception(
+                        "Attempted to double wrap with Wrapper: {}".format(
+                            self.__class__.__name__
+                        )
+                    )
+                env = env.env
+            else:
+                break
+
+    @property
+    def unwrapped(self):
+        """
+        Grabs unwrapped environment
+
+        Returns:
+            env (EnvBase instance): Unwrapped environment
+        """
+        if hasattr(self.env, "unwrapped"):
+            return self.env.unwrapped
+        else:
+            return self.env
+
+    def _to_string(self):
+        """
+        Subclasses should override this method to print out info about the 
+        wrapper (such as arguments passed to it).
+        """
+        return ''
+
+    def __repr__(self):
+        """Pretty print environment."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        indent = ' ' * 4
+        if self._to_string() != '':
+            msg += textwrap.indent("\n" + self._to_string(), indent)
+        msg += textwrap.indent("\nenv={}".format(self.env), indent)
+        msg = header + '(' + msg + '\n)'
+        return msg
+
+    # this method is a fallback option on any methods the original env might support
+    def __getattr__(self, attr):
+        # using getattr ensures that both __getattribute__ and __getattr__ (fallback) get called
+        # (see https://stackoverflow.com/questions/3278077/difference-between-getattr-vs-getattribute)
+        orig_attr = getattr(self.env, attr)
+        if callable(orig_attr):
+
+            def hooked(*args, **kwargs):
+                result = orig_attr(*args, **kwargs)
+                # prevent wrapped_class from becoming unwrapped
+                if id(result) == id(self.env):
+                    return self
+                return result
+
+            return hooked
+        else:
+            return orig_attr
+
+
+class FrameStackWrapper(EnvWrapper):
+    """
+    Wrapper for frame stacking observations during rollouts. The agent
+    receives a sequence of past observations instead of a single observation
+    when it calls @env.reset, @env.reset_to, or @env.step in the rollout loop.
+    """
+    def __init__(self, env, num_frames):
+        """
+        Args:
+            env (EnvBase instance): The environment to wrap.
+            num_frames (int): number of past observations (including current observation)
+                to stack together. Must be greater than 1 (otherwise this wrapper would
+                be a no-op).
+        """
+        assert num_frames > 1, "error: FrameStackWrapper must have num_frames > 1 but got num_frames of {}".format(num_frames)
+
+        super(FrameStackWrapper, self).__init__(env=env)
+        self.num_frames = num_frames
+
+        ### TODO: add action padding option + adding action to obs to include action history in obs ###
+
+        # keep track of last @num_frames observations for each obs key
+        self.obs_history = None
+
+    def _get_initial_obs_history(self, init_obs):
+        """
+        Helper method to get observation history from the initial observation, by
+        repeating it.
+
+        Returns:
+            obs_history (dict): a deque for each observation key, with an extra
+                leading dimension of 1 for each key (for easy concatenation later)
+        """
+        obs_history = {}
+        for k in init_obs:
+            obs_history[k] = deque(
+                [init_obs[k][None] for _ in range(self.num_frames)], 
+                maxlen=self.num_frames,
+            )
+        return obs_history
+
+    def _get_stacked_obs_from_history(self):
+        """
+        Helper method to convert internal variable @self.obs_history to a 
+        stacked observation where each key is a numpy array with leading dimension
+        @self.num_frames.
+        """
+        # concatenate all frames per key so we return a numpy array per key
+        return { k : np.concatenate(self.obs_history[k], axis=0) for k in self.obs_history }
+
+    def cache_obs_history(self):
+        self.obs_history_cache = deepcopy(self.obs_history)
+
+    def uncache_obs_history(self):
+        self.obs_history = self.obs_history_cache
+        self.obs_history_cache = None
+
+    def reset(self):
+        """
+        Modify to return frame stacked observation which is @self.num_frames copies of 
+        the initial observation.
+
+        Returns:
+            obs_stacked (dict): each observation key in original observation now has
+                leading shape @self.num_frames and consists of the previous @self.num_frames
+                observations
+        """
+        obs = self.env.reset()
+        self.timestep = 0  # always zero regardless of timestep type
+        self.update_obs(obs, reset=True)
+        self.obs_history = self._get_initial_obs_history(init_obs=obs)
+        return self._get_stacked_obs_from_history()
+
+    def reset_to(self, state):
+        """
+        Modify to return frame stacked observation which is @self.num_frames copies of 
+        the initial observation.
+
+        Returns:
+            obs_stacked (dict): each observation key in original observation now has
+                leading shape @self.num_frames and consists of the previous @self.num_frames
+                observations
+        """
+        obs = self.env.reset_to(state)
+        self.timestep = 0  # always zero regardless of timestep type
+        self.update_obs(obs, reset=True)
+        self.obs_history = self._get_initial_obs_history(init_obs=obs)
+        return self._get_stacked_obs_from_history()
+
+    def step(self, action):
+        """
+        Modify to update the internal frame history and return frame stacked observation,
+        which will have leading dimension @self.num_frames for each key.
+
+        Args:
+            action (np.array): action to take
+
+        Returns:
+            obs_stacked (dict): each observation key in original observation now has
+                leading shape @self.num_frames and consists of the previous @self.num_frames
+                observations
+            reward (float): reward for this step
+            done (bool): whether the task is done
+            info (dict): extra information
+        """
+        obs, r, done, info = self.env.step(action)
+        self.update_obs(obs, action=action, reset=False)
+        # update frame history
+        for k in obs:
+            # make sure to have leading dim of 1 for easy concatenation
+            self.obs_history[k].append(obs[k][None])
+        obs_ret = self._get_stacked_obs_from_history()
+        return obs_ret, r, done, info
+
+    def update_obs(self, obs, action=None, reset=False):
+        obs["timesteps"] = np.array([self.timestep])
+        
+        if reset:
+            obs["actions"] = np.zeros(self.env.action_dimension)
+        else:
+            self.timestep += 1
+            obs["actions"] = action[: self.env.action_dimension]
+
+    def _to_string(self):
+        """Info to pretty print."""
+        return "num_frames={}".format(self.num_frames)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/exps/templates/bc.json b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/bc.json
new file mode 100644
index 0000000000000000000000000000000000000000..82ad783fbf330fecf0d59f97e346dc797dbaba1f
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/bc.json
@@ -0,0 +1,215 @@
+{
+    "algo_name": "bc",
+    "experiment": {
+        "name": "test",
+        "validate": false,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 50,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": false,
+            "on_best_rollout_success_rate": true
+        },
+        "epoch_every_n_steps": 100,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": true,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 400,
+            "rate": 50,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir": "../bc_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "all",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": false,
+        "hdf5_normalize_obs": false,
+        "hdf5_filter_key": null,
+        "hdf5_validation_filter_key": null,
+        "seq_length": 1,
+        "pad_seq_length": true,
+        "frame_stack": 1,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions",
+            "rewards",
+            "dones"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 100,
+        "num_epochs": 2000,
+        "seed": 1
+    },
+    "algo": {
+        "optim_params": {
+            "policy": {
+                "optimizer_type": "adam",
+                "learning_rate": {
+                    "initial": 0.0001,
+                    "decay_factor": 0.1,
+                    "epoch_schedule": [],
+                    "scheduler_type": "multistep"
+                },
+                "regularization": {
+                    "L2": 0.0
+                }
+            }
+        },
+        "loss": {
+            "l2_weight": 1.0,
+            "l1_weight": 0.0,
+            "cos_weight": 0.0
+        },
+        "actor_layer_dims": [
+            1024,
+            1024
+        ],
+        "gaussian": {
+            "enabled": false,
+            "fixed_std": false,
+            "init_std": 0.1,
+            "min_std": 0.01,
+            "std_activation": "softplus",
+            "low_noise_eval": true
+        },
+        "gmm": {
+            "enabled": false,
+            "num_modes": 5,
+            "min_std": 0.0001,
+            "std_activation": "softplus",
+            "low_noise_eval": true
+        },
+        "vae": {
+            "enabled": false,
+            "latent_dim": 14,
+            "latent_clip": null,
+            "kl_weight": 1.0,
+            "decoder": {
+                "is_conditioned": true,
+                "reconstruction_sum_across_elements": false
+            },
+            "prior": {
+                "learn": false,
+                "is_conditioned": false,
+                "use_gmm": false,
+                "gmm_num_modes": 10,
+                "gmm_learn_weights": false,
+                "use_categorical": false,
+                "categorical_dim": 10,
+                "categorical_gumbel_softmax_hard": false,
+                "categorical_init_temp": 1.0,
+                "categorical_temp_anneal_step": 0.001,
+                "categorical_min_temp": 0.3
+            },
+            "encoder_layer_dims": [
+                300,
+                400
+            ],
+            "decoder_layer_dims": [
+                300,
+                400
+            ],
+            "prior_layer_dims": [
+                300,
+                400
+            ]
+        },
+        "rnn": {
+            "enabled": false,
+            "horizon": 10,
+            "hidden_dim": 400,
+            "rnn_type": "LSTM",
+            "num_layers": 2,
+            "open_loop": false,
+            "kwargs": {
+                "bidirectional": false
+            }
+        },
+        "transformer": {
+            "enabled": false,
+            "context_length": 10,
+            "embed_dim": 512,
+            "num_layers": 6,
+            "num_heads": 8,
+            "emb_dropout": 0.1,
+            "attn_dropout": 0.1,
+            "block_output_dropout": 0.1,
+            "sinusoidal_embedding": false,
+            "activation": "gelu",
+            "supervise_all_steps": false,
+            "nn_parameter_for_timesteps": true
+        }
+    },
+    "observation": {
+        "modalities": {
+            "obs": {
+                "low_dim": [
+                    "robot0_eef_pos",
+                    "robot0_eef_quat",
+                    "robot0_gripper_qpos",
+                    "object"
+                ],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            },
+            "goal": {
+                "low_dim": [],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            }
+        },
+        "encoder": {
+            "low_dim": {
+                "core_class": null,
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "rgb": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "depth": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "scan": {
+                "core_class": "ScanCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            }
+        }
+    },
+    "meta": {
+        "hp_base_config_file": null,
+        "hp_keys": [],
+        "hp_values": []
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/exps/templates/bc_transformer.json b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/bc_transformer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c28696cb0d6abc2d081570ed4dc2eaf16939a819
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/bc_transformer.json
@@ -0,0 +1,171 @@
+{
+    "algo_name": "bc",
+    "experiment": {
+        "name": "test",
+        "validate": true,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 50,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": false,
+            "on_best_rollout_success_rate": true
+        },
+        "epoch_every_n_steps": 100,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": true,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 400,
+            "rate": 50,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir": "../bc_transformer_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "low_dim",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": false,
+        "hdf5_normalize_obs": false,
+        "hdf5_filter_key": null,
+        "seq_length": 1,
+        "pad_seq_length": true,
+        "frame_stack": 10,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 100,
+        "num_epochs": 2000,
+        "seed": 1
+    },
+    "algo": {
+        "optim_params": {
+            "policy": {
+                "optimizer_type": "adamw",
+                "learning_rate": {
+                    "initial": 0.0001,
+                    "decay_factor": 0.1,
+                    "epoch_schedule": [100],
+                    "scheduler_type": "linear"
+                },
+                "regularization": {
+                    "L2": 0.01
+                }
+            }
+        },
+        "loss": {
+            "l2_weight": 1.0,
+            "l1_weight": 0.0,
+            "cos_weight": 0.0
+        },
+        "actor_layer_dims": [],
+        "gaussian": {
+            "enabled": false
+        },
+        "gmm": {
+            "enabled": true,
+            "num_modes": 5,
+            "min_std": 0.0001,
+            "std_activation": "softplus",
+            "low_noise_eval": true
+        },
+        "vae": {
+            "enabled": false
+        },
+        "rnn": {
+            "enabled": false
+        },
+        "transformer": {
+            "enabled": true,
+            "supervise_all_steps": false,
+            "num_layers": 6,
+            "embed_dim": 512,
+            "num_heads": 8
+        }
+    },
+    "observation": {
+        "modalities": {
+            "obs": {
+                "low_dim": [
+                    "robot0_eef_pos",
+                    "robot0_eef_quat",
+                    "robot0_gripper_qpos",
+                    "object"
+                ],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            },
+            "goal": {
+                "low_dim": [],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            }
+        },
+        "encoder": {
+            "low_dim": {
+                "core_class": null,
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "rgb": {
+                "core_class": "VisualCore",
+                "core_kwargs": {
+                    "feature_dimension": 64,
+                    "backbone_class": "ResNet18Conv",
+                    "backbone_kwargs": {
+                        "pretrained": false,
+                        "input_coord_conv": false
+                    },
+                    "pool_class": "SpatialSoftmax",
+                    "pool_kwargs": {
+                        "num_kp": 32,
+                        "learnable_temperature": false,
+                        "temperature": 1.0,
+                        "noise_std": 0.0
+                    }
+                },
+                "obs_randomizer_class": "CropRandomizer",
+                "obs_randomizer_kwargs": {
+                    "crop_height": 76,
+                    "crop_width": 76,
+                    "num_crops": 1,
+                    "pos_enc": false
+                }
+            },
+            "depth": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "scan": {
+                "core_class": "ScanCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/exps/templates/bcq.json b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/bcq.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ae9d907466f4278b418bcc1fb93aacb7fcb1e2a
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/bcq.json
@@ -0,0 +1,235 @@
+{
+    "algo_name": "bcq",
+    "experiment": {
+        "name": "test",
+        "validate": false,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 50,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": false,
+            "on_best_rollout_success_rate": true
+        },
+        "epoch_every_n_steps": 100,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": true,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 400,
+            "rate": 50,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir": "../bcq_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "all",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": true,
+        "hdf5_normalize_obs": false,
+        "hdf5_filter_key": null,
+        "hdf5_validation_filter_key": null,
+        "seq_length": 1,
+        "pad_seq_length": true,
+        "frame_stack": 1,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions",
+            "rewards",
+            "dones"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 100,
+        "num_epochs": 2000,
+        "seed": 1
+    },
+    "algo": {
+        "optim_params": {
+            "critic": {
+                "learning_rate": {
+                    "initial": 0.001,
+                    "decay_factor": 0.1,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                },
+                "start_epoch": -1,
+                "end_epoch": -1
+            },
+            "action_sampler": {
+                "learning_rate": {
+                    "initial": 0.001,
+                    "decay_factor": 0.1,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                },
+                "start_epoch": -1,
+                "end_epoch": -1
+            },
+            "actor": {
+                "learning_rate": {
+                    "initial": 0.001,
+                    "decay_factor": 0.1,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                },
+                "start_epoch": -1,
+                "end_epoch": -1
+            }
+        },
+        "discount": 0.99,
+        "n_step": 1,
+        "target_tau": 0.005,
+        "infinite_horizon": false,
+        "critic": {
+            "use_huber": false,
+            "max_gradient_norm": null,
+            "value_bounds": null,
+            "num_action_samples": 10,
+            "num_action_samples_rollout": 100,
+            "ensemble": {
+                "n": 2,
+                "weight": 0.75
+            },
+            "distributional": {
+                "enabled": false,
+                "num_atoms": 51
+            },
+            "layer_dims": [
+                300,
+                400
+            ]
+        },
+        "action_sampler": {
+            "actor_layer_dims": [
+                1024,
+                1024
+            ],
+            "gmm": {
+                "enabled": false,
+                "num_modes": 5,
+                "min_std": 0.0001,
+                "std_activation": "softplus",
+                "low_noise_eval": true
+            },
+            "vae": {
+                "enabled": true,
+                "latent_dim": 14,
+                "latent_clip": null,
+                "kl_weight": 1.0,
+                "decoder": {
+                    "is_conditioned": true,
+                    "reconstruction_sum_across_elements": false
+                },
+                "prior": {
+                    "learn": false,
+                    "is_conditioned": false,
+                    "use_gmm": false,
+                    "gmm_num_modes": 10,
+                    "gmm_learn_weights": false,
+                    "use_categorical": false,
+                    "categorical_dim": 10,
+                    "categorical_gumbel_softmax_hard": false,
+                    "categorical_init_temp": 1.0,
+                    "categorical_temp_anneal_step": 0.001,
+                    "categorical_min_temp": 0.3
+                },
+                "encoder_layer_dims": [
+                    300,
+                    400
+                ],
+                "decoder_layer_dims": [
+                    300,
+                    400
+                ],
+                "prior_layer_dims": [
+                    300,
+                    400
+                ]
+            },
+            "freeze_encoder_epoch": -1
+        },
+        "actor": {
+            "enabled": false,
+            "perturbation_scale": 0.05,
+            "layer_dims": [
+                300,
+                400
+            ]
+        }
+    },
+    "observation": {
+        "modalities": {
+            "obs": {
+                "low_dim": [
+                    "robot0_eef_pos",
+                    "robot0_eef_quat",
+                    "robot0_gripper_qpos",
+                    "object"
+                ],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            },
+            "goal": {
+                "low_dim": [],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            }
+        },
+        "encoder": {
+            "low_dim": {
+                "core_class": null,
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "rgb": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "depth": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "scan": {
+                "core_class": "ScanCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            }
+        }
+    },
+    "meta": {
+        "hp_base_config_file": null,
+        "hp_keys": [],
+        "hp_values": []
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/exps/templates/cql.json b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/cql.json
new file mode 100644
index 0000000000000000000000000000000000000000..a920efd6f01844971fba4881d73b762f7cf47ade
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/cql.json
@@ -0,0 +1,182 @@
+{
+    "algo_name": "cql",
+    "experiment": {
+        "name": "test",
+        "validate": false,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 50,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": false,
+            "on_best_rollout_success_rate": true
+        },
+        "epoch_every_n_steps": 100,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": true,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 400,
+            "rate": 50,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir": "../cql_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "all",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": true,
+        "hdf5_normalize_obs": false,
+        "hdf5_filter_key": null,
+        "hdf5_validation_filter_key": null,
+        "seq_length": 1,
+        "pad_seq_length": true,
+        "frame_stack": 1,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions",
+            "rewards",
+            "dones"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 1024,
+        "num_epochs": 2000,
+        "seed": 1
+    },
+    "algo": {
+        "optim_params": {
+            "critic": {
+                "learning_rate": {
+                    "initial": 0.001,
+                    "decay_factor": 0.0,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                }
+            },
+            "actor": {
+                "learning_rate": {
+                    "initial": 0.0003,
+                    "decay_factor": 0.0,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                }
+            }
+        },
+        "discount": 0.99,
+        "n_step": 1,
+        "target_tau": 0.005,
+        "actor": {
+            "bc_start_steps": 0,
+            "target_entropy": "default",
+            "max_gradient_norm": null,
+            "net": {
+                "type": "gaussian",
+                "common": {
+                    "std_activation": "exp",
+                    "use_tanh": true,
+                    "low_noise_eval": true
+                },
+                "gaussian": {
+                    "init_last_fc_weight": 0.001,
+                    "init_std": 0.3,
+                    "fixed_std": false
+                }
+            },
+            "layer_dims": [
+                300,
+                400
+            ]
+        },
+        "critic": {
+            "use_huber": false,
+            "max_gradient_norm": null,
+            "value_bounds": null,
+            "num_action_samples": 1,
+            "cql_weight": 1.0,
+            "deterministic_backup": true,
+            "min_q_weight": 1.0,
+            "target_q_gap": 5.0,
+            "num_random_actions": 10,
+            "ensemble": {
+                "n": 2
+            },
+            "layer_dims": [
+                300,
+                400
+            ]
+        }
+    },
+    "observation": {
+        "modalities": {
+            "obs": {
+                "low_dim": [
+                    "robot0_eef_pos",
+                    "robot0_eef_quat",
+                    "robot0_gripper_qpos",
+                    "object"
+                ],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            },
+            "goal": {
+                "low_dim": [],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            }
+        },
+        "encoder": {
+            "low_dim": {
+                "core_class": null,
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "rgb": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "depth": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "scan": {
+                "core_class": "ScanCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            }
+        }
+    },
+    "meta": {
+        "hp_base_config_file": null,
+        "hp_keys": [],
+        "hp_values": []
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/exps/templates/diffusion_policy.json b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/diffusion_policy.json
new file mode 100644
index 0000000000000000000000000000000000000000..75936bb53d5155bac7730c741b20aec7d554ac73
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/diffusion_policy.json
@@ -0,0 +1,174 @@
+{
+    "algo_name": "diffusion_policy",
+    "experiment": {
+        "name": "test",
+        "validate": false,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 50,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": false,
+            "on_best_rollout_success_rate": true
+        },
+        "epoch_every_n_steps": 100,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": true,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 400,
+            "rate": 50,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir":"../diffusion_policy_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "low_dim",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": false,
+        "hdf5_normalize_obs": false,
+        "hdf5_filter_key": null,
+        "seq_length": 15,
+        "pad_seq_length": true,
+        "frame_stack": 2,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 256,
+        "num_epochs": 2000,
+        "seed": 1
+    },
+    "algo": {
+        "optim_params": {
+            "policy": {
+                "learning_rate": {
+                    "initial": 0.0001,
+                    "decay_factor": 0.1,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                }
+            }
+        },
+        "horizon": {
+            "observation_horizon": 2,
+            "action_horizon": 8,
+            "prediction_horizon": 16
+        },
+        "unet": {
+            "enabled": true,
+            "diffusion_step_embed_dim": 256,
+            "down_dims": [256,512,1024],
+            "kernel_size": 5,
+            "n_groups": 8
+        },
+        "ema": {
+            "enabled": true,
+            "power": 0.75
+        },
+        "ddpm": {
+            "enabled": true,
+            "num_train_timesteps": 100,
+            "num_inference_timesteps": 100,
+            "beta_schedule": "squaredcos_cap_v2",
+            "clip_sample": true,
+            "prediction_type": "epsilon"
+        },
+        "ddim": {
+            "enabled": false,
+            "num_train_timesteps": 100,
+            "num_inference_timesteps": 10,
+            "beta_schedule": "squaredcos_cap_v2",
+            "clip_sample": true,
+            "set_alpha_to_one": true,
+            "steps_offset": 0,
+            "prediction_type": "epsilon"
+        }
+    },
+    "observation": {
+        "modalities": {
+            "obs": {
+                "low_dim": [
+                    "robot0_eef_pos",
+                    "robot0_eef_quat",
+                    "robot0_gripper_qpos",
+                    "object"
+                ],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            },
+            "goal": {
+                "low_dim": [],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            }
+        },
+        "encoder": {
+            "low_dim": {
+                "core_class": null,
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "rgb": {
+                "core_class": "VisualCore",
+                "core_kwargs": {
+                    "feature_dimension": 64,
+                    "backbone_class": "ResNet18Conv",
+                    "backbone_kwargs": {
+                        "pretrained": false,
+                        "input_coord_conv": false
+                    },
+                    "pool_class": "SpatialSoftmax",
+                    "pool_kwargs": {
+                        "num_kp": 32,
+                        "learnable_temperature": false,
+                        "temperature": 1.0,
+                        "noise_std": 0.0
+                    }
+                },
+                "obs_randomizer_class": "CropRandomizer",
+                "obs_randomizer_kwargs": {
+                    "crop_height": 76,
+                    "crop_width": 76,
+                    "num_crops": 1,
+                    "pos_enc": false
+                }
+            },
+            "depth": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "scan": {
+                "core_class": "ScanCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/exps/templates/gl.json b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/gl.json
new file mode 100644
index 0000000000000000000000000000000000000000..39b4c2dbd65dad06afaaa1f88bd605a3477e3312
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/gl.json
@@ -0,0 +1,182 @@
+{
+    "algo_name": "gl",
+    "experiment": {
+        "name": "test",
+        "validate": false,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 50,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": false,
+            "on_best_rollout_success_rate": true
+        },
+        "epoch_every_n_steps": 100,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": true,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 400,
+            "rate": 50,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir": "../gl_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "all",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": true,
+        "hdf5_normalize_obs": false,
+        "hdf5_filter_key": null,
+        "hdf5_validation_filter_key": null,
+        "seq_length": 1,
+        "pad_seq_length": true,
+        "frame_stack": 1,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions",
+            "rewards",
+            "dones"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 100,
+        "num_epochs": 2000,
+        "seed": 1
+    },
+    "algo": {
+        "optim_params": {
+            "goal_network": {
+                "learning_rate": {
+                    "initial": 0.0001,
+                    "decay_factor": 0.1,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                }
+            }
+        },
+        "subgoal_horizon": 10,
+        "ae": {
+            "planner_layer_dims": [
+                300,
+                400
+            ]
+        },
+        "vae": {
+            "enabled": true,
+            "latent_dim": 16,
+            "latent_clip": null,
+            "kl_weight": 1.0,
+            "decoder": {
+                "is_conditioned": true,
+                "reconstruction_sum_across_elements": false
+            },
+            "prior": {
+                "learn": false,
+                "is_conditioned": false,
+                "use_gmm": false,
+                "gmm_num_modes": 10,
+                "gmm_learn_weights": false,
+                "use_categorical": false,
+                "categorical_dim": 10,
+                "categorical_gumbel_softmax_hard": false,
+                "categorical_init_temp": 1.0,
+                "categorical_temp_anneal_step": 0.001,
+                "categorical_min_temp": 0.3
+            },
+            "encoder_layer_dims": [
+                300,
+                400
+            ],
+            "decoder_layer_dims": [
+                300,
+                400
+            ],
+            "prior_layer_dims": [
+                300,
+                400
+            ]
+        }
+    },
+    "observation": {
+        "modalities": {
+            "obs": {
+                "low_dim": [
+                    "robot0_eef_pos",
+                    "robot0_eef_quat",
+                    "robot0_gripper_qpos",
+                    "object"
+                ],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            },
+            "goal": {
+                "low_dim": [],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            },
+            "subgoal": {
+                "low_dim": [
+                    "robot0_eef_pos",
+                    "robot0_eef_quat",
+                    "robot0_gripper_qpos",
+                    "object"
+                ],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            }
+        },
+        "encoder": {
+            "low_dim": {
+                "core_class": null,
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "rgb": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "depth": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "scan": {
+                "core_class": "ScanCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            }
+        }
+    },
+    "meta": {
+        "hp_base_config_file": null,
+        "hp_keys": [],
+        "hp_values": []
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/exps/templates/hbc.json b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/hbc.json
new file mode 100644
index 0000000000000000000000000000000000000000..26eff76a8f40e3fd787c7a561a91155369101b7e
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/hbc.json
@@ -0,0 +1,293 @@
+{
+    "algo_name": "hbc",
+    "experiment": {
+        "name": "test",
+        "validate": false,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 50,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": false,
+            "on_best_rollout_success_rate": true
+        },
+        "epoch_every_n_steps": 100,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": true,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 400,
+            "rate": 50,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir": "../hbc_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "all",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": true,
+        "hdf5_normalize_obs": false,
+        "hdf5_filter_key": null,
+        "hdf5_validation_filter_key": null,
+        "seq_length": 10,
+        "pad_seq_length": true,
+        "frame_stack": 1,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions",
+            "rewards",
+            "dones"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 100,
+        "num_epochs": 2000,
+        "seed": 1
+    },
+    "algo": {
+        "mode": "separate",
+        "actor_use_random_subgoals": false,
+        "subgoal_update_interval": 10,
+        "latent_subgoal": {
+            "enabled": false,
+            "prior_correction": {
+                "enabled": false,
+                "num_samples": 100
+            }
+        },
+        "planner": {
+            "optim_params": {
+                "goal_network": {
+                    "learning_rate": {
+                        "initial": 0.0001,
+                        "decay_factor": 0.1,
+                        "epoch_schedule": []
+                    },
+                    "regularization": {
+                        "L2": 0.0
+                    }
+                }
+            },
+            "subgoal_horizon": 10,
+            "ae": {
+                "planner_layer_dims": [
+                    300,
+                    400
+                ]
+            },
+            "vae": {
+                "enabled": true,
+                "latent_dim": 16,
+                "latent_clip": null,
+                "kl_weight": 1.0,
+                "decoder": {
+                    "is_conditioned": true,
+                    "reconstruction_sum_across_elements": false
+                },
+                "prior": {
+                    "learn": false,
+                    "is_conditioned": false,
+                    "use_gmm": false,
+                    "gmm_num_modes": 10,
+                    "gmm_learn_weights": false,
+                    "use_categorical": false,
+                    "categorical_dim": 10,
+                    "categorical_gumbel_softmax_hard": false,
+                    "categorical_init_temp": 1.0,
+                    "categorical_temp_anneal_step": 0.001,
+                    "categorical_min_temp": 0.3
+                },
+                "encoder_layer_dims": [
+                    300,
+                    400
+                ],
+                "decoder_layer_dims": [
+                    300,
+                    400
+                ],
+                "prior_layer_dims": [
+                    300,
+                    400
+                ]
+            }
+        },
+        "actor": {
+            "optim_params": {
+                "policy": {
+                    "optimizer_type": "adam",
+                    "learning_rate": {
+                        "initial": 0.0001,
+                        "decay_factor": 0.1,
+                        "epoch_schedule": [],
+                        "scheduler_type": "multistep"
+                    },
+                    "regularization": {
+                        "L2": 0.0
+                    }
+                }
+            },
+            "loss": {
+                "l2_weight": 1.0,
+                "l1_weight": 0.0,
+                "cos_weight": 0.0
+            },
+            "actor_layer_dims": [
+                1024,
+                1024
+            ],
+            "rnn": {
+                "enabled": true,
+                "horizon": 10,
+                "hidden_dim": 400,
+                "rnn_type": "LSTM",
+                "num_layers": 2,
+                "open_loop": false,
+                "kwargs": {
+                    "bidirectional": false
+                }
+            },
+            "transformer": {
+                "enabled": false,
+                "context_length": 10,
+                "embed_dim": 512,
+                "num_layers": 6,
+                "num_heads": 8,
+                "emb_dropout": 0.1,
+                "attn_dropout": 0.1,
+                "block_output_dropout": 0.1,
+                "sinusoidal_embedding": false,
+                "activation": "gelu",
+                "supervise_all_steps": false,
+                "nn_parameter_for_timesteps": true
+            }
+        }
+    },
+    "observation": {
+        "planner": {
+            "modalities": {
+                "obs": {
+                    "low_dim": [
+                        "robot0_eef_pos",
+                        "robot0_eef_quat",
+                        "robot0_gripper_qpos",
+                        "object"
+                    ],
+                    "rgb": [],
+                    "depth": [],
+                    "scan": []
+                },
+                "goal": {
+                    "low_dim": [],
+                    "rgb": [],
+                    "depth": [],
+                    "scan": []
+                },
+                "subgoal": {
+                    "low_dim": [
+                        "robot0_eef_pos",
+                        "robot0_eef_quat",
+                        "robot0_gripper_qpos",
+                        "object"
+                    ],
+                    "rgb": [],
+                    "depth": [],
+                    "scan": []
+                }
+            },
+            "encoder": {
+                "low_dim": {
+                    "core_class": null,
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                },
+                "rgb": {
+                    "core_class": "VisualCore",
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                },
+                "depth": {
+                    "core_class": "VisualCore",
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                },
+                "scan": {
+                    "core_class": "ScanCore",
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                }
+            }
+        },
+        "actor": {
+            "modalities": {
+                "obs": {
+                    "low_dim": [
+                        "robot0_eef_pos",
+                        "robot0_eef_quat",
+                        "robot0_gripper_qpos",
+                        "object"
+                    ],
+                    "rgb": [],
+                    "depth": [],
+                    "scan": []
+                },
+                "goal": {
+                    "low_dim": [],
+                    "rgb": [],
+                    "depth": [],
+                    "scan": []
+                }
+            },
+            "encoder": {
+                "low_dim": {
+                    "core_class": null,
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                },
+                "rgb": {
+                    "core_class": "VisualCore",
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                },
+                "depth": {
+                    "core_class": "VisualCore",
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                },
+                "scan": {
+                    "core_class": "ScanCore",
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                }
+            }
+        }
+    },
+    "meta": {
+        "hp_base_config_file": null,
+        "hp_keys": [],
+        "hp_values": []
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/exps/templates/iql.json b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/iql.json
new file mode 100644
index 0000000000000000000000000000000000000000..4731788417924c649f1b92627fe6bf7f14668aac
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/iql.json
@@ -0,0 +1,192 @@
+{
+    "algo_name": "iql",
+    "experiment": {
+        "name": "test",
+        "validate": false,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 50,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": false,
+            "on_best_rollout_success_rate": true
+        },
+        "epoch_every_n_steps": 100,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": true,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 400,
+            "rate": 50,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir": "../iql_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "all",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": true,
+        "hdf5_normalize_obs": false,
+        "hdf5_filter_key": null,
+        "hdf5_validation_filter_key": null,
+        "seq_length": 1,
+        "pad_seq_length": true,
+        "frame_stack": 1,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions",
+            "rewards",
+            "dones"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 100,
+        "num_epochs": 2000,
+        "seed": 1
+    },
+    "algo": {
+        "optim_params": {
+            "critic": {
+                "learning_rate": {
+                    "initial": 0.0001,
+                    "decay_factor": 0.0,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                }
+            },
+            "vf": {
+                "learning_rate": {
+                    "initial": 0.0001,
+                    "decay_factor": 0.0,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                }
+            },
+            "actor": {
+                "learning_rate": {
+                    "initial": 0.0001,
+                    "decay_factor": 0.0,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                }
+            }
+        },
+        "discount": 0.99,
+        "target_tau": 0.01,
+        "actor": {
+            "net": {
+                "type": "gaussian",
+                "common": {
+                    "std_activation": "softplus",
+                    "low_noise_eval": true,
+                    "use_tanh": false
+                },
+                "gaussian": {
+                    "init_last_fc_weight": 0.001,
+                    "init_std": 0.3,
+                    "fixed_std": false
+                },
+                "gmm": {
+                    "num_modes": 5,
+                    "min_std": 0.0001
+                }
+            },
+            "layer_dims": [
+                300,
+                400
+            ],
+            "max_gradient_norm": null
+        },
+        "critic": {
+            "ensemble": {
+                "n": 2
+            },
+            "layer_dims": [
+                300,
+                400
+            ],
+            "use_huber": false,
+            "max_gradient_norm": null
+        },
+        "adv": {
+            "clip_adv_value": null,
+            "beta": 1.0,
+            "use_final_clip": true
+        },
+        "vf_quantile": 0.9
+    },
+    "observation": {
+        "modalities": {
+            "obs": {
+                "low_dim": [
+                    "robot0_eef_pos",
+                    "robot0_eef_quat",
+                    "robot0_gripper_qpos",
+                    "object"
+                ],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            },
+            "goal": {
+                "low_dim": [],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            }
+        },
+        "encoder": {
+            "low_dim": {
+                "core_class": null,
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "rgb": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "depth": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "scan": {
+                "core_class": "ScanCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            }
+        }
+    },
+    "meta": {
+        "hp_base_config_file": null,
+        "hp_keys": [],
+        "hp_values": []
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/exps/templates/iris.json b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/iris.json
new file mode 100644
index 0000000000000000000000000000000000000000..6551663864a4d57d05d263de0069269ab115d8de
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/iris.json
@@ -0,0 +1,465 @@
+{
+    "algo_name": "iris",
+    "experiment": {
+        "name": "test",
+        "validate": false,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 50,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": false,
+            "on_best_rollout_success_rate": true
+        },
+        "epoch_every_n_steps": 100,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": true,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 400,
+            "rate": 50,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir": "../iris_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "all",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": true,
+        "hdf5_normalize_obs": false,
+        "hdf5_filter_key": null,
+        "hdf5_validation_filter_key": null,
+        "seq_length": 10,
+        "pad_seq_length": true,
+        "frame_stack": 1,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions",
+            "rewards",
+            "dones"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 100,
+        "num_epochs": 2000,
+        "seed": 1
+    },
+    "algo": {
+        "mode": "separate",
+        "actor_use_random_subgoals": false,
+        "subgoal_update_interval": 10,
+        "latent_subgoal": {
+            "enabled": false,
+            "prior_correction": {
+                "enabled": false,
+                "num_samples": 100
+            }
+        },
+        "value_planner": {
+            "planner": {
+                "optim_params": {
+                    "goal_network": {
+                        "learning_rate": {
+                            "initial": 0.0001,
+                            "decay_factor": 0.1,
+                            "epoch_schedule": []
+                        },
+                        "regularization": {
+                            "L2": 0.0
+                        }
+                    }
+                },
+                "subgoal_horizon": 10,
+                "ae": {
+                    "planner_layer_dims": [
+                        300,
+                        400
+                    ]
+                },
+                "vae": {
+                    "enabled": true,
+                    "latent_dim": 16,
+                    "latent_clip": null,
+                    "kl_weight": 1.0,
+                    "decoder": {
+                        "is_conditioned": true,
+                        "reconstruction_sum_across_elements": false
+                    },
+                    "prior": {
+                        "learn": false,
+                        "is_conditioned": false,
+                        "use_gmm": false,
+                        "gmm_num_modes": 10,
+                        "gmm_learn_weights": false,
+                        "use_categorical": false,
+                        "categorical_dim": 10,
+                        "categorical_gumbel_softmax_hard": false,
+                        "categorical_init_temp": 1.0,
+                        "categorical_temp_anneal_step": 0.001,
+                        "categorical_min_temp": 0.3
+                    },
+                    "encoder_layer_dims": [
+                        300,
+                        400
+                    ],
+                    "decoder_layer_dims": [
+                        300,
+                        400
+                    ],
+                    "prior_layer_dims": [
+                        300,
+                        400
+                    ]
+                }
+            },
+            "value": {
+                "optim_params": {
+                    "critic": {
+                        "learning_rate": {
+                            "initial": 0.001,
+                            "decay_factor": 0.1,
+                            "epoch_schedule": []
+                        },
+                        "regularization": {
+                            "L2": 0.0
+                        },
+                        "start_epoch": -1,
+                        "end_epoch": -1
+                    },
+                    "action_sampler": {
+                        "learning_rate": {
+                            "initial": 0.001,
+                            "decay_factor": 0.1,
+                            "epoch_schedule": []
+                        },
+                        "regularization": {
+                            "L2": 0.0
+                        },
+                        "start_epoch": -1,
+                        "end_epoch": -1
+                    },
+                    "actor": {
+                        "learning_rate": {
+                            "initial": 0.001,
+                            "decay_factor": 0.1,
+                            "epoch_schedule": []
+                        },
+                        "regularization": {
+                            "L2": 0.0
+                        },
+                        "start_epoch": -1,
+                        "end_epoch": -1
+                    }
+                },
+                "discount": 0.99,
+                "n_step": 1,
+                "target_tau": 0.005,
+                "infinite_horizon": false,
+                "critic": {
+                    "use_huber": false,
+                    "max_gradient_norm": null,
+                    "value_bounds": null,
+                    "num_action_samples": 10,
+                    "num_action_samples_rollout": 100,
+                    "ensemble": {
+                        "n": 2,
+                        "weight": 0.75
+                    },
+                    "distributional": {
+                        "enabled": false,
+                        "num_atoms": 51
+                    },
+                    "layer_dims": [
+                        300,
+                        400
+                    ]
+                },
+                "action_sampler": {
+                    "actor_layer_dims": [
+                        1024,
+                        1024
+                    ],
+                    "gmm": {
+                        "enabled": false,
+                        "num_modes": 5,
+                        "min_std": 0.0001,
+                        "std_activation": "softplus",
+                        "low_noise_eval": true
+                    },
+                    "vae": {
+                        "enabled": true,
+                        "latent_dim": 14,
+                        "latent_clip": null,
+                        "kl_weight": 1.0,
+                        "decoder": {
+                            "is_conditioned": true,
+                            "reconstruction_sum_across_elements": false
+                        },
+                        "prior": {
+                            "learn": false,
+                            "is_conditioned": false,
+                            "use_gmm": false,
+                            "gmm_num_modes": 10,
+                            "gmm_learn_weights": false,
+                            "use_categorical": false,
+                            "categorical_dim": 10,
+                            "categorical_gumbel_softmax_hard": false,
+                            "categorical_init_temp": 1.0,
+                            "categorical_temp_anneal_step": 0.001,
+                            "categorical_min_temp": 0.3
+                        },
+                        "encoder_layer_dims": [
+                            300,
+                            400
+                        ],
+                        "decoder_layer_dims": [
+                            300,
+                            400
+                        ],
+                        "prior_layer_dims": [
+                            300,
+                            400
+                        ]
+                    },
+                    "freeze_encoder_epoch": -1
+                },
+                "actor": {
+                    "enabled": false,
+                    "perturbation_scale": 0.05,
+                    "layer_dims": [
+                        300,
+                        400
+                    ]
+                }
+            },
+            "num_samples": 100
+        },
+        "actor": {
+            "optim_params": {
+                "policy": {
+                    "optimizer_type": "adam",
+                    "learning_rate": {
+                        "initial": 0.0001,
+                        "decay_factor": 0.1,
+                        "epoch_schedule": [],
+                        "scheduler_type": "multistep"
+                    },
+                    "regularization": {
+                        "L2": 0.0
+                    }
+                }
+            },
+            "loss": {
+                "l2_weight": 1.0,
+                "l1_weight": 0.0,
+                "cos_weight": 0.0
+            },
+            "actor_layer_dims": [
+                1024,
+                1024
+            ],
+            "rnn": {
+                "enabled": true,
+                "horizon": 10,
+                "hidden_dim": 400,
+                "rnn_type": "LSTM",
+                "num_layers": 2,
+                "open_loop": false,
+                "kwargs": {
+                    "bidirectional": false
+                }
+            },
+            "transformer": {
+                "enabled": false,
+                "context_length": 10,
+                "embed_dim": 512,
+                "num_layers": 6,
+                "num_heads": 8,
+                "emb_dropout": 0.1,
+                "attn_dropout": 0.1,
+                "block_output_dropout": 0.1,
+                "sinusoidal_embedding": false,
+                "activation": "gelu",
+                "supervise_all_steps": false,
+                "nn_parameter_for_timesteps": true
+            }
+        }
+    },
+    "observation": {
+        "value_planner": {
+            "planner": {
+                "modalities": {
+                    "obs": {
+                        "low_dim": [
+                            "robot0_eef_pos",
+                            "robot0_eef_quat",
+                            "robot0_gripper_qpos",
+                            "object"
+                        ],
+                        "rgb": [],
+                        "depth": [],
+                        "scan": []
+                    },
+                    "goal": {
+                        "low_dim": [],
+                        "rgb": [],
+                        "depth": [],
+                        "scan": []
+                    },
+                    "subgoal": {
+                        "low_dim": [
+                            "robot0_eef_pos",
+                            "robot0_eef_quat",
+                            "robot0_gripper_qpos",
+                            "object"
+                        ],
+                        "rgb": [],
+                        "depth": [],
+                        "scan": []
+                    }
+                },
+                "encoder": {
+                    "low_dim": {
+                        "core_class": null,
+                        "core_kwargs": {},
+                        "obs_randomizer_class": null,
+                        "obs_randomizer_kwargs": {}
+                    },
+                    "rgb": {
+                        "core_class": "VisualCore",
+                        "core_kwargs": {},
+                        "obs_randomizer_class": null,
+                        "obs_randomizer_kwargs": {}
+                    },
+                    "depth": {
+                        "core_class": "VisualCore",
+                        "core_kwargs": {},
+                        "obs_randomizer_class": null,
+                        "obs_randomizer_kwargs": {}
+                    },
+                    "scan": {
+                        "core_class": "ScanCore",
+                        "core_kwargs": {},
+                        "obs_randomizer_class": null,
+                        "obs_randomizer_kwargs": {}
+                    }
+                }
+            },
+            "value": {
+                "modalities": {
+                    "obs": {
+                        "low_dim": [
+                            "robot0_eef_pos",
+                            "robot0_eef_quat",
+                            "robot0_gripper_qpos",
+                            "object"
+                        ],
+                        "rgb": [],
+                        "depth": [],
+                        "scan": []
+                    },
+                    "goal": {
+                        "low_dim": [],
+                        "rgb": [],
+                        "depth": [],
+                        "scan": []
+                    }
+                },
+                "encoder": {
+                    "low_dim": {
+                        "core_class": null,
+                        "core_kwargs": {},
+                        "obs_randomizer_class": null,
+                        "obs_randomizer_kwargs": {}
+                    },
+                    "rgb": {
+                        "core_class": "VisualCore",
+                        "core_kwargs": {},
+                        "obs_randomizer_class": null,
+                        "obs_randomizer_kwargs": {}
+                    },
+                    "depth": {
+                        "core_class": "VisualCore",
+                        "core_kwargs": {},
+                        "obs_randomizer_class": null,
+                        "obs_randomizer_kwargs": {}
+                    },
+                    "scan": {
+                        "core_class": "ScanCore",
+                        "core_kwargs": {},
+                        "obs_randomizer_class": null,
+                        "obs_randomizer_kwargs": {}
+                    }
+                }
+            }
+        },
+        "actor": {
+            "modalities": {
+                "obs": {
+                    "low_dim": [
+                        "robot0_eef_pos",
+                        "robot0_eef_quat",
+                        "robot0_gripper_qpos",
+                        "object"
+                    ],
+                    "rgb": [],
+                    "depth": [],
+                    "scan": []
+                },
+                "goal": {
+                    "low_dim": [],
+                    "rgb": [],
+                    "depth": [],
+                    "scan": []
+                }
+            },
+            "encoder": {
+                "low_dim": {
+                    "core_class": null,
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                },
+                "rgb": {
+                    "core_class": "VisualCore",
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                },
+                "depth": {
+                    "core_class": "VisualCore",
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                },
+                "scan": {
+                    "core_class": "ScanCore",
+                    "core_kwargs": {},
+                    "obs_randomizer_class": null,
+                    "obs_randomizer_kwargs": {}
+                }
+            }
+        }
+    },
+    "meta": {
+        "hp_base_config_file": null,
+        "hp_keys": [],
+        "hp_values": []
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/exps/templates/td3_bc.json b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/td3_bc.json
new file mode 100644
index 0000000000000000000000000000000000000000..414a8f04f0cce7c9857207b1b1269ff10c3ee38b
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/exps/templates/td3_bc.json
@@ -0,0 +1,167 @@
+{
+    "algo_name": "td3_bc",
+    "experiment": {
+        "name": "test",
+        "validate": false,
+        "logging": {
+            "terminal_output_to_txt": true,
+            "log_tb": true,
+            "log_wandb": false,
+            "wandb_proj_name": "debug"
+        },
+        "save": {
+            "enabled": true,
+            "every_n_seconds": null,
+            "every_n_epochs": 20,
+            "epochs": [],
+            "on_best_validation": false,
+            "on_best_rollout_return": true,
+            "on_best_rollout_success_rate": false
+        },
+        "epoch_every_n_steps": 5000,
+        "validation_epoch_every_n_steps": 10,
+        "env": null,
+        "additional_envs": null,
+        "render": false,
+        "render_video": false,
+        "keep_all_videos": false,
+        "video_skip": 5,
+        "rollout": {
+            "enabled": true,
+            "n": 50,
+            "horizon": 1000,
+            "rate": 1,
+            "warmstart": 0,
+            "terminate_on_success": true
+        }
+    },
+    "train": {
+        "data": null,
+        "output_dir": "../td3_bc_trained_models",
+        "num_data_workers": 0,
+        "hdf5_cache_mode": "all",
+        "hdf5_use_swmr": true,
+        "hdf5_load_next_obs": true,
+        "hdf5_normalize_obs": true,
+        "hdf5_filter_key": null,
+        "hdf5_validation_filter_key": null,
+        "seq_length": 1,
+        "pad_seq_length": true,
+        "frame_stack": 1,
+        "pad_frame_stack": true,
+        "dataset_keys": [
+            "actions",
+            "rewards",
+            "dones"
+        ],
+        "goal_mode": null,
+        "cuda": true,
+        "batch_size": 256,
+        "num_epochs": 200,
+        "seed": 1
+    },
+    "algo": {
+        "optim_params": {
+            "critic": {
+                "learning_rate": {
+                    "initial": 0.0003,
+                    "decay_factor": 0.1,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                },
+                "start_epoch": -1,
+                "end_epoch": -1
+            },
+            "actor": {
+                "learning_rate": {
+                    "initial": 0.0003,
+                    "decay_factor": 0.1,
+                    "epoch_schedule": []
+                },
+                "regularization": {
+                    "L2": 0.0
+                },
+                "start_epoch": -1,
+                "end_epoch": -1
+            }
+        },
+        "alpha": 2.5,
+        "discount": 0.99,
+        "n_step": 1,
+        "target_tau": 0.005,
+        "infinite_horizon": false,
+        "critic": {
+            "use_huber": false,
+            "max_gradient_norm": null,
+            "value_bounds": null,
+            "ensemble": {
+                "n": 2,
+                "weight": 1.0
+            },
+            "layer_dims": [
+                256,
+                256
+            ]
+        },
+        "actor": {
+            "update_freq": 2,
+            "noise_std": 0.2,
+            "noise_clip": 0.5,
+            "layer_dims": [
+                256,
+                256
+            ]
+        }
+    },
+    "observation": {
+        "modalities": {
+            "obs": {
+                "low_dim": [
+                    "flat"
+                ],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            },
+            "goal": {
+                "low_dim": [],
+                "rgb": [],
+                "depth": [],
+                "scan": []
+            }
+        },
+        "encoder": {
+            "low_dim": {
+                "core_class": null,
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "rgb": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "depth": {
+                "core_class": "VisualCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            },
+            "scan": {
+                "core_class": "ScanCore",
+                "core_kwargs": {},
+                "obs_randomizer_class": null,
+                "obs_randomizer_kwargs": {}
+            }
+        }
+    },
+    "meta": {
+        "hp_base_config_file": null,
+        "hp_keys": [],
+        "hp_values": []
+    }
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/macros.py b/phantom/submodules/phantom-robomimic/robomimic/macros.py
new file mode 100644
index 0000000000000000000000000000000000000000..7496e93bbe5277c68573bdea7543c4a187ec490c
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/macros.py
@@ -0,0 +1,56 @@
+"""
+Set of global variables shared across robomimic
+"""
+# Sets debugging mode. Should be set at top-level script so that internal
+# debugging functionalities are made active
+DEBUG = False
+
+# Whether to visualize the before & after of an observation randomizer
+VISUALIZE_RANDOMIZER = False
+
+# wandb entity (eg. username or team name)
+WANDB_ENTITY = None
+
+# wandb api key (obtain from https://wandb.ai/authorize)
+# alternatively, set up wandb from terminal with `wandb login`
+WANDB_API_KEY = None
+
+### Slack Notifications ###
+
+# Token for sending slack notifications
+SLACK_TOKEN = None
+
+# User ID for user that should receive slack notifications
+SLACK_USER_ID = None
+
+
+### Local Sync Settings ###
+
+# By specifying this path, you can sync the most important results of training back to this folder
+RESULTS_SYNC_PATH = None
+
+# This will be automatically populated.
+RESULTS_SYNC_PATH_ABS = None
+
+
+### MagLev and NGC Cluster Settings ###
+
+# Whether training is happening on MagLev / NGC (should set this on repos hosted in MagLev / NGC scratch space or in Docker)
+USE_MAGLEV = False
+USE_NGC = False
+
+# When using MagLev / NGC, sync the most important results of training back to this directory in scratch space.
+# This path should be relative to the base scratch space directory (for MagLev) or an absolute path (for NGC)
+MAGLEV_SCRATCH_SYNC_PATH = None
+NGC_SCRATCH_SYNC_PATH = None
+
+try:
+    from robomimic.macros_private import *
+except ImportError:
+    from robomimic.utils.log_utils import log_warning
+    import robomimic
+    log_warning(
+        "No private macro file found!"\
+        "\nIt is recommended to use a private macro file"\
+        "\nTo setup, run: python {}/scripts/setup_macros.py".format(robomimic.__path__[0])
+    )
diff --git a/phantom/submodules/phantom-robomimic/robomimic/models/__init__.py b/phantom/submodules/phantom-robomimic/robomimic/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7460f9309af64c4578b547e0944c7e1366b5946c
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/models/__init__.py
@@ -0,0 +1 @@
+from .obs_core import EncoderCore, Randomizer
diff --git a/phantom/submodules/phantom-robomimic/robomimic/models/base_nets.py b/phantom/submodules/phantom-robomimic/robomimic/models/base_nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..18302a2c97a5278777adf2e626e8236d654143b2
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/models/base_nets.py
@@ -0,0 +1,1117 @@
+"""
+Contains torch Modules that correspond to basic network building blocks, like 
+MLP, RNN, and CNN backbones.
+"""
+
+import math
+import abc
+import numpy as np
+import textwrap
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from torchvision import models as vision_models
+from torchvision import transforms
+
+import robomimic.utils.tensor_utils as TensorUtils
+
+
+CONV_ACTIVATIONS = {
+    "relu": nn.ReLU,
+    "None": None,
+    None: None,
+}
+
+
+def rnn_args_from_config(rnn_config):
+    """
+    Takes a Config object corresponding to RNN settings
+    (for example `config.algo.rnn` in BCConfig) and extracts
+    rnn kwargs for instantiating rnn networks.
+    """
+    return dict(
+        rnn_hidden_dim=rnn_config.hidden_dim,
+        rnn_num_layers=rnn_config.num_layers,
+        rnn_type=rnn_config.rnn_type,
+        rnn_kwargs=dict(rnn_config.kwargs),
+    )
+
+
+def transformer_args_from_config(transformer_config):
+    """
+    Takes a Config object corresponding to Transformer settings
+    (for example `config.algo.transformer` in BCConfig) and extracts
+    transformer kwargs for instantiating transformer networks.
+    """
+    transformer_args = dict(
+        transformer_context_length=transformer_config.context_length,
+        transformer_embed_dim=transformer_config.embed_dim,
+        transformer_num_heads=transformer_config.num_heads,
+        transformer_emb_dropout=transformer_config.emb_dropout,
+        transformer_attn_dropout=transformer_config.attn_dropout,
+        transformer_block_output_dropout=transformer_config.block_output_dropout,
+        transformer_sinusoidal_embedding=transformer_config.sinusoidal_embedding,
+        transformer_activation=transformer_config.activation,
+        transformer_nn_parameter_for_timesteps=transformer_config.nn_parameter_for_timesteps,
+    )
+
+    if "num_layers" in transformer_config:
+        transformer_args["transformer_num_layers"] = transformer_config.num_layers
+
+    return transformer_args
+
+
+class Module(torch.nn.Module):
+    """
+    Base class for networks. The only difference from torch.nn.Module is that it
+    requires implementing @output_shape.
+    """
+    @abc.abstractmethod
+    def output_shape(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        raise NotImplementedError
+
+
+class Sequential(torch.nn.Sequential, Module):
+    """
+    Compose multiple Modules together (defined above).
+    """
+    def __init__(self, *args, has_output_shape = True):
+        """
+        Args:
+            has_output_shape (bool, optional): indicates whether output_shape can be called on the Sequential module.
+                torch.nn modules do not have an output_shape, but Modules (defined above) do. Defaults to True.
+        """
+        for arg in args:
+            if has_output_shape:
+                assert isinstance(arg, Module)
+            else:
+                assert isinstance(arg, nn.Module)
+        torch.nn.Sequential.__init__(self, *args)
+        self.fixed = False
+        self.has_output_shape = has_output_shape
+
+    def output_shape(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        if not self.has_output_shape:
+            raise NotImplementedError("Output shape is not defined for this module")
+        out_shape = input_shape
+        for module in self:
+            out_shape = module.output_shape(out_shape)
+        return out_shape
+
+    def freeze(self):
+        self.fixed = True
+
+    def train(self, mode):
+        if self.fixed:
+            super().train(False)
+        else:
+            super().train(mode)
+
+
+class Parameter(Module):
+    """
+    A class that is a thin wrapper around a torch.nn.Parameter to make for easy saving
+    and optimization.
+    """
+    def __init__(self, init_tensor):
+        """
+        Args:
+            init_tensor (torch.Tensor): initial tensor
+        """
+        super(Parameter, self).__init__()
+        self.param = torch.nn.Parameter(init_tensor)
+
+    def output_shape(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        return list(self.param.shape)
+
+    def forward(self, inputs=None):
+        """
+        Forward call just returns the parameter tensor.
+        """
+        return self.param
+
+
+class Unsqueeze(Module):
+    """
+    Trivial class that unsqueezes the input. Useful for including in a nn.Sequential network
+    """
+    def __init__(self, dim):
+        super(Unsqueeze, self).__init__()
+        self.dim = dim
+
+    def output_shape(self, input_shape=None):
+        assert input_shape is not None
+        return input_shape + [1] if self.dim == -1 else input_shape[:self.dim + 1] + [1] + input_shape[self.dim + 1:]
+
+    def forward(self, x):
+        return x.unsqueeze(dim=self.dim)
+
+
+class Squeeze(Module):
+    """
+    Trivial class that squeezes the input. Useful for including in a nn.Sequential network
+    """
+
+    def __init__(self, dim):
+        super(Squeeze, self).__init__()
+        self.dim = dim
+
+    def output_shape(self, input_shape=None):
+        assert input_shape is not None
+        return input_shape[:self.dim] + input_shape[self.dim+1:] if input_shape[self.dim] == 1 else input_shape
+
+    def forward(self, x):
+        return x.squeeze(dim=self.dim)
+
+
+class MLP(Module):
+    """
+    Base class for simple Multi-Layer Perceptrons.
+    """
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        layer_dims=(),
+        layer_func=nn.Linear,
+        layer_func_kwargs=None,
+        activation=nn.ReLU,
+        dropouts=None,
+        normalization=False,
+        output_activation=None,
+    ):
+        """
+        Args:
+            input_dim (int): dimension of inputs
+
+            output_dim (int): dimension of outputs
+
+            layer_dims ([int]): sequence of integers for the hidden layers sizes
+
+            layer_func: mapping per layer - defaults to Linear
+
+            layer_func_kwargs (dict): kwargs for @layer_func
+
+            activation: non-linearity per layer - defaults to ReLU
+
+            dropouts ([float]): if not None, adds dropout layers with the corresponding probabilities
+                after every layer. Must be same size as @layer_dims.
+
+            normalization (bool): if True, apply layer normalization after each layer
+
+            output_activation: if provided, applies the provided non-linearity to the output layer
+        """
+        super(MLP, self).__init__()
+        layers = []
+        dim = input_dim
+        if layer_func_kwargs is None:
+            layer_func_kwargs = dict()
+        if dropouts is not None:
+            assert(len(dropouts) == len(layer_dims))
+        for i, l in enumerate(layer_dims):
+            layers.append(layer_func(dim, l, **layer_func_kwargs))
+            if normalization:
+                layers.append(nn.LayerNorm(l))
+            layers.append(activation())
+            if dropouts is not None and dropouts[i] > 0.:
+                layers.append(nn.Dropout(dropouts[i]))
+            dim = l
+        layers.append(layer_func(dim, output_dim))
+        if output_activation is not None:
+            layers.append(output_activation())
+        self._layer_func = layer_func
+        self.nets = layers
+        self._model = nn.Sequential(*layers)
+
+        self._layer_dims = layer_dims
+        self._input_dim = input_dim
+        self._output_dim = output_dim
+        self._dropouts = dropouts
+        self._act = activation
+        self._output_act = output_activation
+
+    def output_shape(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        return [self._output_dim]
+
+    def forward(self, inputs):
+        """
+        Forward pass.
+        """
+        return self._model(inputs)
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = str(self.__class__.__name__)
+        act = None if self._act is None else self._act.__name__
+        output_act = None if self._output_act is None else self._output_act.__name__
+
+        indent = ' ' * 4
+        msg = "input_dim={}\noutput_dim={}\nlayer_dims={}\nlayer_func={}\ndropout={}\nact={}\noutput_act={}".format(
+            self._input_dim, self._output_dim, self._layer_dims,
+            self._layer_func.__name__, self._dropouts, act, output_act
+        )
+        msg = textwrap.indent(msg, indent)
+        msg = header + '(\n' + msg + '\n)'
+        return msg
+
+
+class RNN_Base(Module):
+    """
+    A wrapper class for a multi-step RNN and a per-step network.
+    """
+    def __init__(
+        self,
+        input_dim,
+        rnn_hidden_dim,
+        rnn_num_layers,
+        rnn_type="LSTM",  # [LSTM, GRU]
+        rnn_kwargs=None,
+        per_step_net=None,
+    ):
+        """
+        Args:
+            input_dim (int): dimension of inputs
+
+            rnn_hidden_dim (int): RNN hidden dimension
+
+            rnn_num_layers (int): number of RNN layers
+
+            rnn_type (str): [LSTM, GRU]
+
+            rnn_kwargs (dict): kwargs for the torch.nn.LSTM / GRU
+
+            per_step_net: a network that runs per time step on top of the RNN output
+        """
+        super(RNN_Base, self).__init__()
+        self.per_step_net = per_step_net
+        if per_step_net is not None:
+            assert isinstance(per_step_net, Module), "RNN_Base: per_step_net is not instance of Module"
+
+        assert rnn_type in ["LSTM", "GRU"]
+        rnn_cls = nn.LSTM if rnn_type == "LSTM" else nn.GRU
+        rnn_kwargs = rnn_kwargs if rnn_kwargs is not None else {}
+        rnn_is_bidirectional = rnn_kwargs.get("bidirectional", False)
+
+        self.nets = rnn_cls(
+            input_size=input_dim,
+            hidden_size=rnn_hidden_dim,
+            num_layers=rnn_num_layers,
+            batch_first=True,
+            **rnn_kwargs,
+        )
+
+        self._hidden_dim = rnn_hidden_dim
+        self._num_layers = rnn_num_layers
+        self._rnn_type = rnn_type
+        self._num_directions = int(rnn_is_bidirectional) + 1 # 2 if bidirectional, 1 otherwise
+
+    @property
+    def rnn_type(self):
+        return self._rnn_type
+
+    def get_rnn_init_state(self, batch_size, device):
+        """
+        Get a default RNN state (zeros)
+        Args:
+            batch_size (int): batch size dimension
+
+            device: device the hidden state should be sent to.
+
+        Returns:
+            hidden_state (torch.Tensor or tuple): returns hidden state tensor or tuple of hidden state tensors
+                depending on the RNN type
+        """
+        h_0 = torch.zeros(self._num_layers * self._num_directions, batch_size, self._hidden_dim).to(device)
+        if self._rnn_type == "LSTM":
+            c_0 = torch.zeros(self._num_layers * self._num_directions, batch_size, self._hidden_dim).to(device)
+            return h_0, c_0
+        else:
+            return h_0
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # infer time dimension from input shape and add to per_step_net output shape
+        if self.per_step_net is not None:
+            out = self.per_step_net.output_shape(input_shape[1:])
+            if isinstance(out, dict):
+                out = {k: [input_shape[0]] + out[k] for k in out}
+            else:
+                out = [input_shape[0]] + out
+        else:
+            out = [input_shape[0], self._num_layers * self._hidden_dim]
+        return out
+
+    def forward(self, inputs, rnn_init_state=None, return_state=False):
+        """
+        Forward a sequence of inputs through the RNN and the per-step network.
+
+        Args:
+            inputs (torch.Tensor): tensor input of shape [B, T, D], where D is the RNN input size
+
+            rnn_init_state: rnn hidden state, initialize to zero state if set to None
+
+            return_state (bool): whether to return hidden state
+
+        Returns:
+            outputs: outputs of the per_step_net
+
+            rnn_state: return rnn state at the end if return_state is set to True
+        """
+        assert inputs.ndimension() == 3  # [B, T, D]
+        batch_size, seq_length, inp_dim = inputs.shape
+        if rnn_init_state is None:
+            rnn_init_state = self.get_rnn_init_state(batch_size, device=inputs.device)
+
+        outputs, rnn_state = self.nets(inputs, rnn_init_state)
+        if self.per_step_net is not None:
+            outputs = TensorUtils.time_distributed(outputs, self.per_step_net)
+
+        if return_state:
+            return outputs, rnn_state
+        else:
+            return outputs
+
+    def forward_step(self, inputs, rnn_state):
+        """
+        Forward a single step input through the RNN and per-step network, and return the new hidden state.
+        Args:
+            inputs (torch.Tensor): tensor input of shape [B, D], where D is the RNN input size
+
+            rnn_state: rnn hidden state, initialize to zero state if set to None
+
+        Returns:
+            outputs: outputs of the per_step_net
+
+            rnn_state: return the new rnn state
+        """
+        assert inputs.ndimension() == 2
+        inputs = TensorUtils.to_sequence(inputs)
+        outputs, rnn_state = self.forward(
+            inputs,
+            rnn_init_state=rnn_state,
+            return_state=True,
+        )
+        return outputs[:, 0], rnn_state
+
+
+"""
+================================================
+Visual Backbone Networks
+================================================
+"""
+class ConvBase(Module):
+    """
+    Base class for ConvNets.
+    """
+    def __init__(self):
+        super(ConvBase, self).__init__()
+
+    # dirty hack - re-implement to pass the buck onto subclasses from ABC parent
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        raise NotImplementedError
+
+    def forward(self, inputs):
+        x = self.nets(inputs)
+        if list(self.output_shape(list(inputs.shape)[1:])) != list(x.shape)[1:]:
+            raise ValueError('Size mismatch: expect size %s, but got size %s' % (
+                str(self.output_shape(list(inputs.shape)[1:])), str(list(x.shape)[1:]))
+            )
+        return x
+
+
+class ResNet18Conv(ConvBase):
+    """
+    A ResNet18 block that can be used to process input images.
+    """
+    def __init__(
+        self,
+        input_channel=3,
+        pretrained=False,
+        input_coord_conv=False,
+    ):
+        """
+        Args:
+            input_channel (int): number of input channels for input images to the network.
+                If not equal to 3, modifies first conv layer in ResNet to handle the number
+                of input channels.
+            pretrained (bool): if True, load pretrained weights for all ResNet layers.
+            input_coord_conv (bool): if True, use a coordinate convolution for the first layer
+                (a convolution where input channels are modified to encode spatial pixel location)
+        """
+        super(ResNet18Conv, self).__init__()
+        net = vision_models.resnet18(pretrained=pretrained)
+
+        if input_coord_conv:
+            net.conv1 = CoordConv2d(input_channel, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        elif input_channel != 3:
+            net.conv1 = nn.Conv2d(input_channel, 64, kernel_size=7, stride=2, padding=3, bias=False)
+
+        # cut the last fc layer
+        self._input_coord_conv = input_coord_conv
+        self._input_channel = input_channel
+        self.nets = torch.nn.Sequential(*(list(net.children())[:-2]))
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        assert(len(input_shape) == 3)
+        out_h = int(math.ceil(input_shape[1] / 32.))
+        out_w = int(math.ceil(input_shape[2] / 32.))
+        return [512, out_h, out_w]
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        return header + '(input_channel={}, input_coord_conv={})'.format(self._input_channel, self._input_coord_conv)
+
+
+class R3MConv(ConvBase):
+    """
+    Base class for ConvNets pretrained with R3M (https://arxiv.org/abs/2203.12601)
+    """
+    def __init__(
+        self,
+        input_channel=3,
+        r3m_model_class='resnet18',
+        freeze=True,
+    ):
+        """
+        Using R3M pretrained observation encoder network proposed by https://arxiv.org/abs/2203.12601
+        Args:
+            input_channel (int): number of input channels for input images to the network.
+                If not equal to 3, modifies first conv layer in ResNet to handle the number
+                of input channels.
+            r3m_model_class (str): select one of the r3m pretrained model "resnet18", "resnet34" or "resnet50"
+            freeze (bool): if True, use a frozen R3M pretrained model.
+        """
+        super(R3MConv, self).__init__()
+
+        try:
+            from r3m import load_r3m
+        except ImportError:
+            print("WARNING: could not load r3m library! Please follow https://github.com/facebookresearch/r3m to install R3M")
+
+        net = load_r3m(r3m_model_class)
+
+        assert input_channel == 3 # R3M only support input image with channel size 3
+        assert r3m_model_class in ["resnet18", "resnet34", "resnet50"] # make sure the selected r3m model do exist
+
+        # cut the last fc layer
+        self._input_channel = input_channel
+        self._r3m_model_class = r3m_model_class
+        self._freeze = freeze
+        self._input_coord_conv = False
+        self._pretrained = True
+
+        preprocess = nn.Sequential(
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        )
+        self.nets = Sequential(*([preprocess] + list(net.module.convnet.children())), has_output_shape = False)
+        if freeze:
+            self.nets.freeze()
+
+        self.weight_sum = np.sum([param.cpu().data.numpy().sum() for param in self.nets.parameters()])
+        if freeze:
+            for param in self.nets.parameters():
+                param.requires_grad = False
+
+        self.nets.eval()
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module.
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        assert(len(input_shape) == 3)
+
+        if self._r3m_model_class == 'resnet50':
+            out_dim = 2048
+        else:
+            out_dim = 512
+
+        return [out_dim, 1, 1]
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        return header + '(input_channel={}, input_coord_conv={}, pretrained={}, freeze={})'.format(self._input_channel, self._input_coord_conv, self._pretrained, self._freeze)
+
+
+class MVPConv(ConvBase):
+    """
+    Base class for ConvNets pretrained with MVP (https://arxiv.org/abs/2203.06173)
+    """
+    def __init__(
+        self,
+        input_channel=3,
+        mvp_model_class='vitb-mae-egosoup',
+        freeze=True,
+    ):
+        """
+        Using MVP pretrained observation encoder network proposed by https://arxiv.org/abs/2203.06173
+        Args:
+            input_channel (int): number of input channels for input images to the network.
+                If not equal to 3, modifies first conv layer in ResNet to handle the number
+                of input channels.
+            mvp_model_class (str): select one of the mvp pretrained model "vits-mae-hoi", "vits-mae-in", "vits-sup-in", "vitb-mae-egosoup" or "vitl-256-mae-egosoup"
+            freeze (bool): if True, use a frozen MVP pretrained model.
+        """
+        super(MVPConv, self).__init__()
+
+        try:
+            import mvp
+        except ImportError:
+            print("WARNING: could not load mvp library! Please follow https://github.com/ir413/mvp to install MVP.")
+
+        self.nets = mvp.load(mvp_model_class)
+        if freeze:
+            self.nets.freeze()
+
+        assert input_channel == 3 # MVP only support input image with channel size 3
+        assert mvp_model_class in ["vits-mae-hoi", "vits-mae-in", "vits-sup-in", "vitb-mae-egosoup", "vitl-256-mae-egosoup"] # make sure the selected r3m model do exist
+
+        self._input_channel = input_channel
+        self._freeze = freeze
+        self._mvp_model_class = mvp_model_class
+        self._input_coord_conv = False
+        self._pretrained = True
+
+        if '256' in mvp_model_class:
+            input_img_size = 256
+        else:
+            input_img_size = 224
+        self.preprocess = nn.Sequential(
+            transforms.Resize(input_img_size)
+        )
+
+    def forward(self, inputs):
+        x = self.preprocess(inputs)
+        x = self.nets(x)
+        if list(self.output_shape(list(inputs.shape)[1:])) != list(x.shape)[1:]:
+            raise ValueError('Size mismatch: expect size %s, but got size %s' % (
+                str(self.output_shape(list(inputs.shape)[1:])), str(list(x.shape)[1:]))
+            )
+        return x
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module.
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        assert(len(input_shape) == 3)
+        if 'vitb' in self._mvp_model_class:
+            output_shape = [768]
+        elif 'vitl' in self._mvp_model_class:
+            output_shape = [1024]
+        else:
+            output_shape = [384]
+        return output_shape
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        return header + '(input_channel={}, input_coord_conv={}, pretrained={}, freeze={})'.format(self._input_channel, self._input_coord_conv, self._pretrained, self._freeze)
+
+
+class CoordConv2d(nn.Conv2d, Module):
+    """
+    2D Coordinate Convolution
+
+    Source: An Intriguing Failing of Convolutional Neural Networks and the CoordConv Solution
+    https://arxiv.org/abs/1807.03247
+    (e.g. adds 2 channels per input feature map corresponding to (x, y) location on map)
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode='zeros',
+        coord_encoding='position',
+    ):
+        """
+        Args:
+            in_channels: number of channels of the input tensor [C, H, W]
+            out_channels: number of output channels of the layer
+            kernel_size: convolution kernel size
+            stride: conv stride
+            padding: conv padding
+            dilation: conv dilation
+            groups: conv groups
+            bias: conv bias
+            padding_mode: conv padding mode
+            coord_encoding: type of coordinate encoding. currently only 'position' is implemented
+        """
+
+        assert(coord_encoding in ['position'])
+        self.coord_encoding = coord_encoding
+        if coord_encoding == 'position':
+            in_channels += 2  # two extra channel for positional encoding
+            self._position_enc = None  # position encoding
+        else:
+            raise Exception("CoordConv2d: coord encoding {} not implemented".format(self.coord_encoding))
+        nn.Conv2d.__init__(
+            self,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode
+        )
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # adds 2 to channel dimension
+        return [input_shape[0] + 2] + input_shape[1:]
+
+    def forward(self, input):
+        b, c, h, w = input.shape
+        if self.coord_encoding == 'position':
+            if self._position_enc is None:
+                pos_y, pos_x = torch.meshgrid(torch.arange(h), torch.arange(w))
+                pos_y = pos_y.float().to(input.device) / float(h)
+                pos_x = pos_x.float().to(input.device) / float(w)
+                self._position_enc = torch.stack((pos_y, pos_x)).unsqueeze(0)
+            pos_enc = self._position_enc.expand(b, -1, -1, -1)
+            input = torch.cat((input, pos_enc), dim=1)
+        return super(CoordConv2d, self).forward(input)
+
+
+class ShallowConv(ConvBase):
+    """
+    A shallow convolutional encoder from https://rll.berkeley.edu/dsae/dsae.pdf
+    """
+    def __init__(self, input_channel=3, output_channel=32):
+        super(ShallowConv, self).__init__()
+        self._input_channel = input_channel
+        self._output_channel = output_channel
+        self.nets = nn.Sequential(
+            torch.nn.Conv2d(input_channel, 64, kernel_size=7, stride=2, padding=3),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(64, 32, kernel_size=1, stride=1, padding=0),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
+        )
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        assert(len(input_shape) == 3)
+        assert(input_shape[0] == self._input_channel)
+        out_h = int(math.floor(input_shape[1] / 2.))
+        out_w = int(math.floor(input_shape[2] / 2.))
+        return [self._output_channel, out_h, out_w]
+
+
+class Conv1dBase(Module):
+    """
+    Base class for stacked Conv1d layers.
+
+    Args:
+        input_channel (int): Number of channels for inputs to this network
+        activation (None or str): Per-layer activation to use. Defaults to "relu". Valid options are
+            currently {relu, None} for no activation
+        out_channels (list of int): Output channel size for each sequential Conv1d layer
+        kernel_size (list of int): Kernel sizes for each sequential Conv1d layer
+        stride (list of int): Stride sizes for each sequential Conv1d layer
+        conv_kwargs (dict): additional nn.Conv1D args to use, in list form, where the ith element corresponds to the
+            argument to be passed to the ith Conv1D layer.
+            See https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html for specific possible arguments.
+    """
+    def __init__(
+        self,
+        input_channel=1,
+        activation="relu",
+        out_channels=(32, 64, 64),
+        kernel_size=(8, 4, 2),
+        stride=(4, 2, 1),
+        **conv_kwargs,
+    ):
+        super(Conv1dBase, self).__init__()
+
+        # Get activation requested
+        activation = CONV_ACTIVATIONS[activation]
+
+        # Add layer kwargs
+        conv_kwargs["out_channels"] = out_channels
+        conv_kwargs["kernel_size"] = kernel_size
+        conv_kwargs["stride"] = stride
+
+        # Generate network
+        self.n_layers = len(out_channels)
+        layers = OrderedDict()
+        for i in range(self.n_layers):
+            layer_kwargs = {k: v[i] for k, v in conv_kwargs.items()}
+            layers[f'conv{i}'] = nn.Conv1d(
+                in_channels=input_channel,
+                **layer_kwargs,
+            )
+            if activation is not None:
+                layers[f'act{i}'] = activation()
+            input_channel = layer_kwargs["out_channels"]
+
+        # Store network
+        self.nets = nn.Sequential(layers)
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        channels, length = input_shape
+        for i in range(self.n_layers):
+            net = getattr(self.nets, f"conv{i}")
+            channels = net.out_channels
+            length = int((length + 2 * net.padding[0] - net.dilation[0] * (net.kernel_size[0] - 1) - 1) / net.stride[0]) + 1
+        return [channels, length]
+
+    def forward(self, inputs):
+        x = self.nets(inputs)
+        if list(self.output_shape(list(inputs.shape)[1:])) != list(x.shape)[1:]:
+            raise ValueError('Size mismatch: expect size %s, but got size %s' % (
+                str(self.output_shape(list(inputs.shape)[1:])), str(list(x.shape)[1:]))
+            )
+        return x
+
+
+"""
+================================================
+Pooling Networks
+================================================
+"""
+class SpatialSoftmax(ConvBase):
+    """
+    Spatial Softmax Layer.
+
+    Based on Deep Spatial Autoencoders for Visuomotor Learning by Finn et al.
+    https://rll.berkeley.edu/dsae/dsae.pdf
+    """
+    def __init__(
+        self,
+        input_shape,
+        num_kp=32,
+        temperature=1.,
+        learnable_temperature=False,
+        output_variance=False,
+        noise_std=0.0,
+    ):
+        """
+        Args:
+            input_shape (list): shape of the input feature (C, H, W)
+            num_kp (int): number of keypoints (None for not using spatialsoftmax)
+            temperature (float): temperature term for the softmax.
+            learnable_temperature (bool): whether to learn the temperature
+            output_variance (bool): treat attention as a distribution, and compute second-order statistics to return
+            noise_std (float): add random spatial noise to the predicted keypoints
+        """
+        super(SpatialSoftmax, self).__init__()
+        assert len(input_shape) == 3
+        self._in_c, self._in_h, self._in_w = input_shape # (C, H, W)
+
+        if num_kp is not None:
+            self.nets = torch.nn.Conv2d(self._in_c, num_kp, kernel_size=1)
+            self._num_kp = num_kp
+        else:
+            self.nets = None
+            self._num_kp = self._in_c
+        self.learnable_temperature = learnable_temperature
+        self.output_variance = output_variance
+        self.noise_std = noise_std
+
+        if self.learnable_temperature:
+            # temperature will be learned
+            temperature = torch.nn.Parameter(torch.ones(1) * temperature, requires_grad=True)
+            self.register_parameter('temperature', temperature)
+        else:
+            # temperature held constant after initialization
+            temperature = torch.nn.Parameter(torch.ones(1) * temperature, requires_grad=False)
+            self.register_buffer('temperature', temperature)
+
+        pos_x, pos_y = np.meshgrid(
+                np.linspace(-1., 1., self._in_w),
+                np.linspace(-1., 1., self._in_h)
+                )
+        pos_x = torch.from_numpy(pos_x.reshape(1, self._in_h * self._in_w)).float()
+        pos_y = torch.from_numpy(pos_y.reshape(1, self._in_h * self._in_w)).float()
+        self.register_buffer('pos_x', pos_x)
+        self.register_buffer('pos_y', pos_y)
+
+        self.kps = None
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = format(str(self.__class__.__name__))
+        return header + '(num_kp={}, temperature={}, noise={})'.format(
+            self._num_kp, self.temperature.item(), self.noise_std)
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        assert(len(input_shape) == 3)
+        assert(input_shape[0] == self._in_c)
+        return [self._num_kp, 2]
+
+    def forward(self, feature):
+        """
+        Forward pass through spatial softmax layer. For each keypoint, a 2D spatial 
+        probability distribution is created using a softmax, where the support is the 
+        pixel locations. This distribution is used to compute the expected value of 
+        the pixel location, which becomes a keypoint of dimension 2. K such keypoints
+        are created.
+
+        Returns:
+            out (torch.Tensor or tuple): mean keypoints of shape [B, K, 2], and possibly
+                keypoint variance of shape [B, K, 2, 2] corresponding to the covariance
+                under the 2D spatial softmax distribution
+        """
+        assert(feature.shape[1] == self._in_c)
+        assert(feature.shape[2] == self._in_h)
+        assert(feature.shape[3] == self._in_w)
+        if self.nets is not None:
+            feature = self.nets(feature)
+
+        # [B, K, H, W] -> [B * K, H * W] where K is number of keypoints
+        feature = feature.reshape(-1, self._in_h * self._in_w)
+        # 2d softmax normalization
+        attention = F.softmax(feature / self.temperature, dim=-1)
+        # [1, H * W] x [B * K, H * W] -> [B * K, 1] for spatial coordinate mean in x and y dimensions
+        expected_x = torch.sum(self.pos_x * attention, dim=1, keepdim=True)
+        expected_y = torch.sum(self.pos_y * attention, dim=1, keepdim=True)
+        # stack to [B * K, 2]
+        expected_xy = torch.cat([expected_x, expected_y], 1)
+        # reshape to [B, K, 2]
+        feature_keypoints = expected_xy.view(-1, self._num_kp, 2)
+
+        if self.training:
+            noise = torch.randn_like(feature_keypoints) * self.noise_std
+            feature_keypoints += noise
+
+        if self.output_variance:
+            # treat attention as a distribution, and compute second-order statistics to return
+            expected_xx = torch.sum(self.pos_x * self.pos_x * attention, dim=1, keepdim=True)
+            expected_yy = torch.sum(self.pos_y * self.pos_y * attention, dim=1, keepdim=True)
+            expected_xy = torch.sum(self.pos_x * self.pos_y * attention, dim=1, keepdim=True)
+            var_x = expected_xx - expected_x * expected_x
+            var_y = expected_yy - expected_y * expected_y
+            var_xy = expected_xy - expected_x * expected_y
+            # stack to [B * K, 4] and then reshape to [B, K, 2, 2] where last 2 dims are covariance matrix
+            feature_covar = torch.cat([var_x, var_xy, var_xy, var_y], 1).reshape(-1, self._num_kp, 2, 2)
+            feature_keypoints = (feature_keypoints, feature_covar)
+
+        if isinstance(feature_keypoints, tuple):
+            self.kps = (feature_keypoints[0].detach(), feature_keypoints[1].detach())
+        else:
+            self.kps = feature_keypoints.detach()
+        return feature_keypoints
+
+
+class SpatialMeanPool(Module):
+    """
+    Module that averages inputs across all spatial dimensions (dimension 2 and after),
+    leaving only the batch and channel dimensions.
+    """
+    def __init__(self, input_shape):
+        super(SpatialMeanPool, self).__init__()
+        assert len(input_shape) == 3 # [C, H, W]
+        self.in_shape = input_shape
+
+    def output_shape(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        return list(self.in_shape[:1]) # [C, H, W] -> [C]
+
+    def forward(self, inputs):
+        """Forward pass - average across all dimensions except batch and channel."""
+        return TensorUtils.flatten(inputs, begin_axis=2).mean(dim=2)
+
+
+class FeatureAggregator(Module):
+    """
+    Helpful class for aggregating features across a dimension. This is useful in 
+    practice when training models that break an input image up into several patches
+    since features can be extraced per-patch using the same encoder and then 
+    aggregated using this module.
+    """
+    def __init__(self, dim=1, agg_type="avg"):
+        super(FeatureAggregator, self).__init__()
+        self.dim = dim
+        self.agg_type = agg_type
+
+    def set_weight(self, w):
+        assert self.agg_type == "w_avg"
+        self.agg_weight = w
+
+    def clear_weight(self):
+        assert self.agg_type == "w_avg"
+        self.agg_weight = None
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        # aggregates on @self.dim, so it is removed from the output shape 
+        return list(input_shape[:self.dim]) + list(input_shape[self.dim+1:])
+
+    def forward(self, x):
+        """Forward pooling pass."""
+        if self.agg_type == "avg":
+            # mean-pooling
+            return torch.mean(x, dim=1)
+        if self.agg_type == "w_avg":
+            # weighted mean-pooling
+            return torch.sum(x * self.agg_weight, dim=1)
+        raise Exception("unexpected agg type: {}".forward(self.agg_type))
diff --git a/phantom/submodules/phantom-robomimic/robomimic/models/distributions.py b/phantom/submodules/phantom-robomimic/robomimic/models/distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..411efb1a8bbc6b0da7ac6f628357dc9c178b8780
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/models/distributions.py
@@ -0,0 +1,123 @@
+"""
+Contains distribution models used as parts of other networks. These
+classes usually inherit or emulate torch distributions.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributions as D
+
+
+class TanhWrappedDistribution(D.Distribution):
+    """
+    Class that wraps another valid torch distribution, such that sampled values from the base distribution are
+    passed through a tanh layer. The corresponding (log) probabilities are also modified accordingly.
+    Tanh Normal distribution - adapted from rlkit and CQL codebase
+    (https://github.com/aviralkumar2907/CQL/blob/d67dbe9cf5d2b96e3b462b6146f249b3d6569796/d4rl/rlkit/torch/distributions.py#L6).
+    """
+    def __init__(self, base_dist, scale=1.0, epsilon=1e-6):
+        """
+        Args:
+            base_dist (Distribution): Distribution to wrap with tanh output
+            scale (float): Scale of output
+            epsilon (float): Numerical stability epsilon when computing log-prob.
+        """
+        self.base_dist = base_dist
+        self.scale = scale
+        self.tanh_epsilon = epsilon
+        super(TanhWrappedDistribution, self).__init__()
+
+    def log_prob(self, value, pre_tanh_value=None):
+        """
+        Args:
+            value (torch.Tensor): some tensor to compute log probabilities for
+            pre_tanh_value: If specified, will not calculate atanh manually from @value. More numerically stable
+        """
+        value = value / self.scale
+        if pre_tanh_value is None:
+            one_plus_x = (1. + value).clamp(min=self.tanh_epsilon)
+            one_minus_x = (1. - value).clamp(min=self.tanh_epsilon)
+            pre_tanh_value = 0.5 * torch.log(one_plus_x / one_minus_x)
+        lp = self.base_dist.log_prob(pre_tanh_value)
+        tanh_lp = torch.log(1 - value * value + self.tanh_epsilon)
+        # In case the base dist already sums up the log probs, make sure we do the same
+        return lp - tanh_lp if len(lp.shape) == len(tanh_lp.shape) else lp - tanh_lp.sum(-1)
+
+    def sample(self, sample_shape=torch.Size(), return_pretanh_value=False):
+        """
+        Gradients will and should *not* pass through this operation.
+        See https://github.com/pytorch/pytorch/issues/4620 for discussion.
+        """
+        z = self.base_dist.sample(sample_shape=sample_shape).detach()
+
+        if return_pretanh_value:
+            return torch.tanh(z) * self.scale, z
+        else:
+            return torch.tanh(z) * self.scale
+
+    def rsample(self, sample_shape=torch.Size(), return_pretanh_value=False):
+        """
+        Sampling in the reparameterization case - for differentiable samples.
+        """
+        z = self.base_dist.rsample(sample_shape=sample_shape)
+
+        if return_pretanh_value:
+            return torch.tanh(z) * self.scale, z
+        else:
+            return torch.tanh(z) * self.scale
+
+    @property
+    def mean(self):
+        return self.base_dist.mean
+
+    @property
+    def stddev(self):
+        return self.base_dist.stddev
+
+
+class DiscreteValueDistribution(object):
+    """
+    Extension to torch categorical probability distribution in order to keep track
+    of the support (categorical values, or in this case, value atoms). This is
+    used for distributional value networks.
+    """
+    def __init__(self, values, probs=None, logits=None):
+        """
+        Creates a categorical distribution parameterized by either @probs or
+        @logits (but not both). Expects inputs to be consistent in shape
+        for broadcasting operations (e.g. multiplication).
+        """
+        self._values = values
+        self._categorical_dist = D.Categorical(probs=probs, logits=logits)
+
+    @property
+    def values(self):
+        return self._values
+
+    @property
+    def probs(self):
+        return self._categorical_dist.probs
+
+    @property
+    def logits(self):
+        return self._categorical_dist.logits
+
+    def mean(self):
+        """
+        Categorical distribution mean, taking the value support into account.
+        """
+        return (self._categorical_dist.probs * self._values).sum(dim=-1)
+
+    def variance(self):
+        """
+        Categorical distribution variance, taking the value support into account.
+        """
+        dist_squared = (self.mean().unsqueeze(-1) - self.values).pow(2)
+        return (self._categorical_dist.probs * dist_squared).sum(dim=-1)
+    
+    def sample(self, sample_shape=torch.Size()):
+        """
+        Sample from the distribution. Make sure to return value atoms, not categorical class indices.
+        """
+        inds = self._categorical_dist.sample(sample_shape=sample_shape)
+        return torch.gather(self.values, inds, dim=-1)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/models/obs_core.py b/phantom/submodules/phantom-robomimic/robomimic/models/obs_core.py
new file mode 100644
index 0000000000000000000000000000000000000000..4183043837c0eda0901f38a93c348e4085128b96
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/models/obs_core.py
@@ -0,0 +1,829 @@
+"""
+Contains torch Modules for core observation processing blocks
+such as encoders (e.g. EncoderCore, VisualCore, ScanCore, ...)
+and randomizers (e.g. Randomizer, CropRandomizer).
+"""
+
+import abc
+import numpy as np
+import textwrap
+import random
+
+import torch
+import torch.nn as nn
+from torchvision.transforms import Lambda, Compose
+import torchvision.transforms.functional as TVF
+
+import robomimic.models.base_nets as BaseNets
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.obs_utils as ObsUtils
+from robomimic.utils.python_utils import extract_class_init_kwargs_from_dict
+
+# NOTE: this is required for the backbone classes to be found by the `eval` call in the core networks
+from robomimic.models.base_nets import *
+from robomimic.utils.vis_utils import visualize_image_randomizer
+from robomimic.macros import VISUALIZE_RANDOMIZER
+
+
+"""
+================================================
+Encoder Core Networks (Abstract class)
+================================================
+"""
+class EncoderCore(BaseNets.Module):
+    """
+    Abstract class used to categorize all cores used to encode observations
+    """
+    def __init__(self, input_shape):
+        self.input_shape = input_shape
+        super(EncoderCore, self).__init__()
+
+    def __init_subclass__(cls, **kwargs):
+        """
+        Hook method to automatically register all valid subclasses so we can keep track of valid observation encoders
+        in a global dict.
+
+        This global dict stores mapping from observation encoder network name to class.
+        We keep track of these registries to enable automated class inference at runtime, allowing
+        users to simply extend our base encoder class and refer to that class in string form
+        in their config, without having to manually register their class internally.
+        This also future-proofs us for any additional encoder classes we would
+        like to add ourselves.
+        """
+        ObsUtils.register_encoder_core(cls)
+
+
+"""
+================================================
+Visual Core Networks (Backbone + Pool)
+================================================
+"""
+class VisualCore(EncoderCore, BaseNets.ConvBase):
+    """
+    A network block that combines a visual backbone network with optional pooling
+    and linear layers.
+    """
+    def __init__(
+        self,
+        input_shape,
+        backbone_class="ResNet18Conv",
+        pool_class="SpatialSoftmax",
+        backbone_kwargs=None,
+        pool_kwargs=None,
+        flatten=True,
+        feature_dimension=64,
+    ):
+        """
+        Args:
+            input_shape (tuple): shape of input (not including batch dimension)
+            backbone_class (str): class name for the visual backbone network. Defaults
+                to "ResNet18Conv".
+            pool_class (str): class name for the visual feature pooler (optional)
+                Common options are "SpatialSoftmax" and "SpatialMeanPool". Defaults to
+                "SpatialSoftmax".
+            backbone_kwargs (dict): kwargs for the visual backbone network (optional)
+            pool_kwargs (dict): kwargs for the visual feature pooler (optional)
+            flatten (bool): whether to flatten the visual features
+            feature_dimension (int): if not None, add a Linear layer to
+                project output into a desired feature dimension
+        """
+        super(VisualCore, self).__init__(input_shape=input_shape)
+        self.flatten = flatten
+
+        if backbone_kwargs is None:
+            backbone_kwargs = dict()
+
+        # add input channel dimension to visual core inputs
+        backbone_kwargs["input_channel"] = input_shape[0]
+
+        # extract only relevant kwargs for this specific backbone
+        backbone_kwargs = extract_class_init_kwargs_from_dict(cls=eval(backbone_class), dic=backbone_kwargs, copy=True)
+
+        # visual backbone
+        assert isinstance(backbone_class, str)
+        self.backbone = eval(backbone_class)(**backbone_kwargs)
+
+        assert isinstance(self.backbone, BaseNets.ConvBase)
+
+        feat_shape = self.backbone.output_shape(input_shape)
+        net_list = [self.backbone]
+
+        # maybe make pool net
+        if pool_class is not None:
+            assert isinstance(pool_class, str)
+            # feed output shape of backbone to pool net
+            if pool_kwargs is None:
+                pool_kwargs = dict()
+            # extract only relevant kwargs for this specific backbone
+            pool_kwargs["input_shape"] = feat_shape
+            pool_kwargs = extract_class_init_kwargs_from_dict(cls=eval(pool_class), dic=pool_kwargs, copy=True)
+            self.pool = eval(pool_class)(**pool_kwargs)
+            assert isinstance(self.pool, BaseNets.Module)
+
+            feat_shape = self.pool.output_shape(feat_shape)
+            net_list.append(self.pool)
+        else:
+            self.pool = None
+
+        # flatten layer
+        if self.flatten:
+            net_list.append(torch.nn.Flatten(start_dim=1, end_dim=-1))
+
+        # maybe linear layer
+        self.feature_dimension = feature_dimension
+        if feature_dimension is not None:
+            assert self.flatten
+            linear = torch.nn.Linear(int(np.prod(feat_shape)), feature_dimension)
+            net_list.append(linear)
+
+        self.nets = nn.Sequential(*net_list)
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        if self.feature_dimension is not None:
+            # linear output
+            return [self.feature_dimension]
+        feat_shape = self.backbone.output_shape(input_shape)
+        if self.pool is not None:
+            # pool output
+            feat_shape = self.pool.output_shape(feat_shape)
+        # backbone + flat output
+        if self.flatten:
+            return [np.prod(feat_shape)]
+        else:
+            return feat_shape
+
+    def forward(self, inputs):
+        """
+        Forward pass through visual core.
+        """
+        ndim = len(self.input_shape)
+        assert tuple(inputs.shape)[-ndim:] == tuple(self.input_shape)
+        return super(VisualCore, self).forward(inputs)
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        indent = ' ' * 2
+        msg += textwrap.indent(
+            "\ninput_shape={}\noutput_shape={}".format(self.input_shape, self.output_shape(self.input_shape)), indent)
+        msg += textwrap.indent("\nbackbone_net={}".format(self.backbone), indent)
+        msg += textwrap.indent("\npool_net={}".format(self.pool), indent)
+        msg = header + '(' + msg + '\n)'
+        return msg
+
+
+"""
+================================================
+Scan Core Networks (Conv1D Sequential + Pool)
+================================================
+"""
+class ScanCore(EncoderCore, BaseNets.ConvBase):
+    """
+    A network block that combines a Conv1D backbone network with optional pooling
+    and linear layers.
+    """
+    def __init__(
+        self,
+        input_shape,
+        conv_kwargs=None,
+        conv_activation="relu",
+        pool_class=None,
+        pool_kwargs=None,
+        flatten=True,
+        feature_dimension=None,
+    ):
+        """
+        Args:
+            input_shape (tuple): shape of input (not including batch dimension)
+            conv_kwargs (dict): kwargs for the conv1d backbone network. Should contain lists for the following values:
+                out_channels (int)
+                kernel_size (int)
+                stride (int)
+                ...
+
+                If not specified, or an empty dictionary is specified, some default settings will be used.
+            conv_activation (str or None): Activation to use between conv layers. Default is relu.
+                Currently, valid options are {relu}
+            pool_class (str): class name for the visual feature pooler (optional)
+                Common options are "SpatialSoftmax" and "SpatialMeanPool"
+            pool_kwargs (dict): kwargs for the visual feature pooler (optional)
+            flatten (bool): whether to flatten the network output
+            feature_dimension (int): if not None, add a Linear layer to
+                project output into a desired feature dimension (note: flatten must be set to True!)
+        """
+        super(ScanCore, self).__init__(input_shape=input_shape)
+        self.flatten = flatten
+        self.feature_dimension = feature_dimension
+
+        if conv_kwargs is None:
+            conv_kwargs = dict()
+
+        # Generate backbone network
+        # N input channels is assumed to be the first dimension
+        self.backbone = BaseNets.Conv1dBase(
+            input_channel=self.input_shape[0],
+            activation=conv_activation,
+            **conv_kwargs,
+        )
+        feat_shape = self.backbone.output_shape(input_shape=input_shape)
+
+        # Create netlist of all generated networks
+        net_list = [self.backbone]
+
+        # Possibly add pooling network
+        if pool_class is not None:
+            # Add an unsqueeze network so that the shape is correct to pass to pooling network
+            self.unsqueeze = Unsqueeze(dim=-1)
+            net_list.append(self.unsqueeze)
+            # Get output shape
+            feat_shape = self.unsqueeze.output_shape(feat_shape)
+            # Create pooling network
+            self.pool = eval(pool_class)(input_shape=feat_shape, **pool_kwargs)
+            net_list.append(self.pool)
+            feat_shape = self.pool.output_shape(feat_shape)
+        else:
+            self.unsqueeze, self.pool = None, None
+
+        # flatten layer
+        if self.flatten:
+            net_list.append(torch.nn.Flatten(start_dim=1, end_dim=-1))
+
+        # maybe linear layer
+        if self.feature_dimension is not None:
+            assert self.flatten
+            linear = torch.nn.Linear(int(np.prod(feat_shape)), self.feature_dimension)
+            net_list.append(linear)
+
+        # Generate final network
+        self.nets = nn.Sequential(*net_list)
+
+    def output_shape(self, input_shape):
+        """
+        Function to compute output shape from inputs to this module.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        if self.feature_dimension is not None:
+            # linear output
+            return [self.feature_dimension]
+        feat_shape = self.backbone.output_shape(input_shape)
+        if self.pool is not None:
+            # pool output
+            feat_shape = self.pool.output_shape(self.unsqueeze.output_shape(feat_shape))
+        # backbone + flat output
+        return [np.prod(feat_shape)] if self.flatten else feat_shape
+
+    def forward(self, inputs):
+        """
+        Forward pass through visual core.
+        """
+        ndim = len(self.input_shape)
+        assert tuple(inputs.shape)[-ndim:] == tuple(self.input_shape)
+        return super(ScanCore, self).forward(inputs)
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        indent = ' ' * 2
+        msg += textwrap.indent(
+            "\ninput_shape={}\noutput_shape={}".format(self.input_shape, self.output_shape(self.input_shape)), indent)
+        msg += textwrap.indent("\nbackbone_net={}".format(self.backbone), indent)
+        msg += textwrap.indent("\npool_net={}".format(self.pool), indent)
+        msg = header + '(' + msg + '\n)'
+        return msg
+
+
+"""
+================================================
+Observation Randomizer Networks
+================================================
+"""
+class Randomizer(BaseNets.Module):
+    """
+    Base class for randomizer networks. Each randomizer should implement the @output_shape_in,
+    @output_shape_out, @forward_in, and @forward_out methods. The randomizer's @forward_in
+    method is invoked on raw inputs, and @forward_out is invoked on processed inputs
+    (usually processed by a @VisualCore instance). Note that the self.training property
+    can be used to change the randomizer's behavior at train vs. test time.
+    """
+    def __init__(self):
+        super(Randomizer, self).__init__()
+
+    def __init_subclass__(cls, **kwargs):
+        """
+        Hook method to automatically register all valid subclasses so we can keep track of valid observation randomizers
+        in a global dict.
+
+        This global dict stores mapping from observation randomizer network name to class.
+        We keep track of these registries to enable automated class inference at runtime, allowing
+        users to simply extend our base randomizer class and refer to that class in string form
+        in their config, without having to manually register their class internally.
+        This also future-proofs us for any additional randomizer classes we would
+        like to add ourselves.
+        """
+        ObsUtils.register_randomizer(cls)
+
+    def output_shape(self, input_shape=None):
+        """
+        This function is unused. See @output_shape_in and @output_shape_out.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def output_shape_in(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. Corresponds to
+        the @forward_in operation, where raw inputs (usually observation modalities)
+        are passed in.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def output_shape_out(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. Corresponds to
+        the @forward_out operation, where processed inputs (usually encoded observation
+        modalities) are passed in.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        raise NotImplementedError
+
+    def forward_in(self, inputs):
+        """
+        Randomize raw inputs if training.
+        """
+        if self.training:
+            randomized_inputs = self._forward_in(inputs=inputs)
+            if VISUALIZE_RANDOMIZER:
+                num_samples_to_visualize = min(4, inputs.shape[0])
+                self._visualize(inputs, randomized_inputs, num_samples_to_visualize=num_samples_to_visualize)
+            return randomized_inputs
+        else:
+            return self._forward_in_eval(inputs)
+
+    def forward_out(self, inputs):
+        """
+        Processing for network outputs.
+        """
+        if self.training:
+            return self._forward_out(inputs)
+        else:
+            return self._forward_out_eval(inputs)
+
+    @abc.abstractmethod
+    def _forward_in(self, inputs):
+        """
+        Randomize raw inputs.
+        """
+        raise NotImplementedError
+
+    def _forward_in_eval(self, inputs):
+        """
+        Test-time behavior for the randomizer
+        """
+        return inputs
+
+    @abc.abstractmethod
+    def _forward_out(self, inputs):
+        """
+        Processing for network outputs.
+        """
+        return inputs
+
+    def _forward_out_eval(self, inputs):
+        """
+        Test-time behavior for the randomizer
+        """
+        return inputs
+
+    @abc.abstractmethod
+    def _visualize(self, pre_random_input, randomized_input, num_samples_to_visualize=2):
+        """
+        Visualize the original input and the randomized input for _forward_in for debugging purposes.
+        """
+        pass
+
+
+class CropRandomizer(Randomizer):
+    """
+    Randomly sample crops at input, and then average across crop features at output.
+    """
+    def __init__(
+        self,
+        input_shape,
+        crop_height=76,
+        crop_width=76,
+        num_crops=1,
+        pos_enc=False,
+    ):
+        """
+        Args:
+            input_shape (tuple, list): shape of input (not including batch dimension)
+            crop_height (int): crop height
+            crop_width (int): crop width
+            num_crops (int): number of random crops to take
+            pos_enc (bool): if True, add 2 channels to the output to encode the spatial
+                location of the cropped pixels in the source image
+        """
+        super(CropRandomizer, self).__init__()
+
+        assert len(input_shape) == 3 # (C, H, W)
+        assert crop_height < input_shape[1]
+        assert crop_width < input_shape[2]
+
+        self.input_shape = input_shape
+        self.crop_height = crop_height
+        self.crop_width = crop_width
+        self.num_crops = num_crops
+        self.pos_enc = pos_enc
+
+    def output_shape_in(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. Corresponds to
+        the @forward_in operation, where raw inputs (usually observation modalities)
+        are passed in.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # outputs are shape (C, CH, CW), or maybe C + 2 if using position encoding, because
+        # the number of crops are reshaped into the batch dimension, increasing the batch
+        # size from B to B * N
+        out_c = self.input_shape[0] + 2 if self.pos_enc else self.input_shape[0]
+        return [out_c, self.crop_height, self.crop_width]
+
+    def output_shape_out(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. Corresponds to
+        the @forward_out operation, where processed inputs (usually encoded observation
+        modalities) are passed in.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # since the forward_out operation splits [B * N, ...] -> [B, N, ...]
+        # and then pools to result in [B, ...], only the batch dimension changes,
+        # and so the other dimensions retain their shape.
+        return list(input_shape)
+
+    def _forward_in(self, inputs):
+        """
+        Samples N random crops for each input in the batch, and then reshapes
+        inputs to [B * N, ...].
+        """
+        assert len(inputs.shape) >= 3 # must have at least (C, H, W) dimensions
+        out, _ = ObsUtils.sample_random_image_crops(
+            images=inputs,
+            crop_height=self.crop_height,
+            crop_width=self.crop_width,
+            num_crops=self.num_crops,
+            pos_enc=self.pos_enc,
+        )
+        # [B, N, ...] -> [B * N, ...]
+        return TensorUtils.join_dimensions(out, 0, 1)
+
+    def _forward_in_eval(self, inputs):
+        """
+        Do center crops during eval
+        """
+        assert len(inputs.shape) >= 3 # must have at least (C, H, W) dimensions
+        inputs = inputs.permute(*range(inputs.dim()-3), inputs.dim()-2, inputs.dim()-1, inputs.dim()-3)
+        out = ObsUtils.center_crop(inputs, self.crop_height, self.crop_width)
+        out = out.permute(*range(out.dim()-3), out.dim()-1, out.dim()-3, out.dim()-2)
+        return out
+
+    def _forward_out(self, inputs):
+        """
+        Splits the outputs from shape [B * N, ...] -> [B, N, ...] and then average across N
+        to result in shape [B, ...] to make sure the network output is consistent with
+        what would have happened if there were no randomization.
+        """
+        batch_size = (inputs.shape[0] // self.num_crops)
+        out = TensorUtils.reshape_dimensions(inputs, begin_axis=0, end_axis=0,
+                                             target_dims=(batch_size, self.num_crops))
+        return out.mean(dim=1)
+
+    def _visualize(self, pre_random_input, randomized_input, num_samples_to_visualize=2):
+        batch_size = pre_random_input.shape[0]
+        random_sample_inds = torch.randint(0, batch_size, size=(num_samples_to_visualize,))
+        pre_random_input_np = TensorUtils.to_numpy(pre_random_input)[random_sample_inds]
+        randomized_input = TensorUtils.reshape_dimensions(
+            randomized_input,
+            begin_axis=0,
+            end_axis=0,
+            target_dims=(batch_size, self.num_crops)
+        )  # [B * N, ...] -> [B, N, ...]
+        randomized_input_np = TensorUtils.to_numpy(randomized_input[random_sample_inds])
+
+        pre_random_input_np = pre_random_input_np.transpose((0, 2, 3, 1))  # [B, C, H, W] -> [B, H, W, C]
+        randomized_input_np = randomized_input_np.transpose((0, 1, 3, 4, 2))  # [B, N, C, H, W] -> [B, N, H, W, C]
+
+        visualize_image_randomizer(
+            pre_random_input_np,
+            randomized_input_np,
+            randomizer_name='{}'.format(str(self.__class__.__name__))
+        )
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = header + "(input_shape={}, crop_size=[{}, {}], num_crops={})".format(
+            self.input_shape, self.crop_height, self.crop_width, self.num_crops)
+        return msg
+
+
+class ColorRandomizer(Randomizer):
+    """
+    Randomly sample color jitter at input, and then average across color jtters at output.
+    """
+    def __init__(
+        self,
+        input_shape,
+        brightness=0.3,
+        contrast=0.3,
+        saturation=0.3,
+        hue=0.3,
+        num_samples=1,
+    ):
+        """
+        Args:
+            input_shape (tuple, list): shape of input (not including batch dimension)
+            brightness (None or float or 2-tuple): How much to jitter brightness. brightness_factor is chosen uniformly
+                from [max(0, 1 - brightness), 1 + brightness] or the given [min, max]. Should be non negative numbers.
+            contrast (None or float or 2-tuple): How much to jitter contrast. contrast_factor is chosen uniformly
+                from [max(0, 1 - contrast), 1 + contrast] or the given [min, max]. Should be non negative numbers.
+            saturation (None or float or 2-tuple): How much to jitter saturation. saturation_factor is chosen uniformly
+                from [max(0, 1 - saturation), 1 + saturation] or the given [min, max]. Should be non negative numbers.
+            hue (None or float or 2-tuple): How much to jitter hue. hue_factor is chosen uniformly from [-hue, hue] or
+                the given [min, max]. Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5. To jitter hue, the pixel
+                values of the input image has to be non-negative for conversion to HSV space; thus it does not work
+                if you normalize your image to an interval with negative values, or use an interpolation that
+                generates negative values before using this function.
+            num_samples (int): number of random color jitters to take
+        """
+        super(ColorRandomizer, self).__init__()
+
+        assert len(input_shape) == 3 # (C, H, W)
+
+        self.input_shape = input_shape
+        self.brightness = [max(0, 1 - brightness), 1 + brightness] if type(brightness) in {float, int} else brightness
+        self.contrast = [max(0, 1 - contrast), 1 + contrast] if type(contrast) in {float, int} else contrast
+        self.saturation = [max(0, 1 - saturation), 1 + saturation] if type(saturation) in {float, int} else saturation
+        self.hue = [-hue, hue] if type(hue) in {float, int} else hue
+        self.num_samples = num_samples
+
+    @torch.jit.unused
+    def get_transform(self):
+        """
+        Get a randomized transform to be applied on image.
+
+        Implementation taken directly from:
+
+        https://github.com/pytorch/vision/blob/2f40a483d73018ae6e1488a484c5927f2b309969/torchvision/transforms/transforms.py#L1053-L1085
+
+        Returns:
+            Transform: Transform which randomly adjusts brightness, contrast and
+            saturation in a random order.
+        """
+        transforms = []
+
+        if self.brightness is not None:
+            brightness_factor = random.uniform(self.brightness[0], self.brightness[1])
+            transforms.append(Lambda(lambda img: TVF.adjust_brightness(img, brightness_factor)))
+
+        if self.contrast is not None:
+            contrast_factor = random.uniform(self.contrast[0], self.contrast[1])
+            transforms.append(Lambda(lambda img: TVF.adjust_contrast(img, contrast_factor)))
+
+        if self.saturation is not None:
+            saturation_factor = random.uniform(self.saturation[0], self.saturation[1])
+            transforms.append(Lambda(lambda img: TVF.adjust_saturation(img, saturation_factor)))
+
+        if self.hue is not None:
+            hue_factor = random.uniform(self.hue[0], self.hue[1])
+            transforms.append(Lambda(lambda img: TVF.adjust_hue(img, hue_factor)))
+
+        random.shuffle(transforms)
+        transform = Compose(transforms)
+
+        return transform
+
+    def get_batch_transform(self, N):
+        """
+        Generates a batch transform, where each set of sample(s) along the batch (first) dimension will have the same
+        @N unique ColorJitter transforms applied.
+
+        Args:
+            N (int): Number of ColorJitter transforms to apply per set of sample(s) along the batch (first) dimension
+
+        Returns:
+            Lambda: Aggregated transform which will autoamtically apply a different ColorJitter transforms to
+                each sub-set of samples along batch dimension, assumed to be the FIRST dimension in the inputted tensor
+                Note: This function will MULTIPLY the first dimension by N
+        """
+        return Lambda(lambda x: torch.stack([self.get_transform()(x_) for x_ in x for _ in range(N)]))
+
+    def output_shape_in(self, input_shape=None):
+        # outputs are same shape as inputs
+        return list(input_shape)
+
+    def output_shape_out(self, input_shape=None):
+        # since the forward_out operation splits [B * N, ...] -> [B, N, ...]
+        # and then pools to result in [B, ...], only the batch dimension changes,
+        # and so the other dimensions retain their shape.
+        return list(input_shape)
+
+    def _forward_in(self, inputs):
+        """
+        Samples N random color jitters for each input in the batch, and then reshapes
+        inputs to [B * N, ...].
+        """
+        assert len(inputs.shape) >= 3 # must have at least (C, H, W) dimensions
+
+        # Make sure shape is exactly 4
+        if len(inputs.shape) == 3:
+            inputs = torch.unsqueeze(inputs, dim=0)
+
+        # Create lambda to aggregate all color randomizings at once
+        transform = self.get_batch_transform(N=self.num_samples)
+
+        return transform(inputs)
+
+    def _forward_out(self, inputs):
+        """
+        Splits the outputs from shape [B * N, ...] -> [B, N, ...] and then average across N
+        to result in shape [B, ...] to make sure the network output is consistent with
+        what would have happened if there were no randomization.
+        """
+        batch_size = (inputs.shape[0] // self.num_samples)
+        out = TensorUtils.reshape_dimensions(inputs, begin_axis=0, end_axis=0,
+                                             target_dims=(batch_size, self.num_samples))
+        return out.mean(dim=1)
+
+    def _visualize(self, pre_random_input, randomized_input, num_samples_to_visualize=2):
+        batch_size = pre_random_input.shape[0]
+        random_sample_inds = torch.randint(0, batch_size, size=(num_samples_to_visualize,))
+        pre_random_input_np = TensorUtils.to_numpy(pre_random_input)[random_sample_inds]
+        randomized_input = TensorUtils.reshape_dimensions(
+            randomized_input,
+            begin_axis=0,
+            end_axis=0,
+            target_dims=(batch_size, self.num_samples)
+        )  # [B * N, ...] -> [B, N, ...]
+        randomized_input_np = TensorUtils.to_numpy(randomized_input[random_sample_inds])
+
+        pre_random_input_np = pre_random_input_np.transpose((0, 2, 3, 1))  # [B, C, H, W] -> [B, H, W, C]
+        randomized_input_np = randomized_input_np.transpose((0, 1, 3, 4, 2))  # [B, N, C, H, W] -> [B, N, H, W, C]
+
+        visualize_image_randomizer(
+            pre_random_input_np,
+            randomized_input_np,
+            randomizer_name='{}'.format(str(self.__class__.__name__))
+        )
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = header + f"(input_shape={self.input_shape}, brightness={self.brightness}, contrast={self.contrast}, " \
+                       f"saturation={self.saturation}, hue={self.hue}, num_samples={self.num_samples})"
+        return msg
+
+
+class GaussianNoiseRandomizer(Randomizer):
+    """
+    Randomly sample gaussian noise at input, and then average across noises at output.
+    """
+    def __init__(
+        self,
+        input_shape,
+        noise_mean=0.0,
+        noise_std=0.3,
+        limits=None,
+        num_samples=1,
+    ):
+        """
+        Args:
+            input_shape (tuple, list): shape of input (not including batch dimension)
+            noise_mean (float): Mean of noise to apply
+            noise_std (float): Standard deviation of noise to apply
+            limits (None or 2-tuple): If specified, should be the (min, max) values to clamp all noisied samples to
+            num_samples (int): number of random color jitters to take
+        """
+        super(GaussianNoiseRandomizer, self).__init__()
+
+        self.input_shape = input_shape
+        self.noise_mean = noise_mean
+        self.noise_std = noise_std
+        self.limits = limits
+        self.num_samples = num_samples
+
+    def output_shape_in(self, input_shape=None):
+        # outputs are same shape as inputs
+        return list(input_shape)
+
+    def output_shape_out(self, input_shape=None):
+        # since the forward_out operation splits [B * N, ...] -> [B, N, ...]
+        # and then pools to result in [B, ...], only the batch dimension changes,
+        # and so the other dimensions retain their shape.
+        return list(input_shape)
+
+    def _forward_in(self, inputs):
+        """
+        Samples N random gaussian noises for each input in the batch, and then reshapes
+        inputs to [B * N, ...].
+        """
+        out = TensorUtils.repeat_by_expand_at(inputs, repeats=self.num_samples, dim=0)
+
+        # Sample noise across all samples
+        out = torch.rand(size=out.shape).to(inputs.device) * self.noise_std + self.noise_mean + out
+
+        # Possibly clamp
+        if self.limits is not None:
+            out = torch.clip(out, min=self.limits[0], max=self.limits[1])
+
+        return out
+
+    def _forward_out(self, inputs):
+        """
+        Splits the outputs from shape [B * N, ...] -> [B, N, ...] and then average across N
+        to result in shape [B, ...] to make sure the network output is consistent with
+        what would have happened if there were no randomization.
+        """
+        batch_size = (inputs.shape[0] // self.num_samples)
+        out = TensorUtils.reshape_dimensions(inputs, begin_axis=0, end_axis=0,
+                                             target_dims=(batch_size, self.num_samples))
+        return out.mean(dim=1)
+
+    def _visualize(self, pre_random_input, randomized_input, num_samples_to_visualize=2):
+        batch_size = pre_random_input.shape[0]
+        random_sample_inds = torch.randint(0, batch_size, size=(num_samples_to_visualize,))
+        pre_random_input_np = TensorUtils.to_numpy(pre_random_input)[random_sample_inds]
+        randomized_input = TensorUtils.reshape_dimensions(
+            randomized_input,
+            begin_axis=0,
+            end_axis=0,
+            target_dims=(batch_size, self.num_samples)
+        )  # [B * N, ...] -> [B, N, ...]
+        randomized_input_np = TensorUtils.to_numpy(randomized_input[random_sample_inds])
+
+        pre_random_input_np = pre_random_input_np.transpose((0, 2, 3, 1))  # [B, C, H, W] -> [B, H, W, C]
+        randomized_input_np = randomized_input_np.transpose((0, 1, 3, 4, 2))  # [B, N, C, H, W] -> [B, N, H, W, C]
+
+        visualize_image_randomizer(
+            pre_random_input_np,
+            randomized_input_np,
+            randomizer_name='{}'.format(str(self.__class__.__name__))
+        )
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = header + f"(input_shape={self.input_shape}, noise_mean={self.noise_mean}, noise_std={self.noise_std}, " \
+                       f"limits={self.limits}, num_samples={self.num_samples})"
+        return msg
diff --git a/phantom/submodules/phantom-robomimic/robomimic/models/obs_nets.py b/phantom/submodules/phantom-robomimic/robomimic/models/obs_nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..b328418505d4aedefcf43b0c3cbd6dd87ae05c37
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/models/obs_nets.py
@@ -0,0 +1,1099 @@
+"""
+Contains torch Modules that help deal with inputs consisting of multiple
+modalities. This is extremely common when networks must deal with one or 
+more observation dictionaries, where each input dictionary can have
+observation keys of a certain modality and shape.
+
+As an example, an observation could consist of a flat "robot0_eef_pos" observation key,
+and a 3-channel RGB "agentview_image" observation key.
+"""
+import sys
+import numpy as np
+import textwrap
+from copy import deepcopy
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributions as D
+
+from robomimic.utils.python_utils import extract_class_init_kwargs_from_dict
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.obs_utils as ObsUtils
+from robomimic.models.base_nets import Module, Sequential, MLP, RNN_Base, ResNet18Conv, SpatialSoftmax, \
+    FeatureAggregator
+from robomimic.models.obs_core import VisualCore, Randomizer
+from robomimic.models.transformers import PositionalEncoding, GPT_Backbone
+
+
+def obs_encoder_factory(
+        obs_shapes,
+        feature_activation=nn.ReLU,
+        encoder_kwargs=None,
+    ):
+    """
+    Utility function to create an @ObservationEncoder from kwargs specified in config.
+
+    Args:
+        obs_shapes (OrderedDict): a dictionary that maps observation key to
+            expected shapes for observations.
+
+        feature_activation: non-linearity to apply after each obs net - defaults to ReLU. Pass
+            None to apply no activation.
+
+        encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should be
+            nested dictionary containing relevant per-modality information for encoder networks.
+            Should be of form:
+
+            obs_modality1: dict
+                feature_dimension: int
+                core_class: str
+                core_kwargs: dict
+                    ...
+                    ...
+                obs_randomizer_class: str
+                obs_randomizer_kwargs: dict
+                    ...
+                    ...
+            obs_modality2: dict
+                ...
+    """
+    enc = ObservationEncoder(feature_activation=feature_activation)
+    for k, obs_shape in obs_shapes.items():
+        obs_modality = ObsUtils.OBS_KEYS_TO_MODALITIES[k]
+        enc_kwargs = deepcopy(ObsUtils.DEFAULT_ENCODER_KWARGS[obs_modality]) if encoder_kwargs is None else \
+            deepcopy(encoder_kwargs[obs_modality])
+
+        for obs_module, cls_mapping in zip(("core", "obs_randomizer"),
+                                      (ObsUtils.OBS_ENCODER_CORES, ObsUtils.OBS_RANDOMIZERS)):
+            # Sanity check for kwargs in case they don't exist / are None
+            if enc_kwargs.get(f"{obs_module}_kwargs", None) is None:
+                enc_kwargs[f"{obs_module}_kwargs"] = {}
+            # Add in input shape info
+            enc_kwargs[f"{obs_module}_kwargs"]["input_shape"] = obs_shape
+            # If group class is specified, then make sure corresponding kwargs only contain relevant kwargs
+            if enc_kwargs[f"{obs_module}_class"] is not None:
+                enc_kwargs[f"{obs_module}_kwargs"] = extract_class_init_kwargs_from_dict(
+                    cls=cls_mapping[enc_kwargs[f"{obs_module}_class"]],
+                    dic=enc_kwargs[f"{obs_module}_kwargs"],
+                    copy=False,
+                )
+
+        # Add in input shape info
+        randomizer = None if enc_kwargs["obs_randomizer_class"] is None else \
+            ObsUtils.OBS_RANDOMIZERS[enc_kwargs["obs_randomizer_class"]](**enc_kwargs["obs_randomizer_kwargs"])
+
+        enc.register_obs_key(
+            name=k,
+            shape=obs_shape,
+            net_class=enc_kwargs["core_class"],
+            net_kwargs=enc_kwargs["core_kwargs"],
+            randomizer=randomizer,
+        )
+
+    enc.make()
+    return enc
+
+
+class ObservationEncoder(Module):
+    """
+    Module that processes inputs by observation key and then concatenates the processed
+    observation keys together. Each key is processed with an encoder head network.
+    Call @register_obs_key to register observation keys with the encoder and then
+    finally call @make to create the encoder networks. 
+    """
+    def __init__(self, feature_activation=nn.ReLU):
+        """
+        Args:
+            feature_activation: non-linearity to apply after each obs net - defaults to ReLU. Pass
+                None to apply no activation. 
+        """
+        super(ObservationEncoder, self).__init__()
+        self.obs_shapes = OrderedDict()
+        self.obs_nets_classes = OrderedDict()
+        self.obs_nets_kwargs = OrderedDict()
+        self.obs_share_mods = OrderedDict()
+        self.obs_nets = nn.ModuleDict()
+        self.obs_randomizers = nn.ModuleDict()
+        self.feature_activation = feature_activation
+        self._locked = False
+
+    def register_obs_key(
+        self, 
+        name,
+        shape, 
+        net_class=None, 
+        net_kwargs=None, 
+        net=None, 
+        randomizer=None,
+        share_net_from=None,
+    ):
+        """
+        Register an observation key that this encoder should be responsible for.
+
+        Args:
+            name (str): modality name
+            shape (int tuple): shape of modality
+            net_class (str): name of class in base_nets.py that should be used
+                to process this observation key before concatenation. Pass None to flatten
+                and concatenate the observation key directly.
+            net_kwargs (dict): arguments to pass to @net_class
+            net (Module instance): if provided, use this Module to process the observation key
+                instead of creating a different net
+            randomizer (Randomizer instance): if provided, use this Module to augment observation keys
+                coming in to the encoder, and possibly augment the processed output as well
+            share_net_from (str): if provided, use the same instance of @net_class 
+                as another observation key. This observation key must already exist in this encoder.
+                Warning: Note that this does not share the observation key randomizer
+        """
+        assert not self._locked, "ObservationEncoder: @register_obs_key called after @make"
+        assert name not in self.obs_shapes, "ObservationEncoder: modality {} already exists".format(name)
+
+        if net is not None:
+            assert isinstance(net, Module), "ObservationEncoder: @net must be instance of Module class"
+            assert (net_class is None) and (net_kwargs is None) and (share_net_from is None), \
+                "ObservationEncoder: @net provided - ignore other net creation options"
+
+        if share_net_from is not None:
+            # share processing with another modality
+            assert (net_class is None) and (net_kwargs is None)
+            assert share_net_from in self.obs_shapes
+
+        net_kwargs = deepcopy(net_kwargs) if net_kwargs is not None else {}
+        if randomizer is not None:
+            assert isinstance(randomizer, Randomizer)
+            if net_kwargs is not None:
+                # update input shape to visual core
+                net_kwargs["input_shape"] = randomizer.output_shape_in(shape)
+
+        self.obs_shapes[name] = shape
+        self.obs_nets_classes[name] = net_class
+        self.obs_nets_kwargs[name] = net_kwargs
+        self.obs_nets[name] = net
+        self.obs_randomizers[name] = randomizer
+        self.obs_share_mods[name] = share_net_from
+
+    def make(self):
+        """
+        Creates the encoder networks and locks the encoder so that more modalities cannot be added.
+        """
+        assert not self._locked, "ObservationEncoder: @make called more than once"
+        self._create_layers()
+        self._locked = True
+
+    def _create_layers(self):
+        """
+        Creates all networks and layers required by this encoder using the registered modalities.
+        """
+        assert not self._locked, "ObservationEncoder: layers have already been created"
+
+        for k in self.obs_shapes:
+            if self.obs_nets_classes[k] is not None:
+                # create net to process this modality
+                self.obs_nets[k] = ObsUtils.OBS_ENCODER_CORES[self.obs_nets_classes[k]](**self.obs_nets_kwargs[k])
+            elif self.obs_share_mods[k] is not None:
+                # make sure net is shared with another modality
+                self.obs_nets[k] = self.obs_nets[self.obs_share_mods[k]]
+
+        self.activation = None
+        if self.feature_activation is not None:
+            self.activation = self.feature_activation()
+
+    def forward(self, obs_dict):
+        """
+        Processes modalities according to the ordering in @self.obs_shapes. For each
+        modality, it is processed with a randomizer (if present), an encoder
+        network (if present), and again with the randomizer (if present), flattened,
+        and then concatenated with the other processed modalities.
+
+        Args:
+            obs_dict (OrderedDict): dictionary that maps modalities to torch.Tensor
+                batches that agree with @self.obs_shapes. All modalities in
+                @self.obs_shapes must be present, but additional modalities
+                can also be present.
+
+        Returns:
+            feats (torch.Tensor): flat features of shape [B, D]
+        """
+        assert self._locked, "ObservationEncoder: @make has not been called yet"
+
+        # ensure all modalities that the encoder handles are present
+        assert set(self.obs_shapes.keys()).issubset(obs_dict), "ObservationEncoder: {} does not contain all modalities {}".format(
+            list(obs_dict.keys()), list(self.obs_shapes.keys())
+        )
+
+        # process modalities by order given by @self.obs_shapes
+        feats = []
+        for k in self.obs_shapes:
+            x = obs_dict[k]
+            # maybe process encoder input with randomizer
+            if self.obs_randomizers[k] is not None:
+                x = self.obs_randomizers[k].forward_in(x)
+            # maybe process with obs net
+            if self.obs_nets[k] is not None:
+                x = self.obs_nets[k](x)
+                if self.activation is not None:
+                    x = self.activation(x)
+            # maybe process encoder output with randomizer
+            if self.obs_randomizers[k] is not None:
+                x = self.obs_randomizers[k].forward_out(x)
+            # flatten to [B, D]
+            x = TensorUtils.flatten(x, begin_axis=1)
+            feats.append(x)
+
+        # concatenate all features together
+        return torch.cat(feats, dim=-1)
+
+    def output_shape(self, input_shape=None):
+        """
+        Compute the output shape of the encoder.
+        """
+        feat_dim = 0
+        for k in self.obs_shapes:
+            feat_shape = self.obs_shapes[k]
+            if self.obs_randomizers[k] is not None:
+                feat_shape = self.obs_randomizers[k].output_shape_in(feat_shape)
+            if self.obs_nets[k] is not None:
+                feat_shape = self.obs_nets[k].output_shape(feat_shape)
+            if self.obs_randomizers[k] is not None:
+                feat_shape = self.obs_randomizers[k].output_shape_out(feat_shape)
+            feat_dim += int(np.prod(feat_shape))
+        return [feat_dim]
+
+    def __repr__(self):
+        """
+        Pretty print the encoder.
+        """
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        for k in self.obs_shapes:
+            msg += textwrap.indent('\nKey(\n', ' ' * 4)
+            indent = ' ' * 8
+            msg += textwrap.indent("name={}\nshape={}\n".format(k, self.obs_shapes[k]), indent)
+            msg += textwrap.indent("modality={}\n".format(ObsUtils.OBS_KEYS_TO_MODALITIES[k]), indent)
+            msg += textwrap.indent("randomizer={}\n".format(self.obs_randomizers[k]), indent)
+            msg += textwrap.indent("net={}\n".format(self.obs_nets[k]), indent)
+            msg += textwrap.indent("sharing_from={}\n".format(self.obs_share_mods[k]), indent)
+            msg += textwrap.indent(")", ' ' * 4)
+        msg += textwrap.indent("\noutput_shape={}".format(self.output_shape()), ' ' * 4)
+        msg = header + '(' + msg + '\n)'
+        return msg
+
+
+class ObservationDecoder(Module):
+    """
+    Module that can generate observation outputs by modality. Inputs are assumed
+    to be flat (usually outputs from some hidden layer). Each observation output
+    is generated with a linear layer from these flat inputs. Subclass this
+    module in order to implement more complex schemes for generating each
+    modality.
+    """
+    def __init__(
+        self,
+        decode_shapes,
+        input_feat_dim,
+    ):
+        """
+        Args:
+            decode_shapes (OrderedDict): a dictionary that maps observation key to
+                expected shape. This is used to generate output modalities from the
+                input features.
+
+            input_feat_dim (int): flat input dimension size
+        """
+        super(ObservationDecoder, self).__init__()
+
+        # important: sort observation keys to ensure consistent ordering of modalities
+        assert isinstance(decode_shapes, OrderedDict)
+        self.obs_shapes = OrderedDict()
+        for k in decode_shapes:
+            self.obs_shapes[k] = decode_shapes[k]
+
+        self.input_feat_dim = input_feat_dim
+        self._create_layers()
+
+    def _create_layers(self):
+        """
+        Create a linear layer to predict each modality.
+        """
+        self.nets = nn.ModuleDict()
+        for k in self.obs_shapes:
+            layer_out_dim = int(np.prod(self.obs_shapes[k]))
+            self.nets[k] = nn.Linear(self.input_feat_dim, layer_out_dim)
+
+    def output_shape(self, input_shape=None):
+        """
+        Returns output shape for this module, which is a dictionary instead
+        of a list since outputs are dictionaries.
+        """
+        return { k : list(self.obs_shapes[k]) for k in self.obs_shapes }
+
+    def forward(self, feats):
+        """
+        Predict each modality from input features, and reshape to each modality's shape.
+        """
+        output = {}
+        for k in self.obs_shapes:
+            out = self.nets[k](feats)
+            output[k] = out.reshape(-1, *self.obs_shapes[k])
+        return output
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        for k in self.obs_shapes:
+            msg += textwrap.indent('\nKey(\n', ' ' * 4)
+            indent = ' ' * 8
+            msg += textwrap.indent("name={}\nshape={}\n".format(k, self.obs_shapes[k]), indent)
+            msg += textwrap.indent("modality={}\n".format(ObsUtils.OBS_KEYS_TO_MODALITIES[k]), indent)
+            msg += textwrap.indent("net=({})\n".format(self.nets[k]), indent)
+            msg += textwrap.indent(")", ' ' * 4)
+        msg = header + '(' + msg + '\n)'
+        return msg
+
+
+class ObservationGroupEncoder(Module):
+    """
+    This class allows networks to encode multiple observation dictionaries into a single
+    flat, concatenated vector representation. It does this by assigning each observation
+    dictionary (observation group) an @ObservationEncoder object.
+
+    The class takes a dictionary of dictionaries, @observation_group_shapes.
+    Each key corresponds to a observation group (e.g. 'obs', 'subgoal', 'goal')
+    and each OrderedDict should be a map between modalities and 
+    expected input shapes (e.g. { 'image' : (3, 120, 160) }).
+    """
+    def __init__(
+        self,
+        observation_group_shapes,
+        feature_activation=nn.ReLU,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            observation_group_shapes (OrderedDict): a dictionary of dictionaries.
+                Each key in this dictionary should specify an observation group, and
+                the value should be an OrderedDict that maps modalities to
+                expected shapes.
+
+            feature_activation: non-linearity to apply after each obs net - defaults to ReLU. Pass
+                None to apply no activation.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        super(ObservationGroupEncoder, self).__init__()
+
+        # type checking
+        assert isinstance(observation_group_shapes, OrderedDict)
+        assert np.all([isinstance(observation_group_shapes[k], OrderedDict) for k in observation_group_shapes])
+        
+        self.observation_group_shapes = observation_group_shapes
+
+        # create an observation encoder per observation group
+        self.nets = nn.ModuleDict()
+        for obs_group in self.observation_group_shapes:
+            self.nets[obs_group] = obs_encoder_factory(
+                obs_shapes=self.observation_group_shapes[obs_group],
+                feature_activation=feature_activation,
+                encoder_kwargs=encoder_kwargs,
+            )
+
+    def forward(self, **inputs):
+        """
+        Process each set of inputs in its own observation group.
+
+        Args:
+            inputs (dict): dictionary that maps observation groups to observation
+                dictionaries of torch.Tensor batches that agree with 
+                @self.observation_group_shapes. All observation groups in
+                @self.observation_group_shapes must be present, but additional
+                observation groups can also be present. Note that these are specified
+                as kwargs for ease of use with networks that name each observation
+                stream in their forward calls.
+
+        Returns:
+            outputs (torch.Tensor): flat outputs of shape [B, D]
+        """
+
+        # ensure all observation groups we need are present
+        assert set(self.observation_group_shapes.keys()).issubset(inputs), "{} does not contain all observation groups {}".format(
+            list(inputs.keys()), list(self.observation_group_shapes.keys())
+        )
+
+        outputs = []
+        # Deterministic order since self.observation_group_shapes is OrderedDict
+        for obs_group in self.observation_group_shapes:
+            # pass through encoder
+            outputs.append(
+                self.nets[obs_group].forward(inputs[obs_group])
+            )
+
+        return torch.cat(outputs, dim=-1)
+
+    def output_shape(self):
+        """
+        Compute the output shape of this encoder.
+        """
+        feat_dim = 0
+        for obs_group in self.observation_group_shapes:
+            # get feature dimension of these keys
+            feat_dim += self.nets[obs_group].output_shape()[0]
+        return [feat_dim]
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        for k in self.observation_group_shapes:
+            msg += '\n'
+            indent = ' ' * 4
+            msg += textwrap.indent("group={}\n{}".format(k, self.nets[k]), indent)
+        msg = header + '(' + msg + '\n)'
+        return msg
+
+
+class MIMO_MLP(Module):
+    """
+    Extension to MLP to accept multiple observation dictionaries as input and
+    to output dictionaries of tensors. Inputs are specified as a dictionary of 
+    observation dictionaries, with each key corresponding to an observation group.
+
+    This module utilizes @ObservationGroupEncoder to process the multiple input dictionaries and
+    @ObservationDecoder to generate tensor dictionaries. The default behavior
+    for encoding the inputs is to process visual inputs with a learned CNN and concatenating
+    the flat encodings with the other flat inputs. The default behavior for generating 
+    outputs is to use a linear layer branch to produce each modality separately
+    (including visual outputs).
+    """
+    def __init__(
+        self,
+        input_obs_group_shapes,
+        output_shapes,
+        layer_dims,
+        layer_func=nn.Linear, 
+        activation=nn.ReLU,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            input_obs_group_shapes (OrderedDict): a dictionary of dictionaries.
+                Each key in this dictionary should specify an observation group, and
+                the value should be an OrderedDict that maps modalities to
+                expected shapes.
+
+            output_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for outputs.
+
+            layer_dims ([int]): sequence of integers for the MLP hidden layer sizes
+
+            layer_func: mapping per MLP layer - defaults to Linear
+
+            activation: non-linearity per MLP layer - defaults to ReLU
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        super(MIMO_MLP, self).__init__()
+
+        assert isinstance(input_obs_group_shapes, OrderedDict)
+        assert np.all([isinstance(input_obs_group_shapes[k], OrderedDict) for k in input_obs_group_shapes])
+        assert isinstance(output_shapes, OrderedDict)
+
+        self.input_obs_group_shapes = input_obs_group_shapes
+        self.output_shapes = output_shapes
+
+        self.nets = nn.ModuleDict()
+
+        # Encoder for all observation groups.
+        self.nets["encoder"] = ObservationGroupEncoder(
+            observation_group_shapes=input_obs_group_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+        # flat encoder output dimension
+        mlp_input_dim = self.nets["encoder"].output_shape()[0]
+
+        # intermediate MLP layers
+        self.nets["mlp"] = MLP(
+            input_dim=mlp_input_dim,
+            output_dim=layer_dims[-1],
+            layer_dims=layer_dims[:-1],
+            layer_func=layer_func,
+            activation=activation,
+            output_activation=activation, # make sure non-linearity is applied before decoder
+        )
+
+        # decoder for output modalities
+        self.nets["decoder"] = ObservationDecoder(
+            decode_shapes=self.output_shapes,
+            input_feat_dim=layer_dims[-1],
+        )
+
+    def output_shape(self, input_shape=None):
+        """
+        Returns output shape for this module, which is a dictionary instead
+        of a list since outputs are dictionaries.
+        """
+        return { k : list(self.output_shapes[k]) for k in self.output_shapes }
+
+    def forward(self, **inputs):
+        """
+        Process each set of inputs in its own observation group.
+
+        Args:
+            inputs (dict): a dictionary of dictionaries with one dictionary per
+                observation group. Each observation group's dictionary should map
+                modality to torch.Tensor batches. Should be consistent with
+                @self.input_obs_group_shapes.
+
+        Returns:
+            outputs (dict): dictionary of output torch.Tensors, that corresponds
+                to @self.output_shapes
+        """
+        enc_outputs = self.nets["encoder"](**inputs)
+        mlp_out = self.nets["mlp"](enc_outputs)
+        return self.nets["decoder"](mlp_out)
+
+    def _to_string(self):
+        """
+        Subclasses should override this method to print out info about network / policy.
+        """
+        return ''
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        indent = ' ' * 4
+        if self._to_string() != '':
+            msg += textwrap.indent("\n" + self._to_string() + "\n", indent)
+        msg += textwrap.indent("\nencoder={}".format(self.nets["encoder"]), indent)
+        msg += textwrap.indent("\n\nmlp={}".format(self.nets["mlp"]), indent)
+        msg += textwrap.indent("\n\ndecoder={}".format(self.nets["decoder"]), indent)
+        msg = header + '(' + msg + '\n)'
+        return msg
+
+
+class RNN_MIMO_MLP(Module):
+    """
+    A wrapper class for a multi-step RNN and a per-step MLP and a decoder.
+
+    Structure: [encoder -> rnn -> mlp -> decoder]
+
+    All temporal inputs are processed by a shared @ObservationGroupEncoder,
+    followed by an RNN, and then a per-step multi-output MLP. 
+    """
+    def __init__(
+        self,
+        input_obs_group_shapes,
+        output_shapes,
+        mlp_layer_dims,
+        rnn_hidden_dim,
+        rnn_num_layers,
+        rnn_type="LSTM",  # [LSTM, GRU]
+        rnn_kwargs=None,
+        mlp_activation=nn.ReLU,
+        mlp_layer_func=nn.Linear,
+        per_step=True,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            input_obs_group_shapes (OrderedDict): a dictionary of dictionaries.
+                Each key in this dictionary should specify an observation group, and
+                the value should be an OrderedDict that maps modalities to
+                expected shapes.
+
+            output_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for outputs.
+
+            rnn_hidden_dim (int): RNN hidden dimension
+
+            rnn_num_layers (int): number of RNN layers
+
+            rnn_type (str): [LSTM, GRU]
+
+            rnn_kwargs (dict): kwargs for the rnn model
+
+            per_step (bool): if True, apply the MLP and observation decoder into @output_shapes
+                at every step of the RNN. Otherwise, apply them to the final hidden state of the 
+                RNN.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        super(RNN_MIMO_MLP, self).__init__()
+        assert isinstance(input_obs_group_shapes, OrderedDict)
+        assert np.all([isinstance(input_obs_group_shapes[k], OrderedDict) for k in input_obs_group_shapes])
+        assert isinstance(output_shapes, OrderedDict)
+        self.input_obs_group_shapes = input_obs_group_shapes
+        self.output_shapes = output_shapes
+        self.per_step = per_step
+
+        self.nets = nn.ModuleDict()
+
+        # Encoder for all observation groups.
+        self.nets["encoder"] = ObservationGroupEncoder(
+            observation_group_shapes=input_obs_group_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+        # flat encoder output dimension
+        rnn_input_dim = self.nets["encoder"].output_shape()[0]
+
+        # bidirectional RNNs mean that the output of RNN will be twice the hidden dimension
+        rnn_is_bidirectional = rnn_kwargs.get("bidirectional", False)
+        num_directions = int(rnn_is_bidirectional) + 1 # 2 if bidirectional, 1 otherwise
+        rnn_output_dim = num_directions * rnn_hidden_dim
+
+        per_step_net = None
+        self._has_mlp = (len(mlp_layer_dims) > 0)
+        if self._has_mlp:
+            self.nets["mlp"] = MLP(
+                input_dim=rnn_output_dim,
+                output_dim=mlp_layer_dims[-1],
+                layer_dims=mlp_layer_dims[:-1],
+                output_activation=mlp_activation,
+                layer_func=mlp_layer_func
+            )
+            self.nets["decoder"] = ObservationDecoder(
+                decode_shapes=self.output_shapes,
+                input_feat_dim=mlp_layer_dims[-1],
+            )
+            if self.per_step:
+                per_step_net = Sequential(self.nets["mlp"], self.nets["decoder"])
+        else:
+            self.nets["decoder"] = ObservationDecoder(
+                decode_shapes=self.output_shapes,
+                input_feat_dim=rnn_output_dim,
+            )
+            if self.per_step:
+                per_step_net = self.nets["decoder"]
+
+        # core network
+        self.nets["rnn"] = RNN_Base(
+            input_dim=rnn_input_dim,
+            rnn_hidden_dim=rnn_hidden_dim,
+            rnn_num_layers=rnn_num_layers,
+            rnn_type=rnn_type,
+            per_step_net=per_step_net,
+            rnn_kwargs=rnn_kwargs
+        )
+
+    def get_rnn_init_state(self, batch_size, device):
+        """
+        Get a default RNN state (zeros)
+
+        Args:
+            batch_size (int): batch size dimension
+
+            device: device the hidden state should be sent to.
+
+        Returns:
+            hidden_state (torch.Tensor or tuple): returns hidden state tensor or tuple of hidden state tensors
+                depending on the RNN type
+        """
+        return self.nets["rnn"].get_rnn_init_state(batch_size, device=device)
+
+    def output_shape(self, input_shape):
+        """
+        Returns output shape for this module, which is a dictionary instead
+        of a list since outputs are dictionaries.
+
+        Args:
+            input_shape (dict): dictionary of dictionaries, where each top-level key
+                corresponds to an observation group, and the low-level dictionaries
+                specify the shape for each modality in an observation dictionary
+        """
+
+        # infers temporal dimension from input shape
+        obs_group = list(self.input_obs_group_shapes.keys())[0]
+        mod = list(self.input_obs_group_shapes[obs_group].keys())[0]
+        T = input_shape[obs_group][mod][0]
+        TensorUtils.assert_size_at_dim(input_shape, size=T, dim=0, 
+                msg="RNN_MIMO_MLP: input_shape inconsistent in temporal dimension")
+        # returns a dictionary instead of list since outputs are dictionaries
+        return { k : [T] + list(self.output_shapes[k]) for k in self.output_shapes }
+
+    def forward(self, rnn_init_state=None, return_state=False, **inputs):
+        """
+        Args:
+            inputs (dict): a dictionary of dictionaries with one dictionary per
+                observation group. Each observation group's dictionary should map
+                modality to torch.Tensor batches. Should be consistent with
+                @self.input_obs_group_shapes. First two leading dimensions should
+                be batch and time [B, T, ...] for each tensor.
+
+            rnn_init_state: rnn hidden state, initialize to zero state if set to None
+
+            return_state (bool): whether to return hidden state
+
+        Returns:
+            outputs (dict): dictionary of output torch.Tensors, that corresponds
+                to @self.output_shapes. Leading dimensions will be batch and time [B, T, ...]
+                for each tensor.
+
+            rnn_state (torch.Tensor or tuple): return the new rnn state (if @return_state)
+        """
+        for obs_group in self.input_obs_group_shapes:
+            for k in self.input_obs_group_shapes[obs_group]:
+                # first two dimensions should be [B, T] for inputs
+                assert inputs[obs_group][k].ndim - 2 == len(self.input_obs_group_shapes[obs_group][k])
+
+        # use encoder to extract flat rnn inputs
+        rnn_inputs = TensorUtils.time_distributed(inputs, self.nets["encoder"], inputs_as_kwargs=True)
+        assert rnn_inputs.ndim == 3  # [B, T, D]
+        if self.per_step:
+            return self.nets["rnn"].forward(inputs=rnn_inputs, rnn_init_state=rnn_init_state, return_state=return_state)
+        
+        # apply MLP + decoder to last RNN output
+        outputs = self.nets["rnn"].forward(inputs=rnn_inputs, rnn_init_state=rnn_init_state, return_state=return_state)
+        if return_state:
+            outputs, rnn_state = outputs
+
+        assert outputs.ndim == 3 # [B, T, D]
+        if self._has_mlp:
+            outputs = self.nets["decoder"](self.nets["mlp"](outputs[:, -1]))
+        else:
+            outputs = self.nets["decoder"](outputs[:, -1])
+
+        if return_state:
+            return outputs, rnn_state
+        return outputs
+
+    def forward_step(self, rnn_state, **inputs):
+        """
+        Unroll network over a single timestep.
+
+        Args:
+            inputs (dict): expects same modalities as @self.input_shapes, with
+                additional batch dimension (but NOT time), since this is a 
+                single time step.
+
+            rnn_state (torch.Tensor): rnn hidden state
+
+        Returns:
+            outputs (dict): dictionary of output torch.Tensors, that corresponds
+                to @self.output_shapes. Does not contain time dimension.
+
+            rnn_state: return the new rnn state
+        """
+        # ensure that the only extra dimension is batch dim, not temporal dim 
+        assert np.all([inputs[k].ndim - 1 == len(self.input_shapes[k]) for k in self.input_shapes])
+
+        inputs = TensorUtils.to_sequence(inputs)
+        outputs, rnn_state = self.forward(
+            inputs, 
+            rnn_init_state=rnn_state,
+            return_state=True,
+        )
+        if self.per_step:
+            # if outputs are not per-step, the time dimension is already reduced
+            outputs = outputs[:, 0]
+        return outputs, rnn_state
+
+    def _to_string(self):
+        """
+        Subclasses should override this method to print out info about network / policy.
+        """
+        return ''
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        indent = ' ' * 4
+        msg += textwrap.indent("\n" + self._to_string(), indent)
+        msg += textwrap.indent("\n\nencoder={}".format(self.nets["encoder"]), indent)
+        msg += textwrap.indent("\n\nrnn={}".format(self.nets["rnn"]), indent)
+        msg = header + '(' + msg + '\n)'
+        return msg
+
+
+class MIMO_Transformer(Module):
+    """
+    Extension to Transformer (based on GPT architecture) to accept multiple observation 
+    dictionaries as input and to output dictionaries of tensors. Inputs are specified as 
+    a dictionary of observation dictionaries, with each key corresponding to an observation group.
+    This module utilizes @ObservationGroupEncoder to process the multiple input dictionaries and
+    @ObservationDecoder to generate tensor dictionaries. The default behavior
+    for encoding the inputs is to process visual inputs with a learned CNN and concatenating
+    the flat encodings with the other flat inputs. The default behavior for generating 
+    outputs is to use a linear layer branch to produce each modality separately
+    (including visual outputs).
+    """
+    def __init__(
+        self,
+        input_obs_group_shapes,
+        output_shapes,
+        transformer_embed_dim,
+        transformer_num_layers,
+        transformer_num_heads,
+        transformer_context_length,
+        transformer_emb_dropout=0.1,
+        transformer_attn_dropout=0.1,
+        transformer_block_output_dropout=0.1,
+        transformer_sinusoidal_embedding=False,
+        transformer_activation="gelu",
+        transformer_nn_parameter_for_timesteps=False,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            input_obs_group_shapes (OrderedDict): a dictionary of dictionaries.
+                Each key in this dictionary should specify an observation group, and
+                the value should be an OrderedDict that maps modalities to
+                expected shapes.
+            output_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for outputs.
+            transformer_embed_dim (int): dimension for embeddings used by transformer
+            transformer_num_layers (int): number of transformer blocks to stack
+            transformer_num_heads (int): number of attention heads for each
+                transformer block - must divide @transformer_embed_dim evenly. Self-attention is 
+                computed over this many partitions of the embedding dimension separately.
+            transformer_context_length (int): expected length of input sequences
+            transformer_activation: non-linearity for input and output layers used in transformer
+            transformer_emb_dropout (float): dropout probability for embedding inputs in transformer
+            transformer_attn_dropout (float): dropout probability for attention outputs for each transformer block
+            transformer_block_output_dropout (float): dropout probability for final outputs for each transformer block
+            encoder_kwargs (dict): observation encoder config
+        """
+        super(MIMO_Transformer, self).__init__()
+        
+        assert isinstance(input_obs_group_shapes, OrderedDict)
+        assert np.all([isinstance(input_obs_group_shapes[k], OrderedDict) for k in input_obs_group_shapes])
+        assert isinstance(output_shapes, OrderedDict)
+
+        self.input_obs_group_shapes = input_obs_group_shapes
+        self.output_shapes = output_shapes
+
+        self.nets = nn.ModuleDict()
+        self.params = nn.ParameterDict()
+
+        # Encoder for all observation groups.
+        self.nets["encoder"] = ObservationGroupEncoder(
+            observation_group_shapes=input_obs_group_shapes,
+            encoder_kwargs=encoder_kwargs,
+            feature_activation=None,
+        )
+
+        # flat encoder output dimension
+        transformer_input_dim = self.nets["encoder"].output_shape()[0]
+
+        self.nets["embed_encoder"] = nn.Linear(
+            transformer_input_dim, transformer_embed_dim
+        )
+
+        max_timestep = transformer_context_length
+
+        if transformer_sinusoidal_embedding:
+            self.nets["embed_timestep"] = PositionalEncoding(transformer_embed_dim)
+        elif transformer_nn_parameter_for_timesteps:
+            assert (
+                not transformer_sinusoidal_embedding
+            ), "nn.Parameter only works with learned embeddings"
+            self.params["embed_timestep"] = nn.Parameter(
+                torch.zeros(1, max_timestep, transformer_embed_dim)
+            )
+        else:
+            self.nets["embed_timestep"] = nn.Embedding(max_timestep, transformer_embed_dim)
+
+        # layer norm for embeddings
+        self.nets["embed_ln"] = nn.LayerNorm(transformer_embed_dim)
+        
+        # dropout for input embeddings
+        self.nets["embed_drop"] = nn.Dropout(transformer_emb_dropout)
+
+        # GPT transformer
+        self.nets["transformer"] = GPT_Backbone(
+            embed_dim=transformer_embed_dim,
+            num_layers=transformer_num_layers,
+            num_heads=transformer_num_heads,
+            context_length=transformer_context_length,
+            attn_dropout=transformer_attn_dropout,
+            block_output_dropout=transformer_block_output_dropout,
+            activation=transformer_activation,
+        )
+
+        # decoder for output modalities
+        self.nets["decoder"] = ObservationDecoder(
+            decode_shapes=self.output_shapes,
+            input_feat_dim=transformer_embed_dim,
+        )
+
+        self.transformer_context_length = transformer_context_length
+        self.transformer_embed_dim = transformer_embed_dim
+        self.transformer_sinusoidal_embedding = transformer_sinusoidal_embedding
+        self.transformer_nn_parameter_for_timesteps = transformer_nn_parameter_for_timesteps
+
+    def output_shape(self, input_shape=None):
+        """
+        Returns output shape for this module, which is a dictionary instead
+        of a list since outputs are dictionaries.
+        """
+        return { k : list(self.output_shapes[k]) for k in self.output_shapes }
+
+    def embed_timesteps(self, embeddings):
+        """
+        Computes timestep-based embeddings (aka positional embeddings) to add to embeddings.
+        Args:
+            embeddings (torch.Tensor): embeddings prior to positional embeddings are computed
+        Returns:
+            time_embeddings (torch.Tensor): positional embeddings to add to embeddings
+        """
+        timesteps = (
+            torch.arange(
+                0,
+                embeddings.shape[1],
+                dtype=embeddings.dtype,
+                device=embeddings.device,
+            )
+            .unsqueeze(0)
+            .repeat(embeddings.shape[0], 1)
+        )
+        assert (timesteps >= 0.0).all(), "timesteps must be positive!"
+        if self.transformer_sinusoidal_embedding:
+            assert torch.is_floating_point(timesteps), timesteps.dtype
+        else:
+            timesteps = timesteps.long()
+
+        if self.transformer_nn_parameter_for_timesteps:
+            time_embeddings = self.params["embed_timestep"]
+        else:
+            time_embeddings = self.nets["embed_timestep"](
+                timesteps
+            )  # these are NOT fed into transformer, only added to the inputs.
+            # compute how many modalities were combined into embeddings, replicate time embeddings that many times
+            num_replicates = embeddings.shape[-1] // self.transformer_embed_dim
+            time_embeddings = torch.cat([time_embeddings for _ in range(num_replicates)], -1)
+            assert (
+                embeddings.shape == time_embeddings.shape
+            ), f"{embeddings.shape}, {time_embeddings.shape}"
+        return time_embeddings
+
+    def input_embedding(
+        self,
+        inputs,
+    ):
+        """
+        Process encoded observations into embeddings to pass to transformer,
+        Adds timestep-based embeddings (aka positional embeddings) to inputs.
+        Args:
+            inputs (torch.Tensor): outputs from observation encoder
+        Returns:
+            embeddings (torch.Tensor): input embeddings to pass to transformer backbone.
+        """
+        embeddings = self.nets["embed_encoder"](inputs)
+        time_embeddings = self.embed_timesteps(embeddings)
+        embeddings = embeddings + time_embeddings
+        embeddings = self.nets["embed_ln"](embeddings)
+        embeddings = self.nets["embed_drop"](embeddings)
+
+        return embeddings
+
+    
+    def forward(self, **inputs):
+        """
+        Process each set of inputs in its own observation group.
+        Args:
+            inputs (dict): a dictionary of dictionaries with one dictionary per
+                observation group. Each observation group's dictionary should map
+                modality to torch.Tensor batches. Should be consistent with
+                @self.input_obs_group_shapes. First two leading dimensions should
+                be batch and time [B, T, ...] for each tensor.
+        Returns:
+            outputs (dict): dictionary of output torch.Tensors, that corresponds
+                to @self.output_shapes. Leading dimensions will be batch and time [B, T, ...]
+                for each tensor.
+        """
+        for obs_group in self.input_obs_group_shapes:
+            for k in self.input_obs_group_shapes[obs_group]:
+                # first two dimensions should be [B, T] for inputs
+                if inputs[obs_group][k] is None:
+                    continue
+                assert inputs[obs_group][k].ndim - 2 == len(self.input_obs_group_shapes[obs_group][k])
+
+        inputs = inputs.copy()
+
+        transformer_encoder_outputs = None
+        transformer_inputs = TensorUtils.time_distributed(
+            inputs, self.nets["encoder"], inputs_as_kwargs=True
+        )
+        assert transformer_inputs.ndim == 3  # [B, T, D]
+
+        if transformer_encoder_outputs is None:
+            transformer_embeddings = self.input_embedding(transformer_inputs)
+            # pass encoded sequences through transformer
+            transformer_encoder_outputs = self.nets["transformer"].forward(transformer_embeddings)
+
+        transformer_outputs = transformer_encoder_outputs
+        # apply decoder to each timestep of sequence to get a dictionary of outputs
+        transformer_outputs = TensorUtils.time_distributed(
+            transformer_outputs, self.nets["decoder"]
+        )
+        transformer_outputs["transformer_encoder_outputs"] = transformer_encoder_outputs
+        return transformer_outputs
+
+    def _to_string(self):
+        """
+        Subclasses should override this method to print out info about network / policy.
+        """
+        return ''
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        indent = ' ' * 4
+        if self._to_string() != '':
+            msg += textwrap.indent("\n" + self._to_string() + "\n", indent)
+        msg += textwrap.indent("\nencoder={}".format(self.nets["encoder"]), indent)
+        msg += textwrap.indent("\n\ntransformer={}".format(self.nets["transformer"]), indent)
+        msg += textwrap.indent("\n\ndecoder={}".format(self.nets["decoder"]), indent)
+        msg = header + '(' + msg + '\n)'
+        return msg
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/models/policy_nets.py b/phantom/submodules/phantom-robomimic/robomimic/models/policy_nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dba1d934cbb6b6a6f2d5c6475d699c48eb2a302
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/models/policy_nets.py
@@ -0,0 +1,1570 @@
+"""
+Contains torch Modules for policy networks. These networks take an
+observation dictionary as input (and possibly additional conditioning,
+such as subgoal or goal dictionaries) and produce action predictions,
+samples, or distributions as outputs. Note that actions
+are assumed to lie in [-1, 1], and most networks will have a final
+tanh activation to help ensure this range.
+"""
+import textwrap
+import numpy as np
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributions as D
+
+import robomimic.utils.tensor_utils as TensorUtils
+from robomimic.models.base_nets import Module
+from robomimic.models.transformers import GPT_Backbone
+from robomimic.models.obs_nets import MIMO_MLP, RNN_MIMO_MLP, MIMO_Transformer, ObservationDecoder
+from robomimic.models.vae_nets import VAE
+from robomimic.models.distributions import TanhWrappedDistribution
+
+
+class ActorNetwork(MIMO_MLP):
+    """
+    A basic policy network that predicts actions from observations.
+    Can optionally be goal conditioned on future observations.
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        mlp_layer_dims,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            obs_shapes (OrderedDict): a dictionary that maps observation keys to
+                expected shapes for observations.
+
+            ac_dim (int): dimension of action space.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layers sizes.
+
+            goal_shapes (OrderedDict): a dictionary that maps observation keys to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-observation key information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        assert isinstance(obs_shapes, OrderedDict)
+        self.obs_shapes = obs_shapes
+        self.ac_dim = ac_dim
+
+        # set up different observation groups for @MIMO_MLP
+        observation_group_shapes = OrderedDict()
+        observation_group_shapes["obs"] = OrderedDict(self.obs_shapes)
+
+        self._is_goal_conditioned = False
+        if goal_shapes is not None and len(goal_shapes) > 0:
+            assert isinstance(goal_shapes, OrderedDict)
+            self._is_goal_conditioned = True
+            self.goal_shapes = OrderedDict(goal_shapes)
+            observation_group_shapes["goal"] = OrderedDict(self.goal_shapes)
+        else:
+            self.goal_shapes = OrderedDict()
+
+        output_shapes = self._get_output_shapes()
+        super(ActorNetwork, self).__init__(
+            input_obs_group_shapes=observation_group_shapes,
+            output_shapes=output_shapes,
+            layer_dims=mlp_layer_dims,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def _get_output_shapes(self):
+        """
+        Allow subclasses to re-define outputs from @MIMO_MLP, since we won't
+        always directly predict actions, but may instead predict the parameters
+        of a action distribution.
+        """
+        return OrderedDict(action=(self.ac_dim,))
+
+    def output_shape(self, input_shape=None):
+        return [self.ac_dim]
+
+    def forward(self, obs_dict, goal_dict=None):
+        actions = super(ActorNetwork, self).forward(obs=obs_dict, goal=goal_dict)["action"]
+        # apply tanh squashing to ensure actions are in [-1, 1]
+        return torch.tanh(actions)
+
+    def _to_string(self):
+        """Info to pretty print."""
+        return "action_dim={}".format(self.ac_dim)
+
+
+class PerturbationActorNetwork(ActorNetwork):
+    """
+    An action perturbation network - primarily used in BCQ.
+    It takes states and actions and returns action perturbations.
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        mlp_layer_dims,
+        perturbation_scale=0.05,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            obs_shapes (OrderedDict): a dictionary that maps observation keys to
+                expected shapes for observations.
+
+            ac_dim (int): dimension of action space.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layers sizes.
+
+            perturbation_scale (float): the perturbation network output is always squashed to 
+                lie in +/- @perturbation_scale. The final action output is equal to the original 
+                input action added to the output perturbation (and clipped to lie in [-1, 1]).
+
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        self.perturbation_scale = perturbation_scale
+
+        # add in action as a modality
+        new_obs_shapes = OrderedDict(obs_shapes)
+        new_obs_shapes["action"] = (ac_dim,)
+
+        # pass to super class to instantiate network
+        super(PerturbationActorNetwork, self).__init__(
+            obs_shapes=new_obs_shapes,
+            ac_dim=ac_dim,
+            mlp_layer_dims=mlp_layer_dims,
+            goal_shapes=goal_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def forward(self, obs_dict, acts, goal_dict=None):
+        """Forward pass through perturbation actor."""
+        # add in actions
+        inputs = dict(obs_dict)
+        inputs["action"] = acts
+        perturbations = super(PerturbationActorNetwork, self).forward(inputs, goal_dict)
+
+        # add perturbations from network to original actions, and ensure the new actions lie in [-1, 1]
+        output_actions = acts + self.perturbation_scale * perturbations
+        output_actions = output_actions.clamp(-1.0, 1.0)
+        return output_actions
+
+    def _to_string(self):
+        """Info to pretty print."""
+        return "action_dim={}, perturbation_scale={}".format(self.ac_dim, self.perturbation_scale)
+
+
+class GaussianActorNetwork(ActorNetwork):
+    """
+    Variant of actor network that learns a diagonal unimodal Gaussian distribution
+    over actions.
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        mlp_layer_dims,
+        fixed_std=False,
+        std_activation="softplus",
+        init_last_fc_weight=None,
+        init_std=0.3,
+        mean_limits=(-9.0, 9.0),
+        std_limits=(0.007, 7.5),
+        low_noise_eval=True,
+        use_tanh=False,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            obs_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for observations.
+
+            ac_dim (int): dimension of action space.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layers sizes.
+
+            fixed_std (bool): if True, std is not learned, but kept constant at @init_std
+
+            std_activation (None or str): type of activation to use for std deviation. Options are:
+
+                None: no activation applied (not recommended unless using fixed std)
+
+                `'softplus'`: Only applicable if not using fixed std. Softplus activation applied, after which the
+                    output is scaled by init_std / softplus(0)
+
+                `'exp'`: Only applicable if not using fixed std. Exp applied; this corresponds to network output
+                    as being interpreted as log_std instead of std
+
+                NOTE: In all cases, the final result is clipped to be within @std_limits
+
+            init_last_fc_weight (None or float): if specified, will intialize the final layer network weights to be
+                uniformly sampled from [-init_weight, init_weight]
+
+            init_std (None or float): approximate initial scaling for standard deviation outputs
+                from network. If None
+
+            mean_limits (2-array): (min, max) to clamp final mean output by
+
+            std_limits (2-array): (min, max) to clamp final std output by
+
+            low_noise_eval (float): if True, model will output means of Gaussian distribution
+                at eval time.
+
+            use_tanh (bool): if True, use a tanh-Gaussian distribution
+
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+
+        # parameters specific to Gaussian actor
+        self.fixed_std = fixed_std
+        self.init_std = init_std
+        self.mean_limits = np.array(mean_limits)
+        self.std_limits = np.array(std_limits)
+
+        # Define activations to use
+        def softplus_scaled(x):
+            out = F.softplus(x)
+            out = out * (self.init_std / F.softplus(torch.zeros(1).to(x.device)))
+            return out
+
+        self.activations = {
+            None: lambda x: x,
+            "softplus": softplus_scaled,
+            "exp": torch.exp,
+        }
+        assert std_activation in self.activations, \
+            "std_activation must be one of: {}; instead got: {}".format(self.activations.keys(), std_activation)
+        self.std_activation = std_activation if not self.fixed_std else None
+
+        self.low_noise_eval = low_noise_eval
+        self.use_tanh = use_tanh
+
+        super(GaussianActorNetwork, self).__init__(
+            obs_shapes=obs_shapes,
+            ac_dim=ac_dim,
+            mlp_layer_dims=mlp_layer_dims,
+            goal_shapes=goal_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+        # If initialization weight was specified, make sure all final layer network weights are specified correctly
+        if init_last_fc_weight is not None:
+            with torch.no_grad():
+                for name, layer in self.nets["decoder"].nets.items():
+                    torch.nn.init.uniform_(layer.weight, -init_last_fc_weight, init_last_fc_weight)
+                    torch.nn.init.uniform_(layer.bias, -init_last_fc_weight, init_last_fc_weight)
+
+    def _get_output_shapes(self):
+        """
+        Tells @MIMO_MLP superclass about the output dictionary that should be generated
+        at the last layer. Network outputs parameters of Gaussian distribution.
+        """
+        return OrderedDict(
+            mean=(self.ac_dim,), 
+            scale=(self.ac_dim,),
+        )
+
+    def forward_train(self, obs_dict, goal_dict=None):
+        """
+        Return full Gaussian distribution, which is useful for computing
+        quantities necessary at train-time, like log-likelihood, KL 
+        divergence, etc.
+
+        Args:
+            obs_dict (dict): batch of observations
+            goal_dict (dict): if not None, batch of goal observations
+
+        Returns:
+            dist (Distribution): Gaussian distribution
+        """
+        out = MIMO_MLP.forward(self, obs=obs_dict, goal=goal_dict)
+        mean = out["mean"]
+        # Use either constant std or learned std depending on setting
+        scale = out["scale"] if not self.fixed_std else torch.ones_like(mean) * self.init_std
+
+        # Clamp the mean
+        mean = torch.clamp(mean, min=self.mean_limits[0], max=self.mean_limits[1])
+
+        # apply tanh squashing to mean if not using tanh-Gaussian to ensure mean is in [-1, 1]
+        if not self.use_tanh:
+            mean = torch.tanh(mean)
+
+        # Calculate scale
+        if self.low_noise_eval and (not self.training):
+            # override std value so that you always approximately sample the mean
+            scale = torch.ones_like(mean) * 1e-4
+        else:
+            # Post-process the scale accordingly
+            scale = self.activations[self.std_activation](scale)
+            # Clamp the scale
+            scale = torch.clamp(scale, min=self.std_limits[0], max=self.std_limits[1])
+
+
+        # the Independent call will make it so that `batch_shape` for dist will be equal to batch size
+        # while `event_shape` will be equal to action dimension - ensuring that log-probability 
+        # computations are summed across the action dimension
+        dist = D.Normal(loc=mean, scale=scale)
+        dist = D.Independent(dist, 1)
+
+        if self.use_tanh:
+            # Wrap distribution with Tanh
+            dist = TanhWrappedDistribution(base_dist=dist, scale=1.)
+
+        return dist
+
+    def forward(self, obs_dict, goal_dict=None):
+        """
+        Samples actions from the policy distribution.
+
+        Args:
+            obs_dict (dict): batch of observations
+            goal_dict (dict): if not None, batch of goal observations
+
+        Returns:
+            action (torch.Tensor): batch of actions from policy distribution
+        """
+        dist = self.forward_train(obs_dict, goal_dict)
+        if self.low_noise_eval and (not self.training):
+            if self.use_tanh:
+                # # scaling factor lets us output actions like [-1. 1.] and is consistent with the distribution transform
+                # return (1. + 1e-6) * torch.tanh(dist.base_dist.mean)
+                return torch.tanh(dist.mean)
+            return dist.mean
+        return dist.sample()
+
+    def _to_string(self):
+        """Info to pretty print."""
+        msg = "action_dim={}\nfixed_std={}\nstd_activation={}\ninit_std={}\nmean_limits={}\nstd_limits={}\nlow_noise_eval={}".format(
+            self.ac_dim, self.fixed_std, self.std_activation, self.init_std, self.mean_limits, self.std_limits, self.low_noise_eval)
+        return msg
+
+
+class GMMActorNetwork(ActorNetwork):
+    """
+    Variant of actor network that learns a multimodal Gaussian mixture distribution
+    over actions.
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        mlp_layer_dims,
+        num_modes=5,
+        min_std=0.01,
+        std_activation="softplus",
+        low_noise_eval=True,
+        use_tanh=False,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            obs_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for observations.
+
+            ac_dim (int): dimension of action space.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layers sizes.
+
+            num_modes (int): number of GMM modes
+
+            min_std (float): minimum std output from network
+
+            std_activation (None or str): type of activation to use for std deviation. Options are:
+
+                `'softplus'`: Softplus activation applied
+
+                `'exp'`: Exp applied; this corresponds to network output being interpreted as log_std instead of std
+
+            low_noise_eval (float): if True, model will sample from GMM with low std, so that
+                one of the GMM modes will be sampled (approximately)
+
+            use_tanh (bool): if True, use a tanh-Gaussian distribution
+
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+
+        # parameters specific to GMM actor
+        self.num_modes = num_modes
+        self.min_std = min_std
+        self.low_noise_eval = low_noise_eval
+        self.use_tanh = use_tanh
+
+        # Define activations to use
+        self.activations = {
+            "softplus": F.softplus,
+            "exp": torch.exp,
+        }
+        assert std_activation in self.activations, \
+            "std_activation must be one of: {}; instead got: {}".format(self.activations.keys(), std_activation)
+        self.std_activation = std_activation
+
+        super(GMMActorNetwork, self).__init__(
+            obs_shapes=obs_shapes,
+            ac_dim=ac_dim,
+            mlp_layer_dims=mlp_layer_dims,
+            goal_shapes=goal_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def _get_output_shapes(self):
+        """
+        Tells @MIMO_MLP superclass about the output dictionary that should be generated
+        at the last layer. Network outputs parameters of GMM distribution.
+        """
+        return OrderedDict(
+            mean=(self.num_modes, self.ac_dim), 
+            scale=(self.num_modes, self.ac_dim), 
+            logits=(self.num_modes,),
+        )
+
+    def forward_train(self, obs_dict, goal_dict=None):
+        """
+        Return full GMM distribution, which is useful for computing
+        quantities necessary at train-time, like log-likelihood, KL 
+        divergence, etc.
+
+        Args:
+            obs_dict (dict): batch of observations
+            goal_dict (dict): if not None, batch of goal observations
+
+        Returns:
+            dist (Distribution): GMM distribution
+        """
+        out = MIMO_MLP.forward(self, obs=obs_dict, goal=goal_dict)
+        means = out["mean"]
+        scales = out["scale"]
+        logits = out["logits"]
+
+        # apply tanh squashing to means if not using tanh-GMM to ensure means are in [-1, 1]
+        if not self.use_tanh:
+            means = torch.tanh(means)
+
+        # Calculate scale
+        if self.low_noise_eval and (not self.training):
+            # low-noise for all Gaussian dists
+            scales = torch.ones_like(means) * 1e-4
+        else:
+            # post-process the scale accordingly
+            scales = self.activations[self.std_activation](scales) + self.min_std
+
+        # mixture components - make sure that `batch_shape` for the distribution is equal
+        # to (batch_size, num_modes) since MixtureSameFamily expects this shape
+        component_distribution = D.Normal(loc=means, scale=scales)
+        component_distribution = D.Independent(component_distribution, 1)
+
+        # unnormalized logits to categorical distribution for mixing the modes
+        mixture_distribution = D.Categorical(logits=logits)
+
+        dist = D.MixtureSameFamily(
+            mixture_distribution=mixture_distribution,
+            component_distribution=component_distribution,
+        )
+
+        if self.use_tanh:
+            # Wrap distribution with Tanh
+            dist = TanhWrappedDistribution(base_dist=dist, scale=1.)
+
+        return dist
+
+    def forward(self, obs_dict, goal_dict=None):
+        """
+        Samples actions from the policy distribution.
+
+        Args:
+            obs_dict (dict): batch of observations
+            goal_dict (dict): if not None, batch of goal observations
+
+        Returns:
+            action (torch.Tensor): batch of actions from policy distribution
+        """
+        dist = self.forward_train(obs_dict, goal_dict)
+        return dist.sample()
+
+    def _to_string(self):
+        """Info to pretty print."""
+        return "action_dim={}\nnum_modes={}\nmin_std={}\nstd_activation={}\nlow_noise_eval={}".format(
+            self.ac_dim, self.num_modes, self.min_std, self.std_activation, self.low_noise_eval)
+
+
+class RNNActorNetwork(RNN_MIMO_MLP):
+    """
+    An RNN policy network that predicts actions from observations.
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        mlp_layer_dims,
+        rnn_hidden_dim,
+        rnn_num_layers,
+        rnn_type="LSTM",  # [LSTM, GRU]
+        rnn_kwargs=None,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            obs_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for observations.
+
+            ac_dim (int): dimension of action space.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layers sizes.
+
+            rnn_hidden_dim (int): RNN hidden dimension
+
+            rnn_num_layers (int): number of RNN layers
+
+            rnn_type (str): [LSTM, GRU]
+
+            rnn_kwargs (dict): kwargs for the torch.nn.LSTM / GRU
+
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        self.ac_dim = ac_dim
+
+        assert isinstance(obs_shapes, OrderedDict)
+        self.obs_shapes = obs_shapes
+
+        # set up different observation groups for @RNN_MIMO_MLP
+        observation_group_shapes = OrderedDict()
+        observation_group_shapes["obs"] = OrderedDict(self.obs_shapes)
+
+        self._is_goal_conditioned = False
+        if goal_shapes is not None and len(goal_shapes) > 0:
+            assert isinstance(goal_shapes, OrderedDict)
+            self._is_goal_conditioned = True
+            self.goal_shapes = OrderedDict(goal_shapes)
+            observation_group_shapes["goal"] = OrderedDict(self.goal_shapes)
+        else:
+            self.goal_shapes = OrderedDict()
+
+        output_shapes = self._get_output_shapes()
+        super(RNNActorNetwork, self).__init__(
+            input_obs_group_shapes=observation_group_shapes,
+            output_shapes=output_shapes,
+            mlp_layer_dims=mlp_layer_dims,
+            mlp_activation=nn.ReLU,
+            mlp_layer_func=nn.Linear,
+            rnn_hidden_dim=rnn_hidden_dim,
+            rnn_num_layers=rnn_num_layers,
+            rnn_type=rnn_type,
+            rnn_kwargs=rnn_kwargs,
+            per_step=True,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def _get_output_shapes(self):
+        """
+        Allow subclasses to re-define outputs from @RNN_MIMO_MLP, since we won't
+        always directly predict actions, but may instead predict the parameters
+        of a action distribution.
+        """
+        return OrderedDict(action=(self.ac_dim,))
+
+    def output_shape(self, input_shape):
+        # note: @input_shape should be dictionary (key: mod)
+        # infers temporal dimension from input shape
+        mod = list(self.obs_shapes.keys())[0]
+        T = input_shape[mod][0]
+        TensorUtils.assert_size_at_dim(input_shape, size=T, dim=0, 
+                msg="RNNActorNetwork: input_shape inconsistent in temporal dimension")
+        return [T, self.ac_dim]
+
+    def forward(self, obs_dict, goal_dict=None, rnn_init_state=None, return_state=False):
+        """
+        Forward a sequence of inputs through the RNN and the per-step network.
+
+        Args:
+            obs_dict (dict): batch of observations - each tensor in the dictionary
+                should have leading dimensions batch and time [B, T, ...]
+            goal_dict (dict): if not None, batch of goal observations
+            rnn_init_state: rnn hidden state, initialize to zero state if set to None
+            return_state (bool): whether to return hidden state
+
+        Returns:
+            actions (torch.Tensor): predicted action sequence
+            rnn_state: return rnn state at the end if return_state is set to True
+        """
+        if self._is_goal_conditioned:
+            assert goal_dict is not None
+            # repeat the goal observation in time to match dimension with obs_dict
+            mod = list(obs_dict.keys())[0]
+            goal_dict = TensorUtils.unsqueeze_expand_at(goal_dict, size=obs_dict[mod].shape[1], dim=1)
+
+        outputs = super(RNNActorNetwork, self).forward(
+            obs=obs_dict, goal=goal_dict, rnn_init_state=rnn_init_state, return_state=return_state)
+
+        if return_state:
+            actions, state = outputs
+        else:
+            actions = outputs
+            state = None
+        
+        # apply tanh squashing to ensure actions are in [-1, 1]
+        actions = torch.tanh(actions["action"])
+
+        if return_state:
+            return actions, state
+        else:
+            return actions
+
+    def forward_step(self, obs_dict, goal_dict=None, rnn_state=None):
+        """
+        Unroll RNN over single timestep to get actions.
+
+        Args:
+            obs_dict (dict): batch of observations. Should not contain
+                time dimension.
+            goal_dict (dict): if not None, batch of goal observations
+            rnn_state: rnn hidden state, initialize to zero state if set to None
+
+        Returns:
+            actions (torch.Tensor): batch of actions - does not contain time dimension
+            state: updated rnn state
+        """
+        obs_dict = TensorUtils.to_sequence(obs_dict)
+        action, state = self.forward(
+            obs_dict, goal_dict, rnn_init_state=rnn_state, return_state=True)
+        return action[:, 0], state
+
+    def _to_string(self):
+        """Info to pretty print."""
+        return "action_dim={}".format(self.ac_dim)
+
+
+class RNNGMMActorNetwork(RNNActorNetwork):
+    """
+    An RNN GMM policy network that predicts sequences of action distributions from observation sequences.
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        mlp_layer_dims,
+        rnn_hidden_dim,
+        rnn_num_layers,
+        rnn_type="LSTM",  # [LSTM, GRU]
+        rnn_kwargs=None,
+        num_modes=5,
+        min_std=0.01,
+        std_activation="softplus",
+        low_noise_eval=True,
+        use_tanh=False,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+
+            rnn_hidden_dim (int): RNN hidden dimension
+
+            rnn_num_layers (int): number of RNN layers
+
+            rnn_type (str): [LSTM, GRU]
+
+            rnn_kwargs (dict): kwargs for the torch.nn.LSTM / GRU
+
+            num_modes (int): number of GMM modes
+
+            min_std (float): minimum std output from network
+
+            std_activation (None or str): type of activation to use for std deviation. Options are:
+
+                `'softplus'`: Softplus activation applied
+
+                `'exp'`: Exp applied; this corresponds to network output being interpreted as log_std instead of std
+
+            low_noise_eval (float): if True, model will sample from GMM with low std, so that
+                one of the GMM modes will be sampled (approximately)
+
+            use_tanh (bool): if True, use a tanh-Gaussian distribution
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+
+        # parameters specific to GMM actor
+        self.num_modes = num_modes
+        self.min_std = min_std
+        self.low_noise_eval = low_noise_eval
+        self.use_tanh = use_tanh
+
+        # Define activations to use
+        self.activations = {
+            "softplus": F.softplus,
+            "exp": torch.exp,
+        }
+        assert std_activation in self.activations, \
+            "std_activation must be one of: {}; instead got: {}".format(self.activations.keys(), std_activation)
+        self.std_activation = std_activation
+
+        super(RNNGMMActorNetwork, self).__init__(
+            obs_shapes=obs_shapes,
+            ac_dim=ac_dim,
+            mlp_layer_dims=mlp_layer_dims,
+            rnn_hidden_dim=rnn_hidden_dim,
+            rnn_num_layers=rnn_num_layers,
+            rnn_type=rnn_type,
+            rnn_kwargs=rnn_kwargs,
+            goal_shapes=goal_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def _get_output_shapes(self):
+        """
+        Tells @MIMO_MLP superclass about the output dictionary that should be generated
+        at the last layer. Network outputs parameters of GMM distribution.
+        """
+        return OrderedDict(
+            mean=(self.num_modes, self.ac_dim), 
+            scale=(self.num_modes, self.ac_dim), 
+            logits=(self.num_modes,),
+        )
+
+    def forward_train(self, obs_dict, goal_dict=None, rnn_init_state=None, return_state=False):
+        """
+        Return full GMM distribution, which is useful for computing
+        quantities necessary at train-time, like log-likelihood, KL 
+        divergence, etc.
+
+        Args:
+            obs_dict (dict): batch of observations
+            goal_dict (dict): if not None, batch of goal observations
+            rnn_init_state: rnn hidden state, initialize to zero state if set to None
+            return_state (bool): whether to return hidden state
+
+        Returns:
+            dists (Distribution): sequence of GMM distributions over the timesteps
+            rnn_state: return rnn state at the end if return_state is set to True
+        """
+        if self._is_goal_conditioned:
+            assert goal_dict is not None
+            # repeat the goal observation in time to match dimension with obs_dict
+            mod = list(obs_dict.keys())[0]
+            goal_dict = TensorUtils.unsqueeze_expand_at(goal_dict, size=obs_dict[mod].shape[1], dim=1)
+
+        outputs = RNN_MIMO_MLP.forward(
+            self, obs=obs_dict, goal=goal_dict, rnn_init_state=rnn_init_state, return_state=return_state)
+
+        if return_state:
+            outputs, state = outputs
+        else:
+            state = None
+        
+        means = outputs["mean"]
+        scales = outputs["scale"]
+        logits = outputs["logits"]
+
+        # apply tanh squashing to mean if not using tanh-GMM to ensure means are in [-1, 1]
+        if not self.use_tanh:
+            means = torch.tanh(means)
+
+        if self.low_noise_eval and (not self.training):
+            # low-noise for all Gaussian dists
+            scales = torch.ones_like(means) * 1e-4
+        else:
+            # post-process the scale accordingly
+            scales = self.activations[self.std_activation](scales) + self.min_std
+
+        # mixture components - make sure that `batch_shape` for the distribution is equal
+        # to (batch_size, timesteps, num_modes) since MixtureSameFamily expects this shape
+        component_distribution = D.Normal(loc=means, scale=scales)
+        component_distribution = D.Independent(component_distribution, 1) # shift action dim to event shape
+
+        # unnormalized logits to categorical distribution for mixing the modes
+        mixture_distribution = D.Categorical(logits=logits)
+
+        dists = D.MixtureSameFamily(
+            mixture_distribution=mixture_distribution,
+            component_distribution=component_distribution,
+        )
+
+        if self.use_tanh:
+            # Wrap distribution with Tanh
+            dists = TanhWrappedDistribution(base_dist=dists, scale=1.)
+
+        if return_state:
+            return dists, state
+        else:
+            return dists
+
+    def forward(self, obs_dict, goal_dict=None, rnn_init_state=None, return_state=False):
+        """
+        Samples actions from the policy distribution.
+
+        Args:
+            obs_dict (dict): batch of observations
+            goal_dict (dict): if not None, batch of goal observations
+
+        Returns:
+            action (torch.Tensor): batch of actions from policy distribution
+        """
+        out = self.forward_train(obs_dict=obs_dict, goal_dict=goal_dict, rnn_init_state=rnn_init_state, return_state=return_state)
+        if return_state:
+            ad, state = out
+            return ad.sample(), state
+        return out.sample()
+
+    def forward_train_step(self, obs_dict, goal_dict=None, rnn_state=None):
+        """
+        Unroll RNN over single timestep to get action GMM distribution, which 
+        is useful for computing quantities necessary at train-time, like 
+        log-likelihood, KL divergence, etc.
+
+        Args:
+            obs_dict (dict): batch of observations. Should not contain
+                time dimension.
+            goal_dict (dict): if not None, batch of goal observations
+            rnn_state: rnn hidden state, initialize to zero state if set to None
+
+        Returns:
+            ad (Distribution): GMM action distributions
+            state: updated rnn state
+        """
+        obs_dict = TensorUtils.to_sequence(obs_dict)
+        ad, state = self.forward_train(
+            obs_dict, goal_dict, rnn_init_state=rnn_state, return_state=True)
+
+        # to squeeze time dimension, make another action distribution
+        assert ad.component_distribution.base_dist.loc.shape[1] == 1
+        assert ad.component_distribution.base_dist.scale.shape[1] == 1
+        assert ad.mixture_distribution.logits.shape[1] == 1
+        component_distribution = D.Normal(
+            loc=ad.component_distribution.base_dist.loc.squeeze(1),
+            scale=ad.component_distribution.base_dist.scale.squeeze(1),
+        )
+        component_distribution = D.Independent(component_distribution, 1)
+        mixture_distribution = D.Categorical(logits=ad.mixture_distribution.logits.squeeze(1))
+        ad = D.MixtureSameFamily(
+            mixture_distribution=mixture_distribution,
+            component_distribution=component_distribution,
+        )
+        return ad, state
+
+    def forward_step(self, obs_dict, goal_dict=None, rnn_state=None):
+        """
+        Unroll RNN over single timestep to get sampled actions.
+
+        Args:
+            obs_dict (dict): batch of observations. Should not contain
+                time dimension.
+            goal_dict (dict): if not None, batch of goal observations
+            rnn_state: rnn hidden state, initialize to zero state if set to None
+
+        Returns:
+            acts (torch.Tensor): batch of actions - does not contain time dimension
+            state: updated rnn state
+        """
+        obs_dict = TensorUtils.to_sequence(obs_dict)
+        acts, state = self.forward(
+            obs_dict, goal_dict, rnn_init_state=rnn_state, return_state=True)
+        assert acts.shape[1] == 1
+        return acts[:, 0], state
+
+    def _to_string(self):
+        """Info to pretty print."""
+        msg = "action_dim={}, std_activation={}, low_noise_eval={}, num_nodes={}, min_std={}".format(
+            self.ac_dim, self.std_activation, self.low_noise_eval, self.num_modes, self.min_std)
+        return msg
+
+
+class TransformerActorNetwork(MIMO_Transformer):
+    """
+    An Transformer policy network that predicts actions from observation sequences (assumed to be frame stacked
+    from previous observations) and possible from previous actions as well (in an autoregressive manner).
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        transformer_embed_dim,
+        transformer_num_layers,
+        transformer_num_heads,
+        transformer_context_length,
+        transformer_emb_dropout=0.1,
+        transformer_attn_dropout=0.1,
+        transformer_block_output_dropout=0.1,
+        transformer_sinusoidal_embedding=False,
+        transformer_activation="gelu",
+        transformer_nn_parameter_for_timesteps=False,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+
+            obs_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for observations.
+            
+            ac_dim (int): dimension of action space.
+
+            transformer_embed_dim (int): dimension for embeddings used by transformer
+
+            transformer_num_layers (int): number of transformer blocks to stack
+
+            transformer_num_heads (int): number of attention heads for each
+                transformer block - must divide @transformer_embed_dim evenly. Self-attention is 
+                computed over this many partitions of the embedding dimension separately.
+            
+            transformer_context_length (int): expected length of input sequences
+
+            transformer_embedding_dropout (float): dropout probability for embedding inputs in transformer
+
+            transformer_attn_dropout (float): dropout probability for attention outputs for each transformer block
+
+            transformer_block_output_dropout (float): dropout probability for final outputs for each transformer block
+            
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations.
+            
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        self.ac_dim = ac_dim
+
+        assert isinstance(obs_shapes, OrderedDict)
+        self.obs_shapes = obs_shapes
+
+        self.transformer_nn_parameter_for_timesteps = transformer_nn_parameter_for_timesteps
+
+        # set up different observation groups for @RNN_MIMO_MLP
+        observation_group_shapes = OrderedDict()
+        observation_group_shapes["obs"] = OrderedDict(self.obs_shapes)
+
+        self._is_goal_conditioned = False
+        if goal_shapes is not None and len(goal_shapes) > 0:
+            assert isinstance(goal_shapes, OrderedDict)
+            self._is_goal_conditioned = True
+            self.goal_shapes = OrderedDict(goal_shapes)
+            observation_group_shapes["goal"] = OrderedDict(self.goal_shapes)
+        else:
+            self.goal_shapes = OrderedDict()
+
+        output_shapes = self._get_output_shapes()
+        super(TransformerActorNetwork, self).__init__(
+            input_obs_group_shapes=observation_group_shapes,
+            output_shapes=output_shapes,
+            transformer_embed_dim=transformer_embed_dim,
+            transformer_num_layers=transformer_num_layers,
+            transformer_num_heads=transformer_num_heads,
+            transformer_context_length=transformer_context_length,
+            transformer_emb_dropout=transformer_emb_dropout,
+            transformer_attn_dropout=transformer_attn_dropout,
+            transformer_block_output_dropout=transformer_block_output_dropout,
+            transformer_sinusoidal_embedding=transformer_sinusoidal_embedding,
+            transformer_activation=transformer_activation,
+            transformer_nn_parameter_for_timesteps=transformer_nn_parameter_for_timesteps,
+
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def _get_output_shapes(self):
+        """
+        Allow subclasses to re-define outputs from @MIMO_Transformer, since we won't
+        always directly predict actions, but may instead predict the parameters
+        of a action distribution.
+        """
+        output_shapes = OrderedDict(action=(self.ac_dim,))
+        return output_shapes
+
+    def output_shape(self, input_shape):
+        # note: @input_shape should be dictionary (key: mod)
+        # infers temporal dimension from input shape
+        mod = list(self.obs_shapes.keys())[0]
+        T = input_shape[mod][0]
+        TensorUtils.assert_size_at_dim(input_shape, size=T, dim=0, 
+                msg="TransformerActorNetwork: input_shape inconsistent in temporal dimension")
+        return [T, self.ac_dim]
+
+    def forward(self, obs_dict, actions=None, goal_dict=None):
+        """
+        Forward a sequence of inputs through the Transformer.
+        Args:
+            obs_dict (dict): batch of observations - each tensor in the dictionary
+                should have leading dimensions batch and time [B, T, ...]
+            actions (torch.Tensor): batch of actions of shape [B, T, D]
+            goal_dict (dict): if not None, batch of goal observations
+        Returns:
+            outputs (torch.Tensor or dict): contains predicted action sequence, or dictionary
+                with predicted action sequence and predicted observation sequences
+        """
+        if self._is_goal_conditioned:
+            assert goal_dict is not None
+            # repeat the goal observation in time to match dimension with obs_dict
+            mod = list(obs_dict.keys())[0]
+            goal_dict = TensorUtils.unsqueeze_expand_at(goal_dict, size=obs_dict[mod].shape[1], dim=1)
+
+        forward_kwargs = dict(obs=obs_dict, goal=goal_dict)
+        outputs = super(TransformerActorNetwork, self).forward(**forward_kwargs)
+
+        # apply tanh squashing to ensure actions are in [-1, 1]
+        outputs["action"] = torch.tanh(outputs["action"])
+
+        return outputs["action"] # only action sequences
+
+    def _to_string(self):
+        """Info to pretty print."""
+        return "action_dim={}".format(self.ac_dim)
+
+
+class TransformerGMMActorNetwork(TransformerActorNetwork):
+    """
+    A Transformer GMM policy network that predicts sequences of action distributions from observation 
+    sequences (assumed to be frame stacked from previous observations).
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        transformer_embed_dim,
+        transformer_num_layers,
+        transformer_num_heads,
+        transformer_context_length,
+        transformer_emb_dropout=0.1,
+        transformer_attn_dropout=0.1,
+        transformer_block_output_dropout=0.1,
+        transformer_sinusoidal_embedding=False,
+        transformer_activation="gelu",
+        transformer_nn_parameter_for_timesteps=False,
+        num_modes=5,
+        min_std=0.01,
+        std_activation="softplus",
+        low_noise_eval=True,
+        use_tanh=False,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+
+            obs_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for observations.
+            
+            ac_dim (int): dimension of action space.
+
+            transformer_embed_dim (int): dimension for embeddings used by transformer
+
+            transformer_num_layers (int): number of transformer blocks to stack
+
+            transformer_num_heads (int): number of attention heads for each
+                transformer block - must divide @transformer_embed_dim evenly. Self-attention is 
+                computed over this many partitions of the embedding dimension separately.
+            
+            transformer_context_length (int): expected length of input sequences
+
+            transformer_embedding_dropout (float): dropout probability for embedding inputs in transformer
+
+            transformer_attn_dropout (float): dropout probability for attention outputs for each transformer block
+
+            transformer_block_output_dropout (float): dropout probability for final outputs for each transformer block
+
+            num_modes (int): number of GMM modes
+
+            min_std (float): minimum std output from network
+
+            std_activation (None or str): type of activation to use for std deviation. Options are:
+
+                `'softplus'`: Softplus activation applied
+
+                `'exp'`: Exp applied; this corresponds to network output being interpreted as log_std instead of std
+
+            low_noise_eval (float): if True, model will sample from GMM with low std, so that
+                one of the GMM modes will be sampled (approximately)
+
+            use_tanh (bool): if True, use a tanh-Gaussian distribution
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        
+        # parameters specific to GMM actor
+        self.num_modes = num_modes
+        self.min_std = min_std
+        self.low_noise_eval = low_noise_eval
+        self.use_tanh = use_tanh
+
+        # Define activations to use
+        self.activations = {
+            "softplus": F.softplus,
+            "exp": torch.exp,
+        }
+        assert std_activation in self.activations, \
+            "std_activation must be one of: {}; instead got: {}".format(self.activations.keys(), std_activation)
+        self.std_activation = std_activation
+
+        super(TransformerGMMActorNetwork, self).__init__(
+            obs_shapes=obs_shapes,
+            ac_dim=ac_dim,
+            transformer_embed_dim=transformer_embed_dim,
+            transformer_num_layers=transformer_num_layers,
+            transformer_num_heads=transformer_num_heads,
+            transformer_context_length=transformer_context_length,
+            transformer_emb_dropout=transformer_emb_dropout,
+            transformer_attn_dropout=transformer_attn_dropout,
+            transformer_block_output_dropout=transformer_block_output_dropout,
+            transformer_sinusoidal_embedding=transformer_sinusoidal_embedding,
+            transformer_activation=transformer_activation,
+            transformer_nn_parameter_for_timesteps=transformer_nn_parameter_for_timesteps,            
+            encoder_kwargs=encoder_kwargs,
+            goal_shapes=goal_shapes,
+        )
+
+    def _get_output_shapes(self):
+        """
+        Tells @MIMO_Transformer superclass about the output dictionary that should be generated
+        at the last layer. Network outputs parameters of GMM distribution.
+        """
+        return OrderedDict(
+            mean=(self.num_modes, self.ac_dim), 
+            scale=(self.num_modes, self.ac_dim), 
+            logits=(self.num_modes,),
+        )
+
+    def forward_train(self, obs_dict, actions=None, goal_dict=None, low_noise_eval=None):
+        """
+        Return full GMM distribution, which is useful for computing
+        quantities necessary at train-time, like log-likelihood, KL 
+        divergence, etc.
+        Args:
+            obs_dict (dict): batch of observations
+            actions (torch.Tensor): batch of actions
+            goal_dict (dict): if not None, batch of goal observations
+        Returns:
+            dists (Distribution): sequence of GMM distributions over the timesteps
+        """
+        if self._is_goal_conditioned:
+            assert goal_dict is not None
+            # repeat the goal observation in time to match dimension with obs_dict
+            mod = list(obs_dict.keys())[0]
+            goal_dict = TensorUtils.unsqueeze_expand_at(goal_dict, size=obs_dict[mod].shape[1], dim=1)
+
+        forward_kwargs = dict(obs=obs_dict, goal=goal_dict)
+
+        outputs = MIMO_Transformer.forward(self, **forward_kwargs)
+        
+        means = outputs["mean"]
+        scales = outputs["scale"]
+        logits = outputs["logits"]
+
+        # apply tanh squashing to mean if not using tanh-GMM to ensure means are in [-1, 1]
+        if not self.use_tanh:
+            means = torch.tanh(means)
+
+        if low_noise_eval is None:
+            low_noise_eval = self.low_noise_eval
+        if low_noise_eval and (not self.training):
+            # low-noise for all Gaussian dists
+            scales = torch.ones_like(means) * 1e-4
+        else:
+            # post-process the scale accordingly
+            scales = self.activations[self.std_activation](scales) + self.min_std
+
+        # mixture components - make sure that `batch_shape` for the distribution is equal
+        # to (batch_size, timesteps, num_modes) since MixtureSameFamily expects this shape
+        component_distribution = D.Normal(loc=means, scale=scales)
+        component_distribution = D.Independent(component_distribution, 1) # shift action dim to event shape
+
+        # unnormalized logits to categorical distribution for mixing the modes
+        mixture_distribution = D.Categorical(logits=logits)
+
+        dists = D.MixtureSameFamily(
+            mixture_distribution=mixture_distribution,
+            component_distribution=component_distribution,
+        )
+
+        if self.use_tanh:
+            # Wrap distribution with Tanh
+            dists = TanhWrappedDistribution(base_dist=dists, scale=1.)
+
+        return dists
+
+    def forward(self, obs_dict, actions=None, goal_dict=None):
+        """
+        Samples actions from the policy distribution.
+        Args:
+            obs_dict (dict): batch of observations
+            actions (torch.Tensor): batch of actions
+            goal_dict (dict): if not None, batch of goal observations
+        Returns:
+            action (torch.Tensor): batch of actions from policy distribution
+        """
+        out = self.forward_train(obs_dict=obs_dict, actions=actions, goal_dict=goal_dict)
+        return out.sample()
+
+    def _to_string(self):
+        """Info to pretty print."""
+        msg = "action_dim={}, std_activation={}, low_noise_eval={}, num_nodes={}, min_std={}".format(
+            self.ac_dim, self.std_activation, self.low_noise_eval, self.num_modes, self.min_std)
+        return msg
+
+
+class VAEActor(Module):
+    """
+    A VAE that models a distribution of actions conditioned on observations.
+    The VAE prior and decoder are used at test-time as the policy.
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        encoder_layer_dims,
+        decoder_layer_dims,
+        latent_dim,
+        device,
+        decoder_is_conditioned=True,
+        decoder_reconstruction_sum_across_elements=False,
+        latent_clip=None,
+        prior_learn=False,
+        prior_is_conditioned=False,
+        prior_layer_dims=(),
+        prior_use_gmm=False,
+        prior_gmm_num_modes=10,
+        prior_gmm_learn_weights=False,
+        prior_use_categorical=False,
+        prior_categorical_dim=10,
+        prior_categorical_gumbel_softmax_hard=False,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            obs_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for observations.
+
+            ac_dim (int): dimension of action space.
+
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        super(VAEActor, self).__init__()
+
+        self.obs_shapes = obs_shapes
+        self.ac_dim = ac_dim
+        action_shapes = OrderedDict(action=(self.ac_dim,))
+
+        # ensure VAE decoder will squash actions into [-1, 1]
+        output_squash = ['action']
+        output_scales = OrderedDict(action=1.)
+
+        self._vae = VAE(
+            input_shapes=action_shapes,
+            output_shapes=action_shapes,
+            encoder_layer_dims=encoder_layer_dims,
+            decoder_layer_dims=decoder_layer_dims,
+            latent_dim=latent_dim,
+            device=device,
+            condition_shapes=self.obs_shapes,
+            decoder_is_conditioned=decoder_is_conditioned,
+            decoder_reconstruction_sum_across_elements=decoder_reconstruction_sum_across_elements,
+            latent_clip=latent_clip,
+            output_squash=output_squash,
+            output_scales=output_scales,
+            prior_learn=prior_learn,
+            prior_is_conditioned=prior_is_conditioned,
+            prior_layer_dims=prior_layer_dims,
+            prior_use_gmm=prior_use_gmm,
+            prior_gmm_num_modes=prior_gmm_num_modes,
+            prior_gmm_learn_weights=prior_gmm_learn_weights,
+            prior_use_categorical=prior_use_categorical,
+            prior_categorical_dim=prior_categorical_dim,
+            prior_categorical_gumbel_softmax_hard=prior_categorical_gumbel_softmax_hard,
+            goal_shapes=goal_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def encode(self, actions, obs_dict, goal_dict=None):
+        """
+        Args:
+            actions (torch.Tensor): a batch of actions
+
+            obs_dict (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to the observation modalities 
+                used for conditioning in either the decoder or the prior (or both).
+
+            goal_dict (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to goal modalities.
+
+        Returns:
+            posterior params (dict): dictionary with the following keys:
+
+                mean (torch.Tensor): posterior encoder means
+
+                logvar (torch.Tensor): posterior encoder logvars
+        """
+        inputs = OrderedDict(action=actions)
+        return self._vae.encode(inputs=inputs, conditions=obs_dict, goals=goal_dict)
+
+    def decode(self, obs_dict=None, goal_dict=None, z=None, n=None):
+        """
+        Thin wrapper around @VaeNets.VAE implementation.
+
+        Args:
+            obs_dict (dict): a dictionary that maps modalities to torch.Tensor
+                batches. Only needs to be provided if @decoder_is_conditioned
+                or @z is None (since the prior will require it to generate z).
+
+            goal_dict (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to goal modalities.
+
+            z (torch.Tensor): if provided, these latents are used to generate
+                reconstructions from the VAE, and the prior is not sampled.
+
+            n (int): this argument is used to specify the number of samples to 
+                generate from the prior. Only required if @z is None - i.e.
+                sampling takes place
+
+        Returns:
+            recons (dict): dictionary of reconstructed inputs (this will be a dictionary
+                with a single "action" key)
+        """
+        return self._vae.decode(conditions=obs_dict, goals=goal_dict, z=z, n=n)
+
+    def sample_prior(self, obs_dict=None, goal_dict=None, n=None):
+        """
+        Thin wrapper around @VaeNets.VAE implementation.
+
+        Args:
+            n (int): this argument is used to specify the number
+                of samples to generate from the prior.
+
+            obs_dict (dict): a dictionary that maps modalities to torch.Tensor
+                batches. Only needs to be provided if @prior_is_conditioned.
+
+            goal_dict (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to goal modalities.
+
+        Returns:
+            z (torch.Tensor): latents sampled from the prior
+        """
+        return self._vae.sample_prior(n=n, conditions=obs_dict, goals=goal_dict)
+
+    def set_gumbel_temperature(self, temperature):
+        """
+        Used by external algorithms to schedule Gumbel-Softmax temperature,
+        which is used during reparametrization at train-time. Should only be
+        used if @prior_use_categorical is True.
+        """
+        self._vae.set_gumbel_temperature(temperature)
+
+    def get_gumbel_temperature(self):
+        """
+        Return current Gumbel-Softmax temperature. Should only be used if
+        @prior_use_categorical is True.
+        """
+        return self._vae.get_gumbel_temperature()
+
+    def output_shape(self, input_shape=None):
+        """
+        This implementation is required by the Module superclass, but is unused since we 
+        never chain this module to other ones.
+        """
+        return [self.ac_dim]
+
+    def forward_train(self, actions, obs_dict, goal_dict=None, freeze_encoder=False):
+        """
+        A full pass through the VAE network used during training to construct KL
+        and reconstruction losses. See @VAE class for more info.
+
+        Args:
+            actions (torch.Tensor): a batch of actions
+
+            obs_dict (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to the observation modalities 
+                used for conditioning in either the decoder or the prior (or both).
+
+            goal_dict (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to goal modalities.
+
+        Returns:
+            vae_outputs (dict): a dictionary that contains the following outputs.
+
+                encoder_params (dict): parameters for the posterior distribution
+                    from the encoder forward pass
+
+                encoder_z (torch.Tensor): latents sampled from the encoder posterior
+
+                decoder_outputs (dict): action reconstructions from the decoder
+
+                kl_loss (torch.Tensor): KL loss over the batch of data
+
+                reconstruction_loss (torch.Tensor): reconstruction loss over the batch of data
+        """
+        action_inputs = OrderedDict(action=actions)
+        return self._vae.forward(
+            inputs=action_inputs, 
+            outputs=action_inputs, 
+            conditions=obs_dict, 
+            goals=goal_dict,
+            freeze_encoder=freeze_encoder)
+
+    def forward(self, obs_dict, goal_dict=None, z=None):
+        """
+        Samples actions from the policy distribution.
+
+        Args:
+            obs_dict (dict): batch of observations
+            goal_dict (dict): if not None, batch of goal observations
+            z (torch.Tensor): if not None, use the provided batch of latents instead
+                of sampling from the prior
+
+        Returns:
+            action (torch.Tensor): batch of actions from policy distribution
+        """
+        n = None
+        if z is None:
+            # prior will be sampled - so we must provide number of samples explicitly
+            mod = list(obs_dict.keys())[0]
+            n = obs_dict[mod].shape[0]
+        return self.decode(obs_dict=obs_dict, goal_dict=goal_dict, z=z, n=n)["action"]
diff --git a/phantom/submodules/phantom-robomimic/robomimic/models/transformers.py b/phantom/submodules/phantom-robomimic/robomimic/models/transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..309bff301d02ad561a34021dbea5d370249cef0f
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/models/transformers.py
@@ -0,0 +1,426 @@
+"""
+Implementation of transformers, mostly based on Andrej's minGPT model.
+See https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+for more details.
+"""
+
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from robomimic.models.base_nets import Module
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.torch_utils as TorchUtils
+
+class GEGLU(nn.Module):
+    """
+    References:
+        Shazeer et al., "GLU Variants Improve Transformer," 2020.
+        https://arxiv.org/abs/2002.05202
+    Implementation: https://github.com/pfnet-research/deep-table/blob/237c8be8a405349ce6ab78075234c60d9bfe60b7/deep_table/nn/layers/activation.py
+    """
+
+    def geglu(self, x):
+        assert x.shape[-1] % 2 == 0
+        a, b = x.chunk(2, dim=-1)
+        return a * F.gelu(b)
+
+    def forward(self, x):
+        return self.geglu(x)
+
+
+class PositionalEncoding(nn.Module):
+    """
+    Taken from https://pytorch.org/tutorials/beginner/transformer_tutorial.html.
+    """
+
+    def __init__(self, embed_dim):
+        """
+        Standard sinusoidal positional encoding scheme in transformers.
+
+        Positional encoding of the k'th position in the sequence is given by:
+            p(k, 2i) = sin(k/n^(i/d))
+            p(k, 2i+1) = sin(k/n^(i/d))
+
+        n: set to 10K in original Transformer paper
+        d: the embedding dimension
+        i: positions along the projected embedding space (ranges from 0 to d/2)
+
+        Args:
+            embed_dim: The number of dimensions to project the timesteps into.
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+
+    def forward(self, x):
+        """
+        Input timestep of shape BxT
+        """
+        position = x
+
+        # computing 1/n^(i/d) in log space and then exponentiating and fixing the shape
+        div_term = (
+            torch.exp(
+                torch.arange(0, self.embed_dim, 2, device=x.device)
+                * (-math.log(10000.0) / self.embed_dim)
+            )
+            .unsqueeze(0)
+            .unsqueeze(0)
+            .repeat(x.shape[0], x.shape[1], 1)
+        )
+        pe = torch.zeros((x.shape[0], x.shape[1], self.embed_dim), device=x.device)
+        pe[:, :, 0::2] = torch.sin(position.unsqueeze(-1) * div_term)
+        pe[:, :, 1::2] = torch.cos(position.unsqueeze(-1) * div_term)
+        return pe.detach()
+
+
+class CausalSelfAttention(Module):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        context_length,
+        attn_dropout=0.1,
+        output_dropout=0.1,
+    ):
+        """
+        Multi-head masked self-attention layer + projection (MLP layer).
+
+        For normal self-attention (@num_heads = 1), every single input in the sequence is
+        mapped to a key, query, and value embedding of size @embed_dim. For each input,
+        its query vector is compared (using dot-product) with all other key vectors in the
+        sequence, and softmax normalized to compute an attention over all members of the
+        sequence. This is used to take a linear combination of corresponding value embeddings.
+
+        The @num_heads argument is for multi-head attention, where the self-attention operation above
+        is performed in parallel over equal size partitions of the @embed_dim, allowing for different
+        portions of the embedding dimension to model different kinds of attention. The attention
+        output for each head is concatenated together.
+
+        Finally, we use a causal mask here to ensure that each output only depends on inputs that come
+        before it.
+
+        Args:
+            embed_dim (int): dimension of embeddings to use for keys, queries, and values
+                used in self-attention
+
+            num_heads (int): number of attention heads - must divide @embed_dim evenly. Self-attention is
+                computed over this many partitions of the embedding dimension separately.
+
+            context_length (int): expected length of input sequences
+
+            attn_dropout (float): dropout probability for attention outputs
+
+            output_dropout (float): dropout probability for final outputs
+        """
+        super(CausalSelfAttention, self).__init__()
+
+        assert (
+            embed_dim % num_heads == 0
+        ), "num_heads: {} does not divide embed_dim: {} exactly".format(num_heads, embed_dim)
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.context_length = context_length
+        self.attn_dropout = attn_dropout
+        self.output_dropout = output_dropout
+        self.nets = nn.ModuleDict()
+
+        # projection layers for key, query, value, across all attention heads
+        self.nets["qkv"] = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
+
+        # dropout layers
+        self.nets["attn_dropout"] = nn.Dropout(self.attn_dropout)
+        self.nets["output_dropout"] = nn.Dropout(self.output_dropout)
+
+        # output layer
+        self.nets["output"] = nn.Linear(self.embed_dim, self.embed_dim)
+
+        # causal mask (ensures attention is only over previous inputs) - just a lower triangular matrix of 1s
+        mask = torch.tril(torch.ones(context_length, context_length)).view(
+            1, 1, context_length, context_length
+        )
+        self.register_buffer("mask", mask)
+
+    def forward(self, x):
+        """
+        Forward pass through Self-Attention block.
+        Input should be shape (B, T, D) where B is batch size, T is seq length (@self.context_length), and
+        D is input dimension (@self.embed_dim).
+        """
+
+        # enforce shape consistency
+        assert len(x.shape) == 3
+        B, T, D = x.shape
+        assert (
+            T <= self.context_length
+        ), "self-attention module can only handle sequences up to {} in length but got length {}".format(
+            self.context_length, T
+        )
+        assert D == self.embed_dim
+        NH = self.num_heads  # number of attention heads
+        DH = D // NH  # embed dimension for each attention head
+
+        # compute key, query, and value vectors for each member of sequence, and split across attention heads
+        qkv = self.nets["qkv"](x)
+        q, k, v = torch.chunk(qkv, 3, dim=-1)
+        k = k.view(B, T, NH, DH).transpose(1, 2)  # [B, NH, T, DH]
+        q = q.view(B, T, NH, DH).transpose(1, 2)  # [B, NH, T, DH]
+        v = v.view(B, T, NH, DH).transpose(1, 2)  # [B, NH, T, DH]
+
+        # causal self-attention mechanism
+
+        # batched matrix multiplication between queries and keys to get all pair-wise dot-products.
+        # We broadcast across batch and attention heads and get pair-wise dot-products between all pairs of timesteps
+        # [B, NH, T, DH] x [B, NH, DH, T] -> [B, NH, T, T]
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+
+        # use mask to replace entries in dot products with negative inf to ensure they don't contribute to softmax,
+        # then take softmax over last dimension to end up with attention score for each member of sequence.
+        # Note the use of [:T, :T] -  this makes it so we can handle sequences less than @self.context_length in length.
+        att = att.masked_fill(self.mask[..., :T, :T] == 0, float("-inf"))
+        att = F.softmax(
+            att, dim=-1
+        )  # shape [B, NH, T, T], last dimension has score over all T for each sequence member
+
+        # dropout on attention
+        att = self.nets["attn_dropout"](att)
+
+        # take weighted sum of value vectors over whole sequence according to attention, with batched matrix multiplication
+        # [B, NH, T, T] x [B, NH, T, DH] -> [B, NH, T, DH]
+        y = att @ v
+        # reshape [B, NH, T, DH] -> [B, T, NH, DH] -> [B, T, NH * DH] = [B, T, D]
+        y = y.transpose(1, 2).contiguous().view(B, T, D)
+
+        # pass through output layer + dropout
+        y = self.nets["output"](y)
+        y = self.nets["output_dropout"](y)
+        return y
+
+    def output_shape(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # this module doesn't modify the size of the input, it goes from (B, T, D) -> (B, T, D)
+        return list(input_shape)
+
+
+class SelfAttentionBlock(Module):
+    """
+    A single Transformer Block, that can be chained together repeatedly.
+    It consists of a @CausalSelfAttention module and a small MLP, along with
+    layer normalization and residual connections on each input.
+    """
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        context_length,
+        attn_dropout=0.1,
+        output_dropout=0.1,
+        activation=nn.GELU(),
+    ):
+        """
+        Args:
+            embed_dim (int): dimension of embeddings to use for keys, queries, and values
+                used in self-attention
+
+            num_heads (int): number of attention heads - must divide @embed_dim evenly. Self-attention is
+                computed over this many partitions of the embedding dimension separately.
+
+            context_length (int): expected length of input sequences
+
+            attn_dropout (float): dropout probability for attention outputs
+
+            output_dropout (float): dropout probability for final outputs
+
+            activation (str): string denoting the activation function to use in each transformer block
+        """
+        super(SelfAttentionBlock, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.context_length = context_length
+        self.attn_dropout = attn_dropout
+        self.output_dropout = output_dropout
+        self.nets = nn.ModuleDict()
+
+        # self-attention block
+        self.nets["attention"] = CausalSelfAttention(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            context_length=context_length,
+            attn_dropout=attn_dropout,
+            output_dropout=output_dropout,
+        )
+
+        if type(activation) == GEGLU:
+            mult = 2
+        else:
+            mult = 1
+
+        # small 2-layer MLP
+        self.nets["mlp"] = nn.Sequential(
+            nn.Linear(embed_dim, 4 * embed_dim * mult),
+            activation,
+            nn.Linear(4 * embed_dim, embed_dim),
+            nn.Dropout(output_dropout)
+        )
+
+        # layer normalization for inputs to self-attention module and MLP
+        self.nets["ln1"] = nn.LayerNorm(embed_dim)
+        self.nets["ln2"] = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        """
+        Forward pass - chain self-attention + MLP blocks, with residual connections and layer norms.
+        """
+        x = x + self.nets["attention"](self.nets["ln1"](x))
+        x = x + self.nets["mlp"](self.nets["ln2"](x))
+        return x
+
+    def output_shape(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # this module doesn't modify the size of the input, it goes from (B, T, D) -> (B, T, D)
+        return list(input_shape)
+
+
+class GPT_Backbone(Module):
+    """the GPT model, with a context size of block_size"""
+
+    def __init__(
+        self,
+        embed_dim,
+        context_length,
+        attn_dropout=0.1,
+        block_output_dropout=0.1,
+        num_layers=6,
+        num_heads=8,
+        activation="gelu",
+    ):
+        """
+        Args:
+            embed_dim (int): dimension of embeddings to use for keys, queries, and values
+                used in self-attention
+
+            context_length (int): expected length of input sequences
+
+            attn_dropout (float): dropout probability for attention outputs for each transformer block
+
+            block_output_dropout (float): dropout probability for final outputs for each transformer block
+
+            num_layers (int): number of transformer blocks to stack
+
+            num_heads (int): number of attention heads - must divide @embed_dim evenly. Self-attention is
+                computed over this many partitions of the embedding dimension separately.
+
+            activation (str): string denoting the activation function to use in each transformer block
+
+        """
+        super(GPT_Backbone, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.context_length = context_length
+        self.attn_dropout = attn_dropout
+        self.block_output_dropout = block_output_dropout
+
+        if activation == "gelu":
+            self.activation = nn.GELU()
+        elif activation == "geglu":
+            self.activation = GEGLU()
+
+        # create networks
+        self._create_networks()
+
+        # initialize weights
+        self.apply(self._init_weights)
+
+        print(
+            "Created {} model with number of parameters: {}".format(
+                self.__class__.__name__, sum(p.numel() for p in self.parameters())
+            )
+        )
+
+    def _create_networks(self):
+        """
+        Helper function to create networks.
+        """
+        self.nets = nn.ModuleDict()
+
+        # transformer - cascaded transformer blocks
+        self.nets["transformer"] = nn.Sequential(
+            *[
+                SelfAttentionBlock(
+                    embed_dim=self.embed_dim,
+                    num_heads=self.num_heads,
+                    context_length=self.context_length,
+                    attn_dropout=self.attn_dropout,
+                    output_dropout=self.block_output_dropout,
+                    activation=self.activation,
+                )
+                for _ in range(self.num_layers)
+            ]
+        )
+
+        # decoder head
+        self.nets["output_ln"] = nn.LayerNorm(self.embed_dim)
+
+    def _init_weights(self, module):
+        """
+        Weight initializer.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def output_shape(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # this module takes inputs (B, T, @self.input_dim) and produces outputs (B, T, @self.output_dim)
+        return input_shape[:-1] + [self.output_dim]
+
+    def forward(self, inputs):
+        assert inputs.shape[1:] == (self.context_length, self.embed_dim), inputs.shape
+        x = self.nets["transformer"](inputs)
+        transformer_output = self.nets["output_ln"](x)
+        return transformer_output
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/models/vae_nets.py b/phantom/submodules/phantom-robomimic/robomimic/models/vae_nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..91b4e7f02352126f17fcc92a0e651080a4e0bee6
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/models/vae_nets.py
@@ -0,0 +1,1386 @@
+"""
+Contains an implementation of Variational Autoencoder (VAE) and other
+variants, including other priors, and RNN-VAEs.
+"""
+import textwrap
+import numpy as np
+from copy import deepcopy
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributions as D
+
+import robomimic.utils.loss_utils as LossUtils
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.torch_utils as TorchUtils
+from robomimic.models.base_nets import Module
+from robomimic.models.obs_nets import MIMO_MLP
+
+
+def vae_args_from_config(vae_config):
+    """
+    Generate a set of VAE args that are read from the VAE-specific part
+    of a config (for example see `config.algo.vae` in BCConfig).
+    """
+    vae_args = dict(
+        encoder_layer_dims=vae_config.encoder_layer_dims,
+        decoder_layer_dims=vae_config.decoder_layer_dims,
+        latent_dim=vae_config.latent_dim,
+        decoder_is_conditioned=vae_config.decoder.is_conditioned,
+        decoder_reconstruction_sum_across_elements=vae_config.decoder.reconstruction_sum_across_elements,
+        latent_clip=vae_config.latent_clip,
+        prior_learn=vae_config.prior.learn,
+        prior_is_conditioned=vae_config.prior.is_conditioned,
+        prior_layer_dims=vae_config.prior_layer_dims,
+        prior_use_gmm=vae_config.prior.use_gmm,
+        prior_gmm_num_modes=vae_config.prior.gmm_num_modes,
+        prior_gmm_learn_weights=vae_config.prior.gmm_learn_weights,
+        prior_use_categorical=vae_config.prior.use_categorical,
+        prior_categorical_dim=vae_config.prior.categorical_dim,
+        prior_categorical_gumbel_softmax_hard=vae_config.prior.categorical_gumbel_softmax_hard,
+    )
+    return vae_args
+
+
+class Prior(Module):
+    """
+    Base class for VAE priors. It's basically the same as a @MIMO_MLP network (it
+    instantiates one) but it supports additional methods such as KL loss computation 
+    and sampling, and also may learn prior parameters as observation-independent 
+    torch Parameters instead of observation-dependent mappings.
+    """
+    def __init__(
+        self,
+        param_shapes,
+        param_obs_dependent,
+        obs_shapes=None,
+        mlp_layer_dims=(),
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            param_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for parameters that determine the prior
+                distribution.
+
+            param_obs_dependent (OrderedDict): a dictionary with boolean
+                values consistent with @param_shapes which determines whether
+                to learn parameters as part of the (obs-dependent) network or 
+                directly as learnable parameters.
+
+            obs_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for observations.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layer sizes
+
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        super(Prior, self).__init__()
+
+        assert isinstance(param_shapes, OrderedDict) and isinstance(param_obs_dependent, OrderedDict)
+        assert set(param_shapes.keys()) == set(param_obs_dependent.keys())
+        self.param_shapes = param_shapes
+        self.param_obs_dependent = param_obs_dependent
+
+        net_kwargs = dict(
+            obs_shapes=obs_shapes,
+            mlp_layer_dims=mlp_layer_dims,
+            goal_shapes=goal_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+        self._create_layers(net_kwargs)
+
+    def _create_layers(self, net_kwargs):
+        """
+        Create networks and parameters needed by the prior.
+        """
+        self.prior_params = nn.ParameterDict()
+
+        self._is_obs_dependent = False
+        mlp_output_shapes = OrderedDict()
+        for pp in self.param_shapes:
+            if self.param_obs_dependent[pp]:
+                # prior parameters will be a function of observations using a network
+                mlp_output_shapes[pp] = self.param_shapes[pp]
+            else:
+                # learnable prior parameters independent of observation
+                param_init = torch.randn(*self.param_shapes[pp]) / np.sqrt(np.prod(self.param_shapes[pp]))
+                self.prior_params[pp] = torch.nn.Parameter(param_init)
+
+        # only make networks if we have obs-dependent prior parameters
+        self.prior_module = None
+        if len(mlp_output_shapes) > 0:
+            # create @MIMO_MLP that takes obs and goal dicts and returns prior params
+            self._is_obs_dependent = True
+            obs_shapes = net_kwargs["obs_shapes"]
+            goal_shapes = net_kwargs["goal_shapes"]
+            obs_group_shapes = OrderedDict()
+            assert isinstance(obs_shapes, OrderedDict)
+            obs_group_shapes["obs"] = OrderedDict(obs_shapes)
+            if goal_shapes is not None and len(goal_shapes) > 0:
+                assert isinstance(goal_shapes, OrderedDict)
+                obs_group_shapes["goal"] = OrderedDict(goal_shapes)
+            self.prior_module = MIMO_MLP(
+                input_obs_group_shapes=obs_group_shapes,
+                output_shapes=mlp_output_shapes,
+                layer_dims=net_kwargs["mlp_layer_dims"],
+                encoder_kwargs=net_kwargs["encoder_kwargs"],
+            )
+
+    def sample(self, n, obs_dict=None, goal_dict=None):
+        """
+        Returns a batch of samples from the prior distribution.
+
+        Args:
+            n (int): this argument is used to specify the number
+                of samples to generate from the prior.
+
+            obs_dict (dict): inputs according to @obs_shapes. Only needs to be provided
+                if any prior parameters are obs-dependent. Leading dimension should
+                be consistent with @n, the number of samples to generate.
+
+            goal_dict (dict): inputs according to @goal_shapes (only if using goal observations)
+
+        Returns:
+            z (torch.Tensor): batch of sampled latent vectors.
+        """
+        raise NotImplementedError
+
+    def kl_loss(self, posterior_params, z=None, obs_dict=None, goal_dict=None):
+        """
+        Computes sample-based KL divergence loss between the Gaussian distribution
+        given by @mu, @logvar and the prior distribution. 
+
+        Args:
+            posterior_params (dict): dictionary with keys "mu" and "logvar" corresponding
+                to torch.Tensor batch of means and log-variances of posterior Gaussian
+                distribution.
+
+            z (torch.Tensor): samples from the Gaussian distribution parametrized by
+                @mu and @logvar. May not be needed depending on the prior.
+
+            obs_dict (dict): inputs according to @obs_shapes. Only needs to be provided
+                if any prior parameters are obs-dependent.
+
+            goal_dict (dict): inputs according to @goal_shapes (only if using goal observations)
+
+        Returns:
+            kl_loss (torch.Tensor): KL divergence loss
+        """
+        raise NotImplementedError
+
+    def output_shape(self, input_shape=None):
+        """
+        Returns output shape for this module, which is a dictionary instead
+        of a list since outputs are dictionaries.
+        """
+        if self.prior_module is not None:
+            return self.prior_module.output_shape(input_shape)
+        return { k : list(self.param_shapes[k]) for k in self.param_shapes }
+
+    def forward(self, batch_size, obs_dict=None, goal_dict=None):
+        """
+        Computes prior parameters.
+
+        Args:
+            batch_size (int): batch size - this is needed for parameters that are
+                not obs-dependent, to make sure the leading dimension is correct
+                for downstream sampling and loss computation purposes
+
+            obs_dict (dict): inputs according to @obs_shapes. Only needs to be provided
+                if any prior parameters are obs-dependent.
+
+            goal_dict (dict): inputs according to @goal_shapes (only if using goal observations)
+
+        Returns:
+            prior_params (dict): dictionary containing prior parameters
+        """
+        prior_params = dict()
+        if self._is_obs_dependent:
+            # forward through network for obs-dependent params
+            prior_params = self.prior_module.forward(obs=obs_dict, goal=goal_dict)
+
+        # return params that do not depend on obs as well
+        for pp in self.param_shapes:
+            if not self.param_obs_dependent[pp]:
+                # ensure leading dimension will be consistent with other params
+                prior_params[pp] = TensorUtils.expand_at(self.prior_params[pp], size=batch_size, dim=0)
+
+        # ensure leading dimensions are all consistent
+        TensorUtils.assert_size_at_dim(prior_params, size=batch_size, dim=0, 
+                msg="prior params dim 0 mismatch in forward")
+
+        return prior_params
+
+
+class GaussianPrior(Prior):
+    """
+    A class that holds functionality for learning both unimodal Gaussian priors and
+    multimodal Gaussian Mixture Model priors for use in VAEs.
+    """
+    def __init__(
+        self,
+        latent_dim,
+        device,
+        latent_clip=None,
+        learnable=False,
+        use_gmm=False,
+        gmm_num_modes=10,
+        gmm_learn_weights=False,
+        obs_shapes=None,
+        mlp_layer_dims=(),
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            latent_dim (int): size of latent dimension for the prior
+
+            device (torch.Device): where the module should live (i.e. cpu, gpu)
+
+            latent_clip (float): if provided, clip all latents sampled at
+                test-time in each dimension to (-@latent_clip, @latent_clip)
+
+            learnable (bool): if True, learn the parameters of the prior (as opposed
+                to a default N(0, 1) prior)
+
+            use_gmm (bool): if True, learn a Gaussian Mixture Model (GMM)
+                prior instead of a unimodal Gaussian prior. To use this option,
+                @learnable must be set to True.
+
+            gmm_num_modes (int): number of GMM modes to learn. Only
+                used if @use_gmm is True.
+
+            gmm_learn_weights (bool): if True, learn the weights of the GMM
+                model instead of setting them to be uniform across all the modes.
+                Only used if @use_gmm is True.
+
+            obs_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for observations. If provided, assumes that
+                the prior should depend on observation inputs, and networks 
+                will be created to output prior parameters.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layer sizes
+
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        self.device = device
+        self.latent_dim = latent_dim
+        self.latent_clip = latent_clip
+        self.learnable = learnable
+
+        self.use_gmm = use_gmm
+        if self.use_gmm:
+            self.num_modes = gmm_num_modes
+        else:
+            # unimodal Gaussian prior
+            self.num_modes = 1
+        self.gmm_learn_weights = gmm_learn_weights
+
+        self._input_dependent = (obs_shapes is not None) and (len(obs_shapes) > 0)
+
+        if self._input_dependent:
+            assert learnable
+            assert isinstance(obs_shapes, OrderedDict)
+
+            # network will generate mean and logvar
+            param_shapes = OrderedDict(
+                mean=(self.num_modes, self.latent_dim,),
+                logvar=(self.num_modes, self.latent_dim,),
+            )
+            param_obs_dependent = OrderedDict(mean=True, logvar=True)
+
+            if self.use_gmm and self.gmm_learn_weights:
+                # network generates GMM weights
+                param_shapes["weight"] = (self.num_modes,)
+                param_obs_dependent["weight"] = True
+        else:
+            # learn obs-indep mean / logvar
+            param_shapes = OrderedDict(
+                mean=(1, self.num_modes, self.latent_dim),
+                logvar=(1, self.num_modes, self.latent_dim),
+            )
+            param_obs_dependent = OrderedDict(mean=False, logvar=False)
+
+            if self.use_gmm and self.gmm_learn_weights:
+                # learn obs-indep GMM weights
+                param_shapes["weight"] = (1, self.num_modes)
+                param_obs_dependent["weight"] = False
+
+        super(GaussianPrior, self).__init__(
+            param_shapes=param_shapes,
+            param_obs_dependent=param_obs_dependent,
+            obs_shapes=obs_shapes,
+            mlp_layer_dims=mlp_layer_dims,
+            goal_shapes=goal_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def _create_layers(self, net_kwargs):
+        """
+        Update from superclass to only create parameters / networks if not using
+        N(0, 1) Gaussian prior.
+        """
+        if self.learnable:
+            super(GaussianPrior, self)._create_layers(net_kwargs)
+
+    def sample(self, n, obs_dict=None, goal_dict=None):
+        """
+        Returns a batch of samples from the prior distribution.
+
+        Args:
+            n (int): this argument is used to specify the number
+                of samples to generate from the prior.
+
+            obs_dict (dict): inputs according to @obs_shapes. Only needs to be provided
+                if any prior parameters are obs-dependent. Leading dimension should
+                be consistent with @n, the number of samples to generate.
+
+            goal_dict (dict): inputs according to @goal_shapes (only if using goal observations)
+
+        Returns:
+            z (torch.Tensor): batch of sampled latent vectors.
+        """
+
+        # check consistency between n and obs_dict
+        if self._input_dependent:
+            TensorUtils.assert_size_at_dim(obs_dict, size=n, dim=0, 
+                msg="obs dict and n mismatch in @sample")
+
+        if self.learnable:
+
+            # forward to get parameters
+            out = self.forward(batch_size=n, obs_dict=obs_dict, goal_dict=goal_dict)
+            prior_means, prior_logvars, prior_logweights = out["means"], out["logvars"], out["logweights"]
+
+            if prior_logweights is not None:
+                prior_weights = torch.exp(prior_logweights)
+
+            if self.use_gmm:
+                # learned GMM
+
+                # make uniform weights (in the case that weights were not learned)
+                if not self.gmm_learn_weights:
+                    prior_weights = torch.ones(n, self.num_modes).to(prior_means.device) / self.num_modes
+
+                # sample modes
+                gmm_mode_indices = D.Categorical(prior_weights).sample()
+                
+                # get GMM centers and sample using reparametrization trick
+                selected_means = TensorUtils.gather_sequence(prior_means, indices=gmm_mode_indices)
+                selected_logvars = TensorUtils.gather_sequence(prior_logvars, indices=gmm_mode_indices)
+                z = TorchUtils.reparameterize(selected_means, selected_logvars)
+
+            else:
+                # learned unimodal Gaussian - remove mode dim and sample from Gaussian using reparametrization trick
+                z = TorchUtils.reparameterize(prior_means[:, 0, :], prior_logvars[:, 0, :])
+
+        else:
+            # sample from N(0, 1)
+            z = torch.randn(n, self.latent_dim).float().to(self.device)
+
+        if self.latent_clip is not None:
+            z = z.clamp(-self.latent_clip, self.latent_clip)
+
+        return z
+
+    def kl_loss(self, posterior_params, z=None, obs_dict=None, goal_dict=None):
+        """
+        Computes sample-based KL divergence loss between the Gaussian distribution
+        given by @mu, @logvar and the prior distribution. 
+
+        Args:
+            posterior_params (dict): dictionary with keys "mu" and "logvar" corresponding
+                to torch.Tensor batch of means and log-variances of posterior Gaussian
+                distribution.
+
+            z (torch.Tensor): samples from the Gaussian distribution parametrized by
+                @mu and @logvar. Only needed if @self.use_gmm is True.
+
+            obs_dict (dict): inputs according to @obs_shapes. Only needs to be provided
+                if any prior parameters are obs-dependent.
+
+            goal_dict (dict): inputs according to @goal_shapes (only if using goal observations)
+
+        Returns:
+            kl_loss (torch.Tensor): KL divergence loss
+        """
+        mu = posterior_params["mean"]
+        logvar = posterior_params["logvar"]
+
+        if not self.learnable:
+            # closed-form Gaussian KL from N(0, 1) prior
+            return LossUtils.KLD_0_1_loss(mu=mu, logvar=logvar)
+
+        # forward to get parameters
+        out = self.forward(batch_size=mu.shape[0], obs_dict=obs_dict, goal_dict=goal_dict)
+        prior_means, prior_logvars, prior_logweights = out["means"], out["logvars"], out["logweights"]
+
+        if not self.use_gmm:
+            # collapse mode dimension and compute Gaussian KL in closed-form
+            prior_means = prior_means[:, 0, :]
+            prior_logvars = prior_logvars[:, 0, :]
+            return LossUtils.KLD_gaussian_loss(
+                mu_1=mu, 
+                logvar_1=logvar, 
+                mu_2=prior_means, 
+                logvar_2=prior_logvars,
+            )
+
+        # GMM KL loss computation
+        var = torch.exp(logvar.clamp(-8, 30)) # clamp for numerical stability
+        prior_vars = torch.exp(prior_logvars.clamp(-8, 30))
+        kl_loss = LossUtils.log_normal(x=z, m=mu, v=var) \
+            - LossUtils.log_normal_mixture(x=z, m=prior_means, v=prior_vars, log_w=prior_logweights)
+        return kl_loss.mean()
+
+    def forward(self, batch_size, obs_dict=None, goal_dict=None):
+        """
+        Computes means, logvars, and GMM weights (if using GMM and learning weights).
+
+        Args:
+            batch_size (int): batch size - this is needed for parameters that are
+                not obs-dependent, to make sure the leading dimension is correct
+                for downstream sampling and loss computation purposes
+
+            obs_dict (dict): inputs according to @obs_shapes. Only needs to be provided
+                if any prior parameters are obs-dependent.
+
+            goal_dict (dict): inputs according to @goal_shapes (only if using goal observations)
+
+        Returns:
+            prior_params (dict): dictionary containing prior parameters
+        """
+        assert self.learnable
+        prior_params = super(GaussianPrior, self).forward(
+            batch_size=batch_size, obs_dict=obs_dict, goal_dict=goal_dict)
+
+        if self.use_gmm and self.gmm_learn_weights:
+            # normalize learned weight outputs to sum to 1
+            logweights = F.log_softmax(prior_params["weight"], dim=-1)
+        else:
+            logweights = None
+            assert "weight" not in prior_params
+
+        out = dict(means=prior_params["mean"], logvars=prior_params["logvar"], logweights=logweights)
+        return out
+
+    def __repr__(self):
+        """Pretty print network"""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        indent = ' ' * 4
+        msg += textwrap.indent("latent_dim={}\n".format(self.latent_dim), indent)
+        msg += textwrap.indent("latent_clip={}\n".format(self.latent_clip), indent)
+        msg += textwrap.indent("learnable={}\n".format(self.learnable), indent)
+        msg += textwrap.indent("input_dependent={}\n".format(self._input_dependent), indent)
+        msg += textwrap.indent("use_gmm={}\n".format(self.use_gmm), indent)
+        if self.use_gmm:
+            msg += textwrap.indent("gmm_num_nodes={}\n".format(self.num_modes), indent)
+            msg += textwrap.indent("gmm_learn_weights={}\n".format(self.gmm_learn_weights), indent)
+        if self.learnable:
+            if self.prior_module is not None:
+                msg += textwrap.indent("\nprior_module={}\n".format(self.prior_module), indent)
+            msg += textwrap.indent("prior_params={}\n".format(self.prior_params), indent)
+        msg = header + '(\n' + msg + ')'
+        return msg
+
+
+class CategoricalPrior(Prior):
+    """
+    A class that holds functionality for learning categorical priors for use
+    in VAEs.
+    """
+    def __init__(
+        self,
+        latent_dim,
+        categorical_dim,
+        device,
+        learnable=False,
+        obs_shapes=None,
+        mlp_layer_dims=(),
+        goal_shapes=None,
+        encoder_kwargs=None,
+
+    ):
+        """
+        Args:
+            latent_dim (int): size of latent dimension for the prior
+
+            categorical_dim (int): size of categorical dimension (number of classes
+                for each dimension of latent space)
+
+            device (torch.Device): where the module should live (i.e. cpu, gpu)
+
+            learnable (bool): if True, learn the parameters of the prior (as opposed
+                to a default N(0, 1) prior)
+
+            obs_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for observations. If provided, assumes that
+                the prior should depend on observation inputs, and networks 
+                will be created to output prior parameters.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layer sizes
+
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        self.device = device
+        self.latent_dim = latent_dim
+        self.categorical_dim = categorical_dim
+        self.learnable = learnable
+
+        self._input_dependent = (obs_shapes is not None) and (len(obs_shapes) > 0)
+
+        if self._input_dependent:
+            assert learnable
+            assert isinstance(obs_shapes, OrderedDict)
+
+            # network will generate logits for categorical distributions
+            param_shapes = OrderedDict(
+                logit=(self.latent_dim, self.categorical_dim,)
+            )
+            param_obs_dependent = OrderedDict(logit=True)
+        else:
+            # learn obs-indep mean / logvar
+            param_shapes = OrderedDict(
+                logit=(1, self.latent_dim, self.categorical_dim),
+            )
+            param_obs_dependent = OrderedDict(logit=False)
+
+        super(CategoricalPrior, self).__init__(
+            param_shapes=param_shapes,
+            param_obs_dependent=param_obs_dependent,
+            obs_shapes=obs_shapes,
+            mlp_layer_dims=mlp_layer_dims,
+            goal_shapes=goal_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def _create_layers(self, net_kwargs):
+        """
+        Update from superclass to only create parameters / networks if not using
+        uniform categorical prior.
+        """
+        if self.learnable:
+            super(CategoricalPrior, self)._create_layers(net_kwargs)
+
+    def sample(self, n, obs_dict=None, goal_dict=None):
+        """
+        Returns a batch of samples from the prior distribution.
+
+        Args:
+            n (int): this argument is used to specify the number
+                of samples to generate from the prior.
+
+            obs_dict (dict): inputs according to @obs_shapes. Only needs to be provided
+                if any prior parameters are obs-dependent. Leading dimension should
+                be consistent with @n, the number of samples to generate.
+
+            goal_dict (dict): inputs according to @goal_shapes (only if using goal observations)
+
+        Returns:
+            z (torch.Tensor): batch of sampled latent vectors.
+        """
+
+        # check consistency between n and obs_dict
+        if self._input_dependent:
+            TensorUtils.assert_size_at_dim(obs_dict, size=n, dim=0, 
+                msg="obs dict and n mismatch in @sample")
+
+        if self.learnable:
+
+            # forward to get parameters
+            out = self.forward(batch_size=n, obs_dict=obs_dict, goal_dict=goal_dict)
+            prior_logits = out["logit"]
+
+            # sample one-hot latents from categorical distribution
+            dist = D.Categorical(logits=prior_logits)
+            z = TensorUtils.to_one_hot(dist.sample(), num_class=self.categorical_dim)
+
+        else:
+            # try to include a categorical sample for each class if possible (ensuring rough uniformity)
+            if (self.latent_dim == 1) and (self.categorical_dim <= n):
+                # include samples [0, 1, ..., C - 1] and then repeat until batch is filled
+                dist_samples = torch.arange(n).remainder(self.categorical_dim).unsqueeze(-1).to(self.device)
+            else:
+                # sample one-hot latents from uniform categorical distribution for each latent dimension
+                probs = torch.ones(n, self.latent_dim, self.categorical_dim).float().to(self.device)
+                dist_samples = D.Categorical(probs=probs).sample()
+            z = TensorUtils.to_one_hot(dist_samples, num_class=self.categorical_dim)
+
+        # reshape [B, D, C] to [B, D * C] to be consistent with other priors that return flat latents
+        z = z.reshape(*z.shape[:-2], -1)
+        return z
+
+    def kl_loss(self, posterior_params, z=None, obs_dict=None, goal_dict=None):
+        """
+        Computes KL divergence loss between the Categorical distribution
+        given by the unnormalized logits @logits and the prior distribution. 
+
+        Args:
+            posterior_params (dict): dictionary with key "logits" corresponding
+                to torch.Tensor batch of unnormalized logits of shape [B, D * C] 
+                that corresponds to the posterior categorical distribution
+
+            z (torch.Tensor): samples from encoder - unused for this prior
+
+            obs_dict (dict): inputs according to @obs_shapes. Only needs to be provided
+                if any prior parameters are obs-dependent.
+
+            goal_dict (dict): inputs according to @goal_shapes (only if using goal observations)
+
+        Returns:
+            kl_loss (torch.Tensor): KL divergence loss
+        """
+        logits = posterior_params["logit"].reshape(-1, self.latent_dim, self.categorical_dim)
+        if not self.learnable:
+            # prior logits correspond to uniform categorical distribution
+            prior_logits = torch.zeros_like(logits)
+        else:
+            # forward to get parameters
+            out = self.forward(batch_size=posterior_params["logit"].shape[0], obs_dict=obs_dict, goal_dict=goal_dict)
+            prior_logits = out["logit"]
+
+        prior_dist = D.Categorical(logits=prior_logits)
+        posterior_dist = D.Categorical(logits=logits)
+
+        # sum over latent dimensions, but average over batch dimension
+        kl_loss = D.kl_divergence(posterior_dist, prior_dist)
+        assert len(kl_loss.shape) == 2
+        return kl_loss.sum(-1).mean()
+
+    def forward(self, batch_size, obs_dict=None, goal_dict=None):
+        """
+        Computes prior logits (unnormalized log-probs).
+
+        Args:
+            batch_size (int): batch size - this is needed for parameters that are
+                not obs-dependent, to make sure the leading dimension is correct
+                for downstream sampling and loss computation purposes
+
+            obs_dict (dict): inputs according to @obs_shapes. Only needs to be provided
+                if any prior parameters are obs-dependent.
+
+            goal_dict (dict): inputs according to @goal_shapes (only if using goal observations)
+
+        Returns:
+            prior_params (dict): dictionary containing prior parameters
+        """
+        assert self.learnable
+        return super(CategoricalPrior, self).forward(
+            batch_size=batch_size, obs_dict=obs_dict, goal_dict=goal_dict)
+
+    def __repr__(self):
+        """Pretty print network"""
+        header = '{}'.format(str(self.__class__.__name__))
+        msg = ''
+        indent = ' ' * 4
+        msg += textwrap.indent("latent_dim={}\n".format(self.latent_dim), indent)
+        msg += textwrap.indent("categorical_dim={}\n".format(self.categorical_dim), indent)
+        msg += textwrap.indent("learnable={}\n".format(self.learnable), indent)
+        msg += textwrap.indent("input_dependent={}\n".format(self._input_dependent), indent)
+        if self.learnable:
+            if self.prior_module is not None:
+                msg += textwrap.indent("\nprior_module={}\n".format(self.prior_module), indent)
+            msg += textwrap.indent("prior_params={}\n".format(self.prior_params), indent)
+        msg = header + '(\n' + msg + ')'
+        return msg
+
+
+class VAE(torch.nn.Module):
+    """
+    A Variational Autoencoder (VAE), as described in https://arxiv.org/abs/1312.6114.
+
+    Models a distribution p(X) or a conditional distribution p(X | Y), where each
+    variable can consist of multiple modalities. The target variable X, whose
+    distribution is modeled, is specified through the @input_shapes argument,
+    which is a map between modalities (strings) and expected shapes. In this way,
+    a variable that consists of multiple kinds of data (e.g. image and flat-dimensional)
+    can be modeled as well. A separate @output_shapes argument is used to specify the
+    expected reconstructions - this allows for asymmetric reconstruction (for example,
+    reconstructing low-resolution images).
+
+    This implementation supports learning conditional distributions as well (cVAE). 
+    The conditioning variable Y is specified through the @condition_shapes argument,
+    which is also a map between modalities (strings) and expected shapes. In this way,
+    variables with multiple kinds of data (e.g. image and flat-dimensional) can 
+    jointly be conditioned on. By default, the decoder takes the conditioning 
+    variable Y as input. To force the decoder to reconstruct from just the latent,
+    set @decoder_is_conditioned to False (in this case, the prior must be conditioned).
+
+    The implementation also supports learning expressive priors instead of using
+    the usual N(0, 1) prior. There are three kinds of priors supported - Gaussian,
+    Gaussian Mixture Model (GMM), and Categorical. For each prior, the parameters can 
+    be learned as independent parameters, or be learned as functions of the conditioning
+    variable Y (by setting @prior_is_conditioned).
+    """
+    def __init__(
+        self,
+        input_shapes,
+        output_shapes,
+        encoder_layer_dims,
+        decoder_layer_dims,
+        latent_dim,
+        device,
+        condition_shapes=None,
+        decoder_is_conditioned=True,
+        decoder_reconstruction_sum_across_elements=False,
+        latent_clip=None,
+        output_squash=(),
+        output_scales=None,
+        output_ranges=None,
+        prior_learn=False,
+        prior_is_conditioned=False,
+        prior_layer_dims=(),
+        prior_use_gmm=False,
+        prior_gmm_num_modes=10,
+        prior_gmm_learn_weights=False,
+        prior_use_categorical=False,
+        prior_categorical_dim=10,
+        prior_categorical_gumbel_softmax_hard=False,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            input_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for all encoder-specific inputs. This corresponds
+                to the variable X whose distribution we are learning.
+
+            output_shapes (OrderedDict): a dictionary that maps modality to 
+                expected shape for outputs to reconstruct. Usually, this is
+                the same as @input_shapes but this argument allows
+                for asymmetries, such as reconstructing low-resolution
+                images.
+
+            encoder_layer_dims ([int]): sequence of integers for the encoder hidden 
+                layer sizes.
+
+            decoder_layer_dims ([int]): sequence of integers for the decoder hidden
+                layer sizes.
+
+            latent_dim (int): dimension of latent space for the VAE
+
+            device (torch.Device): where the module should live (i.e. cpu, gpu)
+
+            condition_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for all conditioning inputs. If this is provided,
+                a conditional distribution is modeled (cVAE). Conditioning takes
+                place in the decoder by default, and optionally, the prior.
+
+            decoder_is_conditioned (bool): whether to condition the decoder
+                on the conditioning variables. True by default. Only used if
+                @condition_shapes is not empty.
+
+            decoder_reconstruction_sum_across_elements (bool): by default, VAEs
+                average across modality elements and modalities when computing
+                reconstruction loss. If this is True, sum across all dimensions
+                and modalities instead.
+
+            latent_clip (float): if provided, clip all latents sampled at
+                test-time in each dimension to (-@latent_clip, @latent_clip)
+
+            output_squash ([str]): an iterable of modalities that should be 
+                a subset of @output_shapes. The decoder outputs for these
+                modalities will be squashed into a symmetric range [-a, a]
+                by using a tanh layer and then scaling the output with the
+                corresponding value in the @output_scales dictionary.
+
+            output_scales (dict): a dictionary that maps modality to a
+                scaling value. Used in conjunction with @output_squash.
+
+            output_ranges (dict): a dictionary of [a, b] specifying the output range.
+                when output_ranges is specified (not None), output_scales should be None
+
+            prior_learn (bool): if True, the prior distribution parameters
+                are also learned through the KL-divergence loss (instead 
+                of being constrained to a N(0, 1) Gaussian distribution).
+                If @prior_is_conditioned is True, a global set of parameters
+                are learned, otherwise, a prior network that maps between 
+                modalities in @condition_shapes and prior parameters is 
+                learned. By default, a Gaussian prior is learned, unless 
+                @prior_use_gmm is True, in which case a Gaussian Mixture 
+                Model (GMM) prior is learned.
+
+            prior_is_conditioned (bool): whether to condition the prior
+                on the conditioning variables. False by default. Only used if
+                @condition_shapes is not empty. If this is set to True,
+                @prior_learn must be True.
+            
+            prior_layer_dims ([int]): sequence of integers for the prior hidden layer
+                sizes. Only used for learned priors that take condition variables as
+                input (i.e. when @prior_learn and @prior_is_conditioned are set to True,
+                and @condition_shapes is not empty).
+
+            prior_use_gmm (bool): if True, learn a Gaussian Mixture Model (GMM)
+                prior instead of a unimodal Gaussian prior. To use this option,
+                @prior_learn must be set to True.
+
+            prior_gmm_num_modes (int): number of GMM modes to learn. Only
+                used if @prior_use_gmm is True.
+
+            prior_gmm_learn_weights (bool): if True, learn the weights of the GMM
+                model instead of setting them to be uniform across all the modes.
+                Only used if @prior_use_gmm is True.
+
+            prior_use_categorical (bool): if True, use a categorical prior instead of
+                a unimodal Gaussian prior. This will also cause the encoder to output
+                a categorical distribution, and will use the Gumbel-Softmax trick
+                for reparametrization.
+
+            prior_categorical_dim (int): categorical dimension - each latent sampled
+                from the prior will be of shape (@latent_dim, @prior_categorical_dim)
+                and will be "one-hot" in the latter dimension. Only used if 
+                @prior_use_categorical is True.
+
+            prior_categorical_gumbel_softmax_hard (bool): if True, use the "hard" version of
+                Gumbel Softmax for reparametrization. Only used if @prior_use_categorical is True.
+
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations. Goals are treates as additional
+                conditioning inputs. They are usually specified separately because
+                they have duplicate modalities as the conditioning inputs (otherwise
+                they could just be added to the set of conditioning inputs).
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        super(VAE, self).__init__()
+
+        self.latent_dim = latent_dim
+        self.latent_clip = latent_clip
+        self.device = device
+
+        # encoder and decoder input dicts and output shapes dict for reconstruction
+        assert isinstance(input_shapes, OrderedDict)
+        assert isinstance(output_shapes, OrderedDict)
+        self.input_shapes = deepcopy(input_shapes)
+        self.output_shapes = deepcopy(output_shapes)
+
+        # check for conditioning (cVAE)
+        self._is_cvae = False
+        self.condition_shapes = deepcopy(condition_shapes) if condition_shapes is not None else OrderedDict()
+        if len(self.condition_shapes) > 0:
+            # this is a cVAE - we learn a conditional distribution p(X | Y)
+            assert isinstance(self.condition_shapes, OrderedDict)
+            self._is_cvae = True
+            self.decoder_is_conditioned = decoder_is_conditioned
+            self.prior_is_conditioned = prior_is_conditioned
+            assert self.decoder_is_conditioned or self.prior_is_conditioned, \
+                "cVAE must be conditioned in decoder and/or prior"
+            if self.prior_is_conditioned:
+                assert prior_learn, "to pass conditioning inputs to prior, prior must be learned"
+
+        # check for goal conditioning
+        self._is_goal_conditioned = False
+        self.goal_shapes = deepcopy(goal_shapes) if goal_shapes is not None else OrderedDict()
+        if len(self.goal_shapes) > 0:
+            assert self._is_cvae, "to condition VAE on goals, it must be a cVAE"
+            assert isinstance(self.goal_shapes, OrderedDict)
+            self._is_goal_conditioned = True
+
+        self.encoder_layer_dims = encoder_layer_dims
+        self.decoder_layer_dims = decoder_layer_dims
+
+        # determines whether outputs are squashed with tanh and if so, to what scaling
+        assert not (output_scales is not None and output_ranges is not None)
+        self.output_squash = output_squash
+        self.output_scales = output_scales if output_scales is not None else OrderedDict()
+        self.output_ranges = output_ranges if output_ranges is not None else OrderedDict()
+
+        assert set(self.output_squash) == set(self.output_scales.keys())
+        assert set(self.output_squash).issubset(set(self.output_shapes))
+
+        # decoder settings
+        self.decoder_reconstruction_sum_across_elements = decoder_reconstruction_sum_across_elements
+
+        # prior parameters
+        self.prior_learn = prior_learn
+        self.prior_layer_dims = prior_layer_dims
+        self.prior_use_gmm = prior_use_gmm
+        self.prior_gmm_num_modes = prior_gmm_num_modes
+        self.prior_gmm_learn_weights = prior_gmm_learn_weights
+        self.prior_use_categorical = prior_use_categorical
+        self.prior_categorical_dim = prior_categorical_dim
+        self.prior_categorical_gumbel_softmax_hard = prior_categorical_gumbel_softmax_hard
+        assert np.sum([self.prior_use_gmm, self.prior_use_categorical]) <= 1
+
+        # for obs core
+        self._encoder_kwargs = encoder_kwargs
+
+        if self.prior_use_gmm:
+            assert self.prior_learn, "GMM must be learned"
+
+        if self.prior_use_categorical:
+            # initialize temperature for Gumbel-Softmax
+            self.set_gumbel_temperature(1.0)
+
+        # create encoder, decoder, prior
+        self._create_layers()
+
+    def _create_layers(self):
+        """
+        Creates the encoder, decoder, and prior networks.
+        """
+        self.nets = nn.ModuleDict()
+
+        # VAE Encoder
+        self._create_encoder()
+
+        # VAE Decoder
+        self._create_decoder()
+
+        # VAE Prior.
+        self._create_prior()
+
+    def _create_encoder(self):
+        """
+        Helper function to create encoder.
+        """
+
+        # encoder takes "input" dictionary and possibly "condition" (if cVAE) and "goal" (if goal-conditioned)
+        encoder_obs_group_shapes = OrderedDict()
+        encoder_obs_group_shapes["input"] = OrderedDict(self.input_shapes)
+        if self._is_cvae:
+            encoder_obs_group_shapes["condition"] = OrderedDict(self.condition_shapes)
+            if self._is_goal_conditioned:
+                encoder_obs_group_shapes["goal"] = OrderedDict(self.goal_shapes)
+        
+        # encoder outputs posterior distribution parameters
+        if self.prior_use_categorical:
+            encoder_output_shapes = OrderedDict(
+                logit=(self.latent_dim * self.prior_categorical_dim,),
+            )
+        else:
+            encoder_output_shapes = OrderedDict(
+                mean=(self.latent_dim,), 
+                logvar=(self.latent_dim,),
+            )
+
+        self.nets["encoder"] = MIMO_MLP(
+            input_obs_group_shapes=encoder_obs_group_shapes,
+            output_shapes=encoder_output_shapes, 
+            layer_dims=self.encoder_layer_dims,
+            encoder_kwargs=self._encoder_kwargs,
+        )
+
+    def _create_decoder(self):
+        """
+        Helper function to create decoder.
+        """
+
+        # decoder takes latent (included as "input" observation group) and possibly "condition" (if cVAE) and "goal" (if goal-conditioned)
+        decoder_obs_group_shapes = OrderedDict()
+        latent_shape = (self.latent_dim,)
+        if self.prior_use_categorical:
+            latent_shape = (self.latent_dim * self.prior_categorical_dim,)
+        decoder_obs_group_shapes["input"] = OrderedDict(latent=latent_shape)
+        if self._is_cvae:
+            decoder_obs_group_shapes["condition"] = OrderedDict(self.condition_shapes)
+            if self._is_goal_conditioned:
+                decoder_obs_group_shapes["goal"] = OrderedDict(self.goal_shapes)
+
+        self.nets["decoder"] = MIMO_MLP(
+            input_obs_group_shapes=decoder_obs_group_shapes,
+            output_shapes=self.output_shapes, 
+            layer_dims=self.decoder_layer_dims,
+            encoder_kwargs=self._encoder_kwargs,
+        )
+
+    def _create_prior(self):
+        """
+        Helper function to create prior.
+        """
+
+        # prior possibly takes "condition" (if cVAE) and "goal" (if goal-conditioned)
+        prior_obs_group_shapes = OrderedDict(condition=None, goal=None)
+        if self._is_cvae and self.prior_is_conditioned:
+            prior_obs_group_shapes["condition"] = OrderedDict(self.condition_shapes)
+            if self._is_goal_conditioned:
+                prior_obs_group_shapes["goal"] = OrderedDict(self.goal_shapes)
+
+        if self.prior_use_categorical:
+            self.nets["prior"] = CategoricalPrior(
+                latent_dim=self.latent_dim,
+                categorical_dim=self.prior_categorical_dim,
+                device=self.device,
+                learnable=self.prior_learn,
+                obs_shapes=prior_obs_group_shapes["condition"],
+                mlp_layer_dims=self.prior_layer_dims,
+                goal_shapes=prior_obs_group_shapes["goal"],
+                encoder_kwargs=self._encoder_kwargs,
+            )
+        else:
+            self.nets["prior"] = GaussianPrior(
+                latent_dim=self.latent_dim,
+                device=self.device,
+                latent_clip=self.latent_clip,
+                learnable=self.prior_learn,
+                use_gmm=self.prior_use_gmm,
+                gmm_num_modes=self.prior_gmm_num_modes,
+                gmm_learn_weights=self.prior_gmm_learn_weights,
+                obs_shapes=prior_obs_group_shapes["condition"],
+                mlp_layer_dims=self.prior_layer_dims,
+                goal_shapes=prior_obs_group_shapes["goal"],
+                encoder_kwargs=self._encoder_kwargs,
+            )
+
+    def encode(self, inputs, conditions=None, goals=None):
+        """
+        Args:
+            inputs (dict): a dictionary that maps input modalities to torch.Tensor
+                batches. These should correspond to the encoder-only modalities
+                (i.e. @self.encoder_only_shapes).
+
+            conditions (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to the modalities used for conditioning
+                in either the decoder or the prior (or both). Only for cVAEs.
+
+            goals (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to goal modalities. Only for cVAEs.
+
+        Returns:
+            posterior params (dict): dictionary with posterior parameters
+        """
+        return self.nets["encoder"](
+            input=inputs,
+            condition=conditions,
+            goal=goals,
+        )
+
+    def reparameterize(self, posterior_params):
+        """
+        Args:
+            posterior params (dict): dictionary from encoder forward pass that
+                parametrizes the encoder distribution
+
+        Returns:
+            z (torch.Tensor): sampled latents that are also differentiable
+        """
+        if self.prior_use_categorical:
+            # reshape to [B, D, C] to take softmax across categorical classes
+            logits = posterior_params["logit"].reshape(-1, self.latent_dim, self.prior_categorical_dim)
+            z = F.gumbel_softmax(
+                logits=logits,
+                tau=self._gumbel_temperature,
+                hard=self.prior_categorical_gumbel_softmax_hard,
+                dim=-1,
+            )
+            # reshape to [B, D * C], since downstream networks expect flat latents
+            return TensorUtils.flatten(z)
+
+        return TorchUtils.reparameterize(
+            mu=posterior_params["mean"], 
+            logvar=posterior_params["logvar"],
+        )
+
+    def decode(self, conditions=None, goals=None, z=None, n=None):
+        """
+        Pass latents through decoder. Latents should be passed in to
+        this function at train-time for backpropagation, but they
+        can be left out at test-time. In this case, latents will
+        be sampled using the VAE prior.
+
+        Args:
+            conditions (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to the modalities used for conditioning
+                in either the decoder or the prior (or both). Only for cVAEs.
+
+            goals (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to goal modalities. Only for cVAEs.
+
+            z (torch.Tensor): if provided, these latents are used to generate
+                reconstructions from the VAE, and the prior is not sampled.
+
+            n (int): this argument is used to specify the number of samples to 
+                generate from the prior. Only required if @z is None - i.e.
+                sampling takes place
+
+        Returns:
+            recons (dict): dictionary of reconstructed inputs
+        """
+
+        if z is None:
+            # sample latents from prior distribution
+            assert n is not None
+            z = self.sample_prior(n=n, conditions=conditions, goals=goals)
+
+        # decoder takes latents as input, and maybe condition variables 
+        # and goal variables
+        inputs = dict(
+            input=dict(latent=z), 
+            condition=conditions, 
+            goal=goals,
+        )
+
+        # pass through decoder to reconstruct variables in @self.output_shapes
+        recons = self.nets["decoder"](**inputs)
+
+        # apply tanh squashing to output modalities
+        for k in self.output_squash:
+            recons[k] = self.output_scales[k] * torch.tanh(recons[k])
+
+        for k, v_range in self.output_ranges.items():
+            assert v_range[1] > v_range[0]
+            recons[k] = torch.sigmoid(recons[k]) * (v_range[1] - v_range[0]) + v_range[0]
+        return recons
+
+    def sample_prior(self, n, conditions=None, goals=None):
+        """
+        Samples from the prior using the prior parameters.
+
+        Args:
+            n (int): this argument is used to specify the number
+                of samples to generate from the prior.
+
+            conditions (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to the modalities used for conditioning
+                in either the decoder or the prior (or both). Only for cVAEs.
+
+            goals (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to goal modalities. Only for cVAEs.
+
+        Returns:
+            z (torch.Tensor): sampled latents from the prior
+        """
+        return self.nets["prior"].sample(n=n, obs_dict=conditions, goal_dict=goals)
+
+    def kl_loss(self, posterior_params, encoder_z=None, conditions=None, goals=None):
+        """
+        Computes KL divergence loss given the results of the VAE encoder forward
+        pass and the conditioning and goal modalities (if the prior is input-dependent).
+
+        Args:
+            posterior_params (dict): dictionary with keys "mu" and "logvar" corresponding
+                to torch.Tensor batch of means and log-variances of posterior Gaussian
+                distribution. This is the output of @self.encode.
+
+            encoder_z (torch.Tensor): samples from the Gaussian distribution parametrized by
+                @mu and @logvar. Only required if using a GMM prior.
+
+            conditions (dict): inputs according to @self.condition_shapes. Only needs to be provided
+                if any prior parameters are input-dependent.
+
+            goal_dict (dict): inputs according to @self.goal_shapes (only if using goal observations)
+
+        Returns:
+            kl_loss (torch.Tensor): VAE KL divergence loss
+        """
+        return self.nets["prior"].kl_loss(
+            posterior_params=posterior_params,
+            z=encoder_z,
+            obs_dict=conditions, 
+            goal_dict=goals,
+        )
+
+    def reconstruction_loss(self, reconstructions, targets):
+        """
+        Reconstruction loss. Note that we compute the average per-dimension error
+        in each modality and then average across all the modalities.
+
+        The beta term for weighting between reconstruction and kl losses will
+        need to be tuned in practice for each situation (see
+        https://twitter.com/memotv/status/973323454350090240 for more 
+        discussion).
+
+        Args:
+            reconstructions (dict): reconstructed inputs, consistent with
+                @self.output_shapes
+            targets (dict): reconstruction targets, consistent with
+                @self.output_shapes
+
+        Returns:
+            reconstruction_loss (torch.Tensor): VAE reconstruction loss
+        """
+        random_key = list(reconstructions.keys())[0]
+        batch_size = reconstructions[random_key].shape[0]
+        num_mods = len(reconstructions.keys())
+
+        # collect errors per modality, while preserving shapes in @reconstructions
+        recons_errors = []
+        for k in reconstructions:
+            L2_loss = (reconstructions[k] - targets[k]).pow(2)
+            recons_errors.append(L2_loss)
+
+        # reduce errors across modalities and dimensions
+        if self.decoder_reconstruction_sum_across_elements:
+            # average across batch but sum across modalities and dimensions
+            loss = sum([x.sum() for x in recons_errors])
+            loss /= batch_size
+        else:
+            # compute mse loss in each modality and average across modalities
+            loss = sum([x.mean() for x in recons_errors])
+            loss /= num_mods
+        return loss
+
+    def forward(self, inputs, outputs, conditions=None, goals=None, freeze_encoder=False):
+        """
+        A full pass through the VAE network to construct KL and reconstruction
+        losses.
+
+        Args:
+            inputs (dict): a dictionary that maps input modalities to torch.Tensor
+                batches. These should correspond to the encoder-only modalities
+                (i.e. @self.encoder_only_shapes).
+
+            outputs (dict): a dictionary that maps output modalities to torch.Tensor
+                batches. These should correspond to the modalities used for
+                reconstruction (i.e. @self.output_shapes).
+
+            conditions (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to the modalities used for conditioning
+                in either the decoder or the prior (or both). Only for cVAEs.
+
+            goals (dict): a dictionary that maps modalities to torch.Tensor
+                batches. These should correspond to goal modalities. Only for cVAEs.
+
+            freeze_encoder (bool): if True, don't backprop into encoder by detaching
+                encoder outputs. Useful for doing staged VAE training.
+
+        Returns:
+            vae_outputs (dict): a dictionary that contains the following outputs.
+
+                encoder_params (dict): parameters for the posterior distribution
+                    from the encoder forward pass
+
+                encoder_z (torch.Tensor): latents sampled from the encoder posterior
+
+                decoder_outputs (dict): reconstructions from the decoder
+
+                kl_loss (torch.Tensor): KL loss over the batch of data
+
+                reconstruction_loss (torch.Tensor): reconstruction loss over the batch of data
+        """
+
+        # In the comments below, X = inputs, Y = conditions, and we seek to learn P(X | Y).
+        # The decoder and prior only have knowledge about Y and try to reconstruct X.
+        # Notice that when Y is the empty set, this reduces to a normal VAE.
+
+        # mu, logvar <- Enc(X, Y)
+        posterior_params = self.encode(
+            inputs=inputs, 
+            conditions=conditions,
+            goals=goals,
+        )
+
+        if freeze_encoder:
+            posterior_params = TensorUtils.detach(posterior_params)
+
+        # z ~ Enc(z | X, Y)
+        encoder_z = self.reparameterize(posterior_params)
+
+        # hat(X) = Dec(z, Y)
+        reconstructions = self.decode(
+            conditions=conditions, 
+            goals=goals,
+            z=encoder_z,
+        )
+        
+        # this will also train prior network z ~ Prior(z | Y)
+        kl_loss = self.kl_loss(
+            posterior_params=posterior_params,
+            encoder_z=encoder_z,
+            conditions=conditions,
+            goals=goals,
+        )
+
+        reconstruction_loss = self.reconstruction_loss(
+            reconstructions=reconstructions, 
+            targets=outputs,
+        )
+
+        return {
+            "encoder_params" : posterior_params,
+            "encoder_z" : encoder_z,
+            "decoder_outputs" : reconstructions,
+            "kl_loss" : kl_loss,
+            "reconstruction_loss" : reconstruction_loss,
+        }
+
+    def set_gumbel_temperature(self, temperature):
+        """
+        Used by external algorithms to schedule Gumbel-Softmax temperature,
+        which is used during reparametrization at train-time. Should only
+        be used if @self.prior_use_categorical is True.
+        """
+        assert self.prior_use_categorical
+        self._gumbel_temperature = temperature
+
+    def get_gumbel_temperature(self):
+        """
+        Return current Gumbel-Softmax temperature. Should only be used if
+        @self.prior_use_categorical is True.
+        """
+        assert self.prior_use_categorical
+        return self._gumbel_temperature
diff --git a/phantom/submodules/phantom-robomimic/robomimic/models/value_nets.py b/phantom/submodules/phantom-robomimic/robomimic/models/value_nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..c98fa7e4e0f4185b2a11e5581158268aa9cda2cf
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/models/value_nets.py
@@ -0,0 +1,318 @@
+"""
+Contains torch Modules for value networks. These networks take an 
+observation dictionary as input (and possibly additional conditioning, 
+such as subgoal or goal dictionaries) and produce value or 
+action-value estimates or distributions.
+"""
+import numpy as np
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributions as D
+
+import robomimic.utils.tensor_utils as TensorUtils
+from robomimic.models.obs_nets import MIMO_MLP
+from robomimic.models.distributions import DiscreteValueDistribution
+
+
+class ValueNetwork(MIMO_MLP):
+    """
+    A basic value network that predicts values from observations.
+    Can optionally be goal conditioned on future observations.
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        mlp_layer_dims,
+        value_bounds=None,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            obs_shapes (OrderedDict): a dictionary that maps observation keys to
+                expected shapes for observations.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layers sizes. 
+
+            value_bounds (tuple): a 2-tuple corresponding to the lowest and highest possible return
+                that the network should be possible of generating. The network will rescale outputs
+                using a tanh layer to lie within these bounds. If None, no tanh re-scaling is done.
+
+            goal_shapes (OrderedDict): a dictionary that maps observation keys to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-observation key information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+        self.value_bounds = value_bounds
+        if self.value_bounds is not None:
+            # convert [lb, ub] to a scale and offset for the tanh output, which is in [-1, 1]
+            self._value_scale = (float(self.value_bounds[1]) - float(self.value_bounds[0])) / 2.
+            self._value_offset = (float(self.value_bounds[1]) + float(self.value_bounds[0])) / 2.
+
+        assert isinstance(obs_shapes, OrderedDict)
+        self.obs_shapes = obs_shapes
+
+        # set up different observation groups for @MIMO_MLP
+        observation_group_shapes = OrderedDict()
+        observation_group_shapes["obs"] = OrderedDict(self.obs_shapes)
+
+        self._is_goal_conditioned = False
+        if goal_shapes is not None and len(goal_shapes) > 0:
+            assert isinstance(goal_shapes, OrderedDict)
+            self._is_goal_conditioned = True
+            self.goal_shapes = OrderedDict(goal_shapes)
+            observation_group_shapes["goal"] = OrderedDict(self.goal_shapes)
+        else:
+            self.goal_shapes = OrderedDict()
+
+        output_shapes = self._get_output_shapes()
+        super(ValueNetwork, self).__init__(
+            input_obs_group_shapes=observation_group_shapes,
+            output_shapes=output_shapes,
+            layer_dims=mlp_layer_dims,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def _get_output_shapes(self):
+        """
+        Allow subclasses to re-define outputs from @MIMO_MLP, since we won't
+        always directly predict values, but may instead predict the parameters
+        of a value distribution.
+        """
+        return OrderedDict(value=(1,))
+
+    def output_shape(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. 
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend 
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+        return [1]
+
+    def forward(self, obs_dict, goal_dict=None):
+        """
+        Forward through value network, and then optionally use tanh scaling.
+        """
+        values = super(ValueNetwork, self).forward(obs=obs_dict, goal=goal_dict)["value"]
+        if self.value_bounds is not None:
+            values = self._value_offset + self._value_scale * torch.tanh(values)
+        return values
+
+    def _to_string(self):
+        return "value_bounds={}".format(self.value_bounds)
+
+
+class ActionValueNetwork(ValueNetwork):
+    """
+    A basic Q (action-value) network that predicts values from observations
+    and actions. Can optionally be goal conditioned on future observations.
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        mlp_layer_dims,
+        value_bounds=None,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            obs_shapes (OrderedDict): a dictionary that maps observation keys to
+                expected shapes for observations.
+
+            ac_dim (int): dimension of action space.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layers sizes. 
+
+            value_bounds (tuple): a 2-tuple corresponding to the lowest and highest possible return
+                that the network should be possible of generating. The network will rescale outputs
+                using a tanh layer to lie within these bounds. If None, no tanh re-scaling is done.
+
+            goal_shapes (OrderedDict): a dictionary that maps observation keys to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-observation key information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+
+        # add in action as a modality
+        new_obs_shapes = OrderedDict(obs_shapes)
+        new_obs_shapes["action"] = (ac_dim,)
+        self.ac_dim = ac_dim
+
+        # pass to super class to instantiate network
+        super(ActionValueNetwork, self).__init__(
+            obs_shapes=new_obs_shapes,
+            mlp_layer_dims=mlp_layer_dims,
+            value_bounds=value_bounds,
+            goal_shapes=goal_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def forward(self, obs_dict, acts, goal_dict=None):
+        """
+        Modify forward from super class to include actions in inputs.
+        """
+        inputs = dict(obs_dict)
+        inputs["action"] = acts
+        return super(ActionValueNetwork, self).forward(inputs, goal_dict)
+
+    def _to_string(self):
+        return "action_dim={}\nvalue_bounds={}".format(self.ac_dim, self.value_bounds)
+
+
+class DistributionalActionValueNetwork(ActionValueNetwork):
+    """
+    Distributional Q (action-value) network that outputs a categorical distribution over
+    a discrete grid of value atoms. See https://arxiv.org/pdf/1707.06887.pdf for 
+    more details.
+    """
+    def __init__(
+        self,
+        obs_shapes,
+        ac_dim,
+        mlp_layer_dims,
+        value_bounds,
+        num_atoms,
+        goal_shapes=None,
+        encoder_kwargs=None,
+    ):
+        """
+        Args:
+            obs_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for observations.
+
+            ac_dim (int): dimension of action space.
+
+            mlp_layer_dims ([int]): sequence of integers for the MLP hidden layers sizes. 
+
+            value_bounds (tuple): a 2-tuple corresponding to the lowest and highest possible return
+                that the network should be possible of generating. This defines the support
+                of the value distribution.
+
+            num_atoms (int): number of value atoms to use for the categorical distribution - which
+                is the representation of the value distribution.
+
+            goal_shapes (OrderedDict): a dictionary that maps modality to
+                expected shapes for goal observations.
+
+            encoder_kwargs (dict or None): If None, results in default encoder_kwargs being applied. Otherwise, should
+                be nested dictionary containing relevant per-modality information for encoder networks.
+                Should be of form:
+
+                obs_modality1: dict
+                    feature_dimension: int
+                    core_class: str
+                    core_kwargs: dict
+                        ...
+                        ...
+                    obs_randomizer_class: str
+                    obs_randomizer_kwargs: dict
+                        ...
+                        ...
+                obs_modality2: dict
+                    ...
+        """
+
+        # parameters specific to DistributionalActionValueNetwork
+        self.num_atoms = num_atoms
+        self._atoms = np.linspace(value_bounds[0], value_bounds[1], num_atoms)
+
+        # pass to super class to instantiate network
+        super(DistributionalActionValueNetwork, self).__init__(
+            obs_shapes=obs_shapes,
+            ac_dim=ac_dim,
+            mlp_layer_dims=mlp_layer_dims,
+            value_bounds=value_bounds,
+            goal_shapes=goal_shapes,
+            encoder_kwargs=encoder_kwargs,
+        )
+
+    def _get_output_shapes(self):
+        """
+        Network outputs log probabilities for categorical distribution over discrete value grid.
+        """
+        return OrderedDict(log_probs=(self.num_atoms,))
+
+    def forward_train(self, obs_dict, acts, goal_dict=None):
+        """
+        Return full critic categorical distribution.
+
+        Args:
+            obs_dict (dict): batch of observations
+            acts (torch.Tensor): batch of actions
+            goal_dict (dict): if not None, batch of goal observations
+
+        Returns:
+            value_distribution (DiscreteValueDistribution instance)
+        """
+
+        # add in actions
+        inputs = dict(obs_dict)
+        inputs["action"] = acts
+
+        # network returns unnormalized log probabilities (logits) for each of the value atoms
+        logits = MIMO_MLP.forward(self, obs=inputs, goal=goal_dict)["log_probs"]
+
+        # turn these logits into a categorical distribution over the value atoms.
+        # (unsqueeze to make sure atoms are compatible with batch operations)
+        value_atoms = torch.Tensor(self._atoms).unsqueeze(0).to(logits.device)
+        return DiscreteValueDistribution(values=value_atoms, logits=logits)
+
+    def forward(self, obs_dict, acts, goal_dict=None):
+        """
+        Return mean of critic categorical distribution. Useful for obtaining
+        point estimates of critic values.
+
+        Args:
+            obs_dict (dict): batch of observations
+            acts (torch.Tensor): batch of actions
+            goal_dict (dict): if not None, batch of goal observations
+
+        Returns:
+            mean_value (torch.Tensor): expectation of value distribution
+        """
+        vd = self.forward_train(obs_dict=obs_dict, acts=acts, goal_dict=goal_dict)
+        return vd.mean()
+
+    def _to_string(self):
+        return "action_dim={}\nvalue_bounds={}\nnum_atoms={}".format(self.ac_dim, self.value_bounds, self.num_atoms)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/config_gen/bc_xfmr_gen.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/config_gen/bc_xfmr_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6fe0db1e630658e6fb4a152dbb4a8035efa8a8f
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/config_gen/bc_xfmr_gen.py
@@ -0,0 +1,156 @@
+from robomimic.scripts.config_gen.helper import *
+
+def make_generator_helper(args):
+    algo_name_short = "bc_xfmr"
+
+    generator = get_generator(
+        algo_name="diffusion_policy",
+        config_file=os.path.join(base_path, 'robomimic/exps/templates/diffusion_policy.json'),
+        args=args,
+        algo_name_short=algo_name_short,
+        pt=True,
+    )
+    if args.ckpt_mode is None:
+        args.ckpt_mode = "off"
+
+    generator.add_param(
+        key="train.num_data_workers",
+        name="",
+        group=-1,
+        values=[4],
+    )
+    generator.add_param(
+        key="experiment.save.every_n_epochs",
+        name="",
+        group=-1,
+        values=[
+            100
+        ],
+    )
+
+    # run rollouts at epoch 0 only
+    generator.add_param(
+        key="experiment.rollout.warmstart",
+        name="",
+        group=-1,
+        values=[
+            -1,
+        ],
+    )
+    generator.add_param(
+        key="train.num_epochs",
+        name="",
+        group=-1,
+        values=[40],
+    )
+    generator.add_param(
+        key="experiment.rollout.rate",
+        name="",
+        group=-1,
+        values=[10],
+    )
+
+    if args.env == "r2d2":
+        generator.add_param(
+            key="train.data",
+            name="ds",
+            group=2,
+            values=[
+                # [{"path": p} for p in scan_datasets("~/code/r2d2/data/success/2023-05-23_t2c-cans", postfix="trajectory_im84.h5")],
+                [{"path": p} for p in scan_datasets("/home/cchi/local/data/r2d2/pen/success/2023-02-28", postfix="trajectory_im128.h5")],
+            ],
+            value_names=[
+                "pnp-t2c-cans-84",
+                # "pnp-t2c-cans-128",
+            ],
+        )
+        generator.add_param(
+            key="observation.encoder.rgb.obs_randomizer_kwargs.crop_height",
+            name="",
+            group=2,
+            values=[
+                76,
+                # 116
+            ],
+        )
+        generator.add_param(
+            key="observation.encoder.rgb.obs_randomizer_kwargs.crop_width",
+            name="",
+            group=2,
+            values=[
+                76,
+                # 116
+            ],
+        )
+    elif args.env == "square":
+        generator.add_param(
+            key="train.data",
+            name="ds",
+            group=2,
+            values=[
+                [
+                    {"path": "~/datasets/square/ph/image_v141.hdf5"},
+                    {"path": "~/datasets/square/ph/image_v141.hdf5"},
+                ],
+            ],
+            value_names=[
+                "square",
+            ],
+        )
+    else:
+        raise ValueError
+
+    if "experiment.ckpt_path" in generator.parameters:
+        generator.add_param(
+            key="algo.optim_params.policy.learning_rate.initial",
+            name="lrinit",
+            group=110,
+            values=[
+                1e-5,
+            ],
+            hidename=True,
+        )
+        generator.add_param(
+            key="algo.optim_params.policy.learning_rate.lr_scheduler_type",
+            name="lrsch",
+            group=111,
+            values=[
+                # "linear",
+                None,
+            ],
+            value_names=[
+                "none"
+            ],
+            hidename=True,
+        )
+    
+    generator.add_param(
+        key="train.output_dir",
+        name="",
+        group=-1,
+        values=[
+            "/home/cchi/dev/robomimic_r2d2/datasets/experiment_results/{env}/{mod}/{algo_name_short}".format(
+                env=args.env,
+                mod=args.mod,
+                algo_name_short=algo_name_short,
+            )
+        ],
+    )
+
+    generator.add_param(
+        key="experiment.rollout.enabled",
+        name="",
+        group=-1,
+        values=[
+            True
+        ],
+        hidename=False,
+    )
+
+    return generator
+
+if __name__ == "__main__":
+    parser = get_argparser()
+
+    args = parser.parse_args()
+    make_generator(args, make_generator_helper)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/config_gen/diffusion_gen.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/config_gen/diffusion_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..990a163ae72aa913fafcb87e2372ae7ca5d1fda4
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/config_gen/diffusion_gen.py
@@ -0,0 +1,147 @@
+from robomimic.scripts.config_gen.helper import *
+
+def make_generator_helper(args):
+    algo_name_short = "diffusion_policy"
+
+    generator = get_generator(
+        algo_name="diffusion_policy",
+        config_file=os.path.join(base_path, 'robomimic/exps/templates/diffusion_policy.json'),
+        args=args,
+        algo_name_short=algo_name_short,
+        pt=True,
+    )
+    if args.ckpt_mode is None:
+        args.ckpt_mode = "off"
+
+    generator.add_param(
+        key="train.num_data_workers",
+        name="",
+        group=-1,
+        values=[8],
+    )
+
+    generator.add_param(
+        key="train.num_epochs",
+        name="",
+        group=-1,
+        values=[1000],
+    )
+
+    # use ddim by default
+    generator.add_param(
+        key="algo.ddim.enabled",
+        name="ddim",
+        group=1001,
+        values=[
+            True,
+            # False,
+        ],
+    )
+    generator.add_param(
+        key="algo.ddpm.enabled",
+        name="ddpm",
+        group=1001,
+        values=[
+            False,
+            # True,
+        ],
+        hidename=True,
+    )
+
+    if args.env == "r2d2":
+        generator.add_param(
+            key="train.data",
+            name="ds",
+            group=2,
+            values=[
+                [{"path": p} for p in scan_datasets("~/Downloads/example_pen_in_cup", postfix="trajectory_im128.h5")],
+            ],
+            value_names=[
+                "pen-in-cup",
+            ],
+        )
+        generator.add_param(
+            key="train.action_keys",
+            name="ac_keys",
+            group=-1,
+            values=[
+                [
+                    "action/abs_pos",
+                    "action/abs_rot_6d",
+                    "action/gripper_velocity",
+                ],
+            ],
+            value_names=[
+                "abs",
+            ],
+        )
+    elif args.env == "square":
+        generator.add_param(
+            key="train.data",
+            name="ds",
+            group=2,
+            values=[
+                [
+                    # TODO: point to the hdf5 file
+                    # {"path": "/home/cchi/dev/robomimic_r2d2/datasets/square/ph/image_abs.hdf5"},
+                    # {"path": "~/datasets/square/ph/image_v141.hdf5"},
+                    # {"path": "~/datasets/square/ph/image.hdf5"},
+                    {"path": "~/datasets/square/ph/square_ph_abs_tmp.hdf5"}, # replace with your own path
+                ],
+            ],
+            value_names=[
+                "square",
+            ],
+        )
+
+        # update env config to use absolute action control
+        generator.add_param(
+            key="experiment.env_meta_update_dict",
+            name="",
+            group=-1,
+            values=[
+                {"env_kwargs": {"controller_configs": {"control_delta": False}}}
+            ],
+        )
+        
+        generator.add_param(
+            key="train.action_keys",
+            name="ac_keys",
+            group=-1,
+            values=[
+                [
+                    "action_dict/abs_pos",
+                    "action_dict/abs_rot_6d",
+                    "action_dict/gripper",
+                    # "actions",
+                ],
+            ],
+            value_names=[
+                "abs",
+            ],
+        )
+
+
+    else:
+        raise ValueError
+    
+    generator.add_param(
+        key="train.output_dir",
+        name="",
+        group=-1,
+        values=[
+            "~/expdata/{env}/{mod}/{algo_name_short}".format(
+                env=args.env,
+                mod=args.mod,
+                algo_name_short=algo_name_short,
+            )
+        ],
+    )
+
+    return generator
+
+if __name__ == "__main__":
+    parser = get_argparser()
+
+    args = parser.parse_args()
+    make_generator(args, make_generator_helper)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/config_gen/helper.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/config_gen/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca29a37843c2b208f5ae7a3282bc9f41f426faa4
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/config_gen/helper.py
@@ -0,0 +1,709 @@
+import argparse
+import os
+import time
+import datetime
+
+import robomimic
+import robomimic.utils.hyperparam_utils as HyperparamUtils
+
+base_path = os.path.abspath(os.path.join(os.path.dirname(robomimic.__file__), os.pardir))
+
+def scan_datasets(folder, postfix=".h5"):
+    dataset_paths = []
+    for root, dirs, files in os.walk(os.path.expanduser(folder)):
+        for f in files:
+            if f.endswith(postfix):
+                dataset_paths.append(os.path.join(root, f))
+    return dataset_paths
+
+
+def get_generator(algo_name, config_file, args, algo_name_short=None, pt=False):
+    if args.wandb_proj_name is None:
+        strings = [
+            algo_name_short if (algo_name_short is not None) else algo_name,
+            args.name,
+            args.env,
+            args.mod,
+        ]
+        args.wandb_proj_name = '_'.join([str(s) for s in strings if s is not None])
+
+    if args.script is not None:
+        generated_config_dir = os.path.join(os.path.dirname(args.script), "json")
+    else:
+        curr_time = datetime.datetime.fromtimestamp(time.time()).strftime('%m-%d-%y-%H-%M-%S')
+        generated_config_dir=os.path.join(
+            '~/', 'tmp/autogen_configs/ril', algo_name, args.env, args.mod, args.name, curr_time, "json",
+        )
+
+    generator = HyperparamUtils.ConfigGenerator(
+        base_config_file=config_file,
+        generated_config_dir=generated_config_dir,
+        wandb_proj_name=args.wandb_proj_name,
+        script_file=args.script,
+    )
+
+    args.algo_name = algo_name
+    args.pt = pt
+
+    return generator
+
+
+def set_env_settings(generator, args):
+    if args.env in ["r2d2"]:
+        assert args.mod == "im"
+        generator.add_param(
+            key="experiment.rollout.enabled",
+            name="",
+            group=-1,
+            values=[
+                False
+            ],
+        )
+        generator.add_param(
+            key="experiment.save.every_n_epochs",
+            name="",
+            group=-1,
+            values=[50],
+        )
+        if "observation.modalities.obs.low_dim" not in generator.parameters:
+            generator.add_param(
+                key="observation.modalities.obs.low_dim",
+                name="",
+                group=-1,
+                values=[
+                    ["robot_state/cartesian_position", "robot_state/gripper_position"]
+                ],
+            )
+        if "observation.modalities.obs.rgb" not in generator.parameters:
+            generator.add_param(
+                key="observation.modalities.obs.rgb",
+                name="",
+                group=-1,
+                values=[
+                    [
+                        "camera/image/hand_camera_image",
+                        # "camera/image/varied_camera_1_image", "camera/image/varied_camera_2_image" # uncomment to use all 3 cameras
+                    ]
+                ],
+            )
+        if "observation.encoder.rgb.obs_randomizer_kwargs.crop_height"  not in generator.parameters:
+            generator.add_param(
+                key="observation.encoder.rgb.obs_randomizer_kwargs.crop_height",
+                name="",
+                group=-1,
+                values=[
+                    116
+                ],
+            )
+            generator.add_param(
+                key="observation.encoder.rgb.obs_randomizer_kwargs.crop_width",
+                name="",
+                group=-1,
+                values=[
+                    116
+                ],
+            )
+        generator.add_param(
+            key="train.data_format",
+            name="",
+            group=-1,
+            values=[
+                "r2d2"
+            ],
+        )
+        # specify action keys in your <yourmethod>_gen.py
+        # here, we list how each action key should be treated (normalized etc)
+        generator.add_param(
+            key="train.action_config",
+            name="",
+            group=-1,
+            values=[
+                {
+                    "action/cartesian_position":{
+                        "normalization": "min_max",
+                    },
+                    "action/abs_pos":{
+                        "normalization": "min_max",
+                    },
+                    "action/abs_rot_6d":{
+                        "normalization": "min_max",
+                        "format": "rot_6d",
+                    },
+                    "action/abs_rot_axis_angle":{
+                        "normalization": "min_max",
+                        "format": "rot_axis_angle",
+                    },
+                    "action/gripper_position":{
+                        "normalization": "min_max",
+                    },
+                    "action/cartesian_velocity":{
+                        "normalization": None,
+                    },
+                    "action/gripper_velocity":{
+                        "normalization": None,
+                    },
+                }
+            ],
+        )
+        generator.add_param(
+            key="train.dataset_keys",
+            name="",
+            group=-1,
+            values=[[]],
+        )
+    elif args.env in ['square', 'lift', 'place_close']:
+        # # set videos off
+        # args.no_video = True
+
+        generator.add_param(
+            key="train.action_config",
+            name="",
+            group=-1,
+            values=[
+                {
+                    "actions":{
+                        "normalization": None,
+                    },
+                    "action_dict/abs_pos": {
+                        "normalization": "min_max"
+                    },
+                    "action_dict/abs_rot_axis_angle": {
+                        "normalization": "min_max",
+                        "format": "rot_axis_angle"
+                    },
+                    "action_dict/abs_rot_6d": {
+                        "normalization": None,
+                        "format": "rot_6d"
+                    },
+                    "action_dict/rel_pos": {
+                        "normalization": None,
+                    },
+                    "action_dict/rel_rot_axis_angle": {
+                        "normalization": None,
+                        "format": "rot_axis_angle"
+                    },
+                    "action_dict/rel_rot_6d": {
+                        "normalization": None,
+                        "format": "rot_6d"
+                    },
+                    "action_dict/gripper": {
+                        "normalization": None,
+                    }
+                }
+            ],
+        )
+
+        if args.mod == 'im':
+            generator.add_param(
+                key="observation.modalities.obs.low_dim",
+                name="",
+                group=-1,
+                values=[
+                    ["robot0_eef_pos",
+                     "robot0_eef_quat",
+                     "robot0_gripper_qpos"]
+                ],
+            )
+            generator.add_param(
+                key="observation.modalities.obs.rgb",
+                name="",
+                group=-1,
+                values=[
+                    ["agentview_image",
+                     "robot0_eye_in_hand_image"]
+                ],
+            )
+        else:
+            generator.add_param(
+                key="observation.modalities.obs.low_dim",
+                name="",
+                group=-1,
+                values=[
+                    ["robot0_eef_pos",
+                     "robot0_eef_quat",
+                     "robot0_gripper_qpos",
+                     "object"]
+                ],
+            )
+    elif args.env == 'transport':
+        # set videos off
+        args.no_video = True
+
+        # TODO: fix 2 robot case
+        generator.add_param(
+            key="train.action_config",
+            name="",
+            group=-1,
+            values=[
+                {
+                    "actions":{
+                        "normalization": None,
+                    },
+                    "action_dict/abs_pos": {
+                        "normalization": "min_max"
+                    },
+                    "action_dict/abs_rot_axis_angle": {
+                        "normalization": "min_max",
+                        "format": "rot_axis_angle"
+                    },
+                    "action_dict/abs_rot_6d": {
+                        "normalization": None,
+                        "format": "rot_6d"
+                    },
+                    "action_dict/rel_pos": {
+                        "normalization": None,
+                    },
+                    "action_dict/rel_rot_axis_angle": {
+                        "normalization": None,
+                        "format": "rot_axis_angle"
+                    },
+                    "action_dict/rel_rot_6d": {
+                        "normalization": None,
+                        "format": "rot_6d"
+                    },
+                    "action_dict/gripper": {
+                        "normalization": None,
+                    }
+                }
+            ],
+        )
+
+        if args.mod == 'im':
+            generator.add_param(
+                key="observation.modalities.obs.low_dim",
+                name="",
+                group=-1,
+                values=[
+                    ["robot0_eef_pos",
+                     "robot0_eef_quat",
+                     "robot0_gripper_qpos",
+                     "robot1_eef_pos",
+                     "robot1_eef_quat",
+                     "robot1_gripper_qpos"]
+                ],
+            )
+            generator.add_param(
+                key="observation.modalities.obs.rgb",
+                name="",
+                group=-1,
+                values=[
+                    ["shouldercamera0_image",
+                     "robot0_eye_in_hand_image",
+                     "shouldercamera1_image",
+                     "robot1_eye_in_hand_image"]
+                ],
+            )
+        else:
+            generator.add_param(
+                key="observation.modalities.obs.low_dim",
+                name="",
+                group=-1,
+                values=[
+                    ["robot0_eef_pos",
+                     "robot0_eef_quat",
+                     "robot0_gripper_qpos",
+                     "robot1_eef_pos",
+                     "robot1_eef_quat",
+                     "robot1_gripper_qpos",
+                     "object"]
+                ],
+            )
+
+        generator.add_param(
+            key="experiment.rollout.horizon",
+            name="",
+            group=-1,
+            values=[700],
+        )
+    elif args.env == 'tool_hang':
+        # set videos off
+        args.no_video = True
+
+        generator.add_param(
+            key="train.action_config",
+            name="",
+            group=-1,
+            values=[
+                {
+                    "actions":{
+                        "normalization": None,
+                    },
+                    "action_dict/abs_pos": {
+                        "normalization": "min_max"
+                    },
+                    "action_dict/abs_rot_axis_angle": {
+                        "normalization": "min_max",
+                        "format": "rot_axis_angle"
+                    },
+                    "action_dict/abs_rot_6d": {
+                        "normalization": None,
+                        "format": "rot_6d"
+                    },
+                    "action_dict/rel_pos": {
+                        "normalization": None,
+                    },
+                    "action_dict/rel_rot_axis_angle": {
+                        "normalization": None,
+                        "format": "rot_axis_angle"
+                    },
+                    "action_dict/rel_rot_6d": {
+                        "normalization": None,
+                        "format": "rot_6d"
+                    },
+                    "action_dict/gripper": {
+                        "normalization": None,
+                    }
+                }
+            ],
+        )
+
+        if args.mod == 'im':
+            generator.add_param(
+                key="observation.modalities.obs.low_dim",
+                name="",
+                group=-1,
+                values=[
+                    ["robot0_eef_pos",
+                     "robot0_eef_quat",
+                     "robot0_gripper_qpos"]
+                ],
+            )
+            generator.add_param(
+                key="observation.modalities.obs.rgb",
+                name="",
+                group=-1,
+                values=[
+                    ["sideview_image",
+                     "robot0_eye_in_hand_image"]
+                ],
+            )
+            generator.add_param(
+                key="observation.encoder.rgb.obs_randomizer_kwargs.crop_height",
+                name="",
+                group=-1,
+                values=[
+                    216
+                ],
+            )
+            generator.add_param(
+                key="observation.encoder.rgb.obs_randomizer_kwargs.crop_width",
+                name="",
+                group=-1,
+                values=[
+                    216
+                ],
+            )
+            generator.add_param(
+                key="observation.encoder.rgb2.obs_randomizer_kwargs.crop_height",
+                name="",
+                group=-1,
+                values=[
+                    216
+                ],
+            )
+            generator.add_param(
+                key="observation.encoder.rgb2.obs_randomizer_kwargs.crop_width",
+                name="",
+                group=-1,
+                values=[
+                    216
+                ],
+            )
+        else:
+            generator.add_param(
+                key="observation.modalities.obs.low_dim",
+                name="",
+                group=-1,
+                values=[
+                    ["robot0_eef_pos",
+                     "robot0_eef_quat",
+                     "robot0_gripper_qpos",
+                     "object"]
+                ],
+            )
+
+        generator.add_param(
+            key="experiment.rollout.horizon",
+            name="",
+            group=-1,
+            values=[700],
+        )
+    else:
+        raise ValueError
+
+
+def set_mod_settings(generator, args):
+    if args.mod == 'ld':
+        if "experiment.save.epochs" not in generator.parameters:
+            generator.add_param(
+                key="experiment.save.epochs",
+                name="",
+                group=-1,
+                values=[
+                    [2000]
+                ],
+            )
+    elif args.mod == 'im':
+        if "experiment.save.every_n_epochs" not in generator.parameters:
+            generator.add_param(
+                key="experiment.save.every_n_epochs",
+                name="",
+                group=-1,
+                values=[20],
+            )
+
+        generator.add_param(
+            key="experiment.epoch_every_n_steps",
+            name="",
+            group=-1,
+            values=[500],
+        )
+        if "train.num_data_workers" not in generator.parameters:
+            generator.add_param(
+                key="train.num_data_workers",
+                name="",
+                group=-1,
+                values=[4],
+            )
+        generator.add_param(
+            key="train.hdf5_cache_mode",
+            name="",
+            group=-1,
+            values=["low_dim"],
+        )
+        if "train.batch_size" not in generator.parameters:
+            generator.add_param(
+                key="train.batch_size",
+                name="",
+                group=-1,
+                values=[16],
+            )
+        if "train.num_epochs" not in generator.parameters:
+            generator.add_param(
+                key="train.num_epochs",
+                name="",
+                group=-1,
+                values=[600],
+            )
+        if "experiment.rollout.rate" not in generator.parameters:
+            generator.add_param(
+                key="experiment.rollout.rate",
+                name="",
+                group=-1,
+                values=[20],
+            )
+
+
+def set_debug_mode(generator, args):
+    if not args.debug:
+        return
+
+    generator.add_param(
+        key="experiment.rollout.n",
+        name="",
+        group=-1,
+        values=[2],
+        value_names=[""],
+    )
+    generator.add_param(
+        key="experiment.rollout.horizon",
+        name="",
+        group=-1,
+        values=[30],
+        value_names=[""],
+    )
+    generator.add_param(
+        key="experiment.rollout.rate",
+        name="",
+        group=-1,
+        values=[2],
+        value_names=[""],
+    )
+    generator.add_param(
+        key="experiment.epoch_every_n_steps",
+        name="",
+        group=-1,
+        values=[2],
+        value_names=[""],
+    )
+    generator.add_param(
+        key="experiment.save.every_n_epochs",
+        name="",
+        group=-1,
+        values=[2],
+        value_names=[""],
+    )
+    generator.add_param(
+        key="experiment.validation_epoch_every_n_steps",
+        name="",
+        group=-1,
+        values=[2],
+        value_names=[""],
+    )
+    generator.add_param(
+        key="train.num_epochs",
+        name="",
+        group=-1,
+        values=[2],
+        value_names=[""],
+    )
+    if args.name is None:
+        generator.add_param(
+            key="experiment.name",
+            name="",
+            group=-1,
+            values=["debug"],
+            value_names=[""],
+        )
+    generator.add_param(
+        key="experiment.save.enabled",
+        name="",
+        group=-1,
+        values=[False],
+        value_names=[""],
+    )
+    generator.add_param(
+        key="train.hdf5_cache_mode",
+        name="",
+        group=-1,
+        values=["low_dim"],
+        value_names=[""],
+    )
+    generator.add_param(
+        key="train.num_data_workers",
+        name="",
+        group=-1,
+        values=[3],
+    )
+
+
+def get_argparser():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--name",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--env",
+        type=str,
+        default='r2d2',
+    )
+
+    parser.add_argument(
+        '--mod',
+        type=str,
+        choices=['ld', 'im'],
+        default='im',
+    )
+
+    parser.add_argument(
+        "--ckpt_mode",
+        type=str,
+        choices=["off", "all", "best_only"],
+        default=None,
+    )
+
+    parser.add_argument(
+        "--script",
+        type=str,
+        default=None
+    )
+
+    parser.add_argument(
+        "--wandb_proj_name",
+        type=str,
+        default=None
+    )
+
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        '--no_video',
+        action='store_true'
+    )
+
+    parser.add_argument(
+        "--tmplog",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--nr",
+        type=int,
+        default=-1
+    )
+
+    parser.add_argument(
+        "--no_wandb",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--n_seeds",
+        type=int,
+        default=None
+    )
+
+    parser.add_argument(
+        "--num_cmd_groups",
+        type=int,
+        default=None
+    )
+
+    return parser
+
+
+def make_generator(args, make_generator_helper):
+    if args.tmplog or args.debug and args.name is None:
+        args.name = "debug"
+    else:
+        time_str = datetime.datetime.fromtimestamp(time.time()).strftime('%m-%d-')
+        args.name = time_str + str(args.name)
+
+    if args.debug or args.tmplog:
+        args.no_wandb = True
+
+    if args.wandb_proj_name is not None:
+        # prepend data to wandb name
+        # time_str = datetime.datetime.fromtimestamp(time.time()).strftime('%m-%d-')
+        # args.wandb_proj_name = time_str + args.wandb_proj_name
+        pass
+
+    if (args.debug or args.tmplog) and (args.wandb_proj_name is None):
+        args.wandb_proj_name = 'debug'
+
+    if not args.debug:
+        assert args.name is not None
+
+    # make config generator
+    generator = make_generator_helper(args)
+
+    if args.ckpt_mode is None:
+        if args.pt:
+            args.ckpt_mode = "all"
+        else:
+            args.ckpt_mode = "best_only"
+
+    set_env_settings(generator, args)
+    set_mod_settings(generator, args)
+
+    # set the debug settings last, to override previous setting changes
+    set_debug_mode(generator, args)
+
+    """ misc settings """
+    generator.add_param(
+        key="experiment.validate",
+        name="",
+        group=-1,
+        values=[
+            False,
+        ],
+    )
+
+    # generate jsons and script
+    generator.generate()
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_d4rl.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_d4rl.py
new file mode 100644
index 0000000000000000000000000000000000000000..99fc1d93c53709f6f149d43457c582cea77e853c
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_d4rl.py
@@ -0,0 +1,143 @@
+"""
+Helper script to convert D4RL data into an hdf5 compatible with this repository.
+Takes a folder path and a D4RL env name. This script downloads the corresponding
+raw D4RL dataset into a "d4rl" subfolder, and then makes a converted dataset 
+in the "d4rl/converted" subfolder.
+
+This script has been tested on the follwing commits:
+
+    https://github.com/rail-berkeley/d4rl/tree/9b68f31bab6a8546edfb28ff0bd9d5916c62fd1f
+    https://github.com/rail-berkeley/d4rl/tree/26adf732efafdad864b3df2287e7b778ee4f7f63
+
+Args:
+    env (str): d4rl env name, which specifies the dataset to download and convert
+    folder (str): specify folder to download raw d4rl datasets and converted d4rl datasets to.
+        A `d4rl` subfolder will be created in this folder with the raw d4rl dataset, and 
+        a `d4rl/converted` subfolder will be created in this folder with the converted
+        datasets (if they do not already exist). Defaults to the datasets folder at
+        the top-level of the repository.
+
+Example usage:
+
+    # downloads to default path at robomimic/datasets/d4rl
+    python convert_d4rl.py --env walker2d-medium-expert-v2
+
+    # download to custom path
+    python convert_d4rl.py --env walker2d-medium-expert-v2 --folder /path/to/folder
+"""
+
+import os
+import h5py
+import json
+import argparse
+import numpy as np
+
+import gym
+import d4rl
+import robomimic
+from robomimic.envs.env_gym import EnvGym
+from robomimic.utils.log_utils import custom_tqdm
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--env",
+        type=str,
+        help="d4rl env name, which specifies the dataset to download and convert",
+    )
+    parser.add_argument(
+        "--folder",
+        type=str,
+        default=None,
+        help="specify folder to download raw d4rl datasets and converted d4rl datasets to.\
+            A `d4rl` subfolder will be created in this folder with the raw d4rl dataset, and\
+            a `d4rl/converted` subfolder will be created in this folder with the converted\
+            datasets (if they do not already exist). Defaults to the datasets folder at\
+            the top-level of the repository.",
+    )
+    args = parser.parse_args()
+
+    base_folder = args.folder
+    if base_folder is None:
+        base_folder = os.path.join(robomimic.__path__[0], "../datasets")
+    base_folder = os.path.join(base_folder, "d4rl")
+
+    # get dataset
+    d4rl.set_dataset_path(base_folder)
+    env = gym.make(args.env)
+    ds = env.env.get_dataset()
+    env.close()
+
+    # env
+    env = EnvGym(args.env)
+
+    # output file
+    write_folder = os.path.join(base_folder, "converted")
+    if not os.path.exists(write_folder):
+        os.makedirs(write_folder)
+    output_path = os.path.join(base_folder, "converted", "{}.hdf5".format(args.env.replace("-", "_")))
+    f_sars = h5py.File(output_path, "w")
+    f_sars_grp = f_sars.create_group("data")
+
+    # code to split D4RL data into trajectories
+    # (modified from https://github.com/aviralkumar2907/d4rl_evaluations/blob/bear_intergrate/bear/examples/bear_hdf5_d4rl.py#L18)
+    all_obs = ds['observations']
+    all_act = ds['actions']
+    N = all_obs.shape[0]
+
+    obs = all_obs[:N-1]
+    actions = all_act[:N-1]
+    next_obs = all_obs[1:]
+    rewards = np.squeeze(ds['rewards'][:N-1])
+    dones = np.squeeze(ds['terminals'][:N-1]).astype(np.int32)
+
+    assert 'timeouts' in ds
+    timeouts = ds['timeouts'][:]
+
+    ctr = 0
+    total_samples = 0
+    num_traj = 0
+    traj = dict(obs=[], next_obs=[], actions=[], rewards=[], dones=[])
+
+    print("\nConverting hdf5...")
+    for idx in custom_tqdm(range(obs.shape[0])):
+
+        # add transition
+        traj["obs"].append(obs[idx])
+        traj["actions"].append(actions[idx])
+        traj["rewards"].append(rewards[idx])
+        traj["next_obs"].append(next_obs[idx])
+        traj["dones"].append(dones[idx])
+        ctr += 1
+
+        # if hit timeout or done is True, end the current trajectory and start a new trajectory
+        if timeouts[idx] or dones[idx]:
+
+            # replace next obs with copy of current obs for final timestep, and make sure done is true
+            traj["next_obs"][-1] = np.array(obs[idx])
+            traj["dones"][-1] = 1
+
+            # store trajectory
+            ep_data_grp = f_sars_grp.create_group("demo_{}".format(num_traj))
+            ep_data_grp.create_dataset("obs/flat", data=np.array(traj["obs"]))
+            ep_data_grp.create_dataset("next_obs/flat", data=np.array(traj["next_obs"]))
+            ep_data_grp.create_dataset("actions", data=np.array(traj["actions"]))
+            ep_data_grp.create_dataset("rewards", data=np.array(traj["rewards"]))
+            ep_data_grp.create_dataset("dones", data=np.array(traj["dones"]))
+            ep_data_grp.attrs["num_samples"] = len(traj["actions"])
+            total_samples += len(traj["actions"])
+            num_traj += 1
+
+            # reset
+            ctr = 0
+            traj = dict(obs=[], next_obs=[], actions=[], rewards=[], dones=[])
+
+    print("\nExcluding {} samples at end of file due to no trajectory truncation.".format(len(traj["actions"])))
+    print("Wrote {} trajectories to new converted hdf5 at {}\n".format(num_traj, output_path))
+
+    # metadata
+    f_sars_grp.attrs["total"] = total_samples
+    f_sars_grp.attrs["env_args"] = json.dumps(env.serialize(), indent=4)
+
+    f_sars.close()
+
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_r2d2.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_r2d2.py
new file mode 100644
index 0000000000000000000000000000000000000000..016b9a9d8dbfbfaab5ddcfff2d5b52f677b9e8e6
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_r2d2.py
@@ -0,0 +1,168 @@
+"""
+Add image information to existing r2d2 hdf5 file
+"""
+import h5py
+import os
+import numpy as np
+import glob
+from tqdm import tqdm
+import argparse
+import shutil
+import torch
+import pytorch3d.transforms as pt
+
+from r2d2.camera_utils.wrappers.recorded_multi_camera_wrapper import RecordedMultiCameraWrapper
+from r2d2.trajectory_utils.trajectory_reader import TrajectoryReader
+from r2d2.camera_utils.info import camera_type_to_string_dict
+
+def convert_dataset(path, args):
+    recording_folderpath = os.path.join(os.path.dirname(path), "recordings", "MP4")
+    camera_kwargs = dict(
+        hand_camera=dict(image=True, concatenate_images=False, resolution=(args.imsize, args.imsize), resize_func="cv2"),
+        varied_camera=dict(image=True, concatenate_images=False, resolution=(args.imsize, args.imsize), resize_func="cv2"),
+    )
+    camera_reader = RecordedMultiCameraWrapper(recording_folderpath, camera_kwargs)
+
+    output_path = os.path.join(os.path.dirname(path), "trajectory_im{}.h5".format(args.imsize))
+    if os.path.exists(output_path):
+        # dataset already exists, skip
+        f = h5py.File(output_path)
+        if "observation/camera/image/hand_camera_image" in f.keys():
+            return
+        f.close()
+
+    shutil.copyfile(path, output_path)
+    f = h5py.File(output_path, "a")
+
+    demo_len = f["action"]["cartesian_position"].shape[0]
+
+    if "camera" not in f["observation"]:
+        f["observation"].create_group("camera").create_group("image")
+    image_grp = f["observation/camera/image"]
+
+    """
+    Extract camera type and keys. Examples of what they should look like:
+    camera_type_dict = {
+        '17225336': 'hand_camera',
+        '24013089': 'varied_camera',
+        '25047636': 'varied_camera'
+    }
+    CAM_NAME_TO_KEY_MAPPING = {
+        "hand_camera_image": "17225336_left",
+        "varied_camera_left_image": "25047636_right",
+        "varied_camera_right_image": "24013089_left"
+    }
+    """
+
+    CAM_ID_TO_TYPE = {}
+    for k in f["observation"]["camera_type"]:
+        CAM_ID_TO_TYPE[k] = camera_type_to_string_dict[f["observation"]["camera_type"][k][0]]
+
+    CAM_NAME_TO_KEY_MAPPING = {}
+    for (cam_id, cam_type) in CAM_ID_TO_TYPE.items():
+        if cam_type == "hand_camera":
+            cam_name = "hand_camera_image"
+            cam_key = "{}_left".format(cam_id)
+        elif cam_type == "varied_camera":
+            cam_name = "varied_camera_1_image" if "varied_camera_1_image" not in CAM_NAME_TO_KEY_MAPPING else "varied_camera_2_image"
+            cam_key = "{}_left".format(cam_id)
+        else:
+            raise NotImplementedError
+
+        CAM_NAME_TO_KEY_MAPPING[cam_name] = cam_key
+
+    cam_data = {cam_name: [] for cam_name in CAM_NAME_TO_KEY_MAPPING.keys()}
+    traj_reader = TrajectoryReader(path, read_images=False)
+
+    for index in range(demo_len):
+        
+        timestep = traj_reader.read_timestep(index=index)
+        timestamp_dict = timestep["observation"]["timestamp"]["cameras"]
+        
+        timestamp_dict = {}
+        camera_obs = camera_reader.read_cameras(
+            index=index, camera_type_dict=CAM_ID_TO_TYPE, timestamp_dict=timestamp_dict
+        )
+        for cam_name in CAM_NAME_TO_KEY_MAPPING.keys():
+            if camera_obs is None:
+                im = np.zeros((args.imsize, args.imsize, 3))
+            else:
+                im_key = CAM_NAME_TO_KEY_MAPPING[cam_name]
+                im = camera_obs["image"][im_key]
+
+            # perform bgr_to_rgb operation
+            im = im[:,:,::-1]
+            
+            cam_data[cam_name].append(im)
+
+    for cam_name in cam_data.keys():
+        cam_data[cam_name] = np.array(cam_data[cam_name]).astype(np.uint8)
+        if cam_name in image_grp:
+            del image_grp[cam_name]
+        image_grp.create_dataset(cam_name, data=cam_data[cam_name], compression="gzip")
+
+    # extract action key data
+    action_dict_group = f["action"]
+    for in_ac_key in ["cartesian_position", "cartesian_velocity"]:
+        in_action = action_dict_group[in_ac_key][:]
+        in_pos = in_action[:,:3].astype(np.float64)
+        in_rot = in_action[:,3:6].astype(np.float64)
+        rot_ = torch.from_numpy(in_rot)
+        rot_mat = pt.axis_angle_to_matrix(rot_)
+        rot_6d = pt.matrix_to_rotation_6d(rot_mat).numpy().astype(np.float64)
+
+        if in_ac_key == "cartesian_position":
+            prefix = "abs_"
+        elif in_ac_key == "cartesian_velocity":
+            prefix = "rel_"
+        else:
+            raise ValueError
+        
+        this_action_dict = {
+            prefix + 'pos': in_pos,
+            prefix + 'rot_axis_angle': in_rot,
+            prefix + 'rot_6d': rot_6d,
+        }
+        for key, data in this_action_dict.items():
+            if key in action_dict_group:
+                del action_dict_group[key]
+            action_dict_group.create_dataset(key, data=data)
+
+    # ensure all action keys are batched (ie., are not 0-dimensional)
+    for k in action_dict_group:
+        if isinstance(action_dict_group[k], h5py.Dataset) and len(action_dict_group[k].shape) == 1:
+            reshaped_values = np.reshape(action_dict_group[k][:], (-1, 1))
+            del action_dict_group[k]
+            action_dict_group.create_dataset(k, data=reshaped_values)
+
+    f.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--folder",
+        type=str,
+        help="folder containing hdf5's to add camera images to",
+        default="~/datasets/r2d2/success"
+    )
+
+    parser.add_argument(
+        "--imsize",
+        type=int,
+        default=128,
+        help="image size (w and h)",
+    )
+    
+    args = parser.parse_args()
+
+    datasets = []
+    for root, dirs, files in os.walk(os.path.expanduser(args.folder)):
+        for f in files:
+            if f == "trajectory.h5":
+                datasets.append(os.path.join(root, f))
+
+    print("converting datasets...")
+    for d in tqdm(datasets):
+        d = os.path.expanduser(d)
+        convert_dataset(d, args)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_robosuite.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_robosuite.py
new file mode 100644
index 0000000000000000000000000000000000000000..8825869824c92791f7bf51058e1e5045ca40b72e
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_robosuite.py
@@ -0,0 +1,75 @@
+"""
+Helper script to convert a dataset collected using robosuite into an hdf5 compatible with
+this repository. Takes a dataset path corresponding to the demo.hdf5 file containing the
+demonstrations. It modifies the dataset in-place. By default, the script also creates a
+90-10 train-validation split.
+
+For more information on collecting datasets with robosuite, see the code link and documentation
+link below.
+
+Code: https://github.com/ARISE-Initiative/robosuite/blob/offline_study/robosuite/scripts/collect_human_demonstrations.py
+
+Documentation: https://robosuite.ai/docs/algorithms/demonstrations.html
+
+Example usage:
+
+    python convert_robosuite.py --dataset /path/to/your/demo.hdf5
+"""
+
+import h5py
+import json
+import argparse
+
+import robomimic.envs.env_base as EB
+from robomimic.scripts.split_train_val import split_train_val_from_hdf5
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="path to input hdf5 dataset",
+    )
+    args = parser.parse_args()
+
+    f = h5py.File(args.dataset, "a") # edit mode
+
+    # store env meta
+    env_name = f["data"].attrs["env"]
+    env_info = json.loads(f["data"].attrs["env_info"])
+    env_meta = dict(
+        type=EB.EnvType.ROBOSUITE_TYPE,
+        env_name=env_name,
+        env_version=f["data"].attrs["repository_version"],
+        env_kwargs=env_info,
+    )
+    if "env_args" in f["data"].attrs:
+        del f["data"].attrs["env_args"]
+    f["data"].attrs["env_args"] = json.dumps(env_meta, indent=4)
+
+    print("====== Stored env meta ======")
+    print(f["data"].attrs["env_args"])
+
+    # store metadata about number of samples
+    total_samples = 0
+    for ep in f["data"]:
+        # ensure model-xml is in per-episode metadata
+        assert "model_file" in f["data/{}".format(ep)].attrs
+
+        # add "num_samples" into per-episode metadata
+        if "num_samples" in f["data/{}".format(ep)].attrs:
+            del f["data/{}".format(ep)].attrs["num_samples"]
+        n_sample = f["data/{}/actions".format(ep)].shape[0]
+        f["data/{}".format(ep)].attrs["num_samples"] = n_sample
+        total_samples += n_sample
+
+    # add total samples to global metadata
+    if "total" in f["data"].attrs:
+        del f["data"].attrs["total"]
+    f["data"].attrs["total"] = total_samples
+
+    f.close()
+
+    # create 90-10 train-validation split in the dataset
+    split_train_val_from_hdf5(hdf5_path=args.dataset, val_ratio=0.1)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_roboturk_pilot.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_roboturk_pilot.py
new file mode 100644
index 0000000000000000000000000000000000000000..2105980453d59be953f3fbe58a3a5ace12a8dccb
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_roboturk_pilot.py
@@ -0,0 +1,192 @@
+"""
+Helper script to convert the RoboTurk Pilot datasets (https://roboturk.stanford.edu/dataset_sim.html)
+into a format compatible with this repository. It will also create some useful filter keys
+in the file (e.g. training, validation, and fastest n trajectories). Prior work
+(https://arxiv.org/abs/1911.05321) has found this useful (for example, training on the 
+fastest 225 demonstrations for bins-Can).
+
+Direct download link for dataset: http://cvgl.stanford.edu/projects/roboturk/RoboTurkPilot.zip
+
+Args:
+    folder (str): path to a folder containing a demo.hdf5 and a models directory containing
+        mujoco xml files. For example, RoboTurkPilot/bins-Can.
+
+    n (int): creates a filter key corresponding to the n fastest trajectories. Defaults to 225.
+
+Example usage:
+
+    python convert_roboturk_pilot.py --folder /path/to/RoboTurkPilot/bins-Can --n 225
+"""
+
+import os
+import h5py
+import json
+import argparse
+import numpy as np
+from tqdm import tqdm
+
+import robomimic
+import robomimic.envs.env_base as EB
+from robomimic.utils.file_utils import create_hdf5_filter_key
+from robomimic.scripts.split_train_val import split_train_val_from_hdf5
+
+
+def convert_rt_pilot_hdf5(ref_folder):
+    """
+    Uses the reference demo hdf5 to write a new converted hdf5 compatible with
+    the repository.
+
+    Args:
+        ref_folder (str): path to a folder containing a demo.hdf5 and a models directory containing
+            mujoco xml files.
+    """
+    hdf5_path = os.path.join(ref_folder, "demo.hdf5")
+    new_path = os.path.join(ref_folder, "demo_new.hdf5")
+
+    f = h5py.File(hdf5_path, "r")
+    f_new = h5py.File(new_path, "w")
+    f_new_grp = f_new.create_group("data")
+
+    # sorted list of demonstrations by demo number
+    demos = list(f["data"].keys())
+    inds = np.argsort([int(elem[5:]) for elem in demos])
+    demos = [demos[i] for i in inds]
+
+    # write each demo
+    num_samples_arr = []
+    for demo_id in tqdm(range(len(demos))):
+        ep = demos[demo_id]
+
+        # create group for this demonstration
+        ep_data_grp = f_new_grp.create_group(ep)
+
+        # copy states over
+        states = f["data/{}/states".format(ep)][()]
+        ep_data_grp.create_dataset("states", data=np.array(states))
+
+        # concat jvels and gripper actions to form full actions
+        jvels = f["data/{}/joint_velocities".format(ep)][()]
+        gripper_acts = f["data/{}/gripper_actuations".format(ep)][()]
+        actions = np.concatenate([jvels, gripper_acts], axis=1)
+
+        # IMPORTANT: clip actions to -1, 1, since this is expected by the codebase
+        actions = np.clip(actions, -1., 1.)
+        ep_data_grp.create_dataset("actions", data=actions)
+
+        # store model xml directly in the new hdf5 file
+        model_path = os.path.join(ref_folder, "models", f["data/{}".format(ep)].attrs["model_file"])
+        f_model = open(model_path, "r")
+        model_xml = f_model.read()
+        f_model.close()
+        ep_data_grp.attrs["model_file"] = model_xml
+
+        # store num samples for this ep
+        num_samples = actions.shape[0]
+        ep_data_grp.attrs["num_samples"] = num_samples # number of transitions in this episode
+        num_samples_arr.append(num_samples)
+
+    # write dataset attributes (metadata)
+    f_new_grp.attrs["total"] = np.sum(num_samples_arr)
+
+    # construct and save env metadata
+    env_meta = dict()
+    env_meta["type"] = EB.EnvType.ROBOSUITE_TYPE
+    env_meta["env_name"] = (f["data"].attrs["env"] + "Teleop")
+    # hardcode robosuite v0.3 args
+    robosuite_args = {
+        "has_renderer": False,
+        "has_offscreen_renderer": False,
+        "ignore_done": True,
+        "use_object_obs": True,
+        "use_camera_obs": False,
+        "camera_depth": False,
+        "camera_height": 84,
+        "camera_width": 84,
+        "camera_name": "agentview",
+        "gripper_visualization": False,
+        "reward_shaping": False,
+        "control_freq": 100,
+    }
+    env_meta["env_kwargs"] = robosuite_args
+    f_new_grp.attrs["env_args"] = json.dumps(env_meta, indent=4) # environment info
+
+    print("\n====== Added env meta ======")
+    print(f_new_grp.attrs["env_args"])
+
+    f.close()
+    f_new.close()
+
+    # back up the old dataset, and replace with new dataset
+    os.rename(hdf5_path, os.path.join(ref_folder, "demo_bak.hdf5"))
+    os.rename(new_path, hdf5_path)
+
+
+def split_fastest_from_hdf5(hdf5_path, n):
+    """
+    Creates filter key for fastest N trajectories, named
+    "fastest_{}".format(n).
+
+    Args:
+        hdf5_path (str): path to the hdf5 file
+
+        n (int): fastest n demos to create filter key for
+    """
+
+    # retrieve fastest n demos
+    f = h5py.File(hdf5_path, "r")
+    demos = sorted(list(f["data"].keys()))
+    traj_lengths = []
+    for ep in demos:
+        traj_lengths.append(f["data/{}/actions".format(ep)].shape[0])
+    inds = np.argsort(traj_lengths)[:n]
+    filtered_demos = [demos[i] for i in inds]
+    f.close()
+
+    # create filter key
+    name = "fastest_{}".format(n)
+    lengths = create_hdf5_filter_key(hdf5_path=hdf5_path, demo_keys=filtered_demos, key_name=name)
+
+    print("Total number of samples in fastest {} demos: {}".format(n, np.sum(lengths)))
+    print("Average number of samples in fastest {} demos: {}".format(n, np.mean(lengths)))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--folder",
+        type=str,
+        help="path to a folder containing a demo.hdf5 and a models directory containing \
+            mujoco xml files. For example, RoboTurkPilot/bins-Can.",
+    )
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=225,
+        help="creates a filter key corresponding to the n fastest trajectories. Defaults to 225.",
+    )
+    args = parser.parse_args()
+
+    # convert hdf5
+    convert_rt_pilot_hdf5(ref_folder=args.folder)
+
+    # create 90-10 train-validation split in the dataset
+    print("\nCreating 90-10 train-validation split...\n")
+    hdf5_path = os.path.join(args.folder, "demo.hdf5")
+    split_train_val_from_hdf5(hdf5_path=hdf5_path, val_ratio=0.1)
+
+    print("\nCreating filter key for fastest {} trajectories...".format(args.n))
+    split_fastest_from_hdf5(hdf5_path=hdf5_path, n=args.n)
+
+    print("\nCreating 90-10 train-validation split for fastest {} trajectories...".format(args.n))
+    split_train_val_from_hdf5(hdf5_path=hdf5_path, val_ratio=0.1, filter_key="fastest_{}".format(args.n))
+
+    print(
+        "\nWARNING: new dataset has replaced old one in demo.hdf5 file. "
+        "The old dataset file has been moved to demo_bak.hdf5"
+    )
+
+    print(
+        "\nNOTE: the new dataset also contains a fastest_{} filter key, for an easy way "
+        "to train on the fastest trajectories. Just set config.train.hdf5_filter to train on this "
+        "subset. A common choice is 225 when training on the bins-Can dataset.\n".format(args.n)
+    )
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_to_robosuite_v141.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_to_robosuite_v141.py
new file mode 100644
index 0000000000000000000000000000000000000000..caf694c7abf17d8b28b3ff78bf99bf710d22072b
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/convert_to_robosuite_v141.py
@@ -0,0 +1,156 @@
+import h5py
+import json
+import argparse
+import os
+from shutil import copyfile
+import robosuite
+import xml.etree.ElementTree as ET
+
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.env_utils as EnvUtils
+import robomimic.utils.file_utils as FileUtils
+
+from robosuite.utils.mjcf_utils import find_elements
+
+def replace_elem(parent, old_elem, new_elem):
+    """
+    code adapted from https://stackoverflow.com/a/20931505
+    """
+    parent_index = list(parent).index(old_elem)
+    parent.remove(old_elem)
+    parent.insert(parent_index, new_elem)
+
+def convert_xml(old_xml_str, env_name, env):
+    """
+    Postprocess xml string generated by robosuite to be compatible with robosuite v1.3
+    This script should not the xml string if it was already generated using robosuite v1.3
+    Args:
+        xml_str (str): xml string to process (from robosuite v1.2)
+    """
+    
+    if env_name in ["PickPlaceCan", "NutAssemblySquare", "ToolHang"]:
+        xml_str = env.env.sim.model.get_xml()
+    elif env_name == "Lift":
+        xml_str = env.env.sim.model.get_xml()
+        # replace the cube_g0 and cube_g0_vis with elements in old_xml_str
+        old_et = ET.ElementTree(ET.fromstring(old_xml_str)).getroot()
+        new_et = ET.ElementTree(ET.fromstring(xml_str)).getroot()
+
+        cube_new = find_elements(
+            root=new_et,
+            tags="body",
+            attribs={"name": "cube_main"},
+            return_first=True
+        )
+
+        cube_old = find_elements(
+            root=old_et,
+            tags="body",
+            attribs={"name": "cube_main"},
+            return_first=True
+        )
+
+        worldbody_new = find_elements(
+            root=new_et,
+            tags="worldbody",
+            return_first=True
+        )
+
+        replace_elem(worldbody_new, cube_new, cube_old)
+
+        xml_str = ET.tostring(new_et, encoding="utf8").decode("utf8")
+    elif env_name == "TwoArmTransport":
+        xml_str = env.env.sim.model.get_xml()
+        # replace the cube_g0 and cube_g0_vis with elements in old_xml_str
+        old_et = ET.ElementTree(ET.fromstring(old_xml_str)).getroot()
+        new_et = ET.ElementTree(ET.fromstring(xml_str)).getroot()
+
+        worldbody_new = find_elements(
+            root=new_et,
+            tags="worldbody",
+            return_first=True
+        )
+        for bname in [
+            "payload_root",
+            
+            ### ignore all these other following assets (makes playback worse for some reason...)
+            # "trash_main",
+            # "transport_start_bin_root", "transport_target_bin_root",
+            # "transport_trash_bin_root", "transport_start_bin_lid_root"
+        ]:
+            body_new = find_elements(
+                root=new_et,
+                tags="body",
+                attribs={"name": bname},
+                return_first=True
+            )
+
+            body_old = find_elements(
+                root=old_et,
+                tags="body",
+                attribs={"name": bname},
+                return_first=True
+            )
+
+        replace_elem(worldbody_new, body_new, body_old)
+
+        xml_str = ET.tostring(new_et, encoding="utf8").decode("utf8")
+
+    return xml_str
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="path to input hdf5 dataset",
+    )
+    parser.add_argument(
+        "--output_dataset",
+        type=str,
+        help="path to output hdf5 dataset",
+    )
+    args = parser.parse_args()
+    
+    args.dataset = os.path.expanduser(args.dataset)
+    args.output_dataset = os.path.expanduser(args.output_dataset)
+    
+    assert args.output_dataset != args.dataset
+    assert robosuite.__version__ == '1.4.1'
+    
+    copyfile(args.dataset, args.output_dataset)
+    
+    f = h5py.File(args.output_dataset, "r+")
+
+    env_args = json.loads(f["data"].attrs["env_args"])
+    env_name = env_args["env_name"]
+
+    env_meta = FileUtils.get_env_metadata_from_dataset(dataset_path=args.dataset)
+    env_type = EnvUtils.get_env_type(env_meta=env_meta)
+
+    # need to make sure ObsUtils knows which observations are images, but it doesn't matter 
+    # for playback since observations are unused. Pass a dummy spec here.
+    dummy_spec = dict(
+        obs=dict(
+                low_dim=["robot0_eef_pos"],
+                rgb=[],
+            ),
+    )
+    ObsUtils.initialize_obs_utils_with_obs_specs(obs_modality_specs=dummy_spec)
+
+    env_meta = FileUtils.get_env_metadata_from_dataset(dataset_path=args.dataset)
+    env = EnvUtils.create_env_from_metadata(env_meta=env_meta, render=False, render_offscreen=True)
+    env.reset()
+
+    for demo_key in list(f["data"].keys()):
+        ep_data_grp = f["data/{}".format(demo_key)]
+        model_file = ep_data_grp.attrs["model_file"]
+        
+        coverted_model_file = convert_xml(model_file, env_name, env)
+        ep_data_grp.attrs["model_file"] = coverted_model_file
+        
+    env_args = json.loads(f["data"].attrs["env_args"])
+    env_args["env_version"] = robosuite.__version__ 
+    f["data"].attrs["env_args"] = json.dumps(env_args, indent=4)
+
+    f.close()
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/extract_action_dict.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/extract_action_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..798f59263e03c32fb806b41f7a3a07aa18152631
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/extract_action_dict.py
@@ -0,0 +1,71 @@
+import argparse
+import pathlib
+import sys
+import tqdm
+import h5py
+import numpy as np
+import torch
+import os
+
+def extract_action_dict(args):    
+    # find files
+    f = h5py.File(os.path.expanduser(args.dataset), mode="r+")
+
+    SPECS = [
+        dict(
+            key="actions",
+            is_absolute=False,
+        ),
+        dict(
+            key="actions_abs",
+            is_absolute=True,
+        )
+    ]
+
+    # execute
+    for spec in SPECS:
+        input_action_key = spec["key"]
+        is_absolute = spec["is_absolute"]
+
+        if is_absolute:
+            prefix = "abs_"
+        else:
+            prefix = "rel_"
+
+        for demo in f['data'].values():
+            in_action = demo[str(input_action_key)][:]
+            in_pos = in_action[:,:3].astype(np.float32)
+            in_rot = in_action[:,3:6].astype(np.float32)
+            in_grip = in_action[:,6:].astype(np.float32)
+
+            rot_ = torch.from_numpy(in_rot)
+            rot_6d = TorchUtils.axis_angle_to_rot_6d(rot_).numpy().astype(np.float32)
+            
+            this_action_dict = {
+                prefix + 'pos': in_pos,
+                prefix + 'rot_axis_angle': in_rot,
+                prefix + 'rot_6d': rot_6d,
+                'gripper': in_grip
+            }
+            # if 'action_dict' in demo:
+            #     del demo['action_dict']
+            action_dict_group = demo.require_group('action_dict')
+            for key, data in this_action_dict.items():
+                if key in action_dict_group:
+                    del action_dict_group[key]
+                action_dict_group.create_dataset(key, data=data)
+
+    f.close()
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True
+    )
+    
+    args = parser.parse_args()
+
+    extract_action_dict(args)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/robosuite_add_absolute_actions.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/robosuite_add_absolute_actions.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d1565d00b97d513ea1ea41c5fb3bedf85923560
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/robosuite_add_absolute_actions.py
@@ -0,0 +1,290 @@
+if __name__ == "__main__":
+    import sys
+    import os
+    import pathlib
+
+    ROOT_DIR = str(pathlib.Path(__file__).parent.parent.parent)
+    sys.path.append(ROOT_DIR)
+
+import multiprocessing
+import os
+import shutil
+import click
+import pathlib
+import h5py
+from tqdm import tqdm
+import collections
+import pickle
+
+
+
+import numpy as np
+import copy
+
+import h5py
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.file_utils as FileUtils
+import robomimic.utils.env_utils as EnvUtils
+from scipy.spatial.transform import Rotation
+
+from robomimic.config import config_factory
+
+"""
+copied/adapted from https://github.com/columbia-ai-robotics/diffusion_policy/blob/main/diffusion_policy/common/robomimic_util.py
+"""
+class RobomimicAbsoluteActionConverter:
+    def __init__(self, dataset_path, algo_name='bc'):
+        # default BC config
+        config = config_factory(algo_name=algo_name)
+
+        # read config to set up metadata for observation modalities (e.g. detecting rgb observations)
+        # must ran before create dataset
+        ObsUtils.initialize_obs_utils_with_config(config)
+
+        env_meta = FileUtils.get_env_metadata_from_dataset(dataset_path)
+        abs_env_meta = copy.deepcopy(env_meta)
+        abs_env_meta['env_kwargs']['controller_configs']['control_delta'] = False
+
+        env = EnvUtils.create_env_from_metadata(
+            env_meta=env_meta,
+            render=False, 
+            render_offscreen=False,
+            use_image_obs=False, 
+        )
+        assert len(env.env.robots) in (1, 2)
+
+        abs_env = EnvUtils.create_env_from_metadata(
+            env_meta=abs_env_meta,
+            render=False, 
+            render_offscreen=False,
+            use_image_obs=False, 
+        )
+        assert not abs_env.env.robots[0].controller.use_delta
+
+        self.env = env
+        self.abs_env = abs_env
+        self.file = h5py.File(dataset_path, 'r')
+    
+    def __len__(self):
+        return len(self.file['data'])
+
+    def convert_actions(self, 
+            states: np.ndarray, 
+            actions: np.ndarray) -> np.ndarray:
+        """
+        Given state and delta action sequence
+        generate equivalent goal position and orientation for each step
+        keep the original gripper action intact.
+        """
+        # in case of multi robot
+        # reshape (N,14) to (N,2,7)
+        # or (N,7) to (N,1,7)
+        stacked_actions = actions.reshape(*actions.shape[:-1],-1,7)
+
+        env = self.env
+        # generate abs actions
+        action_goal_pos = np.zeros(
+            stacked_actions.shape[:-1]+(3,), 
+            dtype=stacked_actions.dtype)
+        action_goal_ori = np.zeros(
+            stacked_actions.shape[:-1]+(3,), 
+            dtype=stacked_actions.dtype)
+        action_gripper = stacked_actions[...,[-1]]
+        for i in range(len(states)):
+            _ = env.reset_to({'states': states[i]})
+
+            # taken from robot_env.py L#454
+            for idx, robot in enumerate(env.env.robots):
+                # run controller goal generator
+                robot.control(stacked_actions[i,idx], policy_step=True)
+            
+                # read pos and ori from robots
+                controller = robot.controller
+                action_goal_pos[i,idx] = controller.goal_pos
+                action_goal_ori[i,idx] = Rotation.from_matrix(
+                    controller.goal_ori).as_rotvec()
+
+        stacked_abs_actions = np.concatenate([
+            action_goal_pos,
+            action_goal_ori,
+            action_gripper
+        ], axis=-1)
+        abs_actions = stacked_abs_actions.reshape(actions.shape)
+        return abs_actions
+
+    def convert_idx(self, idx):
+        file = self.file
+        demo = file[f'data/demo_{idx}']
+        # input
+        states = demo['states'][:]
+        actions = demo['actions'][:]
+
+        # generate abs actions
+        abs_actions = self.convert_actions(states, actions)
+        return abs_actions
+
+    def convert_and_eval_idx(self, idx):
+        env = self.env
+        abs_env = self.abs_env
+        file = self.file
+        # first step have high error for some reason, not representative
+        eval_skip_steps = 1
+
+        demo = file[f'data/demo_{idx}']
+        # input
+        states = demo['states'][:]
+        actions = demo['actions'][:]
+
+        # generate abs actions
+        abs_actions = self.convert_actions(states, actions)
+
+        # verify
+        robot0_eef_pos = demo['obs']['robot0_eef_pos'][:]
+        robot0_eef_quat = demo['obs']['robot0_eef_quat'][:]
+
+        delta_error_info = self.evaluate_rollout_error(
+            env, states, actions, robot0_eef_pos, robot0_eef_quat, 
+            metric_skip_steps=eval_skip_steps)
+        abs_error_info = self.evaluate_rollout_error(
+            abs_env, states, abs_actions, robot0_eef_pos, robot0_eef_quat,
+            metric_skip_steps=eval_skip_steps)
+
+        info = {
+            'delta_max_error': delta_error_info,
+            'abs_max_error': abs_error_info
+        }
+        return abs_actions, info
+
+    @staticmethod
+    def evaluate_rollout_error(env, 
+            states, actions, 
+            robot0_eef_pos, 
+            robot0_eef_quat, 
+            metric_skip_steps=1):
+        # first step have high error for some reason, not representative
+
+        # evaluate abs actions
+        rollout_next_states = list()
+        rollout_next_eef_pos = list()
+        rollout_next_eef_quat = list()
+        obs = env.reset_to({'states': states[0]})
+        for i in range(len(states)):
+            obs = env.reset_to({'states': states[i]})
+            obs, reward, done, info = env.step(actions[i])
+            obs = env.get_observation()
+            rollout_next_states.append(env.get_state()['states'])
+            rollout_next_eef_pos.append(obs['robot0_eef_pos'])
+            rollout_next_eef_quat.append(obs['robot0_eef_quat'])
+        rollout_next_states = np.array(rollout_next_states)
+        rollout_next_eef_pos = np.array(rollout_next_eef_pos)
+        rollout_next_eef_quat = np.array(rollout_next_eef_quat)
+
+        next_state_diff = states[1:] - rollout_next_states[:-1]
+        max_next_state_diff = np.max(np.abs(next_state_diff[metric_skip_steps:]))
+
+        next_eef_pos_diff = robot0_eef_pos[1:] - rollout_next_eef_pos[:-1]
+        next_eef_pos_dist = np.linalg.norm(next_eef_pos_diff, axis=-1)
+        max_next_eef_pos_dist = next_eef_pos_dist[metric_skip_steps:].max()
+
+        next_eef_rot_diff = Rotation.from_quat(robot0_eef_quat[1:]) \
+            * Rotation.from_quat(rollout_next_eef_quat[:-1]).inv()
+        next_eef_rot_dist = next_eef_rot_diff.magnitude()
+        max_next_eef_rot_dist = next_eef_rot_dist[metric_skip_steps:].max()
+
+        info = {
+            'state': max_next_state_diff,
+            'pos': max_next_eef_pos_dist,
+            'rot': max_next_eef_rot_dist
+        }
+        return info
+
+"""
+copied/adapted from https://github.com/columbia-ai-robotics/diffusion_policy/blob/main/diffusion_policy/scripts/robomimic_dataset_conversion.py
+"""
+def worker(x):
+    path, idx, do_eval = x
+    converter = RobomimicAbsoluteActionConverter(path)
+    if do_eval:
+        abs_actions, info = converter.convert_and_eval_idx(idx)
+    else:
+        abs_actions = converter.convert_idx(idx)
+        info = dict()
+    return abs_actions, info
+
+@click.command()
+@click.option('-i', '--input', required=True, help='input hdf5 path')
+@click.option('-o', '--output', required=True, help='output hdf5 path. Parent directory must exist')
+@click.option('-e', '--eval_dir', default=None, help='directory to output evaluation metrics')
+@click.option('-n', '--num_workers', default=None, type=int)
+def main(input, output, eval_dir, num_workers):
+    # process inputs
+    input = pathlib.Path(input).expanduser()
+    assert input.is_file()
+    output = pathlib.Path(output).expanduser()
+    assert output.parent.is_dir()
+    assert not output.is_dir()
+
+    do_eval = False
+    if eval_dir is not None:
+        eval_dir = pathlib.Path(eval_dir).expanduser()
+        assert eval_dir.parent.exists()
+        do_eval = True
+    
+    converter = RobomimicAbsoluteActionConverter(input)
+
+    # run
+    with multiprocessing.Pool(num_workers) as pool:
+        results = pool.map(worker, [(input, i, do_eval) for i in range(len(converter))])
+    
+    # save output
+    print('Copying hdf5')
+    shutil.copy(str(input), str(output))
+
+    # modify action
+    with h5py.File(output, 'r+') as out_file:
+        for i in tqdm(range(len(converter)), desc="Writing to output"):
+            abs_actions, info = results[i]
+            demo = out_file[f'data/demo_{i}']
+            if "actions_abs" not in demo:
+                demo.create_dataset("actions_abs", data=np.array(abs_actions))
+            else:
+                demo['actions_abs'][:] = abs_actions
+    
+    # save eval
+    if do_eval:
+        eval_dir.mkdir(parents=False, exist_ok=True)
+
+        print("Writing error_stats.pkl")
+        infos = [info for _, info in results]
+        pickle.dump(infos, eval_dir.joinpath('error_stats.pkl').open('wb'))
+
+        print("Generating visualization")
+        metrics = ['pos', 'rot']
+        metrics_dicts = dict()
+        for m in metrics:
+            metrics_dicts[m] = collections.defaultdict(list)
+
+        for i in range(len(infos)):
+            info = infos[i]
+            for k, v in info.items():
+                for m in metrics:
+                    metrics_dicts[m][k].append(v[m])
+
+        from matplotlib import pyplot as plt
+        plt.switch_backend('PDF')
+
+        fig, ax = plt.subplots(1, len(metrics))
+        for i in range(len(metrics)):
+            axis = ax[i]
+            data = metrics_dicts[metrics[i]]
+            for key, value in data.items():
+                axis.plot(value, label=key)
+            axis.legend()
+            axis.set_title(metrics[i])
+        fig.set_size_inches(10,4)
+        fig.savefig(str(eval_dir.joinpath('error_stats.pdf')))
+        fig.savefig(str(eval_dir.joinpath('error_stats.png')))
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/set_dataset_attr.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/set_dataset_attr.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f148d08f0bc9ef6336c919ee7cf4e639aded8b5
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/conversion/set_dataset_attr.py
@@ -0,0 +1,98 @@
+"""
+Example:
+python robomimic/scripts/set_dataset_attr.py --glob 'datasets/**/*_abs.hdf5' --env_args env_kwargs.controller_configs.control_delta=false absolute_actions=true 
+"""
+import argparse
+import pathlib
+import json
+import sys
+import tqdm
+import h5py
+
+def update_env_args_dict(env_args_dict: dict, key: tuple, value):
+    if key is None:
+        return env_args_dict
+    elif len(key) == 0:
+        return env_args_dict
+    elif len(key) == 1:
+        env_args_dict[key[0]] = value
+        return env_args_dict
+    else:
+        this_key = key[0]
+        if this_key not in env_args_dict:
+            env_args_dict[this_key] = dict()
+        update_env_args_dict(env_args_dict[this_key], key[1:], value)
+        return env_args_dict
+
+def main():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument(
+        "--glob",
+        type=str,
+        required=True
+    )
+    
+    parser.add_argument(
+        "--env_args",
+        type=str,
+        default=None
+    )
+    
+    parser.add_argument(
+        'attrs',
+        nargs='*'
+    )
+    
+    args = parser.parse_args()
+    
+    # parse attrs to set
+    # format: key=value
+    # values are parsed with json
+    attrs_dict = dict()
+    for attr_arg in args.attrs:
+        key, svalue = attr_arg.split("=")
+        value = json.loads(svalue)
+        attrs_dict[key] = value
+        
+    # parse env_args update
+    env_args_key = None
+    env_args_value = None
+    if args.env_args is not None:
+        key, svalue = args.env_args.split('=')
+        env_args_key = key.split('.')
+        env_args_value = json.loads(svalue)
+    
+    # find files
+    file_paths = list(pathlib.Path.cwd().glob(args.glob))
+    
+    # confirm with the user
+    print("Found matching files:")
+    for f in file_paths:
+        print(f)
+    print("Are you sure to modify these files with the following attributes:")
+    print(json.dumps(attrs_dict, indent=2))
+    if env_args_key is not None:
+        print("env_args."+'.'.join(env_args_key)+'='+str(env_args_value))
+    result = input("[y/n]?")
+    if 'y' not in result:
+        sys.exit(0)
+    
+    # execute
+    for file_path in tqdm.tqdm(file_paths):
+        with h5py.File(str(file_path), mode='r+') as file:
+            # update env_args
+            if env_args_key is not None:
+                env_args = file['data'].attrs['env_args']
+                env_args_dict = json.loads(env_args)
+                env_args_dict = update_env_args_dict(
+                    env_args_dict=env_args_dict, 
+                    key=env_args_key, value=env_args_value)
+                env_args = json.dumps(env_args_dict)
+                file['data'].attrs['env_args'] = env_args
+            
+            # update other attrs
+            file['data'].attrs.update(attrs_dict)
+    
+if __name__ == "__main__":
+    main()
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/convert_actions.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/convert_actions.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0048ac44a47af194e641e2e1220b66030ef869b
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/convert_actions.py
@@ -0,0 +1,89 @@
+"""
+Helper script to prepare datasets for diffusion policy training by (1) adding absolute actions and (2) 
+writing the absolute actions to action dictionaries.
+"""
+import os
+import h5py
+import argparse
+import socket
+import json
+import numpy as np
+
+import robomimic
+import robomimic.macros as Macros
+from robomimic.scripts.conversion.extract_action_dict import extract_action_dict
+
+import mimicgen
+from mimicgen.scripts.add_datagen_info import add_datagen_info
+
+DATASETS = [
+    "/tmp/coffee/src_10.hdf5",
+    "/tmp/stack/src_10.hdf5",
+]
+
+
+def convert_actions_in_dataset(dataset_path, output_name=None, absolute_mg=False):
+    """
+    Helper function to call the relevant scripts to get absolute action dicts for a given dataset.
+    """
+
+    # first get absolute actions
+    args = argparse.Namespace()
+    args.dataset = dataset_path
+    args.n = None
+    args.absolute = True
+    args.absolute_mg = absolute_mg
+
+    new_ds_path = dataset_path
+    if output_name is not None:
+        args.output = os.path.join(os.path.dirname(dataset_path), output_name)
+        new_ds_path = args.output
+    else:
+        args.output = None
+    add_datagen_info(args)
+
+    # next convert actions to dict
+    args = argparse.Namespace()
+    args.dataset = new_ds_path
+    extract_action_dict(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--datasets",
+        type=str,
+        nargs='+',
+        default=None,
+    )
+    parser.add_argument(
+        "--output_name",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--absolute_mg",
+        action='store_true',
+        help="extract absolute actions using existing datagen info, and skip extraction of datagen info",
+    )
+    parser.add_argument(
+        "--slack",
+        action='store_true',
+        help="try to give slack notification after script finishes",
+    )
+    args = parser.parse_args()
+
+    datasets = args.datasets
+    if datasets is None:
+        datasets = DATASETS
+
+    for d in datasets:
+        dataset_path = os.path.expanduser(d)
+        convert_actions_in_dataset(dataset_path, output_name=args.output_name, absolute_mg=args.absolute_mg)
+
+    if args.slack and (Macros.SLACK_TOKEN is not None):
+        from robomimic.scripts.give_slack_notification import give_slack_notif
+        msg = "Completed the following action conversion run!\nHostname: {}\n".format(socket.gethostname())
+        datasets_json = json.dumps(dict(datasets=datasets), indent=4)
+        msg += "```{}```".format(datasets_json)
+        give_slack_notif(msg)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/dataset_states_to_obs.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/dataset_states_to_obs.py
new file mode 100644
index 0000000000000000000000000000000000000000..6295d4f72dd6a8f11a4dd8a16f06fd7e88d2dc9c
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/dataset_states_to_obs.py
@@ -0,0 +1,425 @@
+"""
+Script to extract observations from low-dimensional simulation states in a robosuite dataset.
+
+Args:
+    dataset (str): path to input hdf5 dataset
+
+    output_name (str): name of output hdf5 dataset
+
+    n (int): if provided, stop after n trajectories are processed
+
+    shaped (bool): if flag is set, use dense rewards
+
+    camera_names (str or [str]): camera name(s) to use for image observations. 
+        Leave out to not use image observations.
+
+    camera_height (int): height of image observation.
+
+    camera_width (int): width of image observation
+
+    done_mode (int): how to write done signal. If 0, done is 1 whenever s' is a success state.
+        If 1, done is 1 at the end of each trajectory. If 2, both.
+
+    copy_rewards (bool): if provided, copy rewards from source file instead of inferring them
+
+    copy_dones (bool): if provided, copy dones from source file instead of inferring them
+
+Example usage:
+    
+    # extract low-dimensional observations
+    python dataset_states_to_obs.py --dataset /path/to/demo.hdf5 --output_name low_dim.hdf5 --done_mode 2
+    
+    # extract 84x84 image observations
+    python dataset_states_to_obs.py --dataset /path/to/demo.hdf5 --output_name image.hdf5 \
+        --done_mode 2 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+    # extract 84x84 image and depth observations
+    python dataset_states_to_obs.py --dataset /path/to/demo.hdf5 --output_name depth.hdf5 \
+        --done_mode 2 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84 --depth
+
+    # (space saving option) extract 84x84 image observations with compression and without
+    # extracting next obs (not needed for pure imitation learning algos)
+    python dataset_states_to_obs.py --dataset /path/to/demo.hdf5 --output_name image.hdf5 \
+        --done_mode 2 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84 \
+        --compress --exclude-next-obs
+
+    # use dense rewards, and only annotate the end of trajectories with done signal
+    python dataset_states_to_obs.py --dataset /path/to/demo.hdf5 --output_name image_dense_done_1.hdf5 \
+        --done_mode 1 --dense --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+"""
+import os
+import json
+import h5py
+import argparse
+import numpy as np
+from copy import deepcopy
+from tqdm import tqdm
+
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.file_utils as FileUtils
+import robomimic.utils.env_utils as EnvUtils
+from robomimic.envs.env_base import EnvBase
+
+
+def extract_trajectory(
+    env, 
+    initial_state, 
+    states, 
+    actions,
+    done_mode,
+    camera_names=None,
+    camera_height=84,
+    camera_width=84,
+):
+    """
+    Helper function to extract observations, rewards, and dones along a trajectory using
+    the simulator environment.
+
+    Args:
+        env (instance of EnvBase): environment
+        initial_state (dict): initial simulation state to load
+        states (np.array): array of simulation states to load to extract information
+        actions (np.array): array of actions
+        done_mode (int): how to write done signal. If 0, done is 1 whenever s' is a 
+            success state. If 1, done is 1 at the end of each trajectory. 
+            If 2, do both.
+    """
+    assert isinstance(env, EnvBase)
+    assert states.shape[0] == actions.shape[0]
+
+    # load the initial state
+    env.reset()
+    obs = env.reset_to(initial_state)
+
+    # maybe add in intrinsics and extrinsics for all cameras
+    camera_info = None
+    is_robosuite_env = EnvUtils.is_robosuite_env(env=env)
+    if is_robosuite_env:
+        camera_info = get_camera_info(
+            env=env,
+            camera_names=camera_names,
+            camera_height=camera_height,
+            camera_width=camera_width,
+        )
+
+    traj = dict(
+        obs=[], 
+        next_obs=[], 
+        rewards=[], 
+        dones=[], 
+        actions=np.array(actions), 
+        states=np.array(states), 
+        initial_state_dict=initial_state,
+    )
+    traj_len = states.shape[0]
+    # iteration variable @t is over "next obs" indices
+    for t in range(1, traj_len + 1):
+
+        # get next observation
+        if t == traj_len:
+            # play final action to get next observation for last timestep
+            next_obs, _, _, _ = env.step(actions[t - 1])
+        else:
+            # reset to simulator state to get observation
+            next_obs = env.reset_to({"states" : states[t]})
+
+        # infer reward signal
+        # note: our tasks use reward r(s'), reward AFTER transition, so this is
+        #       the reward for the current timestep
+        r = env.get_reward()
+
+        # infer done signal
+        done = False
+        if (done_mode == 1) or (done_mode == 2):
+            # done = 1 at end of trajectory
+            done = done or (t == traj_len)
+        if (done_mode == 0) or (done_mode == 2):
+            # done = 1 when s' is task success state
+            done = done or env.is_success()["task"]
+        done = int(done)
+
+        # collect transition
+        traj["obs"].append(obs)
+        traj["next_obs"].append(next_obs)
+        traj["rewards"].append(r)
+        traj["dones"].append(done)
+
+        # update for next iter
+        obs = deepcopy(next_obs)
+
+    # convert list of dict to dict of list for obs dictionaries (for convenient writes to hdf5 dataset)
+    traj["obs"] = TensorUtils.list_of_flat_dict_to_dict_of_list(traj["obs"])
+    traj["next_obs"] = TensorUtils.list_of_flat_dict_to_dict_of_list(traj["next_obs"])
+
+    # list to numpy array
+    for k in traj:
+        if k == "initial_state_dict":
+            continue
+        if isinstance(traj[k], dict):
+            for kp in traj[k]:
+                traj[k][kp] = np.array(traj[k][kp])
+        else:
+            traj[k] = np.array(traj[k])
+
+    return traj, camera_info
+
+
+def get_camera_info(
+    env,
+    camera_names=None,
+    camera_height=84,
+    camera_width=84,
+):
+    """
+    Helper function to get camera intrinsics and extrinsics for cameras being used for observations.
+    """
+
+    # TODO: make this function more general than just robosuite environments
+    assert EnvUtils.is_robosuite_env(env=env)
+
+    if camera_names is None:
+        return None
+
+    camera_info = dict()
+    for cam_name in camera_names:
+        K = env.get_camera_intrinsic_matrix(camera_name=cam_name, camera_height=camera_height, camera_width=camera_width)
+        R = env.get_camera_extrinsic_matrix(camera_name=cam_name) # camera pose in world frame
+        if "eye_in_hand" in cam_name:
+            # convert extrinsic matrix to be relative to robot eef control frame
+            assert cam_name.startswith("robot0")
+            eef_site_name = env.base_env.robots[0].controller.eef_name
+            eef_pos = np.array(env.base_env.sim.data.site_xpos[env.base_env.sim.model.site_name2id(eef_site_name)])
+            eef_rot = np.array(env.base_env.sim.data.site_xmat[env.base_env.sim.model.site_name2id(eef_site_name)].reshape([3, 3]))
+            eef_pose = np.zeros((4, 4)) # eef pose in world frame
+            eef_pose[:3, :3] = eef_rot
+            eef_pose[:3, 3] = eef_pos
+            eef_pose[3, 3] = 1.0
+            eef_pose_inv = np.zeros((4, 4))
+            eef_pose_inv[:3, :3] = eef_pose[:3, :3].T
+            eef_pose_inv[:3, 3] = -eef_pose_inv[:3, :3].dot(eef_pose[:3, 3])
+            eef_pose_inv[3, 3] = 1.0
+            R = R.dot(eef_pose_inv) # T_E^W * T_W^C = T_E^C
+        camera_info[cam_name] = dict(
+            intrinsics=K.tolist(),
+            extrinsics=R.tolist(),
+        )
+    return camera_info
+
+
+def dataset_states_to_obs(args):
+    if args.depth:
+        assert len(args.camera_names) > 0, "must specify camera names if using depth"
+
+    # create environment to use for data processing
+    env_meta = FileUtils.get_env_metadata_from_dataset(dataset_path=args.dataset)
+    env = EnvUtils.create_env_for_data_processing(
+        env_meta=env_meta,
+        camera_names=args.camera_names, 
+        camera_height=args.camera_height, 
+        camera_width=args.camera_width, 
+        reward_shaping=args.shaped,
+        use_depth_obs=args.depth,
+    )
+
+    print("==== Using environment with the following metadata ====")
+    print(json.dumps(env.serialize(), indent=4))
+    print("")
+
+    # some operations for playback are robosuite-specific, so determine if this environment is a robosuite env
+    is_robosuite_env = EnvUtils.is_robosuite_env(env_meta)
+
+    # list of all demonstration episodes (sorted in increasing number order)
+    f = h5py.File(args.dataset, "r")
+    demos = list(f["data"].keys())
+    inds = np.argsort([int(elem[5:]) for elem in demos])
+    demos = [demos[i] for i in inds]
+
+    # maybe reduce the number of demonstrations to playback
+    if args.n is not None:
+        demos = demos[:args.n]
+
+    # output file in same directory as input file
+    output_path = os.path.join(os.path.dirname(args.dataset), args.output_name)
+    f_out = h5py.File(output_path, "w")
+    data_grp = f_out.create_group("data")
+    print("input file: {}".format(args.dataset))
+    print("output file: {}".format(output_path))
+
+    total_samples = 0
+    for ind in tqdm(range(len(demos))):
+        ep = demos[ind]
+
+        # prepare initial state to reload from
+        states = f["data/{}/states".format(ep)][()]
+        initial_state = dict(states=states[0])
+        if is_robosuite_env:
+            initial_state["model"] = f["data/{}".format(ep)].attrs["model_file"]
+
+        # extract obs, rewards, dones
+        actions = f["data/{}/actions".format(ep)][()]
+        traj, camera_info = extract_trajectory(
+            env=env, 
+            initial_state=initial_state, 
+            states=states, 
+            actions=actions,
+            done_mode=args.done_mode,
+            camera_names=args.camera_names,
+            camera_height=args.camera_height,
+            camera_width=args.camera_width,
+        )
+
+        # maybe copy reward or done signal from source file
+        if args.copy_rewards:
+            traj["rewards"] = f["data/{}/rewards".format(ep)][()]
+        if args.copy_dones:
+            traj["dones"] = f["data/{}/dones".format(ep)][()]
+
+        # store transitions
+
+        # IMPORTANT: keep name of group the same as source file, to make sure that filter keys are
+        #            consistent as well
+        ep_data_grp = data_grp.create_group(ep)
+        ep_data_grp.create_dataset("actions", data=np.array(traj["actions"]))
+        ep_data_grp.create_dataset("states", data=np.array(traj["states"]))
+        ep_data_grp.create_dataset("rewards", data=np.array(traj["rewards"]))
+        ep_data_grp.create_dataset("dones", data=np.array(traj["dones"]))
+        for k in traj["obs"]:
+            if args.compress:
+                ep_data_grp.create_dataset("obs/{}".format(k), data=np.array(traj["obs"][k]), compression="gzip")
+            else:
+                ep_data_grp.create_dataset("obs/{}".format(k), data=np.array(traj["obs"][k]))
+            if not args.exclude_next_obs:
+                if args.compress:
+                    ep_data_grp.create_dataset("next_obs/{}".format(k), data=np.array(traj["next_obs"][k]), compression="gzip")
+                else:
+                    ep_data_grp.create_dataset("next_obs/{}".format(k), data=np.array(traj["next_obs"][k]))
+
+        # episode metadata
+        if is_robosuite_env:
+            ep_data_grp.attrs["model_file"] = traj["initial_state_dict"]["model"] # model xml for this episode
+        ep_data_grp.attrs["num_samples"] = traj["actions"].shape[0] # number of transitions in this episode
+
+        if camera_info is not None:
+            assert is_robosuite_env
+            ep_data_grp.attrs["camera_info"] = json.dumps(camera_info, indent=4)
+
+        total_samples += traj["actions"].shape[0]
+
+
+    # copy over all filter keys that exist in the original hdf5
+    if "mask" in f:
+        f.copy("mask", f_out)
+
+    # global metadata
+    data_grp.attrs["total"] = total_samples
+    data_grp.attrs["env_args"] = json.dumps(env.serialize(), indent=4) # environment info
+    print("Wrote {} trajectories to {}".format(len(demos), output_path))
+
+    f.close()
+    f_out.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="path to input hdf5 dataset",
+    )
+    # name of hdf5 to write - it will be in the same directory as @dataset
+    parser.add_argument(
+        "--output_name",
+        type=str,
+        required=True,
+        help="name of output hdf5 dataset",
+    )
+
+    # specify number of demos to process - useful for debugging conversion with a handful
+    # of trajectories
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=None,
+        help="(optional) stop after n trajectories are processed",
+    )
+
+    # flag for reward shaping
+    parser.add_argument(
+        "--shaped", 
+        action='store_true',
+        help="(optional) use shaped rewards",
+    )
+
+    # camera names to use for observations
+    parser.add_argument(
+        "--camera_names",
+        type=str,
+        nargs='+',
+        default=[],
+        help="(optional) camera name(s) to use for image observations. Leave out to not use image observations.",
+    )
+
+    parser.add_argument(
+        "--camera_height",
+        type=int,
+        default=84,
+        help="(optional) height of image observations",
+    )
+
+    parser.add_argument(
+        "--camera_width",
+        type=int,
+        default=84,
+        help="(optional) width of image observations",
+    )
+
+    # flag for including depth observations per camera
+    parser.add_argument(
+        "--depth",
+        action='store_true',
+        help="(optional) use depth observations for each camera",
+    )
+
+    # specifies how the "done" signal is written. If "0", then the "done" signal is 1 wherever
+    # the transition (s, a, s') has s' in a task completion state. If "1", the "done" signal 
+    # is one at the end of every trajectory. If "2", the "done" signal is 1 at task completion
+    # states for successful trajectories and 1 at the end of all trajectories.
+    parser.add_argument(
+        "--done_mode",
+        type=int,
+        default=0,
+        help="how to write done signal. If 0, done is 1 whenever s' is a success state.\
+            If 1, done is 1 at the end of each trajectory. If 2, both.",
+    )
+
+    # flag for copying rewards from source file instead of re-writing them
+    parser.add_argument(
+        "--copy_rewards", 
+        action='store_true',
+        help="(optional) copy rewards from source file instead of inferring them",
+    )
+
+    # flag for copying dones from source file instead of re-writing them
+    parser.add_argument(
+        "--copy_dones", 
+        action='store_true',
+        help="(optional) copy dones from source file instead of inferring them",
+    )
+
+    # flag to exclude next obs in dataset
+    parser.add_argument(
+        "--exclude-next-obs",
+        action='store_true',
+        help="(optional) exclude next obs in dataset",
+    )
+
+    # flag to compress observations with gzip option in hdf5
+    parser.add_argument(
+        "--compress",
+        action='store_true',
+        help="(optional) compress observations with gzip option in hdf5",
+    )
+
+    args = parser.parse_args()
+    dataset_states_to_obs(args)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/download_datasets.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/download_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..caf3a280a14aec6f3c39157e9f9d84dd2a2486c4
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/download_datasets.py
@@ -0,0 +1,163 @@
+"""
+Script to download datasets packaged with the repository. By default, all
+datasets will be stored at robomimic/datasets, unless the @download_dir
+argument is supplied. We recommend using the default, as most examples that
+use these datasets assume that they can be found there.
+
+The @tasks, @dataset_types, and @hdf5_types arguments can all be supplied
+to choose which datasets to download. 
+
+Args:
+    download_dir (str): Base download directory. Created if it doesn't exist. 
+        Defaults to datasets folder in repository - only pass in if you would
+        like to override the location.
+
+    tasks (list): Tasks to download datasets for. Defaults to lift task. Pass 'all' to 
+        download all tasks (sim + real) 'sim' to download all sim tasks, 'real' to 
+        download all real tasks, or directly specify the list of tasks.
+    
+    dataset_types (list): Dataset types to download datasets for (e.g. ph, mh, mg). 
+        Defaults to ph. Pass 'all' to download datasets for all available dataset 
+        types per task, or directly specify the list of dataset types.
+
+    hdf5_types (list): hdf5 types to download datasets for (e.g. raw, low_dim, image). 
+        Defaults to low_dim. Pass 'all' to download datasets for all available hdf5 
+        types per task and dataset, or directly specify the list of hdf5 types.
+
+Example usage:
+
+    # default behavior - just download lift proficient-human low-dim dataset
+    python download_datasets.py
+
+    # download low-dim proficient-human datasets for all simulation tasks
+    # (do a dry run first to see which datasets would be downloaded)
+    python download_datasets.py --tasks sim --dataset_types ph --hdf5_types low_dim --dry_run
+    python download_datasets.py --tasks sim --dataset_types ph --hdf5_types low_dim
+
+    # download all low-dim and image multi-human datasets for the can and square tasks
+    python download_datasets.py --tasks can square --dataset_types mh --hdf5_types low_dim image
+
+    # download the sparse reward machine-generated low-dim datasets
+    python download_datasets.py --tasks all --dataset_types mg --hdf5_types low_dim_sparse
+
+    # download all real robot datasets
+    python download_datasets.py --tasks real
+"""
+import os
+import argparse
+
+import robomimic
+import robomimic.utils.file_utils as FileUtils
+from robomimic import DATASET_REGISTRY
+
+ALL_TASKS = ["lift", "can", "square", "transport", "tool_hang", "lift_real", "can_real", "tool_hang_real"]
+ALL_DATASET_TYPES = ["ph", "mh", "mg", "paired"]
+ALL_HDF5_TYPES = ["raw", "low_dim", "image", "low_dim_sparse", "low_dim_dense", "image_sparse", "image_dense"]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # directory to download datasets to
+    parser.add_argument(
+        "--download_dir",
+        type=str,
+        default=None,
+        help="Base download directory. Created if it doesn't exist. Defaults to datasets folder in repository.",
+    )
+
+    # tasks to download datasets for
+    parser.add_argument(
+        "--tasks",
+        type=str,
+        nargs='+',
+        default=["lift"],
+        help="Tasks to download datasets for. Defaults to lift task. Pass 'all' to download all tasks (sim + real)\
+            'sim' to download all sim tasks, 'real' to download all real tasks, or directly specify the list of\
+            tasks.",
+    )
+
+    # dataset types to download datasets for
+    parser.add_argument(
+        "--dataset_types",
+        type=str,
+        nargs='+',
+        default=["ph"],
+        help="Dataset types to download datasets for (e.g. ph, mh, mg). Defaults to ph. Pass 'all' to download \
+            datasets for all available dataset types per task, or directly specify the list of dataset types.",
+    )
+
+    # hdf5 types to download datasets for
+    parser.add_argument(
+        "--hdf5_types",
+        type=str,
+        nargs='+',
+        default=["low_dim"],
+        help="hdf5 types to download datasets for (e.g. raw, low_dim, image). Defaults to raw. Pass 'all' \
+            to download datasets for all available hdf5 types per task and dataset, or directly specify the list\
+            of hdf5 types.",
+    )
+
+    # dry run - don't actually download datasets, but print which datasets would be downloaded
+    parser.add_argument(
+        "--dry_run",
+        action='store_true',
+        help="set this flag to do a dry run to only print which datasets would be downloaded"
+    )
+
+    args = parser.parse_args()
+
+    # set default base directory for downloads
+    default_base_dir = args.download_dir
+    if default_base_dir is None:
+        default_base_dir = os.path.join(robomimic.__path__[0], "../datasets")
+
+    # load args
+    download_tasks = args.tasks
+    if "all" in download_tasks:
+        assert len(download_tasks) == 1, "all should be only tasks argument but got: {}".format(args.tasks)
+        download_tasks = ALL_TASKS
+    elif "sim" in download_tasks:
+        assert len(download_tasks) == 1, "sim should be only tasks argument but got: {}".format(args.tasks)
+        download_tasks = [task for task in ALL_TASKS if "real" not in task]
+    elif "real" in download_tasks:
+        assert len(download_tasks) == 1, "real should be only tasks argument but got: {}".format(args.tasks)
+        download_tasks = [task for task in ALL_TASKS if "real" in task]
+
+    download_dataset_types = args.dataset_types
+    if "all" in download_dataset_types:
+        assert len(download_dataset_types) == 1, "all should be only dataset_types argument but got: {}".format(args.dataset_types)
+        download_dataset_types = ALL_DATASET_TYPES
+
+    download_hdf5_types = args.hdf5_types
+    if "all" in download_hdf5_types:
+        assert len(download_hdf5_types) == 1, "all should be only hdf5_types argument but got: {}".format(args.hdf5_types)
+        download_hdf5_types = ALL_HDF5_TYPES
+
+    # download requested datasets
+    for task in DATASET_REGISTRY:
+        if task in download_tasks:
+            for dataset_type in DATASET_REGISTRY[task]:
+                if dataset_type in download_dataset_types:
+                    for hdf5_type in DATASET_REGISTRY[task][dataset_type]:
+                        if hdf5_type in download_hdf5_types:
+                            download_dir = os.path.abspath(os.path.join(default_base_dir, task, dataset_type))
+                            print("\nDownloading dataset:\n    task: {}\n    dataset type: {}\n    hdf5 type: {}\n    download path: {}"
+                                .format(task, dataset_type, hdf5_type, download_dir))
+                            url = DATASET_REGISTRY[task][dataset_type][hdf5_type]["url"]
+                            if url is None:
+                                print(
+                                    "Skipping {}-{}-{}, no url for dataset exists.".format(task, dataset_type, hdf5_type)
+                                    + " Create this dataset locally by running the appropriate command from robomimic/scripts/extract_obs_from_raw_datasets.sh."
+                                )
+                                continue
+                            if args.dry_run:
+                                print("\ndry run: skip download")
+                            else:
+                                # Make sure path exists and create if it doesn't
+                                os.makedirs(download_dir, exist_ok=True)
+                                FileUtils.download_url(
+                                    url=DATASET_REGISTRY[task][dataset_type][hdf5_type]["url"], 
+                                    download_dir=download_dir,
+                                )
+                            print("")
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/download_momart_datasets.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/download_momart_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..affecf11b525f39aaae47095bb85c6086a955a70
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/download_momart_datasets.py
@@ -0,0 +1,161 @@
+"""
+Script to download datasets used in MoMaRT paper (https://arxiv.org/abs/2112.05251). By default, all
+datasets will be stored at robomimic/datasets, unless the @download_dir
+argument is supplied. We recommend using the default, as most examples that
+use these datasets assume that they can be found there.
+
+The @tasks and @dataset_types arguments can all be supplied
+to choose which datasets to download. 
+
+Args:
+    download_dir (str): Base download directory. Created if it doesn't exist. 
+        Defaults to datasets folder in repository - only pass in if you would
+        like to override the location.
+
+    tasks (list): Tasks to download datasets for. Defaults to table_setup_from_dishwasher task. Pass 'all' to
+        download all tasks - 5 total:
+            - table_setup_from_dishwasher
+            - table_setup_from_dresser
+            - table_cleanup_to_dishwasher
+            - table_cleanup_to_sink
+            - unload_dishwasher
+    
+    dataset_types (list): Dataset types to download datasets for (expert, suboptimal, generalize, sample).
+        Defaults to expert. Pass 'all' to download datasets for all available dataset
+        types per task, or directly specify the list of dataset types.
+        NOTE: Because these datasets are huge, we will always print out a warning
+        that a user must respond yes to to acknowledge the data size (can be up to >100G for all tasks of a single type)
+
+Example usage:
+
+    # default behavior - just download expert table_setup_from_dishwasher dataset
+    python download_momart_datasets.py
+
+    # download expert datasets for all tasks
+    # (do a dry run first to see which datasets would be downloaded)
+    python download_momart_datasets.py --tasks all --dataset_types expert --dry_run
+    python download_momart_datasets.py --tasks all --dataset_types expert low_dim
+
+    # download all expert and suboptimal datasets for the table_setup_from_dishwasher and table_cleanup_to_dishwasher tasks
+    python download_datasets.py --tasks table_setup_from_dishwasher table_cleanup_to_dishwasher --dataset_types expert suboptimal
+
+    # download the sample datasets
+    python download_datasets.py --tasks all --dataset_types sample
+
+    # download all datasets
+    python download_datasets.py --tasks all --dataset_types all
+"""
+import os
+import argparse
+
+import robomimic
+import robomimic.utils.file_utils as FileUtils
+from robomimic import MOMART_DATASET_REGISTRY
+
+ALL_TASKS = [
+    "table_setup_from_dishwasher",
+    "table_setup_from_dresser",
+    "table_cleanup_to_dishwasher",
+    "table_cleanup_to_sink",
+    "unload_dishwasher",
+]
+ALL_DATASET_TYPES = [
+    "expert",
+    "suboptimal",
+    "generalize",
+    "sample",
+]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # directory to download datasets to
+    parser.add_argument(
+        "--download_dir",
+        type=str,
+        default=None,
+        help="Base download directory. Created if it doesn't exist. Defaults to datasets folder in repository.",
+    )
+
+    # tasks to download datasets for
+    parser.add_argument(
+        "--tasks",
+        type=str,
+        nargs='+',
+        default=["table_setup_from_dishwasher"],
+        help="Tasks to download datasets for. Defaults to table_setup_from_dishwasher task. Pass 'all' to download all"
+             f"5 tasks, or directly specify the list of tasks. Options are any of: {ALL_TASKS}",
+    )
+
+    # dataset types to download datasets for
+    parser.add_argument(
+        "--dataset_types",
+        type=str,
+        nargs='+',
+        default=["expert"],
+        help="Dataset types to download datasets for (e.g. expert, suboptimal). Defaults to expert. Pass 'all' to "
+             "download datasets for all available dataset types per task, or directly specify the list of dataset "
+             f"types. Options are any of: {ALL_DATASET_TYPES}",
+    )
+
+    # dry run - don't actually download datasets, but print which datasets would be downloaded
+    parser.add_argument(
+        "--dry_run",
+        action='store_true',
+        help="set this flag to do a dry run to only print which datasets would be downloaded"
+    )
+
+    args = parser.parse_args()
+
+    # set default base directory for downloads
+    default_base_dir = args.download_dir
+    if default_base_dir is None:
+        default_base_dir = os.path.join(robomimic.__path__[0], "../datasets")
+
+    # load args
+    download_tasks = args.tasks
+    if "all" in download_tasks:
+        assert len(download_tasks) == 1, "all should be only tasks argument but got: {}".format(args.tasks)
+        download_tasks = ALL_TASKS
+
+    download_dataset_types = args.dataset_types
+    if "all" in download_dataset_types:
+        assert len(download_dataset_types) == 1, "all should be only dataset_types argument but got: {}".format(args.dataset_types)
+        download_dataset_types = ALL_DATASET_TYPES
+
+    # Run sanity check first to warn user if they're about to download a huge amount of data
+    total_size = 0
+    for task in MOMART_DATASET_REGISTRY:
+        if task in download_tasks:
+            for dataset_type in MOMART_DATASET_REGISTRY[task]:
+                if dataset_type in download_dataset_types:
+                    total_size += MOMART_DATASET_REGISTRY[task][dataset_type]["size"]
+
+    # Verify user acknowledgement if we're not doing a dry run
+    if not args.dry_run:
+        user_response = input(f"Warning: requested datasets will take a total of {total_size}GB. Proceed? y/n\n")
+        assert user_response.lower() in {"yes", "y"}, f"Did not receive confirmation. Aborting download."
+
+    # download requested datasets
+    for task in MOMART_DATASET_REGISTRY:
+        if task in download_tasks:
+            for dataset_type in MOMART_DATASET_REGISTRY[task]:
+                if dataset_type in download_dataset_types:
+                    dataset_info = MOMART_DATASET_REGISTRY[task][dataset_type]
+                    download_dir = os.path.abspath(os.path.join(default_base_dir, task, dataset_type))
+                    print(f"\nDownloading dataset:\n"
+                          f"    task: {task}\n"
+                          f"    dataset type: {dataset_type}\n"
+                          f"    dataset size: {dataset_info['size']}GB\n"
+                          f"    download path: {download_dir}")
+                    if args.dry_run:
+                        print("\ndry run: skip download")
+                    else:
+                        # Make sure path exists and create if it doesn't
+                        os.makedirs(download_dir, exist_ok=True)
+                        FileUtils.download_url(
+                            url=dataset_info["url"],
+                            download_dir=download_dir,
+                        )
+                    print("")
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/extract_obs_from_raw_datasets.sh b/phantom/submodules/phantom-robomimic/robomimic/scripts/extract_obs_from_raw_datasets.sh
new file mode 100644
index 0000000000000000000000000000000000000000..00fc78f8bf08df5339e79c65019db683dfac6e59
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/extract_obs_from_raw_datasets.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+
+# This script holds the commands that were used to go from raw robosuite demo.hdf5 files
+# to our processed low-dim and image hdf5 files.
+
+BASE_DATASET_DIR="../../datasets"
+echo "Using base dataset directory: $BASE_DATASET_DIR"
+
+
+### NOTE: we use done-mode 0 for MG (dones on task success) ###
+
+
+### mg ###
+
+
+# lift - mg, sparse
+python dataset_states_to_obs.py --done_mode 0 \
+--dataset $BASE_DATASET_DIR/lift/mg/demo_v141.hdf5 \
+--output_name low_dim_sparse_v141.hdf5
+python dataset_states_to_obs.py --done_mode 0 \
+--dataset $BASE_DATASET_DIR/lift/mg/demo_v141.hdf5 \
+--output_name image_sparse_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+# lift - mg, dense
+python dataset_states_to_obs.py --done_mode 0 --shaped \
+--dataset $BASE_DATASET_DIR/lift/mg/demo_v141.hdf5 \
+--output_name low_dim_dense_v141.hdf5
+python dataset_states_to_obs.py --done_mode 0 --shaped \
+--dataset $BASE_DATASET_DIR/lift/mg/demo_v141.hdf5 \
+--output_name image_dense_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+# can - mg, sparse
+python dataset_states_to_obs.py --done_mode 0 \
+--dataset $BASE_DATASET_DIR/can/mg/demo_v141.hdf5 \
+--output_name low_dim_sparse_v141.hdf5
+python dataset_states_to_obs.py --done_mode 0 \
+--dataset $BASE_DATASET_DIR/can/mg/demo_v141.hdf5 \
+--output_name image_sparse_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+# can - mg, dense
+python dataset_states_to_obs.py --done_mode 0 --shaped \
+--dataset $BASE_DATASET_DIR/can/mg/demo_v141.hdf5 \
+--output_name low_dim_dense_v141.hdf5
+python dataset_states_to_obs.py --done_mode 0 --shaped \
+--dataset $BASE_DATASET_DIR/can/mg/demo_v141.hdf5 \
+--output_name image_dense_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+
+### NOTE: we use done-mode 2 for PH / MH (dones on task success and end of trajectory) ###
+
+
+### ph ###
+
+
+# lift - ph
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/lift/ph/demo_v141.hdf5 \
+--output_name low_dim_v141.hdf5
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/lift/ph/demo_v141.hdf5 \
+--output_name image_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+# can - ph
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/can/ph/demo_v141.hdf5 \
+--output_name low_dim_v141.hdf5
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/can/ph/demo_v141.hdf5 \
+--output_name image_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+# square - ph
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/square/ph/demo_v141.hdf5 \
+--output_name low_dim_v141.hdf5
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/square/ph/demo_v141.hdf5 \
+--output_name image_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+# transport - ph
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/transport/ph/demo_v141.hdf5 \
+--output_name low_dim_v141.hdf5
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/transport/ph/demo_v141.hdf5 \
+--output_name image_v141.hdf5 --camera_names shouldercamera0 shouldercamera1 robot0_eye_in_hand robot1_eye_in_hand --camera_height 84 --camera_width 84
+
+# tool hang - ph
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/tool_hang/ph/demo_v141.hdf5 \
+--output_name low_dim_v141.hdf5
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/tool_hang/ph/demo_v141.hdf5 \
+--output_name image_v141.hdf5 --camera_names sideview robot0_eye_in_hand --camera_height 240 --camera_width 240
+
+
+### mh ###
+
+
+# lift - mh
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/lift/mh/demo_v141.hdf5 \
+--output_name low_dim_v141.hdf5
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/lift/mh/demo_v141.hdf5 \
+--output_name image_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+# can - mh
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/can/mh/demo_v141.hdf5 \
+--output_name low_dim_v141.hdf5
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/can/mh/demo_v141.hdf5 \
+--output_name image_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+# square - mh
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/square/mh/demo_v141.hdf5 \
+--output_name low_dim_v141.hdf5
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/square/mh/demo_v141.hdf5 \
+--output_name image_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
+
+# transport - mh
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/transport/mh/demo_v141.hdf5 \
+--output_name low_dim_v141.hdf5
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/transport/mh/demo_v141.hdf5 \
+--output_name image_v141.hdf5 --camera_names shouldercamera0 shouldercamera1 robot0_eye_in_hand robot1_eye_in_hand --camera_height 84 --camera_width 84
+
+
+### can-paired ###
+
+
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/can/paired/demo_v141.hdf5 \
+--output_name low_dim_v141.hdf5
+python dataset_states_to_obs.py --done_mode 2 \
+--dataset $BASE_DATASET_DIR/can/paired/demo_v141.hdf5 \
+--output_name image_v141.hdf5 --camera_names agentview robot0_eye_in_hand --camera_height 84 --camera_width 84
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/generate_config_templates.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/generate_config_templates.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e1d8710c124cd418850bf25e016873ed88c49d
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/generate_config_templates.py
@@ -0,0 +1,28 @@
+"""
+Helpful script to generate example config files for each algorithm. These should be re-generated
+when new config options are added, or when default settings in the config classes are modified.
+"""
+import os
+import json
+
+import robomimic
+from robomimic.config import get_all_registered_configs
+
+
+def main():
+    # store template config jsons in this directory
+    target_dir = os.path.join(robomimic.__path__[0], "exps/templates/")
+
+    # iterate through registered algorithm config classes
+    all_configs = get_all_registered_configs()
+    for algo_name in all_configs:
+        # make config class for this algorithm
+        c = all_configs[algo_name]()
+        assert algo_name == c.algo_name
+        # dump to json
+        json_path = os.path.join(target_dir, "{}.json".format(algo_name))
+        c.dump(filename=json_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/generate_paper_configs.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/generate_paper_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..52ed7d5b15a25def7da7a02c7c0e135772f269a0
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/generate_paper_configs.py
@@ -0,0 +1,1369 @@
+"""
+Helper script to generate jsons for reproducing paper experiments.
+
+Args:
+    config_dir (str): Directory where generated configs will be placed. 
+        Defaults to 'paper' subfolder in exps folder of repository
+
+    dataset_dir (str): Base dataset directory where released datasets can be
+        found on disk. Defaults to datasets folder in repository.
+
+    output_dir (str): Base output directory for all training runs that will be 
+        written to generated configs.
+
+Example usage:
+    # Assume datasets alredy exist in robomimic/../datasets folder. Configs will be generated under robomimic/exps/paper
+    python generate_paper_configs.py --output_dir /tmp/experiment_results
+
+    # Specify where datasets exist, and specify where configs should be generated.
+    python generate_paper_configs.py --config_dir /tmp/configs --dataset_dir /tmp/datasets --output_dir /tmp/experiment_results
+"""
+import os
+import argparse
+import robomimic
+from robomimic import DATASET_REGISTRY
+from robomimic.config import Config, BCConfig, BCQConfig, CQLConfig, HBCConfig, IRISConfig, config_factory
+
+
+def modify_config_for_default_low_dim_exp(config):
+    """
+    Modifies a Config object with experiment, training, and observation settings that
+    were used across all low-dimensional experiments by default.
+
+    Args:
+        config (Config instance): config to modify
+    """
+
+    with config.experiment.values_unlocked():
+        # save model during every evaluation (every 50 epochs)
+        config.experiment.save.enabled = True
+        config.experiment.save.every_n_epochs = 50
+
+        # every epoch is 100 gradient steps, and validation epoch is 10 gradient steps
+        config.experiment.epoch_every_n_steps = 100
+        config.experiment.validation_epoch_every_n_steps = 10
+
+        # do 50 evaluation rollouts every 50 epochs
+        # NOTE: horizon will generally get set depending on the task and dataset type
+        config.experiment.rollout.enabled = True
+        config.experiment.rollout.n = 50
+        config.experiment.rollout.horizon = 400
+        config.experiment.rollout.rate = 50
+        config.experiment.rollout.warmstart = 0
+        config.experiment.rollout.terminate_on_success = True
+
+    with config.train.values_unlocked():
+        # assume entire dataset can fit in memory
+        config.train.num_data_workers = 0
+        config.train.hdf5_cache_mode = "all"
+
+        # batch size 100 and 2000 training epochs
+        config.train.batch_size = 100
+        config.train.num_epochs = 2000
+
+    with config.observation.values_unlocked():
+        # default observation is eef pose, gripper finger position, and object information,
+        # all of which are low-dim. 
+        default_low_dim_obs = [
+            "robot0_eef_pos", 
+            "robot0_eef_quat", 
+            "robot0_gripper_qpos", 
+            "object",
+        ]
+        # handle hierarchical observation configs
+        if config.algo_name == "hbc":
+            configs_to_set = [
+                config.observation.actor.modalities.obs,
+                config.observation.planner.modalities.obs,
+                config.observation.planner.modalities.subgoal,
+            ]
+        elif config.algo_name == "iris":
+            configs_to_set = [
+                config.observation.actor.modalities.obs,
+                config.observation.value_planner.planner.modalities.obs,
+                config.observation.value_planner.planner.modalities.subgoal,
+                config.observation.value_planner.value.modalities.obs,
+            ]
+        else:
+            configs_to_set = [config.observation.modalities.obs]
+        # set all observations / subgoals to use the correct low-dim modalities
+        for cfg in configs_to_set:
+            cfg.low_dim = list(default_low_dim_obs)
+            cfg.rgb = []
+
+    return config
+
+
+def modify_config_for_default_image_exp(config):
+    """
+    Modifies a Config object with experiment, training, and observation settings that
+    were used across all image experiments by default.
+
+    Args:
+        config (Config instance): config to modify
+    """
+    assert config.algo_name not in ["hbc", "iris"], "no image training for HBC and IRIS"
+
+    with config.experiment.values_unlocked():
+        # save model during every evaluation (every 20 epochs)
+        config.experiment.save.enabled = True
+        config.experiment.save.every_n_epochs = 20
+
+        # every epoch is 500 gradient steps, and validation epoch is 50 gradient steps
+        config.experiment.epoch_every_n_steps = 500
+        config.experiment.validation_epoch_every_n_steps = 50
+
+        # do 50 evaluation rollouts every 20 epochs
+        # NOTE: horizon will generally get set depending on the task and dataset type
+        config.experiment.rollout.enabled = True
+        config.experiment.rollout.n = 50
+        config.experiment.rollout.horizon = 400
+        config.experiment.rollout.rate = 20
+        config.experiment.rollout.warmstart = 0
+        config.experiment.rollout.terminate_on_success = True
+
+    with config.train.values_unlocked():
+        # only cache low-dim info, and use 2 data workers to increase fetch speed for image obs
+        config.train.num_data_workers = 2
+        config.train.hdf5_cache_mode = "low_dim"
+
+        # batch size 16 and 600 training epochs
+        config.train.batch_size = 16
+        config.train.num_epochs = 600
+
+
+    with config.observation.values_unlocked():
+        # default low-dim observation is eef pose, gripper finger position
+        # default image observation is external camera and wrist camera
+        config.observation.modalities.obs.low_dim = [
+            "robot0_eef_pos", 
+            "robot0_eef_quat", 
+            "robot0_gripper_qpos", 
+        ]
+        config.observation.modalities.obs.rgb = [
+            "agentview_image",
+            "robot0_eye_in_hand_image",
+        ]
+        config.observation.modalities.goal.low_dim = []
+        config.observation.modalities.goal.rgb = []
+
+        # default image encoder architecture is ResNet with spatial softmax
+        config.observation.encoder.rgb.core_class = "VisualCore"
+        config.observation.encoder.rgb.core_kwargs.feature_dimension = 64
+        config.observation.encoder.rgb.core_kwargs.backbone_class = 'ResNet18Conv'                         # ResNet backbone for image observations (unused if no image observations)
+        config.observation.encoder.rgb.core_kwargs.backbone_kwargs.pretrained = False                # kwargs for visual core
+        config.observation.encoder.rgb.core_kwargs.backbone_kwargs.input_coord_conv = False
+        config.observation.encoder.rgb.core_kwargs.pool_class = "SpatialSoftmax"                # Alternate options are "SpatialMeanPool" or None (no pooling)
+        config.observation.encoder.rgb.core_kwargs.pool_kwargs.num_kp = 32                      # Default arguments for "SpatialSoftmax"
+        config.observation.encoder.rgb.core_kwargs.pool_kwargs.learnable_temperature = False    # Default arguments for "SpatialSoftmax"
+        config.observation.encoder.rgb.core_kwargs.pool_kwargs.temperature = 1.0                # Default arguments for "SpatialSoftmax"
+        config.observation.encoder.rgb.core_kwargs.pool_kwargs.noise_std = 0.0
+
+        # observation randomizer class - set to None to use no randomization, or 'CropRandomizer' to use crop randomization
+        config.observation.encoder.rgb.obs_randomizer_class = "CropRandomizer"
+
+        # kwargs for observation randomizers (for the CropRandomizer, this is size and number of crops)
+        config.observation.encoder.rgb.obs_randomizer_kwargs.crop_height = 76
+        config.observation.encoder.rgb.obs_randomizer_kwargs.crop_width = 76
+        config.observation.encoder.rgb.obs_randomizer_kwargs.num_crops = 1
+        config.observation.encoder.rgb.obs_randomizer_kwargs.pos_enc = False
+
+    return config
+
+
+def modify_config_for_dataset(config, task_name, dataset_type, hdf5_type, base_dataset_dir, filter_key=None):
+    """
+    Modifies a Config object with experiment, training, and observation settings to
+    correspond to experiment settings for the dataset collected on @task_name with
+    dataset source @dataset_type (e.g. ph, mh, mg), and hdf5 type @hdf5_type (e.g. low_dim
+    or image).
+
+    Args:
+        config (Config instance): config to modify
+
+        task_name (str): identify task that dataset was collected on
+
+        dataset_type (str): dataset type for this dataset (e.g. ph, mh, mg).
+
+        hdf5_type (str): hdf5 type for this dataset (e.g. raw, low_dim, image). 
+
+        base_dataset_dir (str): path to directory where datasets are on disk.
+            Directory structure is expected to be consistent with the output
+            of @make_dataset_dirs in the download_datasets.py script.
+
+        filter_key (str): if not None, use the provided filter key to select a subset of the
+            provided dataset
+    """
+    assert task_name in DATASET_REGISTRY, \
+        "task {} not found in dataset registry!".format(task_name)
+    assert dataset_type in DATASET_REGISTRY[task_name], \
+        "dataset type {} not found for task {} in dataset registry!".format(dataset_type, task_name)
+    assert hdf5_type in DATASET_REGISTRY[task_name][dataset_type], \
+        "hdf5 type {} not found for dataset type {} and task {} in dataset registry!".format(hdf5_type, dataset_type, task_name)
+
+    is_real_dataset = "real" in task_name
+    if is_real_dataset:
+        assert config.algo_name == "bc", "we only ran BC-RNN on real robot"
+    else:
+        assert hdf5_type != "raw", "cannot train on raw demonstrations"
+
+    with config.experiment.values_unlocked():
+
+        # look up rollout evaluation horizon in registry and set it
+        config.experiment.rollout.horizon = DATASET_REGISTRY[task_name][dataset_type][hdf5_type]["horizon"]
+
+        if dataset_type == "mg":
+            # machine-generated datasets did not use validation
+            config.experiment.validate = False
+        else:
+            # all other datasets used validation
+            config.experiment.validate = True
+
+        if is_real_dataset:
+            # no evaluation rollouts for real robot training
+            config.experiment.rollout.enabled = False
+
+    with config.train.values_unlocked():
+        # set dataset path and possibly filter keys
+        url = DATASET_REGISTRY[task_name][dataset_type][hdf5_type]["url"]
+        if url is None:
+            # infer file_name
+            if task_name in ["lift", "can", "square", "tool_hang", "transport"]:
+                file_name = "{}_v141.hdf5".format(hdf5_type)
+            elif task_name in ["lift_real", "can_real", "tool_hang_real"]:
+                file_name = "{}.hdf5".format(hdf5_type)
+            else:
+                raise ValueError("Unknown dataset type")
+        else:
+            file_name = url.split("/")[-1]
+        config.train.data = os.path.join(base_dataset_dir, task_name, dataset_type, file_name)
+        config.train.hdf5_filter_key = None if filter_key is None else filter_key
+        config.train.hdf5_validation_filter_key = None
+        if config.experiment.validate:
+            # set train and valid keys for validation
+            config.train.hdf5_filter_key = "train" if filter_key is None else "{}_train".format(filter_key)
+            config.train.hdf5_validation_filter_key = "valid" if filter_key is None else "{}_valid".format(filter_key)
+
+    with config.observation.values_unlocked():
+        # maybe modify observation names and randomization sizes (since image size might be different)
+
+        if is_real_dataset:
+            # modify observation names for real robot datasets
+            config.observation.modalities.obs.low_dim = [
+                "ee_pose", 
+                "gripper_position", 
+            ]
+
+            if task_name == "tool_hang_real":
+                # side and wrist camera
+                config.observation.modalities.obs.rgb = [
+                    "image_side",
+                    "image_wrist",
+                ]
+                # 240x240 images -> crops should be 216x216
+                config.observation.encoder.rgb.obs_randomizer_kwargs.crop_height = 216
+                config.observation.encoder.rgb.obs_randomizer_kwargs.crop_width = 216
+            else:
+                # front and wrist camera
+                config.observation.modalities.obs.rgb = [
+                    "image",
+                    "image_wrist",
+                ]
+                # 120x120 images -> crops should be 108x108
+                config.observation.encoder.rgb.obs_randomizer_kwargs.crop_height = 108
+                config.observation.encoder.rgb.obs_randomizer_kwargs.crop_width = 108
+
+        elif hdf5_type in ["image", "image_sparse", "image_dense"]:
+            if task_name == "transport":
+                # robot proprioception per arm
+                config.observation.modalities.obs.low_dim = [
+                    "robot0_eef_pos", 
+                    "robot0_eef_quat", 
+                    "robot0_gripper_qpos", 
+                    "robot1_eef_pos", 
+                    "robot1_eef_quat", 
+                    "robot1_gripper_qpos", 
+                ]
+
+                # shoulder and wrist cameras per arm
+                config.observation.modalities.obs.rgb = [
+                    "shouldercamera0_image",
+                    "robot0_eye_in_hand_image",
+                    "shouldercamera1_image",
+                    "robot1_eye_in_hand_image",
+                ]
+            elif task_name == "tool_hang":
+                # side and wrist camera
+                config.observation.modalities.obs.rgb = [
+                    "sideview_image",
+                    "robot0_eye_in_hand_image",
+                ]
+                # 240x240 images -> crops should be 216x216
+                config.observation.encoder.rgb.obs_randomizer_kwargs.crop_height = 216
+                config.observation.encoder.rgb.obs_randomizer_kwargs.crop_width = 216
+
+        elif hdf5_type in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+            if task_name == "transport":
+                # robot proprioception per arm
+                default_low_dim_obs = [
+                    "robot0_eef_pos", 
+                    "robot0_eef_quat", 
+                    "robot0_gripper_qpos", 
+                    "robot1_eef_pos", 
+                    "robot1_eef_quat", 
+                    "robot1_gripper_qpos", 
+                    "object",
+                ]
+                # handle hierarchical observation configs
+                if config.algo_name == "hbc":
+                    configs_to_set = [
+                        config.observation.actor.modalities.obs,
+                        config.observation.planner.modalities.obs,
+                        config.observation.planner.modalities.subgoal,
+                    ]
+                elif config.algo_name == "iris":
+                    configs_to_set = [
+                        config.observation.actor.modalities.obs,
+                        config.observation.value_planner.planner.modalities.obs,
+                        config.observation.value_planner.planner.modalities.subgoal,
+                        config.observation.value_planner.value.modalities.obs,
+                    ]
+                else:
+                    configs_to_set = [config.observation.modalities.obs]
+                # set all observations / subgoals to use the correct low-dim modalities
+                for obs_key_config in configs_to_set:
+                    obs_key_config.low_dim = list(default_low_dim_obs)
+                    obs_key_config.rgb = []
+
+    return config
+
+
+def modify_bc_config_for_dataset(config, task_name, dataset_type, hdf5_type):
+    """
+    Modifies a BCConfig object for training on a particular kind of dataset. This function
+    just sets algorithm hyperparameters in the algo config depending on the kind of 
+    dataset.
+
+    Args:
+        config (BCConfig instance): config to modify
+
+        task_name (str): identify task that dataset was collected on. Only used to distinguish
+            between simulation and real-world, for an assert statement
+
+        dataset_type (str): dataset type for this dataset (e.g. ph, mh, mg, paired).
+
+        hdf5_type (str): hdf5 type for this dataset (e.g. raw, low_dim, image). 
+    """
+    assert isinstance(config, BCConfig), "must be BCConfig"
+    assert config.algo_name == "bc", "must be BCConfig"
+    assert dataset_type in ["ph", "mh", "mg", "paired"], "invalid dataset type"
+    is_real_dataset = "real" in task_name
+    if not is_real_dataset:
+        assert hdf5_type != "raw", "cannot train on raw demonstrations"
+
+    with config.algo.values_unlocked():
+        # base parameters that may get modified
+        config.algo.optim_params.policy.learning_rate.initial = 1e-4            # learning rate 1e-4
+        config.algo.actor_layer_dims = (1024, 1024)                             # MLP size (1024, 1024)
+        config.algo.gmm.enabled = True                                          # enable GMM
+
+        if dataset_type == "mg":
+            # machine-generated datasets don't use GMM
+            config.algo.gmm.enabled = False                                     # disable GMM
+            if hdf5_type in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+                # low-dim mg uses LR 1e-3
+                config.algo.optim_params.policy.learning_rate.initial = 1e-3    # learning rate 1e-3
+
+    return config
+
+
+def modify_bc_rnn_config_for_dataset(config, task_name, dataset_type, hdf5_type):
+    """
+    Modifies a BCConfig object for training on a particular kind of dataset. This function
+    just sets algorithm hyperparameters in the algo config depending on the kind of 
+    dataset.
+
+    Args:
+        config (BCConfig instance): config to modify
+
+        task_name (str): identify task that dataset was collected on. Only used to distinguish
+            between simulation and real-world, for an assert statement
+
+        dataset_type (str): dataset type for this dataset (e.g. ph, mh, mg, paired).
+
+        hdf5_type (str): hdf5 type for this dataset (e.g. raw, low_dim, image). 
+    """
+    assert isinstance(config, BCConfig), "must be BCConfig"
+    assert config.algo_name == "bc", "must be BCConfig"
+    assert dataset_type in ["ph", "mh", "mg", "paired"], "invalid dataset type"
+    is_real_dataset = "real" in task_name
+    if not is_real_dataset:
+        assert hdf5_type != "raw", "cannot train on raw demonstrations"
+
+    with config.train.values_unlocked():
+        # make sure RNN is enabled with sequence length 10
+        config.train.seq_length = 10
+
+    with config.algo.values_unlocked():
+        # make sure RNN is enabled with sequence length 10
+        config.algo.rnn.enabled = True
+        config.algo.rnn.horizon = 10
+
+        # base parameters that may get modified
+        config.algo.optim_params.policy.learning_rate.initial = 1e-4            # learning rate 1e-4
+        config.algo.actor_layer_dims = ()                                       # no MLP layers between rnn layer and output
+        config.algo.gmm.enabled = True                                          # enable GMM
+        config.algo.rnn.hidden_dim = 400                                        # rnn dim 400
+
+        if dataset_type == "mg":
+            # update hyperparams for machine-generated datasets
+            config.algo.gmm.enabled = False                                     # disable GMM
+            if hdf5_type not in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+                # image datasets use RNN dim 1000
+                config.algo.rnn.hidden_dim = 1000                               # rnn dim 1000
+        else:
+            # update hyperparams for all other dataset types (ph, mh, paired)
+            if hdf5_type not in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+                # image datasets use RNN dim 1000
+                config.algo.rnn.hidden_dim = 1000                               # rnn dim 1000
+
+    return config
+
+
+def modify_bcq_config_for_dataset(config, task_name, dataset_type, hdf5_type):
+    """
+    Modifies a BCQConfig object for training on a particular kind of dataset. This function
+    just sets algorithm hyperparameters in the algo config depending on the kind of 
+    dataset.
+
+    Args:
+        config (BCQConfig instance): config to modify
+
+        task_name (str): identify task that dataset was collected on. Only used to distinguish
+            between simulation and real-world, for an assert statement
+
+        dataset_type (str): dataset type for this dataset (e.g. ph, mh, mg, paired).
+
+        hdf5_type (str): hdf5 type for this dataset (e.g. raw, low_dim, image). 
+    """
+    assert isinstance(config, BCQConfig), "must be BCQConfig"
+    assert config.algo_name == "bcq", "must be BCQConfig"
+    assert dataset_type in ["ph", "mh", "mg", "paired"], "invalid dataset type"
+    is_real_dataset = "real" in task_name
+    assert not is_real_dataset, "we only ran BC-RNN on real robot"
+    if not is_real_dataset:
+        assert hdf5_type != "raw", "cannot train on raw demonstrations"
+
+    with config.algo.values_unlocked():
+        # base parameters that may get modified further
+        config.algo.optim_params.critic.learning_rate.initial = 1e-4                # all learning rates 1e-3
+        config.algo.optim_params.action_sampler.learning_rate.initial = 1e-4
+        config.algo.optim_params.actor.learning_rate.initial = 1e-3
+        config.algo.actor.enabled = False                                           # disable actor by default
+        config.algo.action_sampler.vae.enabled = True                               # use VAE action sampler
+        config.algo.action_sampler.gmm.enabled = False
+        config.algo.action_sampler.vae.kl_weight = 0.05                             # beta 0.05 for VAE
+        config.algo.action_sampler.vae.latent_dim = 14                              # latent dim 14
+        config.algo.action_sampler.vae.prior.learn = False                          # N(0, 1) prior
+        config.algo.critic.layer_dims = (300, 400)                                  # all MLP sizes at (300, 400)
+        config.algo.action_sampler.vae.encoder_layer_dims = (300, 400)
+        config.algo.action_sampler.vae.decoder_layer_dims = (300, 400)
+        config.algo.actor.layer_dims = (300, 400)
+        config.algo.target_tau = 5e-4                                               # tau 5e-4
+        config.algo.discount = 0.99                                                 # discount 0.99
+        config.algo.critic.num_action_samples = 10                                  # number of action sampler samples at train and test
+        config.algo.critic.num_action_samples_rollout = 100
+
+        if dataset_type == "mg":
+            # update hyperparams for machine-generated datasets
+            config.algo.optim_params.critic.learning_rate.initial = 1e-3            # all learning rates 1e-3
+            config.algo.optim_params.action_sampler.learning_rate.initial = 1e-3
+            config.algo.optim_params.actor.learning_rate.initial = 1e-3
+            config.algo.action_sampler.vae.kl_weight = 0.5                          # beta 0.5 for VAE
+            config.algo.target_tau = 5e-3                                           # tau 5e-3
+
+            if hdf5_type in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+                # enable actor only on low-dim
+                config.algo.actor.enabled = True
+        else:
+            # make some modifications where needed for human datasets
+            if hdf5_type in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+                if dataset_type in ["mh", "paired"]:
+                    # low-dim, MH had higher layer sizes
+                    config.algo.critic.layer_dims = (1024, 1024)
+                    config.algo.action_sampler.vae.encoder_layer_dims = (1024, 1024)
+                    config.algo.action_sampler.vae.decoder_layer_dims = (1024, 1024)
+                    config.algo.action_sampler.vae.prior_layer_dims = (1024, 1024)
+
+                    config.algo.action_sampler.vae.kl_weight = 0.5
+
+                    # use learned GMM prior for MH dataset
+                    config.algo.action_sampler.vae.prior.learn = True
+                    config.algo.action_sampler.vae.prior.is_conditioned = True
+                    config.algo.action_sampler.vae.prior.use_gmm = True
+                    config.algo.action_sampler.vae.prior.gmm_learn_weights = True
+            else:
+                if dataset_type == "ph":
+                    # image, PH used higher critic LR of 1e-3
+                    config.algo.optim_params.critic.learning_rate.initial = 1e-3
+                # image datasets used bigger VAE
+                config.algo.action_sampler.vae.encoder_layer_dims = (1024, 1024)
+                config.algo.action_sampler.vae.decoder_layer_dims = (1024, 1024)
+                if dataset_type in ["mh", "paired"]:
+                    # image, MH also had bigger critic
+                    config.algo.critic.layer_dims = (1024, 1024)
+
+    return config
+
+
+def modify_cql_config_for_dataset(config, task_name, dataset_type, hdf5_type):
+    """
+    Modifies a CQLConfig object for training on a particular kind of dataset. This function
+    just sets algorithm hyperparameters in the algo config depending on the kind of 
+    dataset.
+
+    Args:
+        config (CQLConfig instance): config to modify
+
+        task_name (str): identify task that dataset was collected on. Only used to distinguish
+            between simulation and real-world, for an assert statement
+
+        dataset_type (str): dataset type for this dataset (e.g. ph, mh, mg, paired).
+
+        hdf5_type (str): hdf5 type for this dataset (e.g. raw, low_dim, image). 
+    """
+    assert isinstance(config, CQLConfig), "must be CQLConfig"
+    assert config.algo_name == "cql", "must be CQLConfig"
+    assert dataset_type in ["ph", "mh", "mg", "paired"], "invalid dataset type"
+    is_real_dataset = "real" in task_name
+    assert not is_real_dataset, "we only ran BC-RNN on real robot"
+    if not is_real_dataset:
+        assert hdf5_type != "raw", "cannot train on raw demonstrations"
+
+    with config.train.values_unlocked():
+        # CQL uses batch size 1024 (for low-dim) and 8 (for image)
+        if hdf5_type in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+            config.train.batch_size = 1024
+        else:
+            config.train.batch_size = 8
+
+    with config.algo.values_unlocked():
+        # base parameters that may get modified further
+        config.algo.optim_params.critic.learning_rate.initial = 1e-3                # learning rates
+        config.algo.optim_params.actor.learning_rate.initial = 3e-4
+        config.algo.actor.target_entropy = "default"                                # use automatic entropy tuning to default target value
+        config.algo.critic.deterministic_backup = True                              # deterministic Q-backup
+        config.algo.critic.target_q_gap = 5.0                                       # use Lagrange, with threshold 5.0
+        config.algo.critic.min_q_weight = 1.0
+        config.algo.target_tau = 5e-3                                               # tau 5e-3
+        config.algo.discount = 0.99                                                 # discount 0.99
+        config.algo.critic.layer_dims = (300, 400)                                  # all MLP sizes at (300, 400)
+        config.algo.actor.layer_dims = (300, 400)
+
+        if hdf5_type not in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+            # update policy LR to 1e-4 for image runs
+            config.algo.optim_params.actor.learning_rate.initial = 1e-4
+
+    return config
+
+
+def modify_hbc_config_for_dataset(config, task_name, dataset_type, hdf5_type):
+    """
+    Modifies a HBCConfig object for training on a particular kind of dataset. This function
+    just sets algorithm hyperparameters in the algo config depending on the kind of 
+    dataset.
+
+    Args:
+        config (HBCConfig instance): config to modify
+
+        task_name (str): identify task that dataset was collected on. Only used to distinguish
+            between simulation and real-world, for an assert statement
+
+        dataset_type (str): dataset type for this dataset (e.g. ph, mh, mg, paired).
+
+        hdf5_type (str): hdf5 type for this dataset (e.g. raw, low_dim, image). 
+    """
+    assert isinstance(config, HBCConfig), "must be HBCConfig"
+    assert config.algo_name == "hbc", "must be HBCConfig"
+    assert dataset_type in ["ph", "mh", "mg", "paired"], "invalid dataset type"
+    assert hdf5_type in ["low_dim", "low_dim_sparse", "low_dim_dense"], "HBC only runs on low-dim"
+    is_real_dataset = "real" in task_name
+    assert not is_real_dataset, "we only ran BC-RNN on real robot"
+
+    with config.algo.values_unlocked():
+        # base parameters that may get modified further
+        config.algo.actor.optim_params.policy.learning_rate.initial = 1e-3          # learning rates
+        config.algo.planner.optim_params.goal_network.learning_rate.initial = 1e-3
+
+        config.algo.planner.vae.enabled = True                                      # goal VAE settings
+        config.algo.planner.vae.kl_weight = 5e-4                                    # beta 5e-4
+        config.algo.planner.vae.latent_dim = 16                                     # latent dim 16
+        config.algo.planner.vae.prior.learn = True                                  # learn GMM prior with 10 modes
+        config.algo.planner.vae.prior.is_conditioned = True
+        config.algo.planner.vae.prior.use_gmm = True
+        config.algo.planner.vae.prior.gmm_learn_weights = True
+        config.algo.planner.vae.prior.gmm_num_modes = 10
+        config.algo.planner.vae.encoder_layer_dims = (1024, 1024)                   # VAE network sizes
+        config.algo.planner.vae.decoder_layer_dims = (1024, 1024)
+        config.algo.planner.vae.prior_layer_dims = (1024, 1024)
+
+        config.algo.actor.rnn.hidden_dim = 400                                      # actor RNN dim
+        config.algo.actor.actor_layer_dims = ()                                     # no MLP layers between rnn layer and output
+
+        if dataset_type == "mg":
+            # update hyperparams for machine-generated datasets
+            config.algo.actor.rnn.hidden_dim = 100
+            config.algo.actor.actor_layer_dims = (1024, 1024)
+
+    return config
+
+
+def modify_iris_config_for_dataset(config, task_name, dataset_type, hdf5_type):
+    """
+    Modifies a IRISConfig object for training on a particular kind of dataset. This function
+    just sets algorithm hyperparameters in the algo config depending on the kind of 
+    dataset.
+
+    Args:
+        config (IRISConfig instance): config to modify
+
+        task_name (str): identify task that dataset was collected on. Only used to distinguish
+            between simulation and real-world, for an assert statement
+
+        dataset_type (str): dataset type for this dataset (e.g. ph, mh, mg, paired).
+
+        hdf5_type (str): hdf5 type for this dataset (e.g. raw, low_dim, image). 
+    """
+    assert isinstance(config, IRISConfig), "must be IRISConfig"
+    assert config.algo_name == "iris", "must be IRISConfig"
+    assert dataset_type in ["ph", "mh", "mg", "paired"], "invalid dataset type"
+    assert hdf5_type in ["low_dim", "low_dim_sparse", "low_dim_dense"], "IRIS only runs on low-dim"
+    is_real_dataset = "real" in task_name
+    assert not is_real_dataset, "we only ran BC-RNN on real robot"
+
+    with config.algo.values_unlocked():
+        # base parameters that may get modified further
+        config.algo.actor.optim_params.policy.learning_rate.initial = 1e-3                          # learning rates
+        config.algo.value_planner.planner.optim_params.goal_network.learning_rate.initial = 1e-3
+        config.algo.value_planner.value.optim_params.critic.learning_rate.initial = 1e-3
+        config.algo.value_planner.value.optim_params.action_sampler.learning_rate.initial = 1e-4
+
+        config.algo.value_planner.planner.vae.enabled = True                                        # goal VAE settings
+        config.algo.value_planner.planner.vae.kl_weight = 5e-4                                      # beta 5e-4
+        config.algo.value_planner.planner.vae.latent_dim = 14                                       # latent dim 14
+        config.algo.value_planner.planner.vae.prior.learn = True                                    # learn GMM prior with 10 modes
+        config.algo.value_planner.planner.vae.prior.is_conditioned = True
+        config.algo.value_planner.planner.vae.prior.use_gmm = True
+        config.algo.value_planner.planner.vae.prior.gmm_learn_weights = True
+        config.algo.value_planner.planner.vae.prior.gmm_num_modes = 10
+        config.algo.value_planner.planner.vae.encoder_layer_dims = (1024, 1024)                     # VAE network sizes
+        config.algo.value_planner.planner.vae.decoder_layer_dims = (1024, 1024)
+        config.algo.value_planner.planner.vae.prior_layer_dims = (1024, 1024)
+
+        config.algo.value_planner.value.target_tau = 5e-4                                           # Value tau
+        config.algo.value_planner.value.action_sampler.vae.kl_weight = 0.5                          # Value KL
+        config.algo.value_planner.value.action_sampler.vae.latent_dim = 16
+        config.algo.value_planner.value.action_sampler.actor_layer_dims = (300, 400)
+
+        config.algo.actor.rnn.hidden_dim = 400                                                      # actor RNN dim
+        config.algo.actor.actor_layer_dims = ()                                                     # no MLP layers between rnn layer and output
+
+        if dataset_type in ["mh", "paired"]:
+            # value LR 1e-4, KL weight is 0.05 for multi-human datasets
+            config.algo.value_planner.value.optim_params.critic.learning_rate.initial = 1e-4
+            config.algo.value_planner.value.action_sampler.vae.kl_weight = 0.05
+
+        if dataset_type in ["mg"]:
+            # Enable value actor and set larger target tau
+            config.algo.value_planner.value.actor.enabled = True
+            config.algo.value_planner.value.optim_params.actor.learning_rate.initial = 1e-3
+            config.algo.value_planner.value.target_tau = 5e-3
+
+    return config
+
+
+def generate_experiment_config(
+    base_exp_name, 
+    base_config_dir, 
+    base_dataset_dir, 
+    base_output_dir, 
+    algo_name, 
+    algo_config_modifier, 
+    task_name, 
+    dataset_type, 
+    hdf5_type,
+    filter_key=None,
+    additional_name=None,
+    additional_config_modifier=None,
+):
+    """
+    Helper function to generate a config for a particular experiment.
+
+    Args:
+        base_exp_name (str): name that identifies this set of experiments
+
+        base_config_dir (str): base directory to place generated configs
+
+        base_dataset_dir (str): path to directory where datasets are on disk.
+            Directory structure is expected to be consistent with the output
+            of @make_dataset_dirs in the download_datasets.py script.
+
+        base_output_dir (str): directory to save training results to. If None, will use the directory
+            from the default algorithm configs.
+
+        algo_name (str): identifies the algorithm - one of ["bc", "bc_rnn", "bcq", "cql", hbc", "iris"]
+
+        algo_config_modifier (function): function to modify config to add algo hyperparameter
+            settings, given the task, dataset, and hdf5 types.
+
+        task_name (str): identify task that dataset was collected on. Only used to distinguish
+            between simulation and real-world, for an assert statement
+
+        dataset_type (str): dataset type for this dataset (e.g. ph, mh, mg, paired).
+
+        hdf5_type (str): hdf5 type for this dataset (e.g. raw, low_dim, image). 
+
+        filter_key (str): if not None, use the provided filter key to select a subset of the
+            provided dataset
+
+        additional_name (str): if provided, will add this name to the generated experiment name, and
+            the name of the generated config json
+
+        additional_config_modifier (function): if provided, run this last function on the config
+            to make final modifications before generating the json.
+    """
+    if "real" not in task_name:
+        assert hdf5_type != "raw", "cannot train on raw demonstrations"
+
+    # decide whether to use low-dim or image training defaults
+    modifier_for_obs = modify_config_for_default_image_exp
+    if hdf5_type in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+        modifier_for_obs = modify_config_for_default_low_dim_exp
+
+    algo_config_name = "bc" if algo_name == "bc_rnn" else algo_name
+    config = config_factory(algo_name=algo_config_name)
+    # turn into default config for observation modalities (e.g.: low-dim or rgb)
+    config = modifier_for_obs(config)
+    # add in config based on the dataset
+    config = modify_config_for_dataset(
+        config=config, 
+        task_name=task_name, 
+        dataset_type=dataset_type, 
+        hdf5_type=hdf5_type, 
+        base_dataset_dir=base_dataset_dir,
+        filter_key=filter_key,
+    )
+    # add in algo hypers based on dataset
+    config = algo_config_modifier(
+        config=config, 
+        task_name=task_name, 
+        dataset_type=dataset_type, 
+        hdf5_type=hdf5_type,
+    )
+    if additional_config_modifier is not None:
+        # use additional config modifier if provided
+        config = additional_config_modifier(config)
+
+    # account for filter key in experiment naming and directory naming
+    filter_key_str = "_{}".format(filter_key) if filter_key is not None else ""
+    dataset_type_dir = "{}/{}".format(dataset_type, filter_key) if filter_key is not None else dataset_type
+
+    # account for @additional_name
+    additional_name_str = "_{}".format(additional_name) if additional_name is not None else ""
+    json_name = "{}{}".format(algo_name, additional_name_str)
+
+    # set experiment name
+    with config.experiment.values_unlocked():
+        config.experiment.name = "{}_{}_{}_{}{}_{}{}".format(base_exp_name, algo_name, task_name, dataset_type, filter_key_str, hdf5_type, additional_name_str)
+    # set output folder
+    with config.train.values_unlocked():
+        if base_output_dir is None:
+            base_output_dir = config.train.output_dir
+        config.train.output_dir = os.path.join(base_output_dir, base_exp_name, algo_name, task_name, dataset_type_dir, hdf5_type, "trained_models")
+    
+    # save config to json file
+    dir_to_save = os.path.join(base_config_dir, base_exp_name, task_name, dataset_type_dir, hdf5_type)
+    os.makedirs(dir_to_save, exist_ok=True)
+    json_path = os.path.join(dir_to_save, "{}.json".format(json_name))
+    config.dump(filename=json_path)
+
+    return config, json_path
+
+
+def generate_core_configs(
+    base_config_dir, 
+    base_dataset_dir, 
+    base_output_dir, 
+    algo_to_config_modifier, 
+):
+    """
+    Helper function to generate all configs for core set of experiments.
+
+    Args:
+        base_config_dir (str): base directory to place generated configs
+
+        base_dataset_dir (str): path to directory where datasets are on disk.
+            Directory structure is expected to be consistent with the output
+            of @make_dataset_dirs in the download_datasets.py script.
+
+        base_output_dir (str): directory to save training results to. If None, will use the directory
+            from the default algorithm configs.
+
+        algo_to_config_modifier (dict): dictionary that maps algo name to a function that modifies configs 
+            to add algo hyperparameter settings, given the task, dataset, and hdf5 types.
+    """
+    core_json_paths = Config() # use for convenient nested dict
+    for task in DATASET_REGISTRY:
+        for dataset_type in DATASET_REGISTRY[task]:
+            for hdf5_type in DATASET_REGISTRY[task][dataset_type]:
+                # if not real robot dataset, skip raw hdf5
+                is_real_dataset = ("real" in task)
+                if not is_real_dataset and hdf5_type == "raw":
+                    continue
+                
+                # get list of algorithms to generate configs for, for this hdf5 dataset
+                algos_to_generate = ["bc", "bc_rnn", "bcq", "cql", "hbc", "iris"]
+                if hdf5_type not in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+                    # no hbc or iris for image runs
+                    algos_to_generate = algos_to_generate[:-2]
+                if is_real_dataset:
+                    # we only ran BC-RNN on real robot
+                    algos_to_generate = ["bc_rnn"]
+
+                for algo_name in algos_to_generate:
+
+                    # generate config for this experiment
+                    config, json_path = generate_experiment_config(
+                        base_exp_name="core",
+                        base_config_dir=base_config_dir,
+                        base_dataset_dir=base_dataset_dir,
+                        base_output_dir=base_output_dir,
+                        algo_name=algo_name, 
+                        algo_config_modifier=algo_to_config_modifier[algo_name], 
+                        task_name=task, 
+                        dataset_type=dataset_type, 
+                        hdf5_type=hdf5_type,
+                    )
+
+                    # save json path into dict
+                    core_json_paths[task][dataset_type][hdf5_type][algo_name] = json_path
+
+    return core_json_paths
+
+
+def generate_subopt_configs(
+    base_config_dir, 
+    base_dataset_dir, 
+    base_output_dir, 
+    algo_to_config_modifier, 
+):
+    """
+    Helper function to generate all configs for the suboptimal human subsets of the multi-human datasets.
+    Note that while the paper includes the results on the can-paired dataset along with results on these
+    datasets, the configs for runs on the can-paired dataset is in the "core" set of runs.
+
+    Args:
+        base_config_dir (str): base directory to place generated configs
+
+        base_dataset_dir (str): path to directory where datasets are on disk.
+            Directory structure is expected to be consistent with the output
+            of @make_dataset_dirs in the download_datasets.py script.
+
+        base_output_dir (str): directory to save training results to. If None, will use the directory
+            from the default algorithm configs.
+
+        algo_to_config_modifier (dict): dictionary that maps algo name to a function that modifies configs 
+            to add algo hyperparameter settings, given the task, dataset, and hdf5 types.
+    """
+    subopt_json_paths = Config() # use for convenient nested dict
+    for task in ["lift", "can", "square", "transport"]:
+        # only generate configs for multi-human data subsets
+        for dataset_type in ["mh"]:
+            # only low-dim / image
+            for hdf5_type in ["low_dim", "image"]:
+
+                # get list of algorithms to generate configs for, for this hdf5 dataset
+                algos_to_generate = ["bc", "bc_rnn", "bcq", "cql", "hbc", "iris"]
+                if hdf5_type == "image":
+                    # no hbc or iris for image runs
+                    algos_to_generate = algos_to_generate[:-2]
+
+                for algo_name in algos_to_generate:
+
+                    for fk in ["worse", "okay", "better", "worse_okay", "worse_better", "okay_better"]:
+
+                        # generate config for this experiment
+                        config, json_path = generate_experiment_config(
+                            base_exp_name="subopt",
+                            base_config_dir=base_config_dir,
+                            base_dataset_dir=base_dataset_dir,
+                            base_output_dir=base_output_dir,
+                            algo_name=algo_name, 
+                            algo_config_modifier=algo_to_config_modifier[algo_name], 
+                            task_name=task, 
+                            dataset_type=dataset_type, 
+                            hdf5_type=hdf5_type,
+                            filter_key=fk,
+                        )
+
+                        # save json path into dict
+                        dataset_type_dir = "{}/{}".format(dataset_type, fk)
+                        subopt_json_paths[task][dataset_type_dir][hdf5_type][algo_name] = json_path
+
+    return subopt_json_paths
+
+
+def generate_dataset_size_configs(
+    base_config_dir, 
+    base_dataset_dir, 
+    base_output_dir, 
+    algo_to_config_modifier, 
+):
+    """
+    Helper function to generate all configs for the dataset size ablation experiments, where BC-RNN models
+    were trained on 20% and 50% dataset sizes.
+
+    Args:
+        base_config_dir (str): base directory to place generated configs
+
+        base_dataset_dir (str): path to directory where datasets are on disk.
+            Directory structure is expected to be consistent with the output
+            of @make_dataset_dirs in the download_datasets.py script.
+
+        base_output_dir (str): directory to save training results to. If None, will use the directory
+            from the default algorithm configs.
+
+        algo_to_config_modifier (dict): dictionary that maps algo name to a function that modifies configs 
+            to add algo hyperparameter settings, given the task, dataset, and hdf5 types.
+    """
+    size_ablation_json_paths = Config() # use for convenient nested dict
+    for task in ["lift", "can", "square", "transport"]:
+        for dataset_type in ["ph", "mh"]:
+            for hdf5_type in ["low_dim", "image"]:
+
+                # only bc-rnn
+                algo_name = "bc_rnn"
+                for fk in ["20_percent", "50_percent"]:
+
+                    # generate config for this experiment
+                    config, json_path = generate_experiment_config(
+                        base_exp_name="dataset_size",
+                        base_config_dir=base_config_dir,
+                        base_dataset_dir=base_dataset_dir,
+                        base_output_dir=base_output_dir,
+                        algo_name=algo_name, 
+                        algo_config_modifier=algo_to_config_modifier[algo_name], 
+                        task_name=task, 
+                        dataset_type=dataset_type, 
+                        hdf5_type=hdf5_type,
+                        filter_key=fk,
+                    )
+
+                    # save json path into dict
+                    dataset_type_dir = "{}/{}".format(dataset_type, fk)
+                    size_ablation_json_paths[task][dataset_type_dir][hdf5_type][algo_name] = json_path
+
+    return size_ablation_json_paths
+
+
+def generate_obs_ablation_configs(
+    base_config_dir, 
+    base_dataset_dir, 
+    base_output_dir, 
+    algo_to_config_modifier, 
+):
+    """
+    Helper function to generate all configs for the observation ablation experiments, where BC and BC-RNN models
+    were trained on different versions of low-dim and image observations.
+
+    Args:
+        base_config_dir (str): base directory to place generated configs
+
+        base_dataset_dir (str): path to directory where datasets are on disk.
+            Directory structure is expected to be consistent with the output
+            of @make_dataset_dirs in the download_datasets.py script.
+
+        base_output_dir (str): directory to save training results to. If None, will use the directory
+            from the default algorithm configs.
+
+        algo_to_config_modifier (dict): dictionary that maps algo name to a function that modifies configs 
+            to add algo hyperparameter settings, given the task, dataset, and hdf5 types.
+    """
+
+    # observation config modifiers for these experiments
+    def add_eef_vel(config):
+        with config.observation.values_unlocked():
+            old_low_dim_mods = list(config.observation.modalities.obs.low_dim)
+            old_low_dim_mods.extend(["robot0_eef_vel_lin", "robot0_eef_vel_ang", "robot0_gripper_qvel"])
+            if "robot1_eef_pos" in old_low_dim_mods:
+                old_low_dim_mods.extend(["robot1_eef_vel_lin", "robot1_eef_vel_ang", "robot1_gripper_qvel"])
+            config.observation.modalities.obs.low_dim = old_low_dim_mods
+        return config
+
+    def add_proprio(config):
+        with config.observation.values_unlocked():
+            old_low_dim_mods = list(config.observation.modalities.obs.low_dim)
+            old_low_dim_mods.extend(["robot0_joint_pos_cos", "robot0_joint_pos_sin", "robot0_joint_vel"])
+            if "robot1_eef_pos" in old_low_dim_mods:
+                old_low_dim_mods.extend(["robot1_joint_pos_cos", "robot1_joint_pos_sin", "robot1_joint_vel"])
+            config.observation.modalities.obs.low_dim = old_low_dim_mods
+        return config
+
+    def remove_wrist(config):
+        with config.observation.values_unlocked():
+            old_image_mods = list(config.observation.modalities.obs.rgb)
+            config.observation.modalities.obs.rgb = [m for m in old_image_mods if "eye_in_hand" not in m]
+        return config
+
+    def remove_rand(config):
+        with config.observation.values_unlocked():
+            config.observation.encoder.rgb.obs_randomizer_class = None
+        return config
+
+    obs_ablation_json_paths = Config() # use for convenient nested dict
+    for task in ["square", "transport"]:
+        for dataset_type in ["ph", "mh"]:
+            for hdf5_type in ["low_dim", "image"]:
+
+                # observation modifiers to apply
+                if hdf5_type == "low_dim":
+                    obs_modifiers = [add_eef_vel, add_proprio]
+                else:
+                    obs_modifiers = [add_eef_vel, add_proprio, remove_wrist, remove_rand]
+
+                # only bc and bc-rnn
+                algos_to_generate = ["bc", "bc_rnn"]
+                for algo_name in algos_to_generate:
+                    for obs_modifier in obs_modifiers:
+                        # generate config for this experiment
+                        config, json_path = generate_experiment_config(
+                            base_exp_name="obs_ablation",
+                            base_config_dir=base_config_dir,
+                            base_dataset_dir=base_dataset_dir,
+                            base_output_dir=base_output_dir,
+                            algo_name=algo_name, 
+                            algo_config_modifier=algo_to_config_modifier[algo_name], 
+                            task_name=task, 
+                            dataset_type=dataset_type, 
+                            hdf5_type=hdf5_type,
+                            additional_name=obs_modifier.__name__,
+                            additional_config_modifier=obs_modifier,
+                        )
+
+                        # save json path into dict
+                        algo_name_str = "{}_{}".format(algo_name, obs_modifier.__name__)
+                        obs_ablation_json_paths[task][dataset_type][hdf5_type][algo_name_str] = json_path
+
+    return obs_ablation_json_paths
+
+
+def generate_hyper_ablation_configs(
+    base_config_dir, 
+    base_dataset_dir, 
+    base_output_dir, 
+    algo_to_config_modifier, 
+):
+    """
+    Helper function to generate all configs for the hyperparameter sensitivity experiments, 
+    where BC-RNN models were trained on different ablations.
+
+    Args:
+        base_config_dir (str): base directory to place generated configs
+
+        base_dataset_dir (str): path to directory where datasets are on disk.
+            Directory structure is expected to be consistent with the output
+            of @make_dataset_dirs in the download_datasets.py script.
+
+        base_output_dir (str): directory to save training results to. If None, will use the directory
+            from the default algorithm configs.
+
+        algo_to_config_modifier (dict): dictionary that maps algo name to a function that modifies configs 
+            to add algo hyperparameter settings, given the task, dataset, and hdf5 types.
+    """
+
+    # observation config modifiers for these experiments
+    def change_lr(config):
+        with config.algo.values_unlocked():
+            config.algo.optim_params.policy.learning_rate.initial = 1e-3
+        return config
+
+    def change_gmm(config):
+        with config.algo.values_unlocked():
+            config.algo.gmm.enabled = False
+        return config
+
+    def change_mlp(config):
+        with config.algo.values_unlocked():
+            config.algo.actor_layer_dims = (1024, 1024) 
+        return config
+
+    def change_conv(config):
+        with config.observation.values_unlocked():
+            config.observation.encoder.rgb.core_class = 'ShallowConv'
+            config.observation.encoder.rgb.core_kwargs = Config()
+        return config
+
+    def change_rnnd_low_dim(config):
+        with config.algo.values_unlocked():
+            config.algo.rnn.hidden_dim = 100
+        return config
+
+    def change_rnnd_image(config):
+        with config.algo.values_unlocked():
+            config.algo.rnn.hidden_dim = 400
+        return config
+
+    hyper_ablation_json_paths = Config() # use for convenient nested dict
+    for task in ["square", "transport"]:
+        for dataset_type in ["ph", "mh"]:
+            for hdf5_type in ["low_dim", "image"]:
+
+                # observation modifiers to apply
+                if hdf5_type == "low_dim":
+                    hyper_modifiers = [change_lr, change_gmm, change_mlp, change_rnnd_low_dim]
+                else:
+                    hyper_modifiers = [change_lr, change_gmm, change_conv, change_rnnd_image]
+
+                # only bc and bc-rnn
+                algo_name = "bc_rnn"
+                for hyper_modifier in hyper_modifiers:
+                    # generate config for this experiment
+                    config, json_path = generate_experiment_config(
+                        base_exp_name="hyper_ablation",
+                        base_config_dir=base_config_dir,
+                        base_dataset_dir=base_dataset_dir,
+                        base_output_dir=base_output_dir,
+                        algo_name=algo_name, 
+                        algo_config_modifier=algo_to_config_modifier[algo_name], 
+                        task_name=task, 
+                        dataset_type=dataset_type, 
+                        hdf5_type=hdf5_type,
+                        additional_name=hyper_modifier.__name__,
+                        additional_config_modifier=hyper_modifier,
+                    )
+
+                    # save json path into dict
+                    algo_name_str = "{}_{}".format(algo_name, hyper_modifier.__name__)
+                    hyper_ablation_json_paths[task][dataset_type][hdf5_type][algo_name_str] = json_path
+
+    return hyper_ablation_json_paths
+
+
+def generate_d4rl_configs(
+    base_config_dir, 
+    base_dataset_dir, 
+    base_output_dir, 
+    algo_to_config_modifier, 
+):
+    """
+    Helper function to generate all configs for reproducing BCQ, CQL, and TD3-BC runs on some D4RL
+    environments.
+
+    Args:
+        base_config_dir (str): base directory to place generated configs
+
+        base_dataset_dir (str): path to directory where datasets are on disk.
+            Directory structure is expected to be consistent with the output
+            of @make_dataset_dirs in the download_datasets.py script.
+
+        base_output_dir (str): directory to save training results to. If None, will use the directory
+            from the default algorithm configs.
+
+        algo_to_config_modifier (dict): dictionary that maps algo name to a function that modifies configs 
+            to add algo hyperparameter settings, given the task, dataset, and hdf5 types.
+    """
+
+    def bcq_algo_config_modifier(config):
+        with config.algo.values_unlocked():
+            # all LRs 1e-3, enable actor
+            config.algo.optim_params.critic.learning_rate.initial = 1e-3
+            config.algo.optim_params.action_sampler.learning_rate.initial = 1e-3
+            config.algo.optim_params.actor.learning_rate.initial = 1e-3
+            config.algo.actor.enabled = True
+            config.algo.action_sampler.vae.kl_weight = 0.5
+        return config
+
+    def cql_algo_config_modifier(config):
+        with config.algo.values_unlocked():
+            # taken from TD3-BC settings described in their paper
+            config.algo.optim_params.critic.learning_rate.initial = 3e-4
+            config.algo.optim_params.actor.learning_rate.initial = 3e-5
+            config.algo.actor.bc_start_steps = 40000                        # pre-training steps for actor
+            config.algo.critic.target_q_gap = None                          # no Lagrange, and fixed weight of 10.0
+            config.algo.critic.cql_weight = 10.0
+            config.algo.critic.min_q_weight = 1.0 
+            config.algo.critic.deterministic_backup = True                  # deterministic backup (no entropy in Q-target)
+            config.algo.actor.layer_dims = (256, 256, 256)                  # MLP sizes
+            config.algo.critic.layer_dims = (256, 256, 256)
+        return config
+    
+    def iql_algo_config_modifier(config):
+        with config.algo.values_unlocked():
+            # taken from IQL settings described in their paper
+            config.algo.target_tau = 0.005
+            config.algo.vf_quantile = 0.7
+            config.algo.adv.beta = 3.0
+            config.algo.optim_params.critic.learning_rate.initial = 3e-4
+            config.algo.optim_params.vf.learning_rate.initial = 3e-4
+            config.algo.optim_params.actor.learning_rate.initial = 3e-4
+            config.algo.actor.layer_dims = (256, 256, 256)                  # MLP sizes
+            config.algo.critic.layer_dims = (256, 256, 256)
+        return config
+
+    d4rl_tasks = [
+        # "halfcheetah-random-v2",
+        # "hopper-random-v2",
+        # "walker2d-random-v2",
+        "halfcheetah-medium-v2",
+        "hopper-medium-v2",
+        "walker2d-medium-v2",
+        "halfcheetah-expert-v2",
+        "hopper-expert-v2",
+        "walker2d-expert-v2",
+        "halfcheetah-medium-expert-v2",
+        "hopper-medium-expert-v2",
+        "walker2d-medium-expert-v2",
+        # "halfcheetah-medium-replay-v2",
+        # "hopper-medium-replay-v2",
+        # "walker2d-medium-replay-v2",
+    ]
+    d4rl_json_paths = Config() # use for convenient nested dict
+    for task_name in d4rl_tasks:
+        for algo_name in ["bcq", "cql", "td3_bc", "iql"]:
+            config = config_factory(algo_name=algo_name)
+
+            # hack: copy experiment and train sections from td3-bc, since that has defaults for training with D4RL
+            if algo_name != "td3_bc":
+                ref_config = config_factory(algo_name="td3_bc")
+                with config.values_unlocked():
+                    config.experiment = ref_config.experiment
+                    config.train = ref_config.train
+                    config.observation = ref_config.observation
+                    config.train.hdf5_normalize_obs = False # only TD3-BC uses observation normalization
+
+            # modify algo section for d4rl defaults
+            if algo_name == "bcq":
+                config = bcq_algo_config_modifier(config)
+            elif algo_name == "cql":
+                config = cql_algo_config_modifier(config)
+            elif algo_name == "iql":
+                config = iql_algo_config_modifier(config)
+
+            # set experiment name
+            with config.experiment.values_unlocked():
+                config.experiment.name = "{}_{}_{}".format("d4rl", algo_name, task_name)
+            # set output folder and dataset
+            with config.train.values_unlocked():
+                if base_output_dir is None:
+                    base_output_dir_for_algo = "../{}_trained_models".format(algo_name)
+                else:
+                    base_output_dir_for_algo = base_output_dir
+                config.train.output_dir = os.path.join(base_output_dir_for_algo, "d4rl", algo_name, task_name, "trained_models")
+                config.train.data = os.path.join(base_dataset_dir, "d4rl", "converted", 
+                    "{}.hdf5".format(task_name.replace("-", "_")))
+
+            # save config to json file
+            dir_to_save = os.path.join(base_config_dir, "d4rl", task_name)
+            os.makedirs(dir_to_save, exist_ok=True)
+            json_path = os.path.join(dir_to_save, "{}.json".format(algo_name))
+            config.dump(filename=json_path)
+
+            # save json path into dict
+            d4rl_json_paths[task_name][""][""][algo_name] = json_path
+
+    return d4rl_json_paths
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Directory where generated configs will be placed
+    parser.add_argument(
+        "--config_dir",
+        type=str,
+        default=None,
+        help="Directory where generated configs will be placed. Defaults to 'paper' subfolder in exps folder of repository",
+    )
+
+    # directory where released datasets are located
+    parser.add_argument(
+        "--dataset_dir",
+        type=str,
+        default=None,
+        help="Base dataset directory for released datasets. Defaults to datasets folder in repository.",
+    )
+
+    # output directory for training runs (will be written to configs)
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Base output directory for all training runs that will be written to generated configs.",
+    )
+
+    args = parser.parse_args()
+
+    # read args
+    generated_configs_base_dir = args.config_dir
+    if generated_configs_base_dir is None:
+        generated_configs_base_dir = os.path.join(robomimic.__path__[0], "exps/paper")
+
+    datasets_base_dir = args.dataset_dir
+    if datasets_base_dir is None:
+        datasets_base_dir = os.path.join(robomimic.__path__[0], "../datasets")
+
+    output_base_dir = args.output_dir
+
+    # algo to modifier
+    algo_to_modifier = dict(
+        bc=modify_bc_config_for_dataset, 
+        bc_rnn=modify_bc_rnn_config_for_dataset,
+        bcq=modify_bcq_config_for_dataset,
+        cql=modify_cql_config_for_dataset,
+        hbc=modify_hbc_config_for_dataset,
+        iris=modify_iris_config_for_dataset,
+    )
+
+    # exp name to config generator
+    exp_name_to_generator = dict(
+        core=generate_core_configs,
+        subopt=generate_subopt_configs,
+        dataset_size=generate_dataset_size_configs,
+        obs_ablation=generate_obs_ablation_configs,
+        hyper_ablation=generate_hyper_ablation_configs,
+        d4rl=generate_d4rl_configs,
+    )
+
+    # generate configs for each experiment name
+    config_json_paths = Config() # use for convenient nested dict
+    for exp_name in exp_name_to_generator:
+        config_json_paths[exp_name] = exp_name_to_generator[exp_name](
+            base_config_dir=generated_configs_base_dir, 
+            base_dataset_dir=datasets_base_dir, 
+            base_output_dir=output_base_dir, 
+            algo_to_config_modifier=algo_to_modifier, 
+        )
+
+    # write output shell scripts
+    for exp_name in config_json_paths:
+        shell_path = os.path.join(generated_configs_base_dir, "{}.sh".format(exp_name))
+        with open(shell_path, "w") as f:
+            f.write("#!/bin/bash\n\n")
+            f.write("# " + "=" * 10 + exp_name + "=" * 10 + "\n")
+            train_script_loc = os.path.join(robomimic.__path__[0], "scripts/train.py")
+
+            for task in config_json_paths[exp_name]:
+                for dataset_type in config_json_paths[exp_name][task]:
+                    for hdf5_type in config_json_paths[exp_name][task][dataset_type]:
+                        f.write("\n")
+                        f.write("#  task: {}\n".format(task))
+                        if len(dataset_type) > 0:
+                            f.write("#    dataset type: {}\n".format(dataset_type))
+                        if len(hdf5_type) > 0:
+                            f.write("#      hdf5 type: {}\n".format(hdf5_type))
+                        for algo_name in config_json_paths[exp_name][task][dataset_type][hdf5_type]:
+                            # f.write("#        {}\n".format(algo_name))
+                            exp_json_path = config_json_paths[exp_name][task][dataset_type][hdf5_type][algo_name]
+                            cmd = "python {} --config {}\n".format(train_script_loc, exp_json_path)
+                            f.write(cmd)
+            f.write("\n")
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/get_dataset_info.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/get_dataset_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..8971bcdb725adbe9a0df0e5e0b77f88b8684d153
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/get_dataset_info.py
@@ -0,0 +1,135 @@
+"""
+Helper script to report dataset information. By default, will print trajectory length statistics,
+the maximum and minimum action element in the dataset, filter keys present, environment
+metadata, and the structure of the first demonstration. If --verbose is passed, it will
+report the exact demo keys under each filter key, and the structure of all demonstrations
+(not just the first one).
+
+Args:
+    dataset (str): path to hdf5 dataset
+
+    filter_key (str): if provided, report statistics on the subset of trajectories
+        in the file that correspond to this filter key
+
+    verbose (bool): if flag is provided, print more details, like the structure of all
+        demonstrations (not just the first one)
+
+Example usage:
+
+    # run script on example hdf5 packaged with repository
+    python get_dataset_info.py --dataset ../../tests/assets/test.hdf5
+
+    # run script only on validation data
+    python get_dataset_info.py --dataset ../../tests/assets/test.hdf5 --filter_key valid
+"""
+import h5py
+import json
+import argparse
+import numpy as np
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="path to hdf5 dataset",
+    )
+    parser.add_argument(
+        "--filter_key",
+        type=str,
+        default=None,
+        help="(optional) if provided, report statistics on the subset of trajectories \
+            in the file that correspond to this filter key",
+    )
+    parser.add_argument(
+        "--verbose",
+        action='store_true',
+        help="verbose output",
+    )
+    args = parser.parse_args()
+
+    # extract demonstration list from file
+    filter_key = args.filter_key
+    all_filter_keys = None
+    f = h5py.File(args.dataset, "r")
+    if filter_key is not None:
+        # use the demonstrations from the filter key instead
+        print("NOTE: using filter key {}".format(filter_key))
+        demos = sorted([elem.decode("utf-8") for elem in np.array(f["mask/{}".format(filter_key)])])
+    else:
+        # use all demonstrations
+        demos = sorted(list(f["data"].keys()))
+
+        # extract filter key information
+        if "mask" in f:
+            all_filter_keys = {}
+            for fk in f["mask"]:
+                fk_demos = sorted([elem.decode("utf-8") for elem in np.array(f["mask/{}".format(fk)])])
+                all_filter_keys[fk] = fk_demos
+
+    # put demonstration list in increasing episode order
+    inds = np.argsort([int(elem[5:]) for elem in demos])
+    demos = [demos[i] for i in inds]
+
+    # extract length of each trajectory in the file
+    traj_lengths = []
+    action_min = np.inf
+    action_max = -np.inf
+    for ep in demos:
+        traj_lengths.append(f["data/{}/actions".format(ep)].shape[0])
+        action_min = min(action_min, np.min(f["data/{}/actions".format(ep)][()]))
+        action_max = max(action_max, np.max(f["data/{}/actions".format(ep)][()]))
+    traj_lengths = np.array(traj_lengths)
+
+    # report statistics on the data
+    print("")
+    print("total transitions: {}".format(np.sum(traj_lengths)))
+    print("total trajectories: {}".format(traj_lengths.shape[0]))
+    print("traj length mean: {}".format(np.mean(traj_lengths)))
+    print("traj length std: {}".format(np.std(traj_lengths)))
+    print("traj length min: {}".format(np.min(traj_lengths)))
+    print("traj length max: {}".format(np.max(traj_lengths)))
+    print("action min: {}".format(action_min))
+    print("action max: {}".format(action_max))
+    print("")
+    print("==== Filter Keys ====")
+    if all_filter_keys is not None:
+        for fk in all_filter_keys:
+            print("filter key {} with {} demos".format(fk, len(all_filter_keys[fk])))
+    else:
+        print("no filter keys")
+    print("")
+    if args.verbose:
+        if all_filter_keys is not None:
+            print("==== Filter Key Contents ====")
+            for fk in all_filter_keys:
+                print("filter_key {} with {} demos: {}".format(fk, len(all_filter_keys[fk]), all_filter_keys[fk]))
+        print("")
+    env_meta = json.loads(f["data"].attrs["env_args"])
+    print("==== Env Meta ====")
+    print(json.dumps(env_meta, indent=4))
+    print("")
+
+    print("==== Dataset Structure ====")
+    for ep in demos:
+        print("episode {} with {} transitions".format(ep, f["data/{}".format(ep)].attrs["num_samples"]))
+        for k in f["data/{}".format(ep)]:
+            if k in ["obs", "next_obs"]:
+                print("    key: {}".format(k))
+                for obs_k in f["data/{}/{}".format(ep, k)]:
+                    shape = f["data/{}/{}/{}".format(ep, k, obs_k)].shape
+                    dtype = f["data/{}/{}/{}".format(ep, k, obs_k)].dtype
+                    print("        observation key {} with shape {} and dtype {}".format(obs_k, shape, dtype))
+            elif isinstance(f["data/{}/{}".format(ep, k)], h5py.Dataset):
+                key_shape = f["data/{}/{}".format(ep, k)].shape
+                print("    key: {} with shape {}".format(k, key_shape))
+
+        if not args.verbose:
+            break
+
+    f.close()
+
+    # maybe display error message
+    print("")
+    if (action_min < -1.) or (action_max > 1.):
+        raise Exception("Dataset should have actions in [-1., 1.] but got bounds [{}, {}]".format(action_min, action_max))
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/give_slack_notification.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/give_slack_notification.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b23d9a4945292eba0018b16063a3a2ae7f3123
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/give_slack_notification.py
@@ -0,0 +1,53 @@
+"""
+Script to send a slack message for notifications on completed training runs.
+Super extra, but gotta love it.
+"""
+
+import os
+import argparse
+import socket
+import ssl as ssl_lib
+import certifi
+import time
+import datetime
+
+import slack_sdk
+from slack_sdk import WebClient
+from slack_sdk.errors import SlackApiError
+
+import robomimic
+import robomimic.macros as Macros
+
+
+def give_slack_notif(msg):
+    # for some reason, we need to explicitly create an SSL context
+    ssl_context = ssl_lib.create_default_context(cafile=certifi.where())
+    client = WebClient(Macros.SLACK_TOKEN, ssl=ssl_context)
+
+    try:
+        response = client.chat_postMessage(
+            channel=Macros.SLACK_USER_ID,
+            text=msg,
+        )
+    except SlackApiError as e:
+        # You will get a SlackApiError if "ok" is False
+        assert e.response["ok"] is False
+        assert e.response["error"]  # str like 'invalid_auth', 'channel_not_found'
+        print(f"Got a slack error: {e.response['error']}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--message",
+        type=str,
+    )
+    args = parser.parse_args()
+
+    # make sure to parse \n from command line
+    message = args.message.replace("\\n", "\n")
+
+    # add some metadata and send message
+    t_now = time.time()
+    time_str = datetime.datetime.fromtimestamp(t_now).strftime('%m/%d/%Y %H:%M:%S')
+    message = "Hostname: `{}`\nProcess ID: `{}`\nTimestamp: `{}`\n```{}```".format(socket.gethostname(), os.getpid(), time_str, message)
+    give_slack_notif(message)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/hyperparam_helper.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/hyperparam_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..870c739ecbf4a751b6e62c363555d451cfa68ae2
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/hyperparam_helper.py
@@ -0,0 +1,141 @@
+"""
+A useful script for generating json files and shell scripts for conducting parameter scans.
+The script takes a path to a base json file as an argument and a shell file name.
+It generates a set of new json files in the same folder as the base json file, and 
+a shell file script that contains commands to run for each experiment.
+
+Instructions:
+
+(1) Start with a base json that specifies a complete set of parameters for a single 
+    run. This only needs to include parameters you want to sweep over, and parameters
+    that are different from the defaults. You can set this file path by either
+    passing it as an argument (e.g. --config /path/to/base.json) or by directly
+    setting the config file in @make_generator. The new experiment jsons will be put
+    into the same directory as the base json.
+
+(2) Decide on what json parameters you would like to sweep over, and fill those in as 
+    keys in @make_generator below, taking note of the hierarchical key
+    formatting using "/" or ".". Fill in corresponding values for each - these will
+    be used in creating the experiment names, and for determining the range
+    of values to sweep. Parameters that should be sweeped together should
+    be assigned the same group number.
+
+(3) Set the output script name by either passing it as an argument (e.g. --script /path/to/script.sh)
+    or by directly setting the script file in @make_generator. The script to run all experiments
+    will be created at the specified path.
+
+Args:
+    config (str): path to a base config json file that will be modified to generate config jsons.
+        The jsons will be generated in the same folder as this file.
+
+    script (str): path to output script that contains commands to run the generated training runs
+
+Example usage:
+
+    # assumes that /tmp/gen_configs/base.json has already been created (see quickstart section of docs for an example)
+    python hyperparam_helper.py --config /tmp/gen_configs/base.json --script /tmp/gen_configs/out.sh
+"""
+import argparse
+
+import robomimic
+import robomimic.utils.hyperparam_utils as HyperparamUtils
+
+
+def make_generator(config_file, script_file):
+    """
+    Implement this function to setup your own hyperparameter scan!
+    """
+    generator = HyperparamUtils.ConfigGenerator(
+        base_config_file=config_file, script_file=script_file
+    )
+
+    # use RNN with horizon 10
+    generator.add_param(
+        key="algo.rnn.enabled",
+        name="", 
+        group=0, 
+        values=[True],
+    )
+    generator.add_param(
+        key="train.seq_length", 
+        name="", 
+        group=0, 
+        values=[10], 
+    )
+    generator.add_param(
+        key="algo.rnn.horizon",
+        name="", 
+        group=0, 
+        values=[10], 
+    )
+
+    # LR - 1e-3, 1e-4
+    generator.add_param(
+        key="algo.optim_params.policy.learning_rate.initial", 
+        name="plr", 
+        group=1, 
+        values=[1e-3, 1e-4], 
+    )
+
+    # GMM y / n
+    generator.add_param(
+        key="algo.gmm.enabled", 
+        name="gmm", 
+        group=2, 
+        values=[True, False], 
+        value_names=["t", "f"],
+    )
+
+    # RNN dim 400 + MLP dims (1024, 1024) vs. RNN dim 1000 + empty MLP dims ()
+    generator.add_param(
+        key="algo.rnn.hidden_dim", 
+        name="rnnd", 
+        group=3, 
+        values=[
+            400, 
+            1000,
+        ], 
+    )
+    generator.add_param(
+        key="algo.actor_layer_dims", 
+        name="mlp", 
+        group=3, 
+        values=[
+            [1024, 1024], 
+            [],
+        ], 
+        value_names=["1024", "0"],
+    )
+
+    return generator
+
+
+def main(args):
+
+    # make config generator
+    generator = make_generator(config_file=args.config, script_file=args.script)
+
+    # generate jsons and script
+    generator.generate()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Path to base json config - will override any defaults.
+    parser.add_argument(
+        "--config",
+        type=str,
+        help="path to base config json that will be modified to generate jsons. The jsons will\
+            be generated in the same folder as this file.",
+    )
+
+    # Script name to generate - will override any defaults
+    parser.add_argument(
+        "--script",
+        type=str,
+        help="path to output script that contains commands to run the generated training runs",
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/hyperparam_helper_diffusion.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/hyperparam_helper_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..085d084be7a78e73ec149474a9ded44722880883
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/hyperparam_helper_diffusion.py
@@ -0,0 +1,297 @@
+"""
+Version of hyperparam helper to easily spin up runs with different base configs and diffusion policy.
+"""
+import os
+import shutil
+import json
+import argparse
+
+import robomimic
+import robomimic.utils.hyperparam_utils as HyperparamUtils
+
+import maglev_utils
+from maglev_utils.utils.file_utils import config_generator_to_script_lines
+
+
+# set base folder for where to copy each base config and generate new configs
+CONFIG_DIR = "/tmp/diffusion_configs"
+
+# path to base robomimic training config(s)
+BASE_CONFIGS = [
+    # "~/Desktop/mimicgen_env_data/base_train_diffusion.json",
+    # "~/Desktop/mimicgen_env_data/base_train_diffusion_image.json",
+    "~/Desktop/mimicgen_env_data/base_train_diffusion.json",
+]
+
+# output directory for this set of runs
+OUTPUT_DIR = "/tmp/diffusion_runs"
+
+
+def make_generators(base_configs):
+    """Helper function to make all generators."""
+    all_settings = [
+        # # low-dim
+        # dict(
+        #     dataset_paths=[
+        #         "/tmp/low_dim.hdf5",
+        #     ],
+        #     dataset_names=[
+        #         "low_dim",
+        #     ],
+        #     horizon=400,
+        # ),
+        # # image
+        # dict(
+        #     dataset_paths=[
+        #         "/tmp/image.hdf5",
+        #     ],
+        #     dataset_names=[
+        #         "image",
+        #     ],
+        #     horizon=400,
+        # ),
+        dict(
+            dataset_paths=[
+                "/ext2/rebuttal/diffusion/square_ph_abs_im.hdf5",
+            ],
+            dataset_names=[
+                "square_ph_ld",
+            ],
+            horizon=400,
+        ),
+    ]
+
+    assert len(base_configs) == len(all_settings)
+    ret = []
+    for conf, setting in zip(base_configs, all_settings):
+        ret.append(make_gen(os.path.expanduser(conf), setting))
+    return ret
+
+
+def make_gen(base_config, settings):
+    """
+    Specify training configs to generate here.
+    """
+    generator = HyperparamUtils.ConfigGenerator(
+        base_config_file=base_config,
+        script_file="", # will be overriden in next step
+    )
+
+    # add some params to sweep
+    dataset_values = [[dict(path=x)] for x in settings["dataset_paths"]]
+    generator.add_param(
+        key="train.data", 
+        name="ds", 
+        group=0, 
+        values=dataset_values,
+        value_names=settings["dataset_names"],
+    )
+
+    # rollout settings
+    generator.add_param(
+        key="experiment.rollout.horizon", 
+        name="", 
+        group=1, 
+        values=[settings["horizon"]],
+    )
+
+    # output path
+    generator.add_param(
+        key="train.output_dir",
+        name="", 
+        group=2, 
+        values=[
+            OUTPUT_DIR,
+        ],
+    )
+
+    # ensure robosuite env uses absolute pose actions
+    generator.add_param(
+        key="experiment.env_meta_update_dict",
+        name="",
+        group=-1,
+        values=[
+            {"env_kwargs": {"controller_configs": {"control_delta": False}}}
+        ],
+    )
+
+    # default action spec for diffusion policy
+    generator.add_param(
+        key="train.action_keys",
+        name="",
+        group=-1,
+        values=[
+            [
+                "action_dict/abs_pos",
+                "action_dict/abs_rot_6d",
+                "action_dict/gripper",
+                # "actions",
+            ],
+        ],
+    )
+    generator.add_param(
+        key="train.action_config",
+        name="",
+        group=-1,
+        values=[
+            {
+                "actions":{
+                    "normalization": None,
+                },
+                "action_dict/abs_pos": {
+                    "normalization": "min_max"
+                },
+                "action_dict/abs_rot_axis_angle": {
+                    "normalization": "min_max",
+                    "format": "rot_axis_angle"
+                },
+                "action_dict/abs_rot_6d": {
+                    "normalization": None,
+                    "format": "rot_6d"
+                },
+                "action_dict/rel_pos": {
+                    "normalization": None,
+                },
+                "action_dict/rel_rot_axis_angle": {
+                    "normalization": None,
+                    "format": "rot_axis_angle"
+                },
+                "action_dict/rel_rot_6d": {
+                    "normalization": None,
+                    "format": "rot_6d"
+                },
+                "action_dict/gripper": {
+                    "normalization": None,
+                }
+            }
+        ],
+    )
+
+    # num data workers 4 by default (for both low-dim and image) and cache mode "low_dim"
+    generator.add_param(
+        key="train.num_data_workers",
+        name="",
+        group=-1,
+        values=[4],
+    )
+    generator.add_param(
+        key="train.hdf5_cache_mode",
+        name="",
+        group=-1,
+        values=["low_dim"],
+    )
+
+    # num epochs 1000 for both low-dim and image
+    generator.add_param(
+        key="train.num_epochs",
+        name="",
+        group=-1,
+        values=[1000],
+    )
+
+    # set low-rate of eval - every 100 epochs
+    generator.add_param(
+        key="experiment.save.every_n_epochs",
+        name="",
+        group=-1,
+        values=[100],
+    )
+    generator.add_param(
+        key="experiment.rollout.rate",
+        name="",
+        group=-1,
+        values=[100],
+    )
+
+    # set noise scheduler
+    use_ddim = True
+    inf_steps = [(100, 10), (50, 5)]
+    # use_ddim = False
+    # inf_steps = []
+
+    generator.add_param(
+        key="algo.ddim.enabled",
+        name="ddim" if use_ddim else "",
+        group=1001,
+        values=[
+            use_ddim,
+        ],
+        value_names=[
+            "t" if use_ddim else "f",
+        ],
+    )
+    generator.add_param(
+        key="algo.ddpm.enabled",
+        name="ddpm" if not use_ddim else "",
+        group=1001,
+        values=[
+            (not use_ddim),
+        ],
+        value_names=[
+            "f" if not use_ddim else "t",
+        ],
+    )
+
+    if len(inf_steps) > 0:
+        train_inf_steps = [x[0] for x in inf_steps]
+        eval_inf_steps = [x[1] for x in inf_steps]
+        # set inf steps
+        generator.add_param(
+            key="algo.ddim.num_train_timesteps" if use_ddim else "algo.ddpm.num_train_timesteps",
+            name="train",
+            group=1002,
+            values=train_inf_steps,
+        )
+        generator.add_param(
+            key="algo.ddim.num_inference_timesteps" if use_ddim else "algo.ddpm.num_inference_timesteps",
+            name="eval",
+            group=1002,
+            values=eval_inf_steps,
+        )
+
+    # # seed
+    # generator.add_param(
+    #     key="train.seed",
+    #     name="seed", 
+    #     group=100000, 
+    #     values=[101, 102, 103],
+    # )
+
+    return generator
+
+
+def main(args):
+
+    # make config generators
+    generators = make_generators(base_configs=BASE_CONFIGS)
+
+    if args.config_dir is None:
+        args.config_dir = CONFIG_DIR
+
+    if os.path.exists(args.config_dir):
+        ans = input("Non-empty dir at {} will be removed.\nContinue (y / n)? \n".format(args.config_dir))
+        if ans != "y":
+            exit()
+        shutil.rmtree(args.config_dir)
+
+    all_json_files, run_lines = config_generator_to_script_lines(generators, config_dir=args.config_dir)
+
+    print("configs")
+    print(json.dumps(all_json_files, indent=4))
+    print("runs")
+    print(json.dumps(run_lines, indent=4))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Path to base json config - will override any defaults.
+    parser.add_argument(
+        "--config_dir",
+        type=str,
+        help="path to base config json that will be modified to generate jsons. The jsons will\
+            be generated in the same folder as this file.",
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/playback_dataset.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/playback_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3251ba6571e2c5df2a476655ea2c84fd57c7fe5e
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/playback_dataset.py
@@ -0,0 +1,514 @@
+"""
+A script to visualize dataset trajectories by loading the simulation states
+one by one or loading the first state and playing actions back open-loop.
+The script can generate videos as well, by rendering simulation frames
+during playback. The videos can also be generated using the image observations
+in the dataset (this is useful for real-robot datasets) by using the
+--use-obs argument.
+
+Args:
+    dataset (str): path to hdf5 dataset
+
+    filter_key (str): if provided, use the subset of trajectories
+        in the file that correspond to this filter key
+
+    n (int): if provided, stop after n trajectories are processed
+
+    use-obs (bool): if flag is provided, visualize trajectories with dataset 
+        image observations instead of simulator
+
+    use-actions (bool): if flag is provided, use open-loop action playback 
+        instead of loading sim states
+
+    render (bool): if flag is provided, use on-screen rendering during playback
+    
+    video_path (str): if provided, render trajectories to this video file path
+
+    video_skip (int): render frames to a video every @video_skip steps
+
+    render_image_names (str or [str]): camera name(s) / image observation(s) to 
+        use for rendering on-screen or to video
+
+    first (bool): if flag is provided, use first frame of each episode for playback
+        instead of the entire episode. Useful for visualizing task initializations.
+
+Example usage below:
+
+    # force simulation states one by one, and render agentview and wrist view cameras to video
+    python playback_dataset.py --dataset /path/to/dataset.hdf5 \
+        --render_image_names agentview robot0_eye_in_hand \
+        --video_path /tmp/playback_dataset.mp4
+
+    # playback the actions in the dataset, and render agentview camera during playback to video
+    python playback_dataset.py --dataset /path/to/dataset.hdf5 \
+        --use-actions --render_image_names agentview \
+        --video_path /tmp/playback_dataset_with_actions.mp4
+
+    # use the observations stored in the dataset to render videos of the dataset trajectories
+    python playback_dataset.py --dataset /path/to/dataset.hdf5 \
+        --use-obs --render_image_names agentview_image \
+        --video_path /tmp/obs_trajectory.mp4
+
+    # visualize depth observations along with image observations
+    python playback_dataset.py --dataset /path/to/dataset.hdf5 \
+        --use-obs --render_image_names agentview_image \
+        --render_depth_names agentview_depth \
+        --video_path /tmp/obs_trajectory.mp4
+
+    # visualize initial states in the demonstration data
+    python playback_dataset.py --dataset /path/to/dataset.hdf5 \
+        --first --render_image_names agentview \
+        --video_path /tmp/dataset_task_inits.mp4
+"""
+
+import os
+import json
+import h5py
+import argparse
+import imageio
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+import numpy as np
+
+import robomimic
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.env_utils as EnvUtils
+import robomimic.utils.file_utils as FileUtils
+from robomimic.utils.vis_utils import depth_to_rgb
+from robomimic.envs.env_base import EnvBase, EnvType
+
+try:
+    import mimicgen
+except ImportError:
+    print("WARNING: could not import mimicgen envs")
+
+
+# Define default cameras to use for each env type
+DEFAULT_CAMERAS = {
+    EnvType.ROBOSUITE_TYPE: ["agentview"],
+    EnvType.IG_MOMART_TYPE: ["rgb"],
+    EnvType.GYM_TYPE: ValueError("No camera names supported for gym type env!"),
+    EnvType.REAL_TYPE: ["front_image"],
+    EnvType.GPRS_REAL_TYPE: ["front_image"],
+}
+
+
+def add_red_border(frame):
+    """Add a red border to image frame."""
+    border_size = int(0.05 * min(frame.shape[0], frame.shape[1])) # 5% of image
+    frame[:border_size, :, :] = [255., 0., 0.]
+    frame[-border_size:, :, :] = [255., 0., 0.]
+    frame[:, :border_size, :] = [255., 0., 0.]
+    frame[:, -border_size:, :] = [255., 0., 0.]
+    return frame
+
+
+def depth_to_rgb(depth_map, depth_min=None, depth_max=None):
+    """
+    Convert depth map to rgb array by computing normalized depth values in [0, 1].
+    """
+    # normalize depth map into [0, 1]
+    if depth_min is None:
+        depth_min = depth_map.min()
+    if depth_max is None:
+        depth_max = depth_map.max()
+    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+    # depth_map = np.clip(depth_map / 3., 0., 1.)
+    if len(depth_map.shape) == 3:
+        assert depth_map.shape[-1] == 1
+        depth_map = depth_map[..., 0]
+    assert len(depth_map.shape) == 2 # [H, W]
+    return (255. * cm.hot(depth_map, 3)).astype(np.uint8)[..., :3]
+
+
+def playback_trajectory_with_env(
+    env, 
+    initial_state, 
+    states, 
+    actions=None, 
+    render=False, 
+    video_writer=None, 
+    video_skip=5, 
+    camera_names=None,
+    first=False,
+    interventions=None,
+    real=False,
+):
+    """
+    Helper function to playback a single trajectory using the simulator environment.
+    If @actions are not None, it will play them open-loop after loading the initial state. 
+    Otherwise, @states are loaded one by one.
+
+    Args:
+        env (instance of EnvBase): environment
+        initial_state (dict): initial simulation state to load
+        states (list of dict or np.array): array of simulation states to load
+        actions (np.array): if provided, play actions back open-loop instead of using @states
+        render (bool): if True, render on-screen
+        video_writer (imageio writer): video writer
+        video_skip (int): determines rate at which environment frames are written to video
+        camera_names (list): determines which camera(s) are used for rendering. Pass more than
+            one to output a video with multiple camera views concatenated horizontally.
+        first (bool): if True, only use the first frame of each episode.
+        real (bool): if True, playback is happening on real robot
+    """
+    assert isinstance(env, EnvBase)
+
+    write_video = (video_writer is not None)
+    video_count = 0
+    assert not (render and write_video)
+
+    # load the initial state
+    env.reset()
+    if real:
+        assert actions is not None, "must supply actions for real robot playback"
+        traj_len = actions.shape[0]
+        input("ready for next episode? hit enter to continue")
+    else:
+        env.reset_to(initial_state)
+        traj_len = len(states)
+
+    action_playback = (actions is not None)
+    if action_playback:
+        assert len(states) == actions.shape[0]
+
+    for i in range(traj_len):
+        if action_playback:
+            env.step(actions[i])
+            if (i < traj_len - 1) and not real:
+                # check whether the actions deterministically lead to the same recorded states
+                state_playback = env.get_state()["states"]
+                if isinstance(state_playback, dict):
+                    # state is dict, so assert equality for all keys
+                    for k in state_playback:
+                        if not np.all(np.equal(states[i + 1][k], state_playback[k])):
+                            err = np.linalg.norm(states[i + 1][k] - state_playback[k])
+                            print("warning: playback diverged by {} at step {} state key {}".format(err, i, k))
+                else:
+                    if not np.all(np.equal(states[i + 1], state_playback)):
+                        err = np.linalg.norm(states[i + 1] - state_playback)
+                        print("warning: playback diverged by {} at step {}".format(err, i))
+
+        else:
+            env.reset_to({"states" : states[i]})
+
+        # on-screen render
+        if render:
+            env.render(mode="human", camera_name=camera_names[0])
+
+        # video render
+        if write_video:
+            if video_count % video_skip == 0:
+                video_img = []
+                for cam_name in camera_names:
+                    frame = env.render(mode="rgb_array", height=512, width=512, camera_name=cam_name)
+                    if (interventions is not None) and interventions[i]:
+                        # add red border to frame
+                        frame = add_red_border(frame=frame)
+                    video_img.append(frame)
+                video_img = np.concatenate(video_img, axis=1) # concatenate horizontally
+                video_writer.append_data(video_img)
+            video_count += 1
+
+        if first:
+            break
+
+
+def playback_trajectory_with_obs(
+    traj_grp,
+    video_writer, 
+    video_skip=5, 
+    image_names=None,
+    depth_names=None,
+    first=False,
+    intervention=False,
+):
+    """
+    This function reads all "rgb" (and possibly "depth") observations in the dataset trajectory and
+    writes them into a video.
+
+    Args:
+        traj_grp (hdf5 file group): hdf5 group which corresponds to the dataset trajectory to playback
+        video_writer (imageio writer): video writer
+        video_skip (int): determines rate at which environment frames are written to video
+        image_names (list): determines which image observations are used for rendering. Pass more than
+            one to output a video with multiple image observations concatenated horizontally.
+        depth_names (list): determines which depth observations are used for rendering (if any).
+        first (bool): if True, only use the first frame of each episode.
+        intervention (bool): if True, denote intervention timesteps with a red border
+    """
+    assert image_names is not None, "error: must specify at least one image observation to use in @image_names"
+    video_count = 0
+
+    if depth_names is not None:
+        # compute min and max depth value across trajectory for normalization
+        depth_min = { k : traj_grp["obs/{}".format(k)][:].min() for k in depth_names }
+        depth_max = { k : traj_grp["obs/{}".format(k)][:].max() for k in depth_names }
+
+    traj_len = traj_grp["actions"].shape[0]
+    frame_inds = range(traj_len)
+    if first:
+        video_skip = 1 # keep all frames
+        if intervention:
+            # find where interventions begin (0 to 1 edge) and get frames right before them
+            if len(traj_grp["interventions"].shape) == 2:
+                all_interventions = traj_grp["interventions"][:, 0].astype(int)
+            else:
+                all_interventions = traj_grp["interventions"][:].astype(int)
+            frame_inds = list(np.nonzero((all_interventions[1:] - all_interventions[:-1]) > 0)[0])
+        else:
+            frame_inds = range(1)
+
+    if depth_names is not None:
+        # compute min and max depth value across trajectory for normalization
+        depth_min = { k : traj_grp["obs/{}".format(k)][:].min() for k in depth_names }
+        depth_max = { k : traj_grp["obs/{}".format(k)][:].max() for k in depth_names }
+
+    for i in frame_inds:
+        if video_count % video_skip == 0:
+            # concatenate image obs together
+            im = [traj_grp["obs/{}".format(k)][i] for k in image_names]
+            depth = [depth_to_rgb(traj_grp["obs/{}".format(k)][i], depth_min=depth_min[k], depth_max=depth_max[k]) for k in depth_names] if depth_names is not None else []
+            frame = np.concatenate(im + depth, axis=1)
+            video_writer.append_data(frame)
+        video_count += 1
+
+
+def playback_dataset(args, env=None):
+    # some arg checking
+    write_video = (args.video_path is not None)
+    assert not (args.render and write_video) # either on-screen or video but not both
+    if args.absolute:
+        assert args.use_actions
+
+    # Auto-fill camera rendering info if not specified
+    if args.render_image_names is None:
+        # We fill in the automatic values
+        env_meta = FileUtils.get_env_metadata_from_dataset(dataset_path=args.dataset)
+        env_type = EnvUtils.get_env_type(env_meta=env_meta)
+        args.render_image_names = DEFAULT_CAMERAS[env_type]
+
+    if args.render:
+        # on-screen rendering can only support one camera
+        assert len(args.render_image_names) == 1
+
+    if args.use_obs:
+        assert write_video, "playback with observations can only write to video"
+        assert not args.use_actions, "playback with observations is offline and does not support action playback"
+
+    if args.render_depth_names is not None:
+        assert args.use_obs, "depth observations can only be visualized from observations currently"
+
+    # create environment only if not playing back with observations
+    if not args.use_obs:
+        # need to make sure ObsUtils knows which observations are images, but it doesn't matter 
+        # for playback since observations are unused. Pass a dummy spec here.
+        dummy_spec = dict(
+            obs=dict(
+                    low_dim=["robot0_eef_pos"],
+                    rgb=[],
+                ),
+        )
+
+        # some operations for playback are env-type-specific
+        env_meta = FileUtils.get_env_metadata_from_dataset(dataset_path=args.dataset)
+        is_robosuite_env = EnvUtils.is_robosuite_env(env_meta)
+        is_real_robot = EnvUtils.is_real_robot_env(env_meta) or EnvUtils.is_real_robot_gprs_env(env_meta)
+
+        if args.absolute:
+            # modify env-meta to tell the environment to expect absolute actions
+            assert is_robosuite_env or is_real_robot, "only these support absolute actions for now"
+            if is_robosuite_env:
+                env_meta["env_kwargs"]["controller_configs"]["control_delta"] = False
+            else:
+                env_meta["env_kwargs"]["absolute_actions"] = True
+
+        if env is None:
+            if is_real_robot:
+                # TODO: update hardcoded keys on real robot
+                dummy_spec["obs"]["rgb"] = ["front_image", "wrist_image", "side_image"]
+                dummy_spec["obs"]["depth"] = ["front_image_depth", "wrist_image_depth", "side_image_depth"]
+            ObsUtils.initialize_obs_utils_with_obs_specs(obs_modality_specs=dummy_spec)
+            env = EnvUtils.create_env_from_metadata(env_meta=env_meta, render=args.render, render_offscreen=write_video)
+
+    f = h5py.File(args.dataset, "r")
+
+    # list of all demonstration episodes (sorted in increasing number order)
+    if args.filter_key is not None:
+        print("using filter key: {}".format(args.filter_key))
+        demos = [elem.decode("utf-8") for elem in np.array(f["mask/{}".format(args.filter_key)])]
+    else:
+        demos = list(f["data"].keys())
+    inds = np.argsort([int(elem[5:]) for elem in demos])
+    demos = [demos[i] for i in inds]
+
+    # maybe reduce the number of demonstrations to playback
+    if args.n is not None:
+        demos = demos[:args.n]
+
+    # maybe dump video
+    video_writer = None
+    if write_video:
+        fps = 5 if args.first else 20
+        video_writer = imageio.get_writer(args.video_path, fps=fps)
+
+    for ind in range(len(demos)):
+        ep = demos[ind]
+        print("Playing back episode: {}".format(ep))
+
+        if args.use_obs:
+            playback_trajectory_with_obs(
+                traj_grp=f["data/{}".format(ep)], 
+                video_writer=video_writer, 
+                video_skip=args.video_skip,
+                image_names=args.render_image_names,
+                depth_names=args.render_depth_names,
+                first=args.first,
+                intervention=args.intervention,
+            )
+            continue
+
+        # prepare states to reload from
+        if not is_real_robot:
+            states = f["data/{}/states".format(ep)][()]
+            initial_state = dict(states=states[0])
+            if is_robosuite_env:
+                initial_state["model"] = f["data/{}".format(ep)].attrs["model_file"]
+
+        # supply actions if using open-loop action playback
+        actions = None
+        if args.use_actions:
+            if args.absolute:
+                actions = f["data/{}/actions_abs".format(ep)][()]
+            else:
+                actions = f["data/{}/actions".format(ep)][()]
+
+        if is_real_robot:
+            assert actions is not None
+            states = np.zeros(actions.shape[0])
+            initial_state = dict(states=states[0])
+
+        # supply interventions if we need them for visualization
+        interventions = None
+        if args.intervention:
+            interventions = f["data/{}/interventions".format(ep)][()]
+
+        playback_trajectory_with_env(
+            env=env, 
+            initial_state=initial_state, 
+            states=states, actions=actions, 
+            render=args.render, 
+            video_writer=video_writer, 
+            video_skip=args.video_skip,
+            camera_names=args.render_image_names,
+            first=args.first,
+            interventions=interventions,
+            real=is_real_robot,
+        )
+
+    f.close()
+    if write_video:
+        video_writer.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="path to hdf5 dataset",
+    )
+    parser.add_argument(
+        "--filter_key",
+        type=str,
+        default=None,
+        help="(optional) filter key, to select a subset of trajectories in the file",
+    )
+
+    # number of trajectories to playback. If omitted, playback all of them.
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=None,
+        help="(optional) stop after n trajectories are played",
+    )
+
+    # Use image observations instead of doing playback using the simulator env.
+    parser.add_argument(
+        "--use-obs",
+        action='store_true',
+        help="visualize trajectories with dataset image observations instead of simulator",
+    )
+
+    # Playback stored dataset actions open-loop instead of loading from simulation states.
+    parser.add_argument(
+        "--use-actions",
+        action='store_true',
+        help="use open-loop action playback instead of loading sim states",
+    )
+
+    # TODO: clean up this arg
+    parser.add_argument(
+        "--absolute",
+        action='store_true',
+        help="use absolute actions for open-loop action playback",
+    )
+
+    # Whether to render playback to screen
+    parser.add_argument(
+        "--render",
+        action='store_true',
+        help="on-screen rendering",
+    )
+
+    # Dump a video of the dataset playback to the specified path
+    parser.add_argument(
+        "--video_path",
+        type=str,
+        default=None,
+        help="(optional) render trajectories to this video file path",
+    )
+
+    # How often to write video frames during the playback
+    parser.add_argument(
+        "--video_skip",
+        type=int,
+        default=5,
+        help="render frames to video every n steps",
+    )
+
+    # camera names to render, or image observations to use for writing to video
+    parser.add_argument(
+        "--render_image_names",
+        type=str,
+        nargs='+',
+        default=None,
+        help="(optional) camera name(s) / image observation(s) to use for rendering on-screen or to video. Default is"
+             "None, which corresponds to a predefined camera for each env type",
+    )
+
+    # depth observations to use for writing to video
+    parser.add_argument(
+        "--render_depth_names",
+        type=str,
+        nargs='+',
+        default=None,
+        help="(optional) depth observation(s) to use for rendering to video"
+    )
+
+    # Only use the first frame of each episode
+    parser.add_argument(
+        "--first",
+        action='store_true',
+        help="use first frame of each episode",
+    )
+
+    # Denote intervention timesteps with a red border in the frame
+    parser.add_argument(
+        "--intervention",
+        action='store_true',
+        help="denote intervention timesteps with a red border in the frame",
+    )
+
+    args = parser.parse_args()
+    playback_dataset(args)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/postprocess_dataset_intervention_segments.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/postprocess_dataset_intervention_segments.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e9fec99aba334a9ba8298c3e73b6b391e29e921
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/postprocess_dataset_intervention_segments.py
@@ -0,0 +1,220 @@
+"""
+Script to postprocess a dataset by splitting each trajectory up into new trajectories 
+that only consists of continuous intervention segments.
+"""
+import os
+import json
+import h5py
+import argparse
+import numpy as np
+
+import robomimic.utils.file_utils as FileUtils
+
+
+def write_intervention_segments_as_trajectories(
+    src_ep_grp,
+    dst_grp,
+    start_ep_write_ind,
+    same=False,
+):
+    """
+    Helper function to extract intervention segments from a source demonstration and use their indices to
+    write the corresponding subset of each trajectory to a new trajectory.
+
+    Returns:
+        end_ep_write_ind (int): updated episode index after writing trajectories to destination file
+        num_traj (int): number of trajectories written to destination file
+        total_samples (int): total number of samples written to destination file
+        same (bool): if True, write all intevrention segments to the same trajectory
+    """
+
+    # get segments
+    interventions = src_ep_grp["interventions"][()].reshape(-1).astype(int)
+    segments = FileUtils.get_intervention_segments(interventions)
+
+    ep_write_ind = start_ep_write_ind
+    total_samples = 0
+    num_traj = len(segments)
+    keys_to_try_and_copy = ["states", "obs", "next_obs", "rewards", "dones", "actions_abs", "datagen_info"]
+
+    if same:
+        # concatenate information across intervention segments and write to single episode
+        num_traj = 1
+        dst_grp_name = "demo_{}".format(ep_write_ind)
+        dst_ep_grp = dst_grp.create_group(dst_grp_name)
+        for k in keys_to_try_and_copy:
+            should_compress = (k in ["obs", "next_obs"])
+            if k in src_ep_grp:
+                if isinstance(src_ep_grp[k], h5py.Group):
+                    for k2 in src_ep_grp[k]:
+                        assert isinstance(src_ep_grp[k][k2], h5py.Dataset)
+                        data = np.concatenate(
+                            [src_ep_grp[k][k2][seg_start_ind : seg_end_ind] for seg_start_ind, seg_end_ind in segments],
+                            axis=0,
+                        )
+                        if should_compress:
+                            dst_ep_grp.create_dataset("{}/{}".format(k, k2), data=data, compression="gzip")
+                        else:
+                            dst_ep_grp.create_dataset("{}/{}".format(k, k2), data=data)
+                else:
+                    assert isinstance(src_ep_grp[k], h5py.Dataset)
+                    data = np.concatenate(
+                        [src_ep_grp[k][seg_start_ind : seg_end_ind] for seg_start_ind, seg_end_ind in segments],
+                        axis=0,
+                    )
+                    if should_compress:
+                        dst_ep_grp.create_dataset("{}".format(k), data=data, compression="gzip")
+                    else:
+                        dst_ep_grp.create_dataset("{}".format(k), data=data)
+
+        # manually copy actions since they might need truncation
+        actions = np.concatenate([src_ep_grp["actions"][seg_start_ind : seg_end_ind] for seg_start_ind, seg_end_ind in segments], axis=0)
+        if actions.shape[-1] != 7:
+            actions = actions[..., :7]
+        dst_ep_grp.create_dataset("actions", data=actions)
+
+        # mimicgen metadata
+        if "src_demo_inds" in src_ep_grp:
+            dst_ep_grp.create_dataset("src_demo_inds", data=np.array(src_ep_grp["src_demo_inds"][:]))
+        if "src_demo_labels" in src_ep_grp:
+            dst_ep_grp.create_dataset("src_demo_labels", data=np.array(src_ep_grp["src_demo_labels"][:]))
+
+        # copy attributes too
+        for k in src_ep_grp.attrs:
+            dst_ep_grp.attrs[k] = src_ep_grp.attrs[k]
+        dst_ep_grp.attrs["num_samples"] = np.sum([(seg_end_ind - seg_start_ind) for seg_start_ind, seg_end_ind in segments])
+
+        # update variables for next iter
+        ep_write_ind += 1
+        total_samples += dst_ep_grp.attrs["num_samples"]
+        print("  wrote trajectory to dst grp {} with num samples {}".format(dst_grp_name, dst_ep_grp.attrs["num_samples"]))
+    else:
+        # write each segment to new episode
+        for seg_start_ind, seg_end_ind in segments:
+            dst_grp_name = "demo_{}".format(ep_write_ind)
+            dst_ep_grp = dst_grp.create_group(dst_grp_name)
+
+            # copy over subsequence from source trajectory into destination trajectory
+            for k in keys_to_try_and_copy:
+                should_compress = (k in ["obs", "next_obs"])
+                if k in src_ep_grp:
+                    if isinstance(src_ep_grp[k], h5py.Group):
+                        for k2 in src_ep_grp[k]:
+                            assert isinstance(src_ep_grp[k][k2], h5py.Dataset)
+                            if should_compress:
+                                dst_ep_grp.create_dataset("{}/{}".format(k, k2), data=np.array(src_ep_grp[k][k2][seg_start_ind : seg_end_ind]), compression="gzip")
+                            else:
+                                dst_ep_grp.create_dataset("{}/{}".format(k, k2), data=np.array(src_ep_grp[k][k2][seg_start_ind : seg_end_ind]))
+                    else:
+                        assert isinstance(src_ep_grp[k], h5py.Dataset)
+                        if should_compress:
+                            dst_ep_grp.create_dataset("{}".format(k), data=np.array(src_ep_grp[k][seg_start_ind : seg_end_ind]), compression="gzip")
+                        else:
+                            dst_ep_grp.create_dataset("{}".format(k), data=np.array(src_ep_grp[k][seg_start_ind : seg_end_ind]))
+
+            # manually copy actions since they might need truncation
+            actions = np.array(src_ep_grp["actions"][seg_start_ind : seg_end_ind])
+            if actions.shape[-1] != 7:
+                actions = actions[..., :7]
+            dst_ep_grp.create_dataset("actions", data=actions)
+
+            # mimicgen metadata
+            if "src_demo_inds" in src_ep_grp:
+                dst_ep_grp.create_dataset("src_demo_inds", data=np.array(src_ep_grp["src_demo_inds"][:]))
+            if "src_demo_labels" in src_ep_grp:
+                dst_ep_grp.create_dataset("src_demo_labels", data=np.array(src_ep_grp["src_demo_labels"][:]))
+
+            # copy attributes too
+            for k in src_ep_grp.attrs:
+                dst_ep_grp.attrs[k] = src_ep_grp.attrs[k]
+            dst_ep_grp.attrs["num_samples"] = (seg_end_ind - seg_start_ind)
+
+            # update variables for next iter
+            ep_write_ind += 1
+            total_samples += dst_ep_grp.attrs["num_samples"]
+            print("  wrote trajectory to dst grp {} with num samples {}".format(dst_grp_name, dst_ep_grp.attrs["num_samples"]))
+
+    return ep_write_ind, num_traj, total_samples
+
+
+def postprocess_dataset_intervention_segments(args):
+    # list of all demonstration episodes (sorted in increasing number order)
+    f = h5py.File(args.dataset, "r")
+    demos = list(f["data"].keys())
+    inds = np.argsort([int(elem[5:]) for elem in demos])
+    demos = [demos[i] for i in inds]
+
+    # maybe reduce the number of demonstrations to playback
+    if args.n is not None:
+        demos = demos[:args.n]
+
+    # output file in same directory as input file
+    output_path = os.path.join(os.path.dirname(args.dataset), args.output_name)
+    f_out = h5py.File(output_path, "w")
+    data_grp = f_out.create_group("data")
+    print("\ninput file: {}".format(args.dataset))
+    print("output file: {}\n".format(output_path))
+
+    ep_write_ind = 0
+    num_traj = 0
+    total_samples = 0
+    for ind in range(len(demos)):
+        ep = demos[ind]
+        src_ep_grp = f["data/{}".format(ep)]
+        print("src grp: {} with {} samples".format(ep, src_ep_grp.attrs["num_samples"]))
+        ep_write_ind, ep_num_traj, ep_total_samples = write_intervention_segments_as_trajectories(
+            src_ep_grp=src_ep_grp,
+            dst_grp=data_grp,
+            start_ep_write_ind=ep_write_ind,
+            same=args.same,
+        )
+        num_traj += ep_num_traj
+        total_samples += ep_total_samples
+
+    # TODO: update filter keys based on which source demos created which target demos
+    # if "mask" in f:
+    #     f.copy("mask", f_out)
+
+    # global metadata
+    data_grp.attrs["total"] = total_samples
+    data_grp.attrs["env_args"] = f["data"].attrs["env_args"] # environment info
+    print("\nWrote {} trajectories from src with {} trajectories to {}".format(num_traj, len(demos), output_path))
+
+    f.close()
+    f_out.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="path to input hdf5 dataset",
+    )
+    # name of hdf5 to write - it will be in the same directory as @dataset
+    parser.add_argument(
+        "--output_name",
+        type=str,
+        required=True,
+        help="name of output hdf5 dataset",
+    )
+
+    # specify number of demos to process - useful for debugging conversion with a handful
+    # of trajectories
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=None,
+        help="(optional) stop after n trajectories are processed",
+    )
+
+    # write all intervention segments to the same demo key (so they will be treated as a contiguous trajectory in time)
+    parser.add_argument(
+        "--same",
+        action='store_true',
+        help="write all intervention segments to the same demo key (so they will be treated as a contiguous trajectory in time",
+    )
+
+    args = parser.parse_args()
+    postprocess_dataset_intervention_segments(args)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/remove_idle_segments.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/remove_idle_segments.py
new file mode 100644
index 0000000000000000000000000000000000000000..35d639363b384a69c87ab93d9beb80e916cd6f2f
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/remove_idle_segments.py
@@ -0,0 +1,199 @@
+"""
+Script to remove idle segments from a real robot hdf5.
+"""
+import os
+import h5py
+import argparse
+import numpy as np
+from tqdm import tqdm
+
+import robomimic.utils.file_utils as FileUtils
+from robomimic.scripts.postprocess_dataset_intervention_segments import postprocess_dataset_intervention_segments
+
+
+def get_idle_segments_in_trajectory(
+    ep_grp,
+    obs_pos_key="ee_pose",
+    min_segment_length=1,
+    threshold=1e-4,
+    verbose=False,
+):
+    """
+    Returns a mask that corresponds to idle segments in the trajectory.
+
+    Args:
+        ep_grp (h5py.Group): hdf5 group that corresponds to a demo key (such as "demo_0")
+        obs_pos_key (str): key for eef pos observations
+        min_segment_length (int): minimum length of idle segment
+        threshold (float): threshold for delta eef pos differences - everything below this threshold
+            value is considered idle
+        verbose (bool): if True, print some helpful info
+
+    Returns:
+        idle_segment_mask (np.array): array with value of 1 during an idle segment
+    """
+    if verbose:
+        print(ep_grp)
+    eef_pos = ep_grp["obs/{}".format(obs_pos_key)][:, :3]
+    delta_eef_pos_norms = np.linalg.norm(np.diff(eef_pos, axis=0), axis=1)
+
+    # note: pad with 0 at start to make sure indices correspond to indices in @eef_pos (otherwise we're off by one due to the difference calculation)
+    idle_segment_mask = np.array([0] + (delta_eef_pos_norms < threshold).astype(int).tolist())
+    idle_segments = FileUtils.get_intervention_segments(idle_segment_mask)
+
+    # filter out segments that are too short
+    ret_mask = np.zeros(eef_pos.shape[0]).astype(int)
+    for seg in idle_segments:
+        if seg[1] - seg[0] >= min_segment_length:
+            ret_mask[seg[0] : seg[1]] = 1
+        
+            if verbose:
+                print("segment: {}".format(seg))
+                # print norms N timesteps before and after window to get a sense of nearby values
+                prev_norms = delta_eef_pos_norms[max(seg[0] - 6, 0) : seg[0] - 1]
+                print("prev_norms")
+                print(prev_norms)
+                post_norms = delta_eef_pos_norms[seg[1] - 1 : min(seg[1] + 4, eef_pos.shape[0] - 1)]
+                print("post_norms")
+                print(post_norms)
+
+    return ret_mask
+
+
+def write_non_idle_segments_as_interventions(hdf5_path, n=None, min_segment_length=1, threshold=1e-4):
+    """
+    Modifies the hdf5 in-place by splitting each trajectory into idle and non-idle segments, and
+    writing the result as an "interventions" key in each trajectory, where the interventions correspond
+    to non-idle segments.
+    """
+    
+    # get demo keys
+    f = h5py.File(args.dataset, "a")
+    demos = list(f["data"].keys())
+    inds = np.argsort([int(elem[5:]) for elem in demos])
+    demos = [demos[i] for i in inds]
+    if args.n is not None:
+        demos = demos[:args.n]
+
+    # for each demo key, get idle segment, and write to interventions
+    for demo_key in tqdm(demos):
+        ep_grp = f["data/{}".format(demo_key)]
+        idle_seg_mask = get_idle_segments_in_trajectory(
+            ep_grp=ep_grp,
+            obs_pos_key="ee_pose",
+            min_segment_length=min_segment_length,
+            threshold=threshold,
+        )
+
+        # write non-idle segment mask as interventions
+        non_idle_seg_mask = 1 - idle_seg_mask
+        if "interventions" in ep_grp:
+            del ep_grp["interventions"]
+        ep_grp.create_dataset("interventions", data=non_idle_seg_mask)
+
+    f.close()
+
+
+def combine_intervention_segments(hdf5_path, output_name, n=None):
+    """
+    Helper function to combine intervention segments in each demo trajectory together, and discard
+    non-intervention segments. This repurposes the postprocess_dataset_intervention_segments.py to
+    essentially remove the idle segments (which are non-intervention segments).
+    """
+    args = argparse.Namespace()
+    args.dataset = os.path.expandvars(os.path.expanduser(hdf5_path))
+    args.output_name = output_name
+    args.n = n
+    args.same = True
+    postprocess_dataset_intervention_segments(args)
+
+
+def remove_idle_segments(args):
+    if args.debug:
+        # print idle segments for the demos
+
+        # get demo keys
+        f = h5py.File(args.dataset, "r")
+        demos = list(f["data"].keys())
+        inds = np.argsort([int(elem[5:]) for elem in demos])
+        demos = [demos[i] for i in inds]
+        if args.n is not None:
+            demos = demos[:args.n]
+
+        for demo_key in demos:
+            idle_seg_mask = get_idle_segments_in_trajectory(
+                ep_grp=f["data/{}".format(demo_key)],
+                obs_pos_key="ee_pose",
+                # min_segment_length=1,
+                min_segment_length=7,
+                threshold=1e-4,
+                # threshold=3e-4,
+                # verbose=True,
+                verbose=False,
+            )
+            idle_segs = FileUtils.get_intervention_segments(idle_seg_mask)
+            print(demo_key)
+            # print(len(idle_segs))
+            print("idle segments")
+            print(idle_segs)
+            print("segment lengths")
+            print([seg[1] - seg[0] for seg in idle_segs])
+
+        f.close()
+        exit()
+
+    assert args.output_name is not None
+
+    # split each trajectory into idle and non-idle segments and write to "interventions" key
+    print("writing non-idle segments as interventions...")
+    write_non_idle_segments_as_interventions(
+        hdf5_path=args.dataset,
+        n=args.n,
+        # some good candidates below
+        min_segment_length=7,
+        threshold=1e-4,
+        # min_segment_length=7,
+        # threshold=3e-4,
+    )
+
+    # write new dataset, keeping only interventions
+    print("combining interventions into new dataset...")
+    combine_intervention_segments(
+        hdf5_path=args.dataset,
+        output_name=args.output_name,
+        n=args.n,
+    )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="path to input hdf5 dataset",
+    )
+    # name of hdf5 to write - it will be in the same directory as @dataset
+    parser.add_argument(
+        "--output_name",
+        type=str,
+        default=None,
+        help="name of output hdf5 dataset",
+    )
+
+    # specify number of demos to process - useful for debugging conversion with a handful
+    # of trajectories
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=None,
+        help="(optional) stop after n trajectories are processed",
+    )
+
+    parser.add_argument(
+        "--debug",
+        action='store_true',
+        help="just print the idle and non-idle segment splits instead of actually doing any dataset processing",
+    )
+
+    args = parser.parse_args()
+    remove_idle_segments(args)
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/run_trained_agent.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/run_trained_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7ef99cdc7742e0acbc4c7c9dabe51b7e8198187
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/run_trained_agent.py
@@ -0,0 +1,536 @@
+"""
+The main script for evaluating a policy in an environment.
+
+Args:
+    agent (str): path to saved checkpoint pth file
+
+    horizon (int): if provided, override maximum horizon of rollout from the one 
+        in the checkpoint
+
+    env (str): if provided, override name of env from the one in the checkpoint,
+        and use it for rollouts
+
+    render (bool): if flag is provided, use on-screen rendering during rollouts
+
+    video_path (str): if provided, render trajectories to this video file path
+
+    video_skip (int): render frames to a video every @video_skip steps
+
+    camera_names (str or [str]): camera name(s) to use for rendering on-screen or to video
+
+    dataset_path (str): if provided, an hdf5 file will be written at this path with the
+        rollout data
+
+    dataset_obs (bool): if flag is provided, and @dataset_path is provided, include 
+        possible high-dimensional observations in output dataset hdf5 file (by default,
+        observations are excluded and only simulator states are saved).
+
+    seed (int): if provided, set seed for rollouts
+
+Example usage:
+
+    # Evaluate a policy with 50 rollouts of maximum horizon 400 and save the rollouts to a video.
+    # Visualize the agentview and wrist cameras during the rollout.
+    
+    python run_trained_agent.py --agent /path/to/model.pth \
+        --n_rollouts 50 --horizon 400 --seed 0 \
+        --video_path /path/to/output.mp4 \
+        --camera_names agentview robot0_eye_in_hand 
+
+    # Write the 50 agent rollouts to a new dataset hdf5.
+
+    python run_trained_agent.py --agent /path/to/model.pth \
+        --n_rollouts 50 --horizon 400 --seed 0 \
+        --dataset_path /path/to/output.hdf5 --dataset_obs 
+
+    # Write the 50 agent rollouts to a new dataset hdf5, but exclude the dataset observations
+    # since they might be high-dimensional (they can be extracted again using the
+    # dataset_states_to_obs.py script).
+
+    python run_trained_agent.py --agent /path/to/model.pth \
+        --n_rollouts 50 --horizon 400 --seed 0 \
+        --dataset_path /path/to/output.hdf5
+"""
+import argparse
+import os
+import json
+import h5py
+import imageio
+import sys
+import time
+import traceback
+import numpy as np
+from copy import deepcopy
+from tqdm import tqdm
+
+import torch
+
+import robomimic
+import robomimic.utils.file_utils as FileUtils
+import robomimic.utils.env_utils as EnvUtils
+import robomimic.utils.torch_utils as TorchUtils
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.obs_utils as ObsUtils
+from robomimic.utils.log_utils import log_warning
+from robomimic.envs.env_base import EnvBase
+from robomimic.envs.wrappers import EnvWrapper
+from robomimic.algo import RolloutPolicy
+from robomimic.scripts.playback_dataset import DEFAULT_CAMERAS
+
+
+def rollout(policy, env, horizon, render=False, video_writer=None, video_skip=5, return_obs=False, camera_names=None, real=False, rate_measure=None):
+    """
+    Helper function to carry out rollouts. Supports on-screen rendering, off-screen rendering to a video, 
+    and returns the rollout trajectory.
+
+    Args:
+        policy (instance of RolloutPolicy): policy loaded from a checkpoint
+        env (instance of EnvBase): env loaded from a checkpoint or demonstration metadata
+        horizon (int): maximum horizon for the rollout
+        render (bool): whether to render rollout on-screen
+        video_writer (imageio writer): if provided, use to write rollout to video
+        video_skip (int): how often to write video frames
+        return_obs (bool): if True, return possibly high-dimensional observations along the trajectoryu. 
+            They are excluded by default because the low-dimensional simulation states should be a minimal 
+            representation of the environment. 
+        camera_names (list): determines which camera(s) are used for rendering. Pass more than
+            one to output a video with multiple camera views concatenated horizontally.
+        real (bool): if real robot rollout
+        rate_measure: if provided, measure rate of action computation and do not play actions in environment
+
+    Returns:
+        stats (dict): some statistics for the rollout - such as return, horizon, and task success
+        traj (dict): dictionary that corresponds to the rollout trajectory
+    """
+    rollout_timestamp = time.time()
+    assert isinstance(env, EnvBase) or isinstance(env, EnvWrapper)
+    assert isinstance(policy, RolloutPolicy)
+    assert not (render and (video_writer is not None))
+
+    policy.start_episode()
+    obs = env.reset()
+    state_dict = dict()
+    if real:
+        input("ready for next eval? hit enter to continue")
+    else:
+        state_dict = env.get_state()
+        # hack that is necessary for robosuite tasks for deterministic action playback
+        obs = env.reset_to(state_dict)
+
+    results = {}
+    video_count = 0  # video frame counter
+    total_reward = 0.
+    got_exception = False
+    success = env.is_success()["task"]
+    traj = dict(actions=[], rewards=[], dones=[], states=[], initial_state_dict=state_dict)
+    if return_obs:
+        # store observations too
+        traj.update(dict(obs=[], next_obs=[]))
+    try:
+        for step_i in range(horizon):
+            # HACK: some keys on real robot do not have a shape (and then they get frame stacked)
+            for k in obs:
+                if len(obs[k].shape) == 1:
+                    obs[k] = obs[k][..., None] 
+
+            # get action from policy
+            t1 = time.time()
+            act = policy(ob=obs)
+            t2 = time.time()
+            if real and (not env.base_env.controller_type == "JOINT_IMPEDANCE") and (policy.policy.global_config.algo_name != "diffusion_policy"):
+                # joint impedance actions and diffusion policy actions are absolute in the real world
+                act = np.clip(act, -1., 1.)
+
+            if rate_measure is not None:
+                rate_measure.measure()
+                print("time: {}s".format(t2 - t1))
+                # dummy reward and done
+                r = 0.
+                done = False
+                next_obs = obs
+            else:
+                # play action
+                next_obs, r, done, _ = env.step(act)
+
+            # compute reward
+            total_reward += r
+            success = env.is_success()["task"]
+
+            # visualization
+            if render:
+                env.render(mode="human", camera_name=camera_names[0])
+            if video_writer is not None:
+                if video_count % video_skip == 0:
+                    video_img = []
+                    for cam_name in camera_names:
+                        video_img.append(env.render(mode="rgb_array", height=512, width=512, camera_name=cam_name))
+                    video_img = np.concatenate(video_img, axis=1) # concatenate horizontally
+                    video_writer.append_data(video_img)
+                video_count += 1
+
+            # collect transition
+            traj["actions"].append(act)
+            traj["rewards"].append(r)
+            traj["dones"].append(done)
+            if not real:
+                traj["states"].append(state_dict["states"])
+            if return_obs:
+                # Note: We need to "unprocess" the observations to prepare to write them to dataset.
+                #       This includes operations like channel swapping and float to uint8 conversion
+                #       for saving disk space.
+                traj["obs"].append(ObsUtils.unprocess_obs_dict(obs))
+                traj["next_obs"].append(ObsUtils.unprocess_obs_dict(next_obs))
+
+            # break if done or if success
+            if done or success:
+                break
+
+            # update for next iter
+            obs = deepcopy(next_obs)
+            if not real:
+                state_dict = env.get_state()
+
+    except env.rollout_exceptions as e:
+        print("WARNING: got rollout exception {}".format(e))
+        got_exception = True
+
+    stats = dict(
+        Return=total_reward,
+        Horizon=(step_i + 1),
+        Success_Rate=float(success),
+        Exception_Rate=float(got_exception),
+        time=(time.time() - rollout_timestamp),
+    )
+
+    if return_obs:
+        # convert list of dict to dict of list for obs dictionaries (for convenient writes to hdf5 dataset)
+        traj["obs"] = TensorUtils.list_of_flat_dict_to_dict_of_list(traj["obs"])
+        traj["next_obs"] = TensorUtils.list_of_flat_dict_to_dict_of_list(traj["next_obs"])
+
+    # list to numpy array
+    for k in traj:
+        if k == "initial_state_dict":
+            continue
+        if isinstance(traj[k], dict):
+            for kp in traj[k]:
+                traj[k][kp] = np.array(traj[k][kp])
+        else:
+            traj[k] = np.array(traj[k])
+
+    return stats, traj
+
+
+def run_trained_agent(args):
+    # some arg checking
+    write_video = (args.video_path is not None)
+    assert not (args.render and write_video) # either on-screen or video but not both
+
+    rate_measure = None
+    if args.hz is not None:
+        import RobotTeleop
+        from RobotTeleop.utils import Rate, RateMeasure, Timers
+        rate_measure = RateMeasure(name="control_rate_measure", freq_threshold=args.hz)
+    
+    # load ckpt dict and get algo name for sanity checks
+    algo_name, ckpt_dict = FileUtils.algo_name_from_checkpoint(ckpt_path=args.agent)
+
+    if args.dp_eval_steps is not None:
+        assert algo_name == "diffusion_policy"
+        log_warning("setting @num_inference_steps to {}".format(args.dp_eval_steps))
+
+        # HACK: modify the config, then dump to json again and write to ckpt_dict
+        tmp_config, _ = FileUtils.config_from_checkpoint(ckpt_dict=ckpt_dict)
+        with tmp_config.values_unlocked():
+            if tmp_config.algo.ddpm.enabled:
+                tmp_config.algo.ddpm.num_inference_timesteps = args.dp_eval_steps
+            elif tmp_config.algo.ddim.enabled:
+                tmp_config.algo.ddim.num_inference_timesteps = args.dp_eval_steps
+            else:
+                raise Exception("should not reach here")
+        ckpt_dict['config'] = tmp_config.dump()
+
+    # device
+    device = TorchUtils.get_torch_device(try_to_use_cuda=True)
+
+    # restore policy
+    policy, ckpt_dict = FileUtils.policy_from_checkpoint(ckpt_dict=ckpt_dict, device=device, verbose=True)
+
+    # read rollout settings
+    rollout_num_episodes = args.n_rollouts
+    rollout_horizon = args.horizon
+    config, _ = FileUtils.config_from_checkpoint(ckpt_dict=ckpt_dict)
+    if rollout_horizon is None:
+        # read horizon from config
+        rollout_horizon = config.experiment.rollout.horizon
+
+    # HACK: assume absolute actions for now if using diffusion policy on real robot
+    if (algo_name == "diffusion_policy") and EnvUtils.is_real_robot_gprs_env(env_meta=ckpt_dict["env_metadata"]):
+        ckpt_dict["env_metadata"]["env_kwargs"]["absolute_actions"] = True
+
+    # create environment from saved checkpoint
+    env, _ = FileUtils.env_from_checkpoint(
+        ckpt_dict=ckpt_dict, 
+        env_name=args.env, 
+        render=args.render, 
+        render_offscreen=(args.video_path is not None), 
+        verbose=True,
+    )
+
+    # Auto-fill camera rendering info if not specified
+    if args.camera_names is None:
+        # We fill in the automatic values
+        env_type = EnvUtils.get_env_type(env=env)
+        args.camera_names = DEFAULT_CAMERAS[env_type]
+    if args.render:
+        # on-screen rendering can only support one camera
+        assert len(args.camera_names) == 1
+
+    is_real_robot = EnvUtils.is_real_robot_env(env=env) or EnvUtils.is_real_robot_gprs_env(env=env)
+    if is_real_robot:
+        # on real robot - log some warnings
+        need_pause = False
+        if "env_name" not in ckpt_dict["env_metadata"]["env_kwargs"]:
+            log_warning("env_name not in checkpoint...proceed with caution...")
+            need_pause = True
+        if ckpt_dict["env_metadata"]["env_name"] != "EnvRealPandaGPRS":
+            # we will load EnvRealPandaGPRS class by default on real robot even if agent was collected with different class
+            log_warning("env name in metadata appears to be class ({}) different from EnvRealPandaGPRS".format(ckpt_dict["env_metadata"]["env_name"]))
+            need_pause = True
+        if need_pause:
+            ans = input("continue? (y/n)")
+            if ans != "y":
+                exit()
+
+    # maybe set seed
+    if args.seed is not None:
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+
+    # maybe create video writer
+    video_writer = None
+    if write_video:
+        video_writer = imageio.get_writer(args.video_path, fps=20)
+
+    # maybe open hdf5 to write rollouts
+    write_dataset = (args.dataset_path is not None)
+    if write_dataset:
+        data_writer = h5py.File(args.dataset_path, "w")
+        data_grp = data_writer.create_group("data")
+        total_samples = 0
+
+    rollout_stats = []
+    for i in tqdm(range(rollout_num_episodes)):
+        try:
+            stats, traj = rollout(
+                policy=policy, 
+                env=env, 
+                horizon=rollout_horizon, 
+                render=args.render, 
+                video_writer=video_writer, 
+                video_skip=args.video_skip, 
+                return_obs=(write_dataset and args.dataset_obs),
+                camera_names=args.camera_names,
+                real=is_real_robot,
+                rate_measure=rate_measure,
+            )
+        except KeyboardInterrupt:
+            if is_real_robot:
+                print("ctrl-C catched, stop execution")
+                print("env rate measure")
+                print(env.rate_measure)
+                ans = input("success? (y / n)")
+                rollout_stats.append((1 if ans == "y" else 0))
+                print("*" * 50)
+                print("have {} success out of {} attempts".format(np.sum(rollout_stats), len(rollout_stats)))
+                print("*" * 50)
+                continue
+            else:
+                sys.exit(0)
+        
+        if is_real_robot:
+            print("TERMINATE WITHOUT KEYBOARD INTERRUPT...")
+            ans = input("success? (y / n)")
+            rollout_stats.append((1 if ans == "y" else 0))
+            continue
+        rollout_stats.append(stats)
+
+        if write_dataset:
+            # store transitions
+            ep_data_grp = data_grp.create_group("demo_{}".format(i))
+            ep_data_grp.create_dataset("actions", data=np.array(traj["actions"]))
+            ep_data_grp.create_dataset("states", data=np.array(traj["states"]))
+            ep_data_grp.create_dataset("rewards", data=np.array(traj["rewards"]))
+            ep_data_grp.create_dataset("dones", data=np.array(traj["dones"]))
+            if args.dataset_obs:
+                for k in traj["obs"]:
+                    ep_data_grp.create_dataset("obs/{}".format(k), data=np.array(traj["obs"][k]))
+                    ep_data_grp.create_dataset("next_obs/{}".format(k), data=np.array(traj["next_obs"][k]))
+
+            # episode metadata
+            if "model" in traj["initial_state_dict"]:
+                ep_data_grp.attrs["model_file"] = traj["initial_state_dict"]["model"] # model xml for this episode
+            ep_data_grp.attrs["num_samples"] = traj["actions"].shape[0] # number of transitions in this episode
+            total_samples += traj["actions"].shape[0]
+
+    rollout_stats = TensorUtils.list_of_flat_dict_to_dict_of_list(rollout_stats)
+    avg_rollout_stats = { k : np.mean(rollout_stats[k]) for k in rollout_stats }
+    avg_rollout_stats["Num_Success"] = np.sum(rollout_stats["Success_Rate"])
+    avg_rollout_stats["Time_Episode"] = np.sum(rollout_stats["time"]) / 60. # total time taken for rollouts in minutes
+    avg_rollout_stats["Num_Episode"] = len(rollout_stats["Success_Rate"]) # number of episodes attempted
+    print("Average Rollout Stats")
+    stats_json = json.dumps(avg_rollout_stats, indent=4)
+    print(stats_json)
+    if args.json_path is not None:
+        json_f = open(args.json_path, "w")
+        json_f.write(stats_json)
+        json_f.close()
+
+    if write_video:
+        video_writer.close()
+
+    if write_dataset:
+        # global metadata
+        data_grp.attrs["total"] = total_samples
+        data_grp.attrs["env_args"] = json.dumps(env.serialize(), indent=4) # environment info
+        data_writer.close()
+        print("Wrote dataset trajectories to {}".format(args.dataset_path))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Path to trained model
+    parser.add_argument(
+        "--agent",
+        type=str,
+        required=True,
+        help="path to saved checkpoint pth file",
+    )
+
+    # number of rollouts
+    parser.add_argument(
+        "--n_rollouts",
+        type=int,
+        default=27,
+        help="number of rollouts",
+    )
+
+    # maximum horizon of rollout, to override the one stored in the model checkpoint
+    parser.add_argument(
+        "--horizon",
+        type=int,
+        default=None,
+        help="(optional) override maximum horizon of rollout from the one in the checkpoint",
+    )
+
+    # Env Name (to override the one stored in model checkpoint)
+    parser.add_argument(
+        "--env",
+        type=str,
+        default=None,
+        help="(optional) override name of env from the one in the checkpoint, and use\
+            it for rollouts",
+    )
+
+    # Whether to render rollouts to screen
+    parser.add_argument(
+        "--render",
+        action='store_true',
+        help="on-screen rendering",
+    )
+
+    # Dump a video of the rollouts to the specified path
+    parser.add_argument(
+        "--video_path",
+        type=str,
+        default=None,
+        help="(optional) render rollouts to this video file path",
+    )
+
+    # How often to write video frames during the rollout
+    parser.add_argument(
+        "--video_skip",
+        type=int,
+        default=5,
+        help="render frames to video every n steps",
+    )
+
+    # camera names to render
+    parser.add_argument(
+        "--camera_names",
+        type=str,
+        nargs='+',
+        default=None,
+        help="(optional) camera name(s) to use for rendering on-screen or to video",
+    )
+
+    # If provided, an hdf5 file will be written with the rollout data
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default=None,
+        help="(optional) if provided, an hdf5 file will be written at this path with the rollout data",
+    )
+
+    # If True and @dataset_path is supplied, will write possibly high-dimensional observations to dataset.
+    parser.add_argument(
+        "--dataset_obs",
+        action='store_true',
+        help="include possibly high-dimensional observations in output dataset hdf5 file (by default,\
+            observations are excluded and only simulator states are saved)",
+    )
+
+    # for seeding before starting rollouts
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="(optional) set seed for rollouts",
+    )
+
+    # Dump a json of the rollout results stats to the specified path
+    parser.add_argument(
+        "--json_path",
+        type=str,
+        default=None,
+        help="(optional) dump a json of the rollout results stats to the specified path",
+    )
+
+    # Dump a file with the error traceback at this path. Only created if run fails with an error.
+    parser.add_argument(
+        "--error_path",
+        type=str,
+        default=None,
+        help="(optional) dump a file with the error traceback at this path. Only created if run fails with an error.",
+    )
+
+    # TODO: clean up this arg
+    # If provided, do not run actions in env, and instead just measure the rate of action computation
+    parser.add_argument(
+        "--hz",
+        type=int,
+        default=None,
+        help="If provided, do not run actions in env, and instead just measure the rate of action computation and raise warnings if it dips below this threshold",
+    )
+
+    # TODO: clean up this arg
+    # If provided, set num_inference_timesteps explicitly for diffusion policy evaluation
+    parser.add_argument(
+        "--dp_eval_steps",
+        type=int,
+        default=None,
+        help="If provided, set num_inference_timesteps explicitly for diffusion policy evaluation",
+    )
+
+    args = parser.parse_args()
+    res_str = None
+    try:
+        run_trained_agent(args)
+    except Exception as e:
+        res_str = "run failed with error:\n{}\n\n{}".format(e, traceback.format_exc())
+        if args.error_path is not None:
+            # write traceback to file
+            f = open(args.error_path, "w")
+            f.write(res_str)
+            f.close()
+        raise e
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/setup_macros.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/setup_macros.py
new file mode 100644
index 0000000000000000000000000000000000000000..92c472712078684a84e9ae624b13cd7d9b6c953c
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/setup_macros.py
@@ -0,0 +1,32 @@
+"""
+This script sets up a private macros file.
+
+The private macros file (macros_private.py) is not tracked by git,
+allowing user-specific settings that are not tracked by git.
+
+This script checks if macros_private.py exists.
+If applicable, it creates the private macros at robomimic/macros_private.py
+"""
+
+import os
+import robomimic
+import shutil
+
+if __name__ == "__main__":
+    base_path = robomimic.__path__[0]
+    macros_path = os.path.join(base_path, "macros.py")
+    macros_private_path = os.path.join(base_path, "macros_private.py")
+
+    if not os.path.exists(macros_path):
+        print("{} does not exist! Aborting...".format(macros_path))
+
+    if os.path.exists(macros_private_path):
+        ans = input("{} already exists! \noverwrite? (y/n)\n".format(macros_private_path))
+
+        if ans == "y":
+            print("REMOVING")
+        else:
+            exit()
+
+    shutil.copyfile(macros_path, macros_private_path)
+    print("copied {}\nto {}".format(macros_path, macros_private_path))
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/split_train_val.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/split_train_val.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d0502ea81dc238e21e0211c1c71c803f0b1b00d
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/split_train_val.py
@@ -0,0 +1,105 @@
+"""
+Script for splitting a dataset hdf5 file into training and validation trajectories.
+
+Args:
+    dataset (str): path to hdf5 dataset
+
+    filter_key (str): if provided, split the subset of trajectories
+        in the file that correspond to this filter key into a training
+        and validation set of trajectories, instead of splitting the
+        full set of trajectories
+
+    ratio (float): validation ratio, in (0, 1). Defaults to 0.1, which is 10%.
+
+Example usage:
+    python split_train_val.py --dataset /path/to/demo.hdf5 --ratio 0.1
+"""
+
+import argparse
+import h5py
+import numpy as np
+
+from robomimic.utils.file_utils import create_hdf5_filter_key
+
+
+def split_train_val_from_hdf5(hdf5_path, val_ratio=0.1, filter_key=None):
+    """
+    Splits data into training set and validation set from HDF5 file.
+
+    Args:
+        hdf5_path (str): path to the hdf5 file
+            to load the transitions from
+
+        val_ratio (float): ratio of validation demonstrations to all demonstrations
+
+        filter_key (str): if provided, split the subset of demonstration keys stored
+            under mask/@filter_key instead of the full set of demonstrations
+    """
+
+    # retrieve number of demos
+    f = h5py.File(hdf5_path, "r")
+    if filter_key is not None:
+        print("using filter key: {}".format(filter_key))
+        demos = sorted([elem.decode("utf-8") for elem in np.array(f["mask/{}".format(filter_key)])])
+    else:
+        demos = sorted(list(f["data"].keys()))
+    num_demos = len(demos)
+    f.close()
+
+    # get random split
+    num_demos = len(demos)
+    num_val = int(val_ratio * num_demos)
+    mask = np.zeros(num_demos)
+    mask[:num_val] = 1.
+    np.random.shuffle(mask)
+    mask = mask.astype(int)
+    train_inds = (1 - mask).nonzero()[0]
+    valid_inds = mask.nonzero()[0]
+    train_keys = [demos[i] for i in train_inds]
+    valid_keys = [demos[i] for i in valid_inds]
+    print("{} validation demonstrations out of {} total demonstrations.".format(num_val, num_demos))
+
+    # pass mask to generate split
+    name_1 = "train"
+    name_2 = "valid"
+    if filter_key is not None:
+        name_1 = "{}_{}".format(filter_key, name_1)
+        name_2 = "{}_{}".format(filter_key, name_2)
+
+    train_lengths = create_hdf5_filter_key(hdf5_path=hdf5_path, demo_keys=train_keys, key_name=name_1)
+    valid_lengths = create_hdf5_filter_key(hdf5_path=hdf5_path, demo_keys=valid_keys, key_name=name_2)
+
+    print("Total number of train samples: {}".format(np.sum(train_lengths)))
+    print("Average number of train samples {}".format(np.mean(train_lengths)))
+
+    print("Total number of valid samples: {}".format(np.sum(valid_lengths)))
+    print("Average number of valid samples {}".format(np.mean(valid_lengths)))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="path to hdf5 dataset",
+    )
+    parser.add_argument(
+        "--filter_key",
+        type=str,
+        default=None,
+        help="if provided, split the subset of trajectories in the file that correspond to\
+            this filter key into a training and validation set of trajectories, instead of\
+            splitting the full set of trajectories",
+    )
+    parser.add_argument(
+        "--ratio",
+        type=float,
+        default=0.1,
+        help="validation ratio, in (0, 1)"
+    )
+    args = parser.parse_args()
+
+    # seed to make sure results are consistent
+    np.random.seed(0)
+
+    split_train_val_from_hdf5(args.dataset, val_ratio=args.ratio, filter_key=args.filter_key)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/scripts/train.py b/phantom/submodules/phantom-robomimic/robomimic/scripts/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b101984e2972395175e1b0c21563b9ab15cba2d
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/scripts/train.py
@@ -0,0 +1,599 @@
+"""
+The main entry point for training policies.
+
+Args:
+    config (str): path to a config json that will be used to override the default settings.
+        If omitted, default settings are used. This is the preferred way to run experiments.
+
+    algo (str): name of the algorithm to run. Only needs to be provided if @config is not
+        provided.
+
+    name (str): if provided, override the experiment name defined in the config
+
+    dataset (str): if provided, override the dataset path defined in the config
+
+    debug (bool): set this flag to run a quick training run for debugging purposes    
+"""
+
+import argparse
+import json
+import numpy as np
+import time
+import os
+import shutil
+import psutil
+import sys
+import socket
+import traceback
+
+from collections import OrderedDict
+
+import torch
+from torch.utils.data import DataLoader
+
+import robomimic
+import robomimic.macros as Macros
+import robomimic.utils.train_utils as TrainUtils
+import robomimic.utils.torch_utils as TorchUtils
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.env_utils as EnvUtils
+import robomimic.utils.file_utils as FileUtils
+from robomimic.config import config_factory
+from robomimic.algo import algo_factory, RolloutPolicy
+from robomimic.utils.log_utils import PrintLogger, DataLogger, flush_warnings
+
+
+def train(config, device, auto_remove_exp=False):
+    """
+    Train a model using the algorithm.
+    """
+
+    # time this run
+    start_time = time.time()
+
+    # first set seeds
+    np.random.seed(config.train.seed)
+    torch.manual_seed(config.train.seed)
+
+    torch.set_num_threads(2)
+
+    print("\n============= New Training Run with Config =============")
+    print(config)
+    print("")
+    log_dir, ckpt_dir, video_dir = TrainUtils.get_exp_dir(config, auto_remove_exp_dir=auto_remove_exp)
+
+    if config.experiment.logging.terminal_output_to_txt:
+        # log stdout and stderr to a text file
+        logger = PrintLogger(os.path.join(log_dir, 'log.txt'))
+        sys.stdout = logger
+        sys.stderr = logger
+
+    # read config to set up metadata for observation modalities (e.g. detecting rgb observations)
+    ObsUtils.initialize_obs_utils_with_config(config)
+
+    # make sure the dataset exists
+    if isinstance(config.train.data, str):
+        dataset_path = os.path.expandvars(os.path.expanduser(config.train.data))
+    else:
+        eval_dataset_cfg = config.train.data[0]
+        dataset_path = os.path.expandvars(os.path.expanduser(eval_dataset_cfg["path"]))
+    ds_format = config.train.data_format
+    if not os.path.exists(dataset_path):
+        raise Exception("Dataset at provided path {} not found!".format(dataset_path))
+
+    # load basic metadata from training file
+    print("\n============= Loaded Environment Metadata =============")
+    env_meta = FileUtils.get_env_metadata_from_dataset(dataset_path=dataset_path, ds_format=ds_format)
+
+    # update env meta if applicable
+    from robomimic.utils.script_utils import deep_update
+    deep_update(env_meta, config.experiment.env_meta_update_dict)
+
+    shape_meta = FileUtils.get_shape_metadata_from_dataset(
+        dataset_path=dataset_path,
+        action_keys=config.train.action_keys,
+        all_obs_keys=config.all_obs_keys,
+        ds_format=ds_format,
+        verbose=True
+    )
+
+    if config.experiment.env is not None:
+        env_meta["env_name"] = config.experiment.env
+        print("=" * 30 + "\n" + "Replacing Env to {}\n".format(env_meta["env_name"]) + "=" * 30)
+
+    # create environment
+    envs = OrderedDict()
+    if config.experiment.rollout.enabled:
+        # create environments for validation runs
+        env_names = [env_meta["env_name"]]
+
+        if config.experiment.additional_envs is not None:
+            for name in config.experiment.additional_envs:
+                env_names.append(name)
+
+        for env_name in env_names:
+            env = EnvUtils.create_env_from_metadata(
+                env_meta=env_meta,
+                env_name=env_name, 
+                render=config.experiment.render,
+                render_offscreen=config.experiment.render_video,
+                use_image_obs=shape_meta["use_images"],
+                use_depth_obs=shape_meta["use_depths"],
+            )
+            env = EnvUtils.wrap_env_from_config(env, config=config) # apply environment warpper, if applicable
+            envs[env.name] = env
+            print(envs[env.name])
+
+    print("")
+
+    # setup for a new training run
+    data_logger = DataLogger(
+        log_dir,
+        config,
+        log_tb=config.experiment.logging.log_tb,
+        log_wandb=config.experiment.logging.log_wandb,
+    )
+    model = algo_factory(
+        algo_name=config.algo_name,
+        config=config,
+        obs_key_shapes=shape_meta["all_shapes"],
+        ac_dim=shape_meta["ac_dim"],
+        device=device,
+    )
+    
+    # save the config as a json file
+    with open(os.path.join(log_dir, '..', 'config.json'), 'w') as outfile:
+        json.dump(config, outfile, indent=4)
+
+    print("\n============= Model Summary =============")
+    print(model)  # print model summary
+    print("")
+
+    # load training data
+    trainset, validset = TrainUtils.load_data_for_training(
+        config, obs_keys=shape_meta["all_obs_keys"])
+    train_sampler = trainset.get_dataset_sampler()
+    print("\n============= Training Dataset =============")
+    print(trainset)
+    print("")
+    if validset is not None:
+        print("\n============= Validation Dataset =============")
+        print(validset)
+        print("")
+
+    # maybe retreve statistics for normalizing observations
+    obs_normalization_stats = None
+    if config.train.hdf5_normalize_obs:
+        obs_normalization_stats = trainset.get_obs_normalization_stats()
+
+    # maybe retreve statistics for normalizing actions
+    action_normalization_stats = trainset.get_action_normalization_stats()
+
+    # initialize data loaders
+    train_loader = DataLoader(
+        dataset=trainset,
+        sampler=train_sampler,
+        batch_size=config.train.batch_size,
+        shuffle=(train_sampler is None),
+        num_workers=config.train.num_data_workers,
+        drop_last=True
+    )
+
+    if config.experiment.validate:
+        # cap num workers for validation dataset at 1
+        num_workers = min(config.train.num_data_workers, 1)
+        valid_sampler = validset.get_dataset_sampler()
+        valid_loader = DataLoader(
+            dataset=validset,
+            sampler=valid_sampler,
+            batch_size=config.train.batch_size,
+            shuffle=(valid_sampler is None),
+            num_workers=num_workers,
+            drop_last=True
+        )
+    else:
+        valid_loader = None
+
+    # print all warnings before training begins
+    print("*" * 50)
+    print("Warnings generated by robomimic have been duplicated here (from above) for convenience. Please check them carefully.")
+    flush_warnings()
+    print("*" * 50)
+    print("")
+
+    # main training loop
+    best_valid_loss = None
+    best_return = {k: -np.inf for k in envs} if config.experiment.rollout.enabled else None
+    best_success_rate = {k: -1. for k in envs} if config.experiment.rollout.enabled else None
+    last_ckpt_time = time.time()
+
+    need_sync_results = (Macros.RESULTS_SYNC_PATH_ABS is not None)
+    if need_sync_results:
+        # these paths will be updated after each evaluation
+        best_ckpt_path_synced = None
+        best_video_path_synced = None
+        last_ckpt_path_synced = None
+        last_video_path_synced = None
+        log_dir_path_synced = os.path.join(Macros.RESULTS_SYNC_PATH_ABS, "logs")
+
+    # number of learning steps per epoch (defaults to a full dataset pass)
+    train_num_steps = config.experiment.epoch_every_n_steps
+    valid_num_steps = config.experiment.validation_epoch_every_n_steps
+
+    for epoch in range(1, config.train.num_epochs + 1): # epoch numbers start at 1
+        step_log = TrainUtils.run_epoch(
+            model=model,
+            data_loader=train_loader,
+            epoch=epoch,
+            num_steps=train_num_steps,
+            obs_normalization_stats=obs_normalization_stats,
+        )
+        model.on_epoch_end(epoch)
+
+        # setup checkpoint path
+        epoch_ckpt_name = "model_epoch_{}".format(epoch)
+
+        # check for recurring checkpoint saving conditions
+        should_save_ckpt = False
+        if config.experiment.save.enabled:
+            time_check = (config.experiment.save.every_n_seconds is not None) and \
+                (time.time() - last_ckpt_time > config.experiment.save.every_n_seconds)
+            epoch_check = (config.experiment.save.every_n_epochs is not None) and \
+                (epoch > 0) and (epoch % config.experiment.save.every_n_epochs == 0)
+            epoch_list_check = (epoch in config.experiment.save.epochs)
+            should_save_ckpt = (time_check or epoch_check or epoch_list_check)
+        ckpt_reason = None
+        if should_save_ckpt:
+            last_ckpt_time = time.time()
+            ckpt_reason = "time"
+
+        print("Train Epoch {}".format(epoch))
+        print(json.dumps(step_log, sort_keys=True, indent=4))
+        for k, v in step_log.items():
+            if k.startswith("Time_"):
+                data_logger.record("Timing_Stats/Train_{}".format(k[5:]), v, epoch)
+            else:
+                data_logger.record("Train/{}".format(k), v, epoch)
+
+        # Evaluate the model on validation set
+        if config.experiment.validate:
+            with torch.no_grad():
+                step_log = TrainUtils.run_epoch(model=model, data_loader=valid_loader, epoch=epoch, validate=True, num_steps=valid_num_steps)
+            for k, v in step_log.items():
+                if k.startswith("Time_"):
+                    data_logger.record("Timing_Stats/Valid_{}".format(k[5:]), v, epoch)
+                else:
+                    data_logger.record("Valid/{}".format(k), v, epoch)
+
+            print("Validation Epoch {}".format(epoch))
+            print(json.dumps(step_log, sort_keys=True, indent=4))
+
+            # save checkpoint if achieve new best validation loss
+            valid_check = "Loss" in step_log
+            if valid_check and (best_valid_loss is None or (step_log["Loss"] <= best_valid_loss)):
+                best_valid_loss = step_log["Loss"]
+                if config.experiment.save.enabled and config.experiment.save.on_best_validation:
+                    epoch_ckpt_name += "_best_validation_{}".format(best_valid_loss)
+                    should_save_ckpt = True
+                    ckpt_reason = "valid" if ckpt_reason is None else ckpt_reason
+
+        # Evaluate the model by by running rollouts
+
+        # do rollouts at fixed rate or if it's time to save a new ckpt
+        video_paths = None
+        rollout_check = (epoch % config.experiment.rollout.rate == 0) or (should_save_ckpt and ckpt_reason == "time")
+        did_rollouts = False
+        if config.experiment.rollout.enabled and (epoch > config.experiment.rollout.warmstart) and rollout_check:
+
+            # wrap model as a RolloutPolicy to prepare for rollouts
+            rollout_model = RolloutPolicy(
+                model,
+                obs_normalization_stats=obs_normalization_stats,
+                action_normalization_stats=action_normalization_stats,
+            )
+
+            num_episodes = config.experiment.rollout.n
+            all_rollout_logs, video_paths = TrainUtils.rollout_with_stats(
+                policy=rollout_model,
+                envs=envs,
+                horizon=config.experiment.rollout.horizon,
+                use_goals=config.use_goals,
+                num_episodes=num_episodes,
+                render=False,
+                video_dir=video_dir if config.experiment.render_video else None,
+                epoch=epoch,
+                video_skip=config.experiment.get("video_skip", 5),
+                terminate_on_success=config.experiment.rollout.terminate_on_success,
+            )
+
+            # summarize results from rollouts to tensorboard and terminal
+            for env_name in all_rollout_logs:
+                rollout_logs = all_rollout_logs[env_name]
+                for k, v in rollout_logs.items():
+                    if k.startswith("Time_"):
+                        data_logger.record("Timing_Stats/Rollout_{}_{}".format(env_name, k[5:]), v, epoch)
+                    else:
+                        data_logger.record("Rollout/{}/{}".format(k, env_name), v, epoch, log_stats=True)
+
+                print("\nEpoch {} Rollouts took {}s (avg) with results:".format(epoch, rollout_logs["time"]))
+                print('Env: {}'.format(env_name))
+                print(json.dumps(rollout_logs, sort_keys=True, indent=4))
+
+            # checkpoint and video saving logic
+            updated_stats = TrainUtils.should_save_from_rollout_logs(
+                all_rollout_logs=all_rollout_logs,
+                best_return=best_return,
+                best_success_rate=best_success_rate,
+                epoch_ckpt_name=epoch_ckpt_name,
+                save_on_best_rollout_return=config.experiment.save.on_best_rollout_return,
+                save_on_best_rollout_success_rate=config.experiment.save.on_best_rollout_success_rate,
+            )
+            best_return = updated_stats["best_return"]
+            best_success_rate = updated_stats["best_success_rate"]
+            epoch_ckpt_name = updated_stats["epoch_ckpt_name"]
+            should_save_ckpt = (config.experiment.save.enabled and updated_stats["should_save_ckpt"]) or should_save_ckpt
+            if updated_stats["ckpt_reason"] is not None:
+                ckpt_reason = updated_stats["ckpt_reason"]
+            did_rollouts = True
+
+        # Only keep saved videos if the ckpt should be saved (but not because of validation score)
+        should_save_video = (should_save_ckpt and (ckpt_reason != "valid")) or config.experiment.keep_all_videos
+        if video_paths is not None and not should_save_video:
+            for env_name in video_paths:
+                os.remove(video_paths[env_name])
+
+        # Save model checkpoints based on conditions (success rate, validation loss, etc)
+        if should_save_ckpt:
+            TrainUtils.save_model(
+                model=model,
+                config=config,
+                env_meta=env_meta,
+                shape_meta=shape_meta,
+                ckpt_path=os.path.join(ckpt_dir, epoch_ckpt_name + ".pth"),
+                obs_normalization_stats=obs_normalization_stats,
+                action_normalization_stats=action_normalization_stats,
+            )
+
+        # maybe sync some results back to scratch space (only if rollouts happened)
+        if did_rollouts and need_sync_results:
+            print("Sync results back to sync path: {}".format(Macros.RESULTS_SYNC_PATH_ABS))
+
+            # get best and latest model checkpoints and videos
+            best_ckpt_path_to_sync, best_video_path_to_sync, best_epoch_to_sync = TrainUtils.get_model_from_output_folder(
+                models_path=ckpt_dir,
+                videos_path=video_dir if config.experiment.render_video else None,
+                best=True,
+            )
+            last_ckpt_path_to_sync, last_video_path_to_sync, last_epoch_to_sync = TrainUtils.get_model_from_output_folder(
+                models_path=ckpt_dir,
+                videos_path=video_dir if config.experiment.render_video else None,
+                last=True,
+            )
+
+            # clear last files that we synced over
+            if best_ckpt_path_synced is not None:
+                os.remove(best_ckpt_path_synced)
+            if last_ckpt_path_synced is not None:
+                os.remove(last_ckpt_path_synced)
+            if best_video_path_synced is not None:
+                os.remove(best_video_path_synced)
+            if last_video_path_synced is not None:
+                os.remove(last_video_path_synced)
+            if os.path.exists(log_dir_path_synced):
+                shutil.rmtree(log_dir_path_synced)
+
+            # set write paths and sync new files over
+            best_success_rate_for_sync = float(best_ckpt_path_to_sync.split("success_")[-1][:-4])
+            best_ckpt_path_synced = os.path.join(
+                Macros.RESULTS_SYNC_PATH_ABS,
+                os.path.basename(best_ckpt_path_to_sync)[:-4] + "_best.pth",
+            )
+            shutil.copyfile(best_ckpt_path_to_sync, best_ckpt_path_synced)
+            last_ckpt_path_synced = os.path.join(
+                Macros.RESULTS_SYNC_PATH_ABS,
+                os.path.basename(last_ckpt_path_to_sync)[:-4] + "_last.pth",
+            )
+            shutil.copyfile(last_ckpt_path_to_sync, last_ckpt_path_synced)
+            if config.experiment.render_video:
+                best_video_path_synced = os.path.join(
+                    Macros.RESULTS_SYNC_PATH_ABS,
+                    os.path.basename(best_video_path_to_sync)[:-4] + "_best_{}.mp4".format(best_success_rate_for_sync),
+                )
+                shutil.copyfile(best_video_path_to_sync, best_video_path_synced)
+                last_video_path_synced = os.path.join(
+                    Macros.RESULTS_SYNC_PATH_ABS,
+                    os.path.basename(last_video_path_to_sync)[:-4] + "_last.mp4",
+                )
+                shutil.copyfile(last_video_path_to_sync, last_video_path_synced)
+            # sync logs dir
+            shutil.copytree(log_dir, log_dir_path_synced)
+            # sync config json
+            shutil.copyfile(
+                os.path.join(log_dir, '..', 'config.json'),
+                os.path.join(Macros.RESULTS_SYNC_PATH_ABS, 'config.json')
+            )
+
+        # Finally, log memory usage in MB
+        process = psutil.Process(os.getpid())
+        mem_usage = int(process.memory_info().rss / 1000000)
+        data_logger.record("System/RAM Usage (MB)", mem_usage, epoch)
+        print("\nEpoch {} Memory Usage: {} MB\n".format(epoch, mem_usage))
+
+    # terminate logging
+    data_logger.close()
+
+    # sync logs after closing data logger to make sure everything was transferred
+    if need_sync_results:
+        print("Sync results back to sync path: {}".format(Macros.RESULTS_SYNC_PATH_ABS))
+        # sync logs dir
+        if os.path.exists(log_dir_path_synced):
+            shutil.rmtree(log_dir_path_synced)
+        shutil.copytree(log_dir, log_dir_path_synced)
+
+    # collect important statistics
+    important_stats = dict()
+    prefix = "Rollout/Success_Rate/"
+    exception_prefix = "Rollout/Exception_Rate/"
+    for k in data_logger._data:
+        if k.startswith(prefix):
+            suffix = k[len(prefix):]
+            stats = data_logger.get_stats(k)
+            important_stats["{}-max".format(suffix)] = stats["max"]
+            important_stats["{}-mean".format(suffix)] = stats["mean"]
+        elif k.startswith(exception_prefix):
+            suffix = k[len(exception_prefix):]
+            stats = data_logger.get_stats(k)
+            important_stats["{}-exception-rate-max".format(suffix)] = stats["max"]
+            important_stats["{}-exception-rate-mean".format(suffix)] = stats["mean"]
+
+    # add in time taken
+    important_stats["time spent (hrs)"] = "{:.2f}".format((time.time() - start_time) / 3600.)
+
+    # write stats to disk
+    json_file_path = os.path.join(log_dir, "important_stats.json")
+    with open(json_file_path, 'w') as f:
+        # preserve original key ordering
+        json.dump(important_stats, f, sort_keys=False, indent=4)
+
+    return important_stats
+
+
+def main(args):
+
+    if args.config is not None:
+        ext_cfg = json.load(open(args.config, 'r'))
+        config = config_factory(ext_cfg["algo_name"])
+        # update config with external json - this will throw errors if
+        # the external config has keys not present in the base algo config
+        with config.values_unlocked():
+            config.update(ext_cfg)
+    else:
+        config = config_factory(args.algo)
+
+    if args.dataset is not None:
+        config.train.data = [dict(path=args.dataset)]
+
+    if args.name is not None:
+        config.experiment.name = args.name
+
+    if args.output is not None:
+        config.train.output_dir = args.output
+
+    # get torch device
+    device = TorchUtils.get_torch_device(try_to_use_cuda=config.train.cuda)
+
+    # maybe modify config for debugging purposes
+    if args.debug:
+        Macros.DEBUG = True
+
+        # shrink length of training to test whether this run is likely to crash
+        config.unlock()
+        config.lock_keys()
+
+        # train and validate (if enabled) for 3 gradient steps, for 2 epochs
+        config.experiment.epoch_every_n_steps = 3
+        config.experiment.validation_epoch_every_n_steps = 3
+        config.train.num_epochs = 2
+
+        # if rollouts are enabled, try 2 rollouts at end of each epoch, with 10 environment steps
+        config.experiment.rollout.rate = 1
+        config.experiment.rollout.n = 2
+        config.experiment.rollout.horizon = 10
+
+        # send output to a temporary directory
+        config.train.output_dir = "/tmp/tmp_trained_models"
+
+    # lock config to prevent further modifications and ensure missing keys raise errors
+    config.lock()
+
+    # catch error during training and print it
+    res_str = "finished run successfully!"
+    important_stats = None
+    try:
+        important_stats = train(config, device=device, auto_remove_exp=args.auto_remove_exp)
+    except Exception as e:
+        res_str = "run failed with error:\n{}\n\n{}".format(e, traceback.format_exc())
+    print(res_str)
+    if important_stats is not None:
+        important_stats = json.dumps(important_stats, indent=4)
+        print("\nRollout Success Rate Stats")
+        print(important_stats)
+
+        # maybe sync important stats back
+        if Macros.RESULTS_SYNC_PATH_ABS is not None:
+            json_file_path = os.path.join(Macros.RESULTS_SYNC_PATH_ABS, "important_stats.json")
+            with open(json_file_path, 'w') as f:
+                # preserve original key ordering
+                json.dump(important_stats, f, sort_keys=False, indent=4)
+
+    # maybe give slack notification
+    if Macros.SLACK_TOKEN is not None:
+        from robomimic.scripts.give_slack_notification import give_slack_notif
+        msg = "Completed the following training run!\nHostname: {}\nExperiment Name: {}\n".format(socket.gethostname(), config.experiment.name)
+        msg += "```{}```".format(res_str)
+        if important_stats is not None:
+            msg += "\nRollout Success Rate Stats"
+            msg += "\n```{}```".format(important_stats)
+        give_slack_notif(msg)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # External config file that overwrites default config
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=None,
+        help="(optional) path to a config json that will be used to override the default settings. \
+            If omitted, default settings are used. This is the preferred way to run experiments.",
+    )
+
+    # Algorithm Name
+    parser.add_argument(
+        "--algo",
+        type=str,
+        help="(optional) name of algorithm to run. Only needs to be provided if --config is not provided",
+    )
+
+    # Experiment Name (for tensorboard, saving models, etc.)
+    parser.add_argument(
+        "--name",
+        type=str,
+        default=None,
+        help="(optional) if provided, override the experiment name defined in the config",
+    )
+
+    # Dataset path, to override the one in the config
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="(optional) if provided, override the dataset path defined in the config",
+    )
+
+    # Output path, to override the one in the config
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=None,
+        help="(optional) if provided, override the output folder path defined in the config",
+    )
+
+    # force delete the experiment folder if it exists
+    parser.add_argument(
+        "--auto-remove-exp",
+        action='store_true',
+        help="force delete the experiment folder if it exists"
+    )
+
+    # debug mode
+    parser.add_argument(
+        "--debug",
+        action='store_true',
+        help="set this flag to run a quick training run for debugging purposes"
+    )
+
+    args = parser.parse_args()
+    main(args)
+
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/__init__.py b/phantom/submodules/phantom-robomimic/robomimic/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/action_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/action_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac974d50c5fab46b85cd8c3bb76d8e05a4f56aba
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/action_utils.py
@@ -0,0 +1,35 @@
+
+from typing import Union, Sequence, Dict, Optional, Tuple
+
+from copy import deepcopy
+from collections import OrderedDict
+import functools
+
+import numpy as np
+
+
+def action_dict_to_vector(
+        action_dict: Dict[str, np.ndarray], 
+        action_keys: Optional[Sequence[str]]=None) -> np.ndarray:
+    if action_keys is None:
+        action_keys = list(action_dict.keys())
+    actions = [action_dict[k] for k in action_keys]
+
+    action_vec = np.concatenate(actions, axis=-1)
+    return action_vec
+
+
+def vector_to_action_dict(
+        action: np.ndarray, 
+        action_shapes: Dict[str, Tuple[int]],
+        action_keys: Sequence[str]) -> Dict[str, np.ndarray]:
+    action_dict = dict()
+    start_idx = 0
+    for key in action_keys:
+        this_act_shape = action_shapes[key]
+        this_act_dim = np.prod(this_act_shape)
+        end_idx = start_idx + this_act_dim
+        action_dict[key] = action[...,start_idx:end_idx].reshape(
+            action.shape[:-1]+this_act_shape)
+        start_idx = end_idx
+    return action_dict
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/dataset.py b/phantom/submodules/phantom-robomimic/robomimic/utils/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d429c7a46d09767f8b1946765e599bbc5667da3
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/dataset.py
@@ -0,0 +1,1134 @@
+"""
+This file contains Dataset classes that are used by torch dataloaders
+to fetch batches from hdf5 files.
+"""
+import os
+import h5py
+import numpy as np
+from copy import deepcopy
+from contextlib import contextmanager
+from collections import OrderedDict
+
+import torch.utils.data
+
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.action_utils as AcUtils
+import robomimic.utils.log_utils as LogUtils
+
+
+class SequenceDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        hdf5_path,
+        obs_keys,
+        action_keys,
+        dataset_keys,
+        action_config,
+        frame_stack=1,
+        seq_length=1,
+        pad_frame_stack=True,
+        pad_seq_length=True,
+        get_pad_mask=False,
+        goal_mode=None,
+        hdf5_cache_mode=None,
+        hdf5_use_swmr=True,
+        hdf5_normalize_obs=False,
+        filter_by_attribute=None,
+        load_next_obs=True,
+    ):
+        """
+        Dataset class for fetching sequences of experience.
+        Length of the fetched sequence is equal to (@frame_stack - 1 + @seq_length)
+
+        Args:
+            hdf5_path (str): path to hdf5
+
+            obs_keys (tuple, list): keys to observation items (image, object, etc) to be fetched from the dataset
+
+            action_config (dict): TODO
+
+            dataset_keys (tuple, list): keys to dataset items (actions, rewards, etc) to be fetched from the dataset
+
+            frame_stack (int): numbers of stacked frames to fetch. Defaults to 1 (single frame).
+
+            seq_length (int): length of sequences to sample. Defaults to 1 (single frame).
+
+            pad_frame_stack (int): whether to pad sequence for frame stacking at the beginning of a demo. This
+                ensures that partial frame stacks are observed, such as (s_0, s_0, s_0, s_1). Otherwise, the
+                first frame stacked observation would be (s_0, s_1, s_2, s_3).
+
+            pad_seq_length (int): whether to pad sequence for sequence fetching at the end of a demo. This
+                ensures that partial sequences at the end of a demonstration are observed, such as
+                (s_{T-1}, s_{T}, s_{T}, s_{T}). Otherwise, the last sequence provided would be
+                (s_{T-3}, s_{T-2}, s_{T-1}, s_{T}).
+
+            get_pad_mask (bool): if True, also provide padding masks as part of the batch. This can be
+                useful for masking loss functions on padded parts of the data.
+
+            goal_mode (str): either "last" or None. Defaults to None, which is to not fetch goals
+
+            hdf5_cache_mode (str): one of ["all", "low_dim", or None]. Set to "all" to cache entire hdf5 
+                in memory - this is by far the fastest for data loading. Set to "low_dim" to cache all 
+                non-image data. Set to None to use no caching - in this case, every batch sample is 
+                retrieved via file i/o. You should almost never set this to None, even for large 
+                image datasets.
+
+            hdf5_use_swmr (bool): whether to use swmr feature when opening the hdf5 file. This ensures
+                that multiple Dataset instances can all access the same hdf5 file without problems.
+
+            hdf5_normalize_obs (bool): if True, normalize observations by computing the mean observation
+                and std of each observation (in each dimension and modality), and normalizing to unit
+                mean and variance in each dimension.
+
+            filter_by_attribute (str): if provided, use the provided filter key to look up a subset of
+                demonstrations to load
+
+            load_next_obs (bool): whether to load next_obs from the dataset
+        """
+        super(SequenceDataset, self).__init__()
+
+        self.hdf5_path = os.path.expandvars(os.path.expanduser(hdf5_path))
+        self.hdf5_use_swmr = hdf5_use_swmr
+        self.hdf5_normalize_obs = hdf5_normalize_obs
+        self._hdf5_file = None
+
+        assert hdf5_cache_mode in ["all", "low_dim", None]
+        self.hdf5_cache_mode = hdf5_cache_mode
+
+        self.load_next_obs = load_next_obs
+        self.filter_by_attribute = filter_by_attribute
+
+        # get all keys that needs to be fetched
+        self.obs_keys = tuple(obs_keys)
+        self.action_keys = tuple(action_keys)
+        self.dataset_keys = tuple(dataset_keys)
+        # add action keys to dataset keys
+        if self.action_keys is not None:
+            self.dataset_keys = tuple(set(self.dataset_keys).union(set(self.action_keys)))
+
+        self.action_config = action_config
+
+        self.n_frame_stack = frame_stack
+        assert self.n_frame_stack >= 1
+
+        self.seq_length = seq_length
+        assert self.seq_length >= 1
+
+        self.goal_mode = goal_mode
+        if self.goal_mode is not None:
+            assert self.goal_mode in ["last"]
+        if not self.load_next_obs:
+            assert self.goal_mode != "last"  # we use last next_obs as goal
+
+        self.pad_seq_length = pad_seq_length
+        self.pad_frame_stack = pad_frame_stack
+        self.get_pad_mask = get_pad_mask
+
+        self.load_demo_info(filter_by_attribute=self.filter_by_attribute)
+
+        # maybe prepare for observation normalization
+        self.obs_normalization_stats = None
+        if self.hdf5_normalize_obs:
+            self.obs_normalization_stats = self.normalize_obs()
+
+        # prepare for action normalization
+        self.action_normalization_stats = None
+
+        # maybe store dataset in memory for fast access
+        if self.hdf5_cache_mode in ["all", "low_dim"]:
+            obs_keys_in_memory = self.obs_keys
+            if self.hdf5_cache_mode == "low_dim":
+                # only store low-dim observations
+                obs_keys_in_memory = []
+                for k in self.obs_keys:
+                    if ObsUtils.key_is_obs_modality(k, "low_dim"):
+                        obs_keys_in_memory.append(k)
+            self.obs_keys_in_memory = obs_keys_in_memory
+
+            self.hdf5_cache = self.load_dataset_in_memory(
+                demo_list=self.demos,
+                hdf5_file=self.hdf5_file,
+                obs_keys=self.obs_keys_in_memory,
+                dataset_keys=self.dataset_keys,
+                load_next_obs=self.load_next_obs
+            )
+
+            if self.hdf5_cache_mode == "all":
+                # cache getitem calls for even more speedup. We don't do this for
+                # "low-dim" since image observations require calls to getitem anyways.
+                print("SequenceDataset: caching get_item calls...")
+                self.getitem_cache = [self.get_item(i) for i in LogUtils.custom_tqdm(range(len(self)))]
+
+                # don't need the previous cache anymore
+                del self.hdf5_cache
+                self.hdf5_cache = None
+        else:
+            self.hdf5_cache = None
+
+        self.close_and_delete_hdf5_handle()
+
+    def load_demo_info(self, filter_by_attribute=None, demos=None):
+        """
+        Args:
+            filter_by_attribute (str): if provided, use the provided filter key
+                to select a subset of demonstration trajectories to load
+
+            demos (list): list of demonstration keys to load from the hdf5 file. If 
+                omitted, all demos in the file (or under the @filter_by_attribute 
+                filter key) are used.
+        """
+        # filter demo trajectory by mask
+        if demos is not None:
+            self.demos = demos
+        elif filter_by_attribute is not None:
+            self.demos = [elem.decode("utf-8") for elem in np.array(self.hdf5_file["mask/{}".format(filter_by_attribute)][:])]
+        else:
+            self.demos = list(self.hdf5_file["data"].keys())
+
+        # sort demo keys
+        inds = np.argsort([int(elem[5:]) for elem in self.demos])
+        self.demos = [self.demos[i] for i in inds]
+
+        self.n_demos = len(self.demos)
+
+        # keep internal index maps to know which transitions belong to which demos
+        self._index_to_demo_id = dict()  # maps every index to a demo id
+        self._demo_id_to_start_indices = dict()  # gives start index per demo id
+        self._demo_id_to_demo_length = dict()
+
+        # determine index mapping
+        self.total_num_sequences = 0
+        for ep in self.demos:
+            demo_length = self.hdf5_file["data/{}".format(ep)].attrs["num_samples"]
+            self._demo_id_to_start_indices[ep] = self.total_num_sequences
+            self._demo_id_to_demo_length[ep] = demo_length
+
+            num_sequences = demo_length
+            # determine actual number of sequences taking into account whether to pad for frame_stack and seq_length
+            if not self.pad_frame_stack:
+                num_sequences -= (self.n_frame_stack - 1)
+            if not self.pad_seq_length:
+                num_sequences -= (self.seq_length - 1)
+
+            if self.pad_seq_length:
+                assert demo_length >= 1  # sequence needs to have at least one sample
+                num_sequences = max(num_sequences, 1)
+            else:
+                assert num_sequences >= 1  # assume demo_length >= (self.n_frame_stack - 1 + self.seq_length)
+
+            for _ in range(num_sequences):
+                self._index_to_demo_id[self.total_num_sequences] = ep
+                self.total_num_sequences += 1
+
+    @property
+    def hdf5_file(self):
+        """
+        This property allows for a lazy hdf5 file open.
+        """
+        if self._hdf5_file is None:
+            self._hdf5_file = h5py.File(self.hdf5_path, 'r', swmr=self.hdf5_use_swmr, libver='latest')
+        return self._hdf5_file
+
+    def close_and_delete_hdf5_handle(self):
+        """
+        Maybe close the file handle.
+        """
+        if self._hdf5_file is not None:
+            self._hdf5_file.close()
+        self._hdf5_file = None
+
+    @contextmanager
+    def hdf5_file_opened(self):
+        """
+        Convenient context manager to open the file on entering the scope
+        and then close it on leaving.
+        """
+        should_close = self._hdf5_file is None
+        yield self.hdf5_file
+        if should_close:
+            self.close_and_delete_hdf5_handle()
+
+    def __del__(self):
+        self.close_and_delete_hdf5_handle()
+
+    def __repr__(self):
+        """
+        Pretty print the class and important attributes on a call to `print`.
+        """
+        msg = str(self.__class__.__name__)
+        msg += " (\n\tpath={}\n\tobs_keys={}\n\tseq_length={}\n\tfilter_key={}\n\tframe_stack={}\n"
+        msg += "\tpad_seq_length={}\n\tpad_frame_stack={}\n\tgoal_mode={}\n"
+        msg += "\tcache_mode={}\n"
+        msg += "\tnum_demos={}\n\tnum_sequences={}\n)"
+        filter_key_str = self.filter_by_attribute if self.filter_by_attribute is not None else "none"
+        goal_mode_str = self.goal_mode if self.goal_mode is not None else "none"
+        cache_mode_str = self.hdf5_cache_mode if self.hdf5_cache_mode is not None else "none"
+        msg = msg.format(self.hdf5_path, self.obs_keys, self.seq_length, filter_key_str, self.n_frame_stack,
+                         self.pad_seq_length, self.pad_frame_stack, goal_mode_str, cache_mode_str,
+                         self.n_demos, self.total_num_sequences)
+        return msg
+
+    def __len__(self):
+        """
+        Ensure that the torch dataloader will do a complete pass through all sequences in 
+        the dataset before starting a new iteration.
+        """
+        return self.total_num_sequences
+
+    def load_dataset_in_memory(self, demo_list, hdf5_file, obs_keys, dataset_keys, load_next_obs):
+        """
+        Loads the hdf5 dataset into memory, preserving the structure of the file. Note that this
+        differs from `self.getitem_cache`, which, if active, actually caches the outputs of the
+        `getitem` operation.
+
+        Args:
+            demo_list (list): list of demo keys, e.g., 'demo_0'
+            hdf5_file (h5py.File): file handle to the hdf5 dataset.
+            obs_keys (list, tuple): observation keys to fetch, e.g., 'images'
+            dataset_keys (list, tuple): dataset keys to fetch, e.g., 'actions'
+            load_next_obs (bool): whether to load next_obs from the dataset
+
+        Returns:
+            all_data (dict): dictionary of loaded data.
+        """
+        all_data = dict()
+        print("SequenceDataset: loading dataset into memory...")
+        for ep in LogUtils.custom_tqdm(demo_list):
+            all_data[ep] = {}
+            all_data[ep]["attrs"] = {}
+            all_data[ep]["attrs"]["num_samples"] = hdf5_file["data/{}".format(ep)].attrs["num_samples"]
+            # get obs
+            all_data[ep]["obs"] = {k: hdf5_file["data/{}/obs/{}".format(ep, k)][()] for k in obs_keys}
+            if load_next_obs:
+                all_data[ep]["next_obs"] = {k: hdf5_file["data/{}/next_obs/{}".format(ep, k)][()] for k in obs_keys}
+            # get other dataset keys
+            for k in dataset_keys:
+                if k in hdf5_file["data/{}".format(ep)]:
+                    all_data[ep][k] = hdf5_file["data/{}/{}".format(ep, k)][()].astype('float32')
+                else:
+                    all_data[ep][k] = np.zeros((all_data[ep]["attrs"]["num_samples"], 1), dtype=np.float32)
+
+            if "model_file" in hdf5_file["data/{}".format(ep)].attrs:
+                all_data[ep]["attrs"]["model_file"] = hdf5_file["data/{}".format(ep)].attrs["model_file"]
+
+        return all_data
+
+    def normalize_obs(self):
+        """
+        Computes a dataset-wide mean and standard deviation for the observations 
+        (per dimension and per obs key) and returns it.
+        """
+
+        # Run through all trajectories. For each one, compute minimal observation statistics, and then aggregate
+        # with the previous statistics.
+        ep = self.demos[0]
+        obs_traj = {k: self.hdf5_file["data/{}/obs/{}".format(ep, k)][()].astype('float32') for k in self.obs_keys}
+        obs_traj = ObsUtils.process_obs_dict(obs_traj)
+        merged_stats = _compute_traj_stats(obs_traj)
+        print("SequenceDataset: normalizing observations...")
+        for ep in LogUtils.custom_tqdm(self.demos[1:]):
+            obs_traj = {k: self.hdf5_file["data/{}/obs/{}".format(ep, k)][()].astype('float32') for k in self.obs_keys}
+            obs_traj = ObsUtils.process_obs_dict(obs_traj)
+            traj_stats = _compute_traj_stats(obs_traj)
+            merged_stats = _aggregate_traj_stats(merged_stats, traj_stats)
+
+        obs_normalization_stats = { k : {} for k in merged_stats }
+        for k in merged_stats:
+            # note we add a small tolerance of 1e-3 for std
+            obs_normalization_stats[k]["mean"] = merged_stats[k]["mean"].astype(np.float32)
+            obs_normalization_stats[k]["std"] = (np.sqrt(merged_stats[k]["sqdiff"] / merged_stats[k]["n"]) + 1e-3).astype(np.float32)
+        return obs_normalization_stats
+
+    def get_obs_normalization_stats(self):
+        """
+        Returns dictionary of mean and std for each observation key if using
+        observation normalization, otherwise None.
+
+        Returns:
+            obs_normalization_stats (dict): a dictionary for observation
+                normalization. This maps observation keys to dicts
+                with a "mean" and "std" of shape (1, ...) where ... is the default
+                shape for the observation.
+        """
+        assert self.hdf5_normalize_obs, "not using observation normalization!"
+        return deepcopy(self.obs_normalization_stats)
+
+    def get_action_traj(self, ep):
+        action_traj = dict()
+        for key in self.action_keys:
+            action_traj[key] = self.hdf5_file["data/{}/{}".format(ep, key)][()].astype('float32')
+        return action_traj
+   
+    def get_action_stats(self):
+        ep = self.demos[0]
+        action_traj = self.get_action_traj(ep)
+        action_stats = _compute_traj_stats(action_traj)
+        print("SequenceDataset: normalizing actions...")
+        for ep in LogUtils.custom_tqdm(self.demos[1:]):
+            action_traj = self.get_action_traj(ep)
+            traj_stats = _compute_traj_stats(action_traj)
+            action_stats = _aggregate_traj_stats(action_stats, traj_stats)
+        return action_stats
+
+    def set_action_normalization_stats(self, action_normalization_stats):
+        self.action_normalization_stats = action_normalization_stats
+
+    def get_action_normalization_stats(self):
+        """
+        Computes a dataset-wide min, max, mean and standard deviation for the actions 
+        (per dimension) and returns it.
+        """
+        
+        # Run through all trajectories. For each one, compute minimal observation statistics, and then aggregate
+        # with the previous statistics.
+        if self.action_normalization_stats is None:
+            action_stats = self.get_action_stats()
+            self.action_normalization_stats = action_stats_to_normalization_stats(
+                action_stats, self.action_config)
+        return self.action_normalization_stats
+
+    def get_dataset_for_ep(self, ep, key):
+        """
+        Helper utility to get a dataset for a specific demonstration.
+        Takes into account whether the dataset has been loaded into memory.
+        """
+
+        # check if this key should be in memory
+        key_should_be_in_memory = (self.hdf5_cache_mode in ["all", "low_dim"])
+        if key_should_be_in_memory:
+            # if key is an observation, it may not be in memory
+            if '/' in key:
+                key1, key2 = key.split('/')
+                assert(key1 in ['obs', 'next_obs', 'action_dict'])
+                if key2 not in self.obs_keys_in_memory:
+                    key_should_be_in_memory = False
+
+        if key_should_be_in_memory:
+            # read cache
+            if '/' in key:
+                key1, key2 = key.split('/')
+                assert(key1 in ['obs', 'next_obs', 'action_dict'])
+                ret = self.hdf5_cache[ep][key1][key2]
+            else:
+                ret = self.hdf5_cache[ep][key]
+        else:
+            # read from file
+            hd5key = "data/{}/{}".format(ep, key)
+            ret = self.hdf5_file[hd5key]
+        return ret
+
+    def __getitem__(self, index):
+        """
+        Fetch dataset sequence @index (inferred through internal index map), using the getitem_cache if available.
+        """
+        if self.hdf5_cache_mode == "all":
+            return self.getitem_cache[index]
+        return self.get_item(index)
+
+    def get_item(self, index):
+        """
+        Main implementation of getitem when not using cache.
+        """
+
+        demo_id = self._index_to_demo_id[index]
+        demo_start_index = self._demo_id_to_start_indices[demo_id]
+        demo_length = self._demo_id_to_demo_length[demo_id]
+
+        # start at offset index if not padding for frame stacking
+        demo_index_offset = 0 if self.pad_frame_stack else (self.n_frame_stack - 1)
+        index_in_demo = index - demo_start_index + demo_index_offset
+
+        # end at offset index if not padding for seq length
+        demo_length_offset = 0 if self.pad_seq_length else (self.seq_length - 1)
+        end_index_in_demo = demo_length - demo_length_offset
+
+        meta = self.get_dataset_sequence_from_demo(
+            demo_id,
+            index_in_demo=index_in_demo,
+            keys=self.dataset_keys,
+            num_frames_to_stack=self.n_frame_stack - 1, # note: need to decrement self.n_frame_stack by one
+            seq_length=self.seq_length
+        )
+
+        # determine goal index
+        goal_index = None
+        if self.goal_mode == "last":
+            goal_index = end_index_in_demo - 1
+
+        meta["obs"] = self.get_obs_sequence_from_demo(
+            demo_id,
+            index_in_demo=index_in_demo,
+            keys=self.obs_keys,
+            num_frames_to_stack=self.n_frame_stack - 1,
+            seq_length=self.seq_length,
+            prefix="obs"
+        )
+
+        if self.load_next_obs:
+            meta["next_obs"] = self.get_obs_sequence_from_demo(
+                demo_id,
+                index_in_demo=index_in_demo,
+                keys=self.obs_keys,
+                num_frames_to_stack=self.n_frame_stack - 1,
+                seq_length=self.seq_length,
+                prefix="next_obs"
+            )
+
+        if goal_index is not None:
+            goal = self.get_obs_sequence_from_demo(
+                demo_id,
+                index_in_demo=goal_index,
+                keys=self.obs_keys,
+                num_frames_to_stack=0,
+                seq_length=1,
+                prefix="next_obs",
+            )
+            meta["goal_obs"] = {k: goal[k][0] for k in goal}  # remove sequence dimension for goal
+
+        # get action components
+        ac_dict = OrderedDict()
+        for k in self.action_keys:
+            ac = meta[k]
+            # expand action shape if needed
+            if len(ac.shape) == 1:
+                ac = ac.reshape(-1, 1)
+            ac_dict[k] = ac
+       
+        # normalize actions
+        action_normalization_stats = self.get_action_normalization_stats()
+        ac_dict = ObsUtils.normalize_dict(ac_dict, normalization_stats=action_normalization_stats)
+
+        # concatenate all action components
+        meta["actions"] = AcUtils.action_dict_to_vector(ac_dict)
+
+        # also return the sampled index
+        meta["index"] = index
+
+        return meta
+
+    def get_sequence_from_demo(self, demo_id, index_in_demo, keys, num_frames_to_stack=0, seq_length=1):
+        """
+        Extract a (sub)sequence of data items from a demo given the @keys of the items.
+
+        Args:
+            demo_id (str): id of the demo, e.g., demo_0
+            index_in_demo (int): beginning index of the sequence wrt the demo
+            keys (tuple): list of keys to extract
+            num_frames_to_stack (int): numbers of frame to stack. Seq gets prepended with repeated items if out of range
+            seq_length (int): sequence length to extract. Seq gets post-pended with repeated items if out of range
+
+        Returns:
+            a dictionary of extracted items.
+        """
+        assert num_frames_to_stack >= 0
+        assert seq_length >= 1
+
+        demo_length = self._demo_id_to_demo_length[demo_id]
+        assert index_in_demo < demo_length
+
+        # determine begin and end of sequence
+        seq_begin_index = max(0, index_in_demo - num_frames_to_stack)
+        seq_end_index = min(demo_length, index_in_demo + seq_length)
+
+        # determine sequence padding
+        seq_begin_pad = max(0, num_frames_to_stack - index_in_demo)  # pad for frame stacking
+        seq_end_pad = max(0, index_in_demo + seq_length - demo_length)  # pad for sequence length
+
+        # make sure we are not padding if specified.
+        if not self.pad_frame_stack:
+            assert seq_begin_pad == 0
+        if not self.pad_seq_length:
+            assert seq_end_pad == 0
+
+        # fetch observation from the dataset file
+        seq = dict()
+        for k in keys:
+            data = self.get_dataset_for_ep(demo_id, k)
+            seq[k] = data[seq_begin_index: seq_end_index]
+
+        seq = TensorUtils.pad_sequence(seq, padding=(seq_begin_pad, seq_end_pad), pad_same=True)
+        pad_mask = np.array([0] * seq_begin_pad + [1] * (seq_end_index - seq_begin_index) + [0] * seq_end_pad)
+        pad_mask = pad_mask[:, None].astype(bool)
+
+        return seq, pad_mask
+
+    def get_obs_sequence_from_demo(self, demo_id, index_in_demo, keys, num_frames_to_stack=0, seq_length=1, prefix="obs"):
+        """
+        Extract a (sub)sequence of observation items from a demo given the @keys of the items.
+
+        Args:
+            demo_id (str): id of the demo, e.g., demo_0
+            index_in_demo (int): beginning index of the sequence wrt the demo
+            keys (tuple): list of keys to extract
+            num_frames_to_stack (int): numbers of frame to stack. Seq gets prepended with repeated items if out of range
+            seq_length (int): sequence length to extract. Seq gets post-pended with repeated items if out of range
+            prefix (str): one of "obs", "next_obs"
+
+        Returns:
+            a dictionary of extracted items.
+        """
+        obs, pad_mask = self.get_sequence_from_demo(
+            demo_id,
+            index_in_demo=index_in_demo,
+            keys=tuple('{}/{}'.format(prefix, k) for k in keys),
+            num_frames_to_stack=num_frames_to_stack,
+            seq_length=seq_length,
+        )
+        obs = {'/'.join(k.split('/')[1:]): obs[k] for k in obs}  # strip the prefix
+        if self.get_pad_mask:
+            obs["pad_mask"] = pad_mask
+
+        return obs
+
+    def get_dataset_sequence_from_demo(self, demo_id, index_in_demo, keys, num_frames_to_stack=0, seq_length=1):
+        """
+        Extract a (sub)sequence of dataset items from a demo given the @keys of the items (e.g., states, actions).
+        
+        Args:
+            demo_id (str): id of the demo, e.g., demo_0
+            index_in_demo (int): beginning index of the sequence wrt the demo
+            keys (tuple): list of keys to extract
+            num_frames_to_stack (int): numbers of frame to stack. Seq gets prepended with repeated items if out of range
+            seq_length (int): sequence length to extract. Seq gets post-pended with repeated items if out of range
+
+        Returns:
+            a dictionary of extracted items.
+        """
+        data, pad_mask = self.get_sequence_from_demo(
+            demo_id,
+            index_in_demo=index_in_demo,
+            keys=keys,
+            num_frames_to_stack=num_frames_to_stack,
+            seq_length=seq_length,
+        )
+        if self.get_pad_mask:
+            data["pad_mask"] = pad_mask
+        return data
+
+    def get_trajectory_at_index(self, index):
+        """
+        Method provided as a utility to get an entire trajectory, given
+        the corresponding @index.
+        """
+        demo_id = self.demos[index]
+        demo_length = self._demo_id_to_demo_length[demo_id]
+
+        meta = self.get_dataset_sequence_from_demo(
+            demo_id,
+            index_in_demo=0,
+            keys=self.dataset_keys,
+            num_frames_to_stack=self.n_frame_stack - 1, # note: need to decrement self.n_frame_stack by one
+            seq_length=demo_length
+        )
+        meta["obs"] = self.get_obs_sequence_from_demo(
+            demo_id,
+            index_in_demo=0,
+            keys=self.obs_keys,
+            seq_length=demo_length
+        )
+        if self.load_next_obs:
+            meta["next_obs"] = self.get_obs_sequence_from_demo(
+                demo_id,
+                index_in_demo=0,
+                keys=self.obs_keys,
+                seq_length=demo_length,
+                prefix="next_obs"
+            )
+
+        meta["ep"] = demo_id
+        return meta
+
+    def get_dataset_sampler(self):
+        """
+        Return instance of torch.utils.data.Sampler or None. Allows
+        for dataset to define custom sampling logic, such as
+        re-weighting the probability of samples being drawn.
+        See the `train` function in scripts/train.py, and torch
+        `DataLoader` documentation, for more info.
+        """
+        return None
+
+
+class R2D2Dataset(SequenceDataset):
+    def get_action_traj(self, ep):
+        action_traj = dict()
+        for key in self.action_keys:
+            action_traj[key] = self.hdf5_file[key][()].astype('float32')
+            if len(action_traj[key].shape) == 1:
+                action_traj[key] = np.reshape(action_traj[key], (-1, 1))
+
+        return action_traj
+
+    def load_demo_info(self, filter_by_attribute=None, demos=None, n_demos=None):
+        """
+        Args:
+            filter_by_attribute (str): if provided, use the provided filter key
+                to select a subset of demonstration trajectories to load
+
+            demos (list): list of demonstration keys to load from the hdf5 file. If 
+                omitted, all demos in the file (or under the @filter_by_attribute 
+                filter key) are used.
+        """
+
+        self.demos = ["demo"]
+
+        self.n_demos = len(self.demos)
+
+        # keep internal index maps to know which transitions belong to which demos
+        self._index_to_demo_id = dict()  # maps every index to a demo id
+        self._demo_id_to_start_indices = dict()  # gives start index per demo id
+        self._demo_id_to_demo_length = dict()
+
+        # segment time stamps
+        self._demo_id_to_segments = dict()
+
+        ep = self.demos[0]
+
+        # determine index mapping
+        self.total_num_sequences = 0
+        demo_length = self.hdf5_file["action/cartesian_velocity"].shape[0]
+        self._demo_id_to_start_indices[ep] = self.total_num_sequences
+        self._demo_id_to_demo_length[ep] = demo_length
+
+        # seperate demo into segments for better alignment
+        gripper_actions = list(self.hdf5_file["action/gripper_position"])
+        gripper_closed = [1 if x > 0 else 0 for x in gripper_actions]
+
+        try:
+            # find when the gripper fist opens/closes
+            gripper_close = gripper_closed.index(1)
+            gripper_open = gripper_close + gripper_closed[gripper_close:].index(0)
+        except ValueError:
+            # special case for (invalid) trajectories
+            gripper_close, gripper_open = int(demo_length / 3), int(demo_length / 3 * 2)
+            print("No gripper action:", gripper_actions)
+        self._demo_id_to_segments[ep] = [0, gripper_close, gripper_open, demo_length - 1]
+
+        num_sequences = demo_length
+        # determine actual number of sequences taking into account whether to pad for frame_stack and seq_length
+        if not self.pad_frame_stack:
+            num_sequences -= (self.n_frame_stack - 1)
+        if not self.pad_seq_length:
+            num_sequences -= (self.seq_length - 1)
+
+        if self.pad_seq_length:
+            assert demo_length >= 1  # sequence needs to have at least one sample
+            num_sequences = max(num_sequences, 1)
+        else:
+            assert num_sequences >= 1  # assume demo_length >= (self.n_frame_stack - 1 + self.seq_length)
+
+        for _ in range(num_sequences):
+            self._index_to_demo_id[self.total_num_sequences] = ep
+            self.total_num_sequences += 1
+
+    def load_dataset_in_memory(self, demo_list, hdf5_file, obs_keys, dataset_keys, load_next_obs):
+        """
+        Loads the hdf5 dataset into memory, preserving the structure of the file. Note that this
+        differs from `self.getitem_cache`, which, if active, actually caches the outputs of the
+        `getitem` operation.
+
+        Args:
+            demo_list (list): list of demo keys, e.g., 'demo_0'
+            hdf5_file (h5py.File): file handle to the hdf5 dataset.
+            obs_keys (list, tuple): observation keys to fetch, e.g., 'images'
+            dataset_keys (list, tuple): dataset keys to fetch, e.g., 'actions'
+            load_next_obs (bool): whether to load next_obs from the dataset
+
+        Returns:
+            all_data (dict): dictionary of loaded data.
+        """
+        all_data = dict()
+        print("SequenceDataset: loading dataset into memory...")
+
+        for ep in LogUtils.custom_tqdm(demo_list):
+            all_data[ep] = {}
+            all_data[ep]["attrs"] = {}
+            all_data[ep]["attrs"]["num_samples"] = hdf5_file["action/cartesian_velocity"].shape[0] # hack to get traj len
+            # get obs
+            all_data[ep]["obs"] = {k: hdf5_file["observation/{}".format(k)][()].astype('float32') for k in obs_keys}
+            if load_next_obs:
+                raise NotImplementedError
+            # get other dataset keys
+            for k in dataset_keys:
+                if k in hdf5_file.keys():
+                    all_data[ep][k] = hdf5_file["{}".format(k)][()].astype('float32')
+                else:
+                    raise NotImplementedError
+
+        return all_data
+
+    def get_dataset_for_ep(self, ep, key, try_to_use_cache=True):
+        """
+        Helper utility to get a dataset for a specific demonstration.
+        Takes into account whether the dataset has been loaded into memory.
+        """
+
+        # check if this key should be in memory
+        key_should_be_in_memory = try_to_use_cache and (self.hdf5_cache_mode in ["all", "low_dim"])
+        if key_should_be_in_memory:
+            # if key is an observation, it may not be in memory
+            if '/' in key:
+                key_splits = key.split('/')
+                key1 = key_splits[0]
+                key2 = "/".join(key_splits[1:])
+                if key1 == "observation" and key2 not in self.obs_keys_in_memory:
+                    key_should_be_in_memory = False
+
+        if key_should_be_in_memory:
+            # read cache
+            if '/' in key:
+                key_splits = key.split('/')
+                key1 = key_splits[0]
+                key2 = "/".join(key_splits[1:])
+                if key1 == "observation":
+                    ret = self.hdf5_cache[ep]["obs"][key2]
+                else:
+                    ret = self.hdf5_cache[ep][key]
+            else:
+                ret = self.hdf5_cache[ep][key]
+        else:
+            # read from file
+            hd5key = "{}".format(key) #"data/{}/{}".format(ep, key)
+            ret = self.hdf5_file[hd5key]
+        return ret
+
+    
+    def get_sequence_from_demo(self, demo_id, index_in_demo, keys, num_frames_to_stack=0, seq_length=1):
+        """
+        Extract a (sub)sequence of data items from a demo given the @keys of the items.
+
+        Args:
+            demo_id (str): id of the demo, e.g., demo_0
+            index_in_demo (int): beginning index of the sequence wrt the demo
+            keys (tuple): list of keys to extract
+            num_frames_to_stack (int): numbers of frame to stack. Seq gets prepended with repeated items if out of range
+            seq_length (int): sequence length to extract. Seq gets post-pended with repeated items if out of range
+
+        Returns:
+            a dictionary of extracted items.
+        """
+        assert num_frames_to_stack >= 0
+        assert seq_length >= 1
+
+        demo_length = self._demo_id_to_demo_length[demo_id]
+        assert index_in_demo < demo_length
+
+        # determine begin and end of sequence
+        seq_begin_index = max(0, index_in_demo - num_frames_to_stack)
+        seq_end_index = min(demo_length, index_in_demo + seq_length)
+
+        # determine sequence padding
+        seq_begin_pad = max(0, num_frames_to_stack - index_in_demo)  # pad for frame stacking
+        seq_end_pad = max(0, index_in_demo + seq_length - demo_length)  # pad for sequence length
+
+        # make sure we are not padding if specified.
+        if not self.pad_frame_stack:
+            assert seq_begin_pad == 0
+        if not self.pad_seq_length:
+            assert seq_end_pad == 0
+
+        # fetch observation from the dataset file
+        seq = dict()
+        for k in keys:
+            data = self.get_dataset_for_ep(demo_id, k)
+            seq[k] = data[seq_begin_index: seq_end_index].astype("float32")
+        
+        seq = TensorUtils.pad_sequence(seq, padding=(seq_begin_pad, seq_end_pad), pad_same=True)
+        pad_mask = np.array([0] * seq_begin_pad + [1] * (seq_end_index - seq_begin_index) + [0] * seq_end_pad)
+        pad_mask = pad_mask[:, None].astype(bool)
+
+        return seq, pad_mask
+    
+
+    def get_item(self, index):
+        """
+        Main implementation of getitem when not using cache.
+        """
+
+        demo_id = self._index_to_demo_id[index]
+        demo_start_index = self._demo_id_to_start_indices[demo_id]
+        demo_length = self._demo_id_to_demo_length[demo_id]
+
+        # start at offset index if not padding for frame stacking
+        demo_index_offset = 0 if self.pad_frame_stack else (self.n_frame_stack - 1)
+        index_in_demo = index - demo_start_index + demo_index_offset
+
+        # end at offset index if not padding for seq length
+        demo_length_offset = 0 if self.pad_seq_length else (self.seq_length - 1)
+        end_index_in_demo = demo_length - demo_length_offset
+        
+        meta = self.get_dataset_sequence_from_demo(
+            demo_id,
+            index_in_demo=index_in_demo,
+            keys=self.dataset_keys,
+            num_frames_to_stack=self.n_frame_stack - 1,
+            seq_length=self.seq_length,
+        )
+
+        # determine goal index
+        goal_index = None
+        if self.goal_mode == "last":
+            goal_index = end_index_in_demo - 1
+
+        meta["obs"] = self.get_obs_sequence_from_demo(
+            demo_id,
+            index_in_demo=index_in_demo,
+            keys=self.obs_keys,
+            num_frames_to_stack=self.n_frame_stack - 1,
+            seq_length=self.seq_length,
+            prefix="observation"
+        )
+
+        if self.load_next_obs:
+            meta["next_obs"] = self.get_obs_sequence_from_demo(
+                demo_id,
+                index_in_demo=index_in_demo,
+                keys=self.obs_keys,
+                num_frames_to_stack=self.n_frame_stack - 1,
+                seq_length=self.seq_length,
+                prefix="next_obs"
+            )
+
+        if goal_index is not None:
+            goal = self.get_obs_sequence_from_demo(
+                demo_id,
+                index_in_demo=goal_index,
+                keys=self.obs_keys,
+                num_frames_to_stack=0,
+                seq_length=1,
+                prefix="next_obs",
+            )
+            meta["goal_obs"] = {k: goal[k][0] for k in goal}  # remove sequence dimension for goal
+        
+        # get action components
+        ac_dict = OrderedDict()
+        for k in self.action_keys:
+            ac = meta[k]
+            # expand action shape if needed
+            if len(ac.shape) == 1:
+                ac = ac.reshape(-1, 1)
+            ac_dict[k] = ac
+       
+        # normalize actions
+        action_normalization_stats = self.get_action_normalization_stats()
+        ac_dict = ObsUtils.normalize_dict(ac_dict, normalization_stats=action_normalization_stats)
+
+        # concatenate all action components
+        meta["actions"] = AcUtils.action_dict_to_vector(ac_dict)
+
+        # keys to reshape
+        for k in meta["obs"]:
+            if len(meta["obs"][k].shape) == 1:
+                meta["obs"][k] = np.expand_dims(meta["obs"][k], axis=1)
+
+        # also return the sampled index
+        meta["index"] = index
+
+        return meta
+
+
+class MetaDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        datasets,
+        ds_weights,
+        normalize_weights_by_ds_size=False,
+        ds_labels=None,
+    ):
+        super(MetaDataset, self).__init__()
+        self.datasets = datasets
+        ds_lens = np.array([len(ds) for ds in self.datasets])
+        if normalize_weights_by_ds_size:
+            self.ds_weights = np.array(ds_weights) / ds_lens
+        else:
+            self.ds_weights = ds_weights
+        self._ds_ind_bins = np.cumsum([0] + list(ds_lens))
+
+        # cache mode "all" not supported! The action normalization stats of each
+        # dataset will change after the datasets are already initialized
+        for ds in self.datasets:
+            assert ds.hdf5_cache_mode != "all"
+        
+        # compute ds_labels to one hot ids
+        if ds_labels is None:
+            self.ds_labels = ["dummy"]
+        else:
+            self.ds_labels = ds_labels
+
+        unique_labels = sorted(set(self.ds_labels))
+
+        self.ds_labels_to_ids = {}
+        for i, label in enumerate(sorted(unique_labels)):
+            one_hot_id = np.zeros(len(unique_labels))
+            one_hot_id[i] = 1.0
+            self.ds_labels_to_ids[label] = one_hot_id
+
+        # TODO: comment
+        action_stats = self.get_action_stats()
+        self.action_normalization_stats = action_stats_to_normalization_stats(
+            action_stats, self.datasets[0].action_config)
+        self.set_action_normalization_stats(self.action_normalization_stats)
+    
+    def __len__(self):
+        return np.sum([len(ds) for ds in self.datasets])
+
+    def __getitem__(self, idx):
+        ds_ind = np.digitize(idx, self._ds_ind_bins) - 1
+        ind_in_ds = idx - self._ds_ind_bins[ds_ind]
+        meta = self.datasets[ds_ind].__getitem__(ind_in_ds)
+        meta["index"] = idx
+        ds_label = self.ds_labels[ds_ind]
+        T = meta["actions"].shape[0]
+        return meta
+
+    def get_ds_label(self, idx):
+        ds_ind = np.digitize(idx, self._ds_ind_bins) - 1
+        ds_label = self.ds_labels[ds_ind]
+        return ds_label
+    
+    def get_ds_id(self, idx):
+        ds_ind = np.digitize(idx, self._ds_ind_bins) - 1
+        ds_label = self.ds_labels[ds_ind]
+        return self.ds_labels_to_ids[ds_label]
+
+    def __repr__(self):
+        str_output = '\n'.join([ds.__repr__() for ds in self.datasets])
+        return str_output
+
+    def get_dataset_sampler(self):
+        weights = np.ones(len(self))
+        for i, (start, end) in enumerate(zip(self._ds_ind_bins[:-1], self._ds_ind_bins[1:])):
+            weights[start:end] = self.ds_weights[i]
+
+        sampler = torch.utils.data.WeightedRandomSampler(
+            weights=weights,
+            num_samples=len(self),
+            replacement=True,
+        )
+        return sampler
+
+    def get_action_stats(self):
+        meta_action_stats = self.datasets[0].get_action_stats()
+        for dataset in self.datasets[1:]:
+            ds_action_stats = dataset.get_action_stats()
+            meta_action_stats = _aggregate_traj_stats(meta_action_stats, ds_action_stats)
+            
+        return meta_action_stats
+    
+    def set_action_normalization_stats(self, action_normalization_stats):
+        self.action_normalization_stats = action_normalization_stats
+        for ds in self.datasets:
+            ds.set_action_normalization_stats(self.action_normalization_stats)
+
+    def get_action_normalization_stats(self):
+        """
+        Computes a dataset-wide min, max, mean and standard deviation for the actions 
+        (per dimension) and returns it.
+        """
+        
+        # Run through all trajectories. For each one, compute minimal observation statistics, and then aggregate
+        # with the previous statistics.
+        if self.action_normalization_stats is None:
+            action_stats = self.get_action_stats()
+            self.action_normalization_stats = action_stats_to_normalization_stats(
+                action_stats, self.datasets[0].action_config)
+        return self.action_normalization_stats
+
+def _compute_traj_stats(traj_obs_dict):
+    """
+    Helper function to compute statistics over a single trajectory of observations.
+    """
+    traj_stats = { k : {} for k in traj_obs_dict }
+    for k in traj_obs_dict:
+        traj_stats[k]["n"] = traj_obs_dict[k].shape[0]
+        traj_stats[k]["mean"] = traj_obs_dict[k].mean(axis=0, keepdims=True) # [1, ...]
+        traj_stats[k]["sqdiff"] = ((traj_obs_dict[k] - traj_stats[k]["mean"]) ** 2).sum(axis=0, keepdims=True) # [1, ...]
+        traj_stats[k]["min"] = traj_obs_dict[k].min(axis=0, keepdims=True)
+        traj_stats[k]["max"] = traj_obs_dict[k].max(axis=0, keepdims=True)
+    return traj_stats
+
+def _aggregate_traj_stats(traj_stats_a, traj_stats_b):
+    """
+    Helper function to aggregate trajectory statistics.
+    See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+    for more information.
+    """
+    merged_stats = {}
+    for k in traj_stats_a:
+        n_a, avg_a, M2_a, min_a, max_a = traj_stats_a[k]["n"], traj_stats_a[k]["mean"], traj_stats_a[k]["sqdiff"], traj_stats_a[k]["min"], traj_stats_a[k]["max"]
+        n_b, avg_b, M2_b, min_b, max_b = traj_stats_b[k]["n"], traj_stats_b[k]["mean"], traj_stats_b[k]["sqdiff"], traj_stats_b[k]["min"], traj_stats_b[k]["max"]
+        n = n_a + n_b
+        mean = (n_a * avg_a + n_b * avg_b) / n
+        delta = (avg_b - avg_a)
+        M2 = M2_a + M2_b + (delta ** 2) * (n_a * n_b) / n
+        min_ = np.minimum(min_a, min_b)
+        max_ = np.maximum(max_a, max_b)
+        merged_stats[k] = dict(n=n, mean=mean, sqdiff=M2, min=min_, max=max_)
+    return merged_stats
+
+def action_stats_to_normalization_stats(action_stats, action_config):
+    action_normalization_stats = OrderedDict()
+    for action_key in action_stats.keys():
+        # get how this action should be normalized from config, default to None
+        norm_method = action_config[action_key].get("normalization", None)
+        if norm_method is None:
+            # no normalization, unit scale, zero offset
+            action_normalization_stats[action_key] = {
+                "scale": np.ones_like(action_stats[action_key]["mean"], dtype=np.float32),
+                "offset": np.zeros_like(action_stats[action_key]["mean"], dtype=np.float32)
+            }
+        elif norm_method == "min_max":
+            # normalize min to -1 and max to 1
+            range_eps = 1e-4
+            input_min = action_stats[action_key]["min"].astype(np.float32)
+            input_max = action_stats[action_key]["max"].astype(np.float32)
+            # instead of -1 and 1 use numbers just below threshold to prevent numerical instability issues
+            output_min = -0.999999
+            output_max = 0.999999
+            
+            # ignore input dimentions that is too small to prevent division by zero
+            input_range = input_max - input_min
+            ignore_dim = input_range < range_eps
+            input_range[ignore_dim] = output_max - output_min    
+
+            # expected usage of scale and offset
+            # normalized_action = (raw_action - offset) / scale
+            # raw_action = scale * normalized_action + offset
+
+            # eq1: input_max = scale * output_max + offset
+            # eq2: input_min = scale * output_min + offset
+
+            # solution for scale and offset
+            # eq1 - eq2: 
+            #   input_max - input_min = scale * (output_max - output_min)
+            #   (input_max - input_min) / (output_max - output_min) = scale <- eq3
+            # offset = input_min - scale * output_min <- eq4
+            scale = input_range / (output_max - output_min)
+            offset = input_min - scale * output_min
+
+            offset[ignore_dim] = input_min[ignore_dim] - (output_max + output_min) / 2
+
+            action_normalization_stats[action_key] = {
+                "scale": scale,
+                "offset": offset
+            }
+        elif norm_method == "gaussian":
+            # normalize to zero mean unit variance
+            input_mean = action_stats[action_key]["mean"].astype(np.float32)
+            input_std = np.sqrt(action_stats[action_key]["sqdiff"] / action_stats[action_key]["n"]).astype(np.float32)
+
+            # ignore input dimentions that is too small to prevent division by zero
+            std_eps = 1e-6
+            ignore_dim = input_std < std_eps
+            input_std[ignore_dim] = 1.0
+
+            action_normalization_stats[action_key] = {
+                "scale": input_mean,
+                "offset": input_std
+            }
+        else:
+            raise NotImplementedError(
+                'action_config.actions.normalization: "{}" is not supported'.format(norm_method))
+    
+    return action_normalization_stats
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/env_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/env_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..61c7500daaf7026977005e798a483049695611e4
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/env_utils.py
@@ -0,0 +1,385 @@
+"""
+This file contains several utility functions for working with environment
+wrappers provided by the repository, and with environment metadata saved
+in dataset files.
+"""
+from copy import deepcopy
+import robomimic.envs.env_base as EB
+from robomimic.utils.log_utils import log_warning
+
+
+def get_env_class(env_meta=None, env_type=None, env=None):
+    """
+    Return env class from either env_meta, env_type, or env.
+    Note the use of lazy imports - this ensures that modules are only
+    imported when the corresponding env type is requested. This can
+    be useful in practice. For example, a training run that only
+    requires access to gym environments should not need to import
+    robosuite.
+
+    Args:
+        env_meta (dict): environment metadata, which should be loaded from demonstration
+            hdf5 with @FileUtils.get_env_metadata_from_dataset or from checkpoint (see
+            @FileUtils.env_from_checkpoint). Contains 3 keys:
+
+                :`'env_name'`: name of environment
+                :`'type'`: type of environment, should be a value in EB.EnvType
+                :`'env_kwargs'`: dictionary of keyword arguments to pass to environment constructor
+
+        env_type (int): the type of environment, which determines the env class that will
+            be instantiated. Should be a value in EB.EnvType.
+
+        env (instance of EB.EnvBase): environment instance
+    """
+    env_type = get_env_type(env_meta=env_meta, env_type=env_type, env=env)
+    if env_type == EB.EnvType.ROBOSUITE_TYPE:
+        from robomimic.envs.env_robosuite import EnvRobosuite
+        return EnvRobosuite
+    elif env_type == EB.EnvType.GYM_TYPE:
+        from robomimic.envs.env_gym import EnvGym
+        return EnvGym
+    elif env_type == EB.EnvType.IG_MOMART_TYPE:
+        from robomimic.envs.env_ig_momart import EnvGibsonMOMART
+        return EnvGibsonMOMART
+    elif env_type == EB.EnvType.REAL_TYPE:
+        from robomimic.envs.env_real_panda import EnvRealPanda
+        return EnvRealPanda
+    elif env_type == EB.EnvType.GPRS_REAL_TYPE:
+        from robomimic.envs.env_real_panda_gprs import EnvRealPandaGPRS
+        return EnvRealPandaGPRS
+    raise Exception("code should never reach this point")
+
+
+def get_env_type(env_meta=None, env_type=None, env=None):
+    """
+    Helper function to get env_type from a variety of inputs.
+
+    Args:
+        env_meta (dict): environment metadata, which should be loaded from demonstration
+            hdf5 with @FileUtils.get_env_metadata_from_dataset or from checkpoint (see
+            @FileUtils.env_from_checkpoint). Contains 3 keys:
+
+                :`'env_name'`: name of environment
+                :`'type'`: type of environment, should be a value in EB.EnvType
+                :`'env_kwargs'`: dictionary of keyword arguments to pass to environment constructor
+
+        env_type (int): the type of environment, which determines the env class that will
+            be instantiated. Should be a value in EB.EnvType.
+
+        env (instance of EB.EnvBase): environment instance
+    """
+    checks = [(env_meta is not None), (env_type is not None), (env is not None)]
+    assert sum(checks) == 1, "should provide only one of env_meta, env_type, env"
+    if env_meta is not None:
+        env_type = env_meta["type"]
+    elif env is not None:
+        env_type = env.type
+    return env_type
+
+
+def check_env_type(type_to_check, env_meta=None, env_type=None, env=None):
+    """
+    Checks whether the passed env_meta, env_type, or env is of type @type_to_check.
+    Type corresponds to EB.EnvType.
+
+    Args:
+        type_to_check (int): type to check equality against
+
+        env_meta (dict): environment metadata, which should be loaded from demonstration
+            hdf5 with @FileUtils.get_env_metadata_from_dataset or from checkpoint (see
+            @FileUtils.env_from_checkpoint). Contains 3 keys:
+
+                :`'env_name'`: name of environment
+                :`'type'`: type of environment, should be a value in EB.EnvType
+                :`'env_kwargs'`: dictionary of keyword arguments to pass to environment constructor
+
+        env_type (int): the type of environment, which determines the env class that will
+            be instantiated. Should be a value in EB.EnvType.
+
+        env (instance of EB.EnvBase): environment instance
+    """
+    env_type = get_env_type(env_meta=env_meta, env_type=env_type, env=env)
+    return (env_type == type_to_check)
+
+
+def check_env_version(env, env_meta):
+    """
+    Checks whether the passed env and env_meta dictionary having matching environment versions.
+    Logs warning if cannot find version or versions do not match.
+
+    Args:
+        env (instance of EB.EnvBase): environment instance
+
+        env_meta (dict): environment metadata, which should be loaded from demonstration
+            hdf5 with @FileUtils.get_env_metadata_from_dataset or from checkpoint (see
+            @FileUtils.env_from_checkpoint). Contains following key:
+
+                :`'env_version'`: environment version, type str
+    """
+    env_system_version = env.version
+    env_meta_version = env_meta.get("env_version", None)
+
+    if env_meta_version is None:
+        log_warning(
+            "No environment version found in dataset!"\
+            "\nCannot verify if dataset and installed environment versions match"\
+        )
+    elif env_system_version != env_meta_version:
+        log_warning(
+            "Dataset and installed environment version mismatch!"\
+            "\nDataset environment version: {meta}"\
+            "\nInstalled environment version: {sys}".format(
+                sys=env_system_version,
+                meta=env_meta_version,
+            )
+        )
+
+
+def is_robosuite_env(env_meta=None, env_type=None, env=None):
+    """
+    Determines whether the environment is a robosuite environment. Accepts
+    either env_meta, env_type, or env.
+    """
+    return check_env_type(type_to_check=EB.EnvType.ROBOSUITE_TYPE, env_meta=env_meta, env_type=env_type, env=env)
+
+
+def is_simpler_env(env_meta=None, env_type=None, env=None):
+    return False
+
+
+def is_simpler_ov_env(env_meta=None, env_type=None, env=None):
+    return False
+
+
+def is_factory_env(env_meta=None, env_type=None, env=None):
+    return False
+
+
+def is_furniture_sim_env(env_meta=None, env_type=None, env=None):
+    return False
+
+
+def is_real_robot_env(env_meta=None, env_type=None, env=None):
+    """
+    Determines whether the environment is a real robot environment. Accepts
+    either env_meta, env_type, or env.
+    """
+    return check_env_type(type_to_check=EB.EnvType.REAL_TYPE, env_meta=env_meta, env_type=env_type, env=env)
+
+
+def is_real_robot_gprs_env(env_meta=None, env_type=None, env=None):
+    """
+    Determines whether the environment is a real robot environment. Accepts
+    either env_meta, env_type, or env.
+    """
+    return check_env_type(type_to_check=EB.EnvType.GPRS_REAL_TYPE, env_meta=env_meta, env_type=env_type, env=env)
+
+
+def create_env(
+    env_type,
+    env_name,
+    env_class=None,
+    render=False, 
+    render_offscreen=False, 
+    use_image_obs=False, 
+    use_depth_obs=False,
+    **kwargs,
+):
+    """
+    Create environment.
+
+    Args:
+        env_type (int): the type of environment, which determines the env class that will
+            be instantiated. Should be a value in EB.EnvType.
+
+        env_name (str): name of environment
+
+        render (bool): if True, environment supports on-screen rendering
+
+        render_offscreen (bool): if True, environment supports off-screen rendering. This
+            is forced to be True if @use_image_obs is True.
+
+        use_image_obs (bool): if True, environment is expected to render rgb image observations
+            on every env.step call. Set this to False for efficiency reasons, if image
+            observations are not required.
+
+        use_depth_obs (bool): if True, environment is expected to render depth image observations
+            on every env.step call. Set this to False for efficiency reasons, if depth
+            observations are not required.
+    """
+
+    # note: pass @postprocess_visual_obs True, to make sure images are processed for network inputs
+    if env_class is None:
+        env_class = get_env_class(env_type=env_type)
+    env = env_class(
+        env_name=env_name, 
+        render=render, 
+        render_offscreen=render_offscreen, 
+        use_image_obs=use_image_obs,
+        use_depth_obs=use_depth_obs,
+        postprocess_visual_obs=True,
+        **kwargs,
+    )
+    print("Created environment with name {}".format(env_name))
+    print("Action size is {}".format(env.action_dimension))
+    return env
+
+
+def create_env_from_metadata(
+    env_meta,
+    env_name=None,
+    env_class=None,
+    render=False, 
+    render_offscreen=False, 
+    use_image_obs=False, 
+    use_depth_obs=False,
+):
+    """
+    Create environment.
+
+    Args:
+        env_meta (dict): environment metadata, which should be loaded from demonstration
+            hdf5 with @FileUtils.get_env_metadata_from_dataset or from checkpoint (see
+            @FileUtils.env_from_checkpoint). Contains 3 keys:
+
+                :`'env_name'`: name of environment
+                :`'type'`: type of environment, should be a value in EB.EnvType
+                :`'env_kwargs'`: dictionary of keyword arguments to pass to environment constructor
+
+        env_name (str): name of environment. Only needs to be provided if making a different
+            environment from the one in @env_meta.
+
+        render (bool): if True, environment supports on-screen rendering
+
+        render_offscreen (bool): if True, environment supports off-screen rendering. This
+            is forced to be True if @use_image_obs is True.
+
+        use_image_obs (bool): if True, environment is expected to render rgb image observations
+            on every env.step call. Set this to False for efficiency reasons, if image
+            observations are not required.
+
+        use_depth_obs (bool): if True, environment is expected to render depth image observations
+            on every env.step call. Set this to False for efficiency reasons, if depth
+            observations are not required.
+    """
+    if env_name is None:
+        env_name = env_meta["env_name"]
+    env_type = get_env_type(env_meta=env_meta)
+    env_kwargs = env_meta["env_kwargs"]
+    env_kwargs.pop("use_image_obs", None)
+    env_kwargs.pop("use_depth_obs", None)
+
+    env = create_env(
+        env_type=env_type,
+        env_name=env_name,  
+        env_class=env_class,
+        render=render,
+        render_offscreen=render_offscreen, 
+        use_image_obs=use_image_obs, 
+        use_depth_obs=use_depth_obs,
+        **env_kwargs,
+    )
+    check_env_version(env, env_meta)
+    return env
+
+
+def create_env_for_data_processing(
+    env_meta,
+    camera_names, 
+    camera_height, 
+    camera_width, 
+    reward_shaping,
+    env_class=None,
+    render=None,
+    render_offscreen=None,
+    use_image_obs=None,
+    use_depth_obs=None,
+):
+    """
+    Creates environment for processing dataset observations and rewards.
+
+    Args:
+        env_meta (dict): environment metadata, which should be loaded from demonstration
+            hdf5 with @FileUtils.get_env_metadata_from_dataset or from checkpoint (see
+            @FileUtils.env_from_checkpoint). Contains 3 keys:
+
+                :`'env_name'`: name of environment
+                :`'type'`: type of environment, should be a value in EB.EnvType
+                :`'env_kwargs'`: dictionary of keyword arguments to pass to environment constructor
+
+        camera_names (list of st): list of camera names that correspond to image observations
+
+        camera_height (int): camera height for all cameras
+
+        camera_width (int): camera width for all cameras
+
+        reward_shaping (bool): if True, use shaped environment rewards, else use sparse task completion rewards
+
+        render (bool or None): optionally override rendering behavior
+
+        render_offscreen (bool or None): optionally override rendering behavior
+
+        use_image_obs (bool or None): optionally override rendering behavior
+
+        use_depth_obs (bool or None): optionally override rendering behavior
+    """
+    env_name = env_meta["env_name"]
+    env_type = get_env_type(env_meta=env_meta)
+    env_kwargs = env_meta["env_kwargs"]
+    if env_class is None:
+        env_class = get_env_class(env_type=env_type)
+
+    # remove possibly redundant values in kwargs
+    env_kwargs = deepcopy(env_kwargs)
+    env_kwargs.pop("env_name", None)
+    env_kwargs.pop("camera_names", None)
+    env_kwargs.pop("camera_height", None)
+    env_kwargs.pop("camera_width", None)
+    env_kwargs.pop("reward_shaping", None)
+    env_kwargs.pop("render", None)
+    env_kwargs.pop("render_offscreen", None)
+    env_kwargs.pop("use_image_obs", None)
+    env_kwargs.pop("use_depth_obs", None)
+
+    env = env_class.create_for_data_processing(
+        env_name=env_name, 
+        camera_names=camera_names, 
+        camera_height=camera_height, 
+        camera_width=camera_width, 
+        reward_shaping=reward_shaping, 
+        render=render,
+        render_offscreen=render_offscreen,
+        use_image_obs=use_image_obs,
+        use_depth_obs=use_depth_obs,
+        **env_kwargs,
+    )
+    check_env_version(env, env_meta)
+    return env
+
+
+def set_env_specific_obs_processing(env_meta=None, env_type=None, env=None):
+    """
+    Sets env-specific observation processing. As an example, robosuite depth observations
+    correspond to raw depth and should not be normalized by default, while default depth
+    processing normalizes and clips all values to [0, 1].
+    """
+    if is_robosuite_env(env_meta=env_meta, env_type=env_type, env=env):
+        from robomimic.utils.obs_utils import DepthModality, process_frame, unprocess_frame
+        DepthModality.set_obs_processor(processor=(
+            lambda obs: process_frame(frame=obs, channel_dim=1, scale=None)
+        ))
+        DepthModality.set_obs_unprocessor(unprocessor=(
+            lambda obs: unprocess_frame(frame=obs, channel_dim=1, scale=None)
+        ))
+
+
+def wrap_env_from_config(env, config):
+    """
+    Wraps environment using the provided Config object to determine which wrappers
+    to use (if any).
+    """
+    if ("frame_stack" in config.train) and (config.train.frame_stack > 1):
+        from robomimic.envs.wrappers import FrameStackWrapper
+        env = FrameStackWrapper(env, num_frames=config.train.frame_stack)
+
+    return env
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/file_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..67e43d07f96ea354e9150c5dd90721139371175c
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/file_utils.py
@@ -0,0 +1,616 @@
+"""
+A collection of utility functions for working with files, such as reading metadata from
+demonstration datasets, loading model checkpoints, or downloading dataset files.
+"""
+import os
+import h5py
+import json
+import time
+import urllib.request
+import numpy as np
+from collections import OrderedDict
+from tqdm import tqdm
+
+import torch
+
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.env_utils as EnvUtils
+import robomimic.utils.torch_utils as TorchUtils
+from robomimic.config import config_factory
+from robomimic.algo import algo_factory
+from robomimic.algo import RolloutPolicy
+
+
+def create_hdf5_filter_key(hdf5_path, demo_keys, key_name):
+    """
+    Creates a new hdf5 filter key in hdf5 file @hdf5_path with
+    name @key_name that corresponds to the demonstrations
+    @demo_keys. Filter keys are generally useful to create
+    named subsets of the demonstrations in an hdf5, making it
+    easy to train, test, or report statistics on a subset of
+    the trajectories in a file.
+
+    Returns the list of episode lengths that correspond to the filtering.
+
+    Args:
+        hdf5_path (str): path to hdf5 file
+        demo_keys ([str]): list of demonstration keys which should
+            correspond to this filter key. For example, ["demo_0", 
+            "demo_1"].
+        key_name (str): name of filter key to create
+
+    Returns:
+        ep_lengths ([int]): list of episode lengths that corresponds to
+            each demonstration in the new filter key
+    """
+    f = h5py.File(hdf5_path, "a")  
+    demos = sorted(list(f["data"].keys()))
+
+    # collect episode lengths for the keys of interest
+    ep_lengths = []
+    for ep in demos:
+        ep_data_grp = f["data/{}".format(ep)]
+        if ep in demo_keys:
+            ep_lengths.append(ep_data_grp.attrs["num_samples"])
+
+    # store list of filtered keys under mask group
+    k = "mask/{}".format(key_name)
+    if k in f:
+        del f[k]
+    f[k] = np.array(demo_keys, dtype='S')
+
+    f.close()
+    return ep_lengths
+
+
+def get_demos_for_filter_key(hdf5_path, filter_key):
+    """
+    Gets demo keys that correspond to a particular filter key.
+
+    Args:
+        hdf5_path (str): path to hdf5 file
+        filter_key (str): name of filter key
+
+    Returns:
+        demo_keys ([str]): list of demonstration keys that
+            correspond to this filter key. For example, ["demo_0",
+            "demo_1"].
+    """
+    f = h5py.File(hdf5_path, "r")
+    demo_keys = [elem.decode("utf-8") for elem in np.array(f["mask/{}".format(filter_key)][:])]
+    f.close()
+    return demo_keys
+
+
+def get_env_metadata_from_dataset(dataset_path, ds_format="robomimic", set_env_specific_obs_processors=True):
+    """
+    Retrieves env metadata from dataset.
+
+    Args:
+        dataset_path (str): path to dataset
+
+        set_env_specific_obs_processors (bool): environment might have custom rules for how to process
+            observations - if this flag is true, make sure ObsUtils will use these custom settings. This
+            is a good place to do this operation to make sure it happens before loading data, running a
+            trained model, etc.
+
+    Returns:
+        env_meta (dict): environment metadata. Contains 3 keys:
+
+            :`'env_name'`: name of environment
+            :`'type'`: type of environment, should be a value in EB.EnvType
+            :`'env_kwargs'`: dictionary of keyword arguments to pass to environment constructor
+    """
+    dataset_path = os.path.expandvars(os.path.expanduser(dataset_path))
+    f = h5py.File(dataset_path, "r")
+    if ds_format == "robomimic":
+        env_meta = json.loads(f["data"].attrs["env_args"])
+    elif ds_format == "r2d2":
+        env_meta = dict(f.attrs)
+    else:
+        raise ValueError
+    f.close()
+    if set_env_specific_obs_processors:
+        # handle env-specific custom observation processing logic
+        EnvUtils.set_env_specific_obs_processing(env_meta=env_meta)
+    return env_meta
+
+
+def get_shape_metadata_from_dataset(dataset_path, action_keys, all_obs_keys=None, ds_format="robomimic", verbose=False):
+    """
+    Retrieves shape metadata from dataset.
+
+    Args:
+        dataset_path (str): path to dataset
+        action_keys (list): list of all action key strings
+        all_obs_keys (list): list of all modalities used by the model. If not provided, all modalities
+            present in the file are used.
+        verbose (bool): if True, include print statements
+
+    Returns:
+        shape_meta (dict): shape metadata. Contains the following keys:
+
+            :`'ac_dim'`: action space dimension
+            :`'all_shapes'`: dictionary that maps observation key string to shape
+            :`'all_obs_keys'`: list of all observation modalities used
+            :`'use_images'`: bool, whether or not image modalities are present
+            :`'use_depths'`: bool, whether or not depth modalities are present
+    """
+
+    shape_meta = {}
+
+    # read demo file for some metadata
+    dataset_path = os.path.expandvars(os.path.expanduser(dataset_path))
+    f = h5py.File(dataset_path, "r")
+
+    if ds_format == "robomimic":
+        demo_id = list(f["data"].keys())[0]
+        demo = f["data/{}".format(demo_id)]
+
+        for key in action_keys:
+            assert len(demo[key].shape) == 2 # shape should be (B, D)
+        action_dim = sum([demo[key].shape[1] for key in action_keys])
+        shape_meta["ac_dim"] = action_dim
+
+        # observation dimensions
+        all_shapes = OrderedDict()
+
+        if all_obs_keys is None:
+            # use all modalities present in the file
+            all_obs_keys = [k for k in demo["obs"]]
+
+        for k in sorted(all_obs_keys):
+            initial_shape = demo["obs/{}".format(k)].shape[1:]
+            if verbose:
+                print("obs key {} with shape {}".format(k, initial_shape))
+            # Store processed shape for each obs key
+            all_shapes[k] = ObsUtils.get_processed_shape(
+                obs_modality=ObsUtils.OBS_KEYS_TO_MODALITIES[k],
+                input_shape=initial_shape,
+            )
+    elif ds_format == "r2d2":
+        for key in action_keys:
+            assert len(f[key].shape) == 2 # shape should be (B, D)
+        action_dim = sum([f[key].shape[1] for key in action_keys])
+        shape_meta["ac_dim"] = action_dim
+
+        # observation dimensions
+        all_shapes = OrderedDict()
+
+        # hack all relevant obs shapes for now
+        for k in [
+            "robot_state/cartesian_position",
+            "robot_state/gripper_position",
+            "robot_state/joint_positions",
+            "camera/image/hand_camera_image",
+            "camera/image/varied_camera_1_image",
+            "camera/image/varied_camera_2_image",
+        ]:
+            initial_shape = f["observation/{}".format(k)].shape[1:]
+            if len(initial_shape) == 0:
+                initial_shape = (1,)
+
+            all_shapes[k] = ObsUtils.get_processed_shape(
+                obs_modality=ObsUtils.OBS_KEYS_TO_MODALITIES[k],
+                input_shape=initial_shape,
+            )
+    else:
+        raise ValueError
+
+    f.close()
+
+    shape_meta['all_shapes'] = all_shapes
+    shape_meta['all_obs_keys'] = all_obs_keys
+    shape_meta['use_images'] = ObsUtils.has_modality("rgb", all_obs_keys)
+    shape_meta['use_depths'] = ObsUtils.has_modality("depth", all_obs_keys)
+
+    return shape_meta
+
+
+def get_intervention_segments(interventions):
+    """
+    Splits interventions list into a list of start and end indices (windows) of continuous intervention segments.
+    """
+    interventions = interventions.reshape(-1).astype(int)
+    # pad before and after to make it easy to count starting and ending intervention segments
+    expanded_ints = [False] + interventions.astype(bool).tolist() + [False]
+    start_inds = []
+    end_inds = []
+    for i in range(1, len(expanded_ints)):
+        if expanded_ints[i] and (not expanded_ints[i - 1]):
+            # low to high edge means start of new window
+            start_inds.append(i - 1) # record index in original array which is one less (since we added an element to the beg)
+        elif (not expanded_ints[i]) and expanded_ints[i - 1]:
+            # high to low edge means end of previous window
+            end_inds.append(i - 1) # record index in original array which is one less (since we added an element to the beg)
+
+    # run some sanity checks
+    assert len(start_inds) == len(end_inds), "missing window edge"
+    assert np.all([np.sum(interventions[s : e]) == (e - s) for s, e in zip(start_inds, end_inds)]), "window computation covers non-interventions"
+    assert sum([np.sum(interventions[s : e]) for s, e in zip(start_inds, end_inds)]) == np.sum(interventions), "window computation does not cover all interventions"
+    return list(zip(start_inds, end_inds))
+
+
+def load_dict_from_checkpoint(ckpt_path):
+    """
+    Load checkpoint dictionary from a checkpoint file.
+    
+    Args:
+        ckpt_path (str): Path to checkpoint file.
+
+    Returns:
+        ckpt_dict (dict): Loaded checkpoint dictionary.
+    """
+    ckpt_path = os.path.expandvars(os.path.expanduser(ckpt_path))
+    if not torch.cuda.is_available():
+        ckpt_dict = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
+    else:
+        ckpt_dict = torch.load(ckpt_path)
+    return ckpt_dict
+
+
+def maybe_dict_from_checkpoint(ckpt_path=None, ckpt_dict=None):
+    """
+    Utility function for the common use case where either an ckpt path
+    or a ckpt_dict is provided. This is a no-op if ckpt_dict is not
+    None, otherwise it loads the model dict from the ckpt path.
+
+    Args:
+        ckpt_path (str): Path to checkpoint file. Only needed if not providing @ckpt_dict.
+
+        ckpt_dict(dict): Loaded model checkpoint dictionary. Only needed if not providing @ckpt_path.
+
+    Returns:
+        ckpt_dict (dict): Loaded checkpoint dictionary.
+    """
+    assert (ckpt_path is not None) or (ckpt_dict is not None)
+    if ckpt_dict is None:
+        ckpt_dict = load_dict_from_checkpoint(ckpt_path)
+    return ckpt_dict
+
+
+def algo_name_from_checkpoint(ckpt_path=None, ckpt_dict=None):
+    """
+    Return algorithm name that was used to train a checkpoint or
+    loaded model dictionary.
+
+    Args:
+        ckpt_path (str): Path to checkpoint file. Only needed if not providing @ckpt_dict.
+
+        ckpt_dict(dict): Loaded model checkpoint dictionary. Only needed if not providing @ckpt_path.
+
+    Returns:
+        algo_name (str): algorithm name
+
+        ckpt_dict (dict): loaded checkpoint dictionary (convenient to avoid
+            re-loading checkpoint from disk multiple times)
+    """
+    ckpt_dict = maybe_dict_from_checkpoint(ckpt_path=ckpt_path, ckpt_dict=ckpt_dict)
+    algo_name = ckpt_dict["algo_name"]
+    return algo_name, ckpt_dict
+
+
+def update_config(cfg):
+    """
+    Updates the config for backwards-compatibility if it uses outdated configurations.
+
+    See https://github.com/ARISE-Initiative/robomimic/releases/tag/v0.2.0 for more info.
+
+    Args:
+        cfg (dict): Raw dictionary of config values
+    """
+    # Check if image modality is defined -- this means we're using an outdated config
+    # Note: There may be a nested hierarchy, so we possibly check all the nested obs cfgs which can include
+    # e.g. a planner and actor for HBC
+
+    def find_obs_dicts_recursively(dic):
+        dics = []
+        if "modalities" in dic:
+            dics.append(dic)
+        else:
+            for child_dic in dic.values():
+                dics += find_obs_dicts_recursively(child_dic)
+        return dics
+
+    obs_cfgs = find_obs_dicts_recursively(cfg["observation"])
+    for obs_cfg in obs_cfgs:
+        modalities = obs_cfg["modalities"]
+
+        found_img = False
+        for modality_group in ("obs", "subgoal", "goal"):
+            if modality_group in modalities:
+                img_modality = modalities[modality_group].pop("image", None)
+                if img_modality is not None:
+                    found_img = True
+                    modalities[modality_group]["rgb"] = img_modality
+
+        if found_img:
+            # Also need to map encoder kwargs correctly
+            old_encoder_cfg = obs_cfg.pop("encoder")
+
+            # Create new encoder entry for RGB
+            rgb_encoder_cfg = {
+                "core_class": "VisualCore",
+                "core_kwargs": {
+                    "backbone_kwargs": dict(),
+                    "pool_kwargs": dict(),
+                },
+                "obs_randomizer_class": None,
+                "obs_randomizer_kwargs": dict(),
+            }
+
+            if "visual_feature_dimension" in old_encoder_cfg:
+                rgb_encoder_cfg["core_kwargs"]["feature_dimension"] = old_encoder_cfg["visual_feature_dimension"]
+
+            if "visual_core" in old_encoder_cfg:
+                rgb_encoder_cfg["core_kwargs"]["backbone_class"] = old_encoder_cfg["visual_core"]
+
+            for kwarg in ("pretrained", "input_coord_conv"):
+                if "visual_core_kwargs" in old_encoder_cfg and kwarg in old_encoder_cfg["visual_core_kwargs"]:
+                    rgb_encoder_cfg["core_kwargs"]["backbone_kwargs"][kwarg] = old_encoder_cfg["visual_core_kwargs"][kwarg]
+
+            # Optionally add pooling info too
+            if old_encoder_cfg.get("use_spatial_softmax", True):
+                rgb_encoder_cfg["core_kwargs"]["pool_class"] = "SpatialSoftmax"
+
+            for kwarg in ("num_kp", "learnable_temperature", "temperature", "noise_std"):
+                if "spatial_softmax_kwargs" in old_encoder_cfg and kwarg in old_encoder_cfg["spatial_softmax_kwargs"]:
+                    rgb_encoder_cfg["core_kwargs"]["pool_kwargs"][kwarg] = old_encoder_cfg["spatial_softmax_kwargs"][kwarg]
+
+            # Update obs randomizer as well
+            for kwarg in ("obs_randomizer_class", "obs_randomizer_kwargs"):
+                if kwarg in old_encoder_cfg:
+                    rgb_encoder_cfg[kwarg] = old_encoder_cfg[kwarg]
+
+            # Store rgb config
+            obs_cfg["encoder"] = {"rgb": rgb_encoder_cfg}
+
+            # Also add defaults for low dim
+            obs_cfg["encoder"]["low_dim"] = {
+                "core_class": None,
+                "core_kwargs": {
+                    "backbone_kwargs": dict(),
+                    "pool_kwargs": dict(),
+                },
+                "obs_randomizer_class": None,
+                "obs_randomizer_kwargs": dict(),
+            }
+
+
+def config_from_checkpoint(algo_name=None, ckpt_path=None, ckpt_dict=None, verbose=False):
+    """
+    Helper function to restore config from a checkpoint file or loaded model dictionary.
+
+    Args:
+        algo_name (str): Algorithm name.
+
+        ckpt_path (str): Path to checkpoint file. Only needed if not providing @ckpt_dict.
+
+        ckpt_dict(dict): Loaded model checkpoint dictionary. Only needed if not providing @ckpt_path.
+
+        verbose (bool): if True, include print statements
+
+    Returns:
+        config (dict): Raw loaded configuration, without properties replaced.
+
+        ckpt_dict (dict): loaded checkpoint dictionary (convenient to avoid
+            re-loading checkpoint from disk multiple times)
+    """
+    ckpt_dict = maybe_dict_from_checkpoint(ckpt_path=ckpt_path, ckpt_dict=ckpt_dict)
+    if algo_name is None:
+        algo_name, _ = algo_name_from_checkpoint(ckpt_dict=ckpt_dict)
+
+    # restore config from loaded model dictionary
+    config_dict = json.loads(ckpt_dict['config'])
+    update_config(cfg=config_dict)
+
+    if verbose:
+        print("============= Loaded Config =============")
+        print(json.dumps(config_dict, indent=4))
+
+    config = config_factory(algo_name, dic=config_dict)
+
+    # lock config to prevent further modifications and ensure missing keys raise errors
+    config.lock()
+
+    return config, ckpt_dict
+
+
+def policy_from_checkpoint(device=None, ckpt_path=None, ckpt_dict=None, verbose=False):
+    """
+    This function restores a trained policy from a checkpoint file or
+    loaded model dictionary.
+
+    Args:
+        device (torch.device): if provided, put model on this device
+
+        ckpt_path (str): Path to checkpoint file. Only needed if not providing @ckpt_dict.
+
+        ckpt_dict(dict): Loaded model checkpoint dictionary. Only needed if not providing @ckpt_path.
+
+        verbose (bool): if True, include print statements
+
+    Returns:
+        model (RolloutPolicy): instance of Algo that has the saved weights from
+            the checkpoint file, and also acts as a policy that can easily
+            interact with an environment in a training loop
+
+        ckpt_dict (dict): loaded checkpoint dictionary (convenient to avoid
+            re-loading checkpoint from disk multiple times)
+    """
+    ckpt_dict = maybe_dict_from_checkpoint(ckpt_path=ckpt_path, ckpt_dict=ckpt_dict)
+
+    # algo name and config from model dict
+    algo_name, _ = algo_name_from_checkpoint(ckpt_dict=ckpt_dict)
+    config, _ = config_from_checkpoint(algo_name=algo_name, ckpt_dict=ckpt_dict, verbose=verbose)
+
+    # read config to set up metadata for observation modalities (e.g. detecting rgb observations)
+    ObsUtils.initialize_obs_utils_with_config(config)
+
+    # shape meta from model dict to get info needed to create model
+    shape_meta = ckpt_dict["shape_metadata"]
+
+    # maybe restore observation normalization stats
+    obs_normalization_stats = ckpt_dict.get("obs_normalization_stats", None)
+    if obs_normalization_stats is not None:
+        assert config.train.hdf5_normalize_obs
+        for m in obs_normalization_stats:
+            for k in obs_normalization_stats[m]:
+                obs_normalization_stats[m][k] = np.array(obs_normalization_stats[m][k])
+
+    # maybe restore action normalization stats
+    action_normalization_stats = ckpt_dict.get("action_normalization_stats", None)
+    if action_normalization_stats is not None:
+        for m in action_normalization_stats:
+            for k in action_normalization_stats[m]:
+                action_normalization_stats[m][k] = np.array(action_normalization_stats[m][k])
+
+    if device is None:
+        # get torch device
+        device = TorchUtils.get_torch_device(try_to_use_cuda=config.train.cuda)
+
+    # create model and load weights
+    model = algo_factory(
+        algo_name,
+        config,
+        obs_key_shapes=shape_meta["all_shapes"],
+        ac_dim=shape_meta["ac_dim"],
+        device=device,
+    )
+    model.deserialize(ckpt_dict["model"])
+    model.set_eval()
+    model = RolloutPolicy(
+        model,
+        obs_normalization_stats=obs_normalization_stats,
+        action_normalization_stats=action_normalization_stats
+    )
+    if verbose:
+        print("============= Loaded Policy =============")
+        print(model)
+    return model, ckpt_dict
+
+
+def env_from_checkpoint(ckpt_path=None, ckpt_dict=None, env_name=None, render=False, render_offscreen=False, verbose=False):
+    """
+    Creates an environment using the metadata saved in a checkpoint.
+
+    Args:
+        ckpt_path (str): Path to checkpoint file. Only needed if not providing @ckpt_dict.
+
+        ckpt_dict(dict): Loaded model checkpoint dictionary. Only needed if not providing @ckpt_path.
+
+        env_name (str): if provided, override environment name saved in checkpoint
+
+        render (bool): if True, environment supports on-screen rendering
+
+        render_offscreen (bool): if True, environment supports off-screen rendering. This
+            is forced to be True if saved model uses image observations.
+
+    Returns:
+        env (EnvBase instance): environment created using checkpoint
+
+        ckpt_dict (dict): loaded checkpoint dictionary (convenient to avoid
+            re-loading checkpoint from disk multiple times)
+    """
+    ckpt_dict = maybe_dict_from_checkpoint(ckpt_path=ckpt_path, ckpt_dict=ckpt_dict)
+
+    # metadata from model dict to get info needed to create environment
+    env_meta = ckpt_dict["env_metadata"]
+    shape_meta = ckpt_dict["shape_metadata"]
+
+    # create env from saved metadata
+    env = EnvUtils.create_env_from_metadata(
+        env_meta=env_meta, 
+        env_name=env_name,
+        render=render,
+        render_offscreen=render_offscreen,
+        use_image_obs=shape_meta.get("use_images", False),
+        use_depth_obs=shape_meta.get("use_depths", False),
+    )
+    config, _ = config_from_checkpoint(algo_name=ckpt_dict["algo_name"], ckpt_dict=ckpt_dict, verbose=False)
+    env = EnvUtils.wrap_env_from_config(env, config=config) # apply environment wrapper, if applicable
+    if verbose:
+        print("============= Loaded Environment =============")
+        print(env)
+    return env, ckpt_dict
+
+
+class DownloadProgressBar(tqdm):
+    def update_to(self, b=1, bsize=1, tsize=None):
+        if tsize is not None:
+            self.total = tsize
+        self.update(b * bsize - self.n)
+
+
+def url_is_alive(url):
+    """
+    Checks that a given URL is reachable.
+    From https://gist.github.com/dehowell/884204.
+
+    Args:
+        url (str): url string
+
+    Returns:
+        is_alive (bool): True if url is reachable, False otherwise
+    """
+    request = urllib.request.Request(url)
+    request.get_method = lambda: 'HEAD'
+
+    try:
+        urllib.request.urlopen(request)
+        return True
+    except urllib.request.HTTPError:
+        return False
+
+
+def download_url(url, download_dir, check_overwrite=True):
+    """
+    First checks that @url is reachable, then downloads the file
+    at that url into the directory specified by @download_dir.
+    Prints a progress bar during the download using tqdm.
+
+    Modified from https://github.com/tqdm/tqdm#hooks-and-callbacks, and
+    https://stackoverflow.com/a/53877507.
+
+    Args:
+        url (str): url string
+        download_dir (str): path to directory where file should be downloaded
+        check_overwrite (bool): if True, will sanity check the download fpath to make sure a file of that name
+            doesn't already exist there
+    """
+
+    # check if url is reachable. We need the sleep to make sure server doesn't reject subsequent requests
+    assert url_is_alive(url), "@download_url got unreachable url: {}".format(url)
+    time.sleep(0.5)
+
+    # infer filename from url link
+    fname = url.split("/")[-1]
+    file_to_write = os.path.join(download_dir, fname)
+
+    # If we're checking overwrite and the path already exists,
+    # we ask the user to verify that they want to overwrite the file
+    if check_overwrite and os.path.exists(file_to_write):
+        user_response = input(f"Warning: file {file_to_write} already exists. Overwrite? y/n\n")
+        assert user_response.lower() in {"yes", "y"}, f"Did not receive confirmation. Aborting download."
+
+    with DownloadProgressBar(unit='B', unit_scale=True,
+                             miniters=1, desc=fname) as t:
+        urllib.request.urlretrieve(url, filename=file_to_write, reporthook=t.update_to)
+
+
+def find_and_replace_path_prefix(org_path, replace_prefixes, new_prefix, assert_replace=False):
+    """
+    Try to find and replace one of several prefixes (@replace_prefixes) in string @org_path
+    with another prefix (@new_prefix). If @assert_replace is True, the function asserts that
+    replacement did occur.
+    """
+    check_ind = -1
+    for i, x in enumerate(replace_prefixes):
+        if org_path.startswith(x):
+            check_ind = i
+    if assert_replace:
+        assert check_ind != -1
+    if check_ind == -1:
+        return org_path
+    replace_prefix = replace_prefixes[check_ind]
+    return org_path.replace(replace_prefix, new_prefix, 1)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/hyperparam_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/hyperparam_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..460cfbc9e6b4f43d36fde6cdf440332307da7af2
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/hyperparam_utils.py
@@ -0,0 +1,368 @@
+"""
+A collection of utility functions and classes for generating config jsons for hyperparameter sweeps.
+"""
+import argparse
+import os
+import json
+import re
+import itertools
+
+from collections import OrderedDict
+from copy import deepcopy
+
+
+class ConfigGenerator(object):
+    """
+    Useful class to keep track of hyperparameters to sweep, and to generate
+    the json configs for each experiment run.
+    """
+    def __init__(self, base_config_file, wandb_proj_name="debug", script_file=None, generated_config_dir=None):
+        """
+        Args:
+            base_config_file (str): path to a base json config to use as a starting point
+                for the parameter sweep.
+
+            script_file (str): script filename to write as output
+        """
+        assert isinstance(base_config_file, str)
+        self.base_config_file = base_config_file
+        assert generated_config_dir is None or isinstance(generated_config_dir, str)
+        if generated_config_dir is not None:
+            generated_config_dir = os.path.expanduser(generated_config_dir)
+        self.generated_config_dir = generated_config_dir
+        assert script_file is None or isinstance(script_file, str)
+        if script_file is None:
+            self.script_file = os.path.join('~', 'tmp/tmpp.sh')
+        else:
+            self.script_file = script_file
+        self.script_file = os.path.expanduser(self.script_file)
+        self.parameters = OrderedDict()
+
+        assert isinstance(wandb_proj_name, str)
+        self.wandb_proj_name = wandb_proj_name
+
+    def add_param(self, key, name, group, values, value_names=None):
+        """
+        Add parameter to the hyperparameter sweep.
+
+        Args:
+            key (str): location of parameter in the config, using hierarchical key format
+                (ex. train/data = config.train.data)
+
+            name (str): name, as it will appear in the experiment name
+
+            group (int): group id - parameters with the same ID have their values swept
+                together
+
+            values (list): list of values to sweep over for this parameter
+
+            value_names ([str]): if provided, strings to use in experiment name for
+                each value, instead of the parameter value. This is helpful for parameters
+                that may have long or large values (for example, dataset path).
+        """
+        if value_names is not None:
+            assert len(values) == len(value_names)
+        self.parameters[key] = argparse.Namespace(
+            key=key, 
+            name=name, 
+            group=group, 
+            values=values, 
+            value_names=value_names,
+            hidename=hidename,
+        )
+
+    def generate(self):
+        """
+        Generates json configs for the hyperparameter sweep using attributes
+        @self.parameters, @self.base_config_file, and @self.script_file,
+        all of which should have first been set externally by calling
+        @add_param, @set_base_config_file, and @set_script_file.
+        """
+        assert len(self.parameters) > 0, "must add parameters using add_param first!"
+        generated_json_paths = self._generate_jsons()
+        self._script_from_jsons(generated_json_paths)
+
+    def _name_for_experiment(self, base_name, parameter_values, parameter_value_names):
+        """
+        This function generates the name for an experiment, given one specific
+        parameter setting.
+
+        Args:
+            base_name (str): base experiment name
+            parameter_values (OrderedDict): dictionary that maps parameter name to
+                the parameter value for this experiment run
+            parameter_value_names (dict): dictionary that maps parameter name to
+                the name to use for its value in the experiment name
+
+        Returns:
+            name (str): generated experiment name
+        """
+        name = base_name
+        for k in parameter_values:
+            # append parameter name and value to end of base name
+            if len(self.parameters[k].name) == 0 or self.parameters[k].hidename:
+                # empty string indicates that naming should be skipped
+                continue
+            if len(self.parameters[k].name) == 0:
+                # empty string indicates that naming should be skipped
+                continue
+            if parameter_value_names[k] is not None:
+                # take name from passed dictionary
+                val_str = parameter_value_names[k]
+            else:
+                val_str = parameter_values[k]
+                if isinstance(parameter_values[k], list) or isinstance(parameter_values[k], tuple):
+                    # convert list to string to avoid weird spaces and naming problems
+                    val_str = "_".join([str(x) for x in parameter_values[k]])
+            val_str = str(val_str)
+            name += '_{}'.format(self.parameters[k].name)
+            if len(val_str) > 0:
+                name += '_{}'.format(val_str)
+        return name
+
+    def _get_parameter_ranges(self):
+        """
+        Extract parameter ranges from base json file. Also takes all possible
+        combinations of the parameter ranges to generate an expanded set of values.
+
+        Returns:
+            parameter_ranges (dict): dictionary that maps the parameter to a list
+                of all values it should take for each generated config. The length 
+                of the list will be the total number of configs that will be
+                generated from this scan.
+
+            parameter_names (dict): dictionary that maps the parameter to a list
+                of all name strings that should contribute to each invididual
+                experiment's name. The length of the list will be the total 
+                number of configs that will be generated from this scan.
+        """
+
+        # mapping from group id to list of indices to grab from each parameter's list 
+        # of values in the parameter group
+        parameter_group_indices = OrderedDict()
+        for k in self.parameters:
+            group_id = self.parameters[k].group
+            assert isinstance(self.parameters[k].values, list)
+            num_param_values = len(self.parameters[k].values)
+            if group_id not in parameter_group_indices:
+                parameter_group_indices[group_id] = list(range(num_param_values))
+            else:
+                assert len(parameter_group_indices[group_id]) == num_param_values, \
+                    "error: inconsistent number of parameter values in group with id {}".format(group_id)
+
+        keys = list(parameter_group_indices.keys())
+        inds = list(parameter_group_indices.values())
+        new_parameter_group_indices = OrderedDict(
+            { k : [] for k in keys }
+        )
+        # get all combinations of the different parameter group indices
+        # and then use these indices to determine the new parameter ranges
+        # per member of each parameter group.
+        #
+        # e.g. with two parameter groups, one with two values, and another with three values
+        # we have [0, 1] x [0, 1, 2] = [0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2]
+        # so the corresponding parameter group indices are [0, 0, 0, 1, 1, 1] and 
+        # [0, 1, 2, 0, 1, 2], and all parameters in each parameter group are indexed
+        # together using these indices, to get each parameter range.
+        for comb in itertools.product(*inds):
+            for i in range(len(comb)):
+                new_parameter_group_indices[keys[i]].append(comb[i])
+        parameter_group_indices = new_parameter_group_indices
+
+        # use the indices to gather the parameter values to sweep per parameter
+        parameter_ranges = OrderedDict()
+        parameter_names = OrderedDict()
+        for k in self.parameters:
+            parameter_values = self.parameters[k].values
+            group_id = self.parameters[k].group
+            inds = parameter_group_indices[group_id]
+            parameter_ranges[k] = [parameter_values[ind] for ind in inds]
+
+            # add in parameter names if supplied
+            parameter_names[k] = None
+            if self.parameters[k].value_names is not None:
+                par_names = self.parameters[k].value_names
+                assert isinstance(par_names, list)
+                assert len(par_names) == len(parameter_values)
+                parameter_names[k] = [par_names[ind] for ind in inds]
+
+        # ensure that the number of parameter settings is the same per parameter
+        first_key = list(parameter_ranges.keys())[0]
+        num_settings = len(parameter_ranges[first_key])
+        for k in parameter_ranges:
+            assert len(parameter_ranges[k]) == num_settings, "inconsistent number of values"
+
+        return parameter_ranges, parameter_names
+
+    def _generate_jsons(self):
+        """
+        Generates json configs for the hyperparameter sweep, using @self.parameters and
+        @self.base_config_file.
+
+        Returns:
+            json_paths (list): list of paths to created json files, one per experiment
+        """
+
+        # base directory for saving jsons
+        if self.generated_config_dir:
+            base_dir = self.generated_config_dir
+            if not os.path.exists(base_dir):
+                os.makedirs(base_dir)
+        else:
+            base_dir = os.path.abspath(os.path.dirname(self.base_config_file))
+
+        # read base json
+        base_config = load_json(self.base_config_file, verbose=False)
+
+        # base exp name from this base config
+        base_exp_name = base_config['experiment']['name']
+
+        # use base json to determine the parameter ranges
+        parameter_ranges, parameter_names = self._get_parameter_ranges()
+
+        # iterate through each parameter setting to create each json
+        first_key = list(parameter_ranges.keys())[0]
+        num_settings = len(parameter_ranges[first_key])
+
+        # keep track of path to generated jsons
+        json_paths = []
+
+        for i in range(num_settings):
+            # the specific parameter setting for this experiment
+            setting = { k : parameter_ranges[k][i] for k in parameter_ranges }
+            maybe_parameter_names = OrderedDict()
+            for k in parameter_names:
+                maybe_parameter_names[k] = None
+                if parameter_names[k] is not None:
+                    maybe_parameter_names[k] = parameter_names[k][i]
+
+            # experiment name from setting
+            exp_name = self._name_for_experiment(
+                base_name=base_exp_name, 
+                parameter_values=setting, 
+                parameter_value_names=maybe_parameter_names,
+            )
+
+            # copy old json, but override name, and parameter values
+            json_dict = deepcopy(base_config)
+            json_dict['experiment']['name'] = exp_name
+            for k in parameter_ranges:
+                set_value_for_key(json_dict, k, v=parameter_ranges[k][i])
+
+            # populate list of identifying meta for logger;
+            # see meta_config method in base_config.py for more info
+            json_dict["experiment"]["logging"]["wandb_proj_name"] = self.wandb_proj_name
+            if "meta" not in json_dict:
+                json_dict["meta"] = dict()
+            json_dict["meta"].update(
+                hp_base_config_file=self.base_config_file,
+                hp_keys=list(),
+                hp_values=list(),
+            )
+            # logging: keep track of hyp param names and values as meta info
+            for k in parameter_ranges.keys():
+                key_name = self.parameters[k].name
+                if key_name is not None and len(key_name) > 0:
+                    if maybe_parameter_names[k] is not None:
+                        value_name = maybe_parameter_names[k]
+                    else:
+                        value_name = setting[k]
+
+                    json_dict["meta"]["hp_keys"].append(key_name)
+                    json_dict["meta"]["hp_values"].append(value_name)
+
+            # save file in same directory as old json
+            json_path = os.path.join(base_dir, "{}.json".format(exp_name))
+            save_json(json_dict, json_path)
+            json_paths.append(json_path)
+
+        print("Num exps:", len(json_paths))
+
+        return json_paths
+
+    def _script_from_jsons(self, json_paths):
+        """
+        Generates a bash script to run the experiments that correspond to
+        the input jsons.
+        """
+        with open(self.script_file, 'w') as f:
+            f.write("#!/bin/bash\n\n")
+            for path in json_paths:
+                # write python command to file
+                cmd = "python train.py --config {}\n".format(path)
+
+                print()
+                print(cmd)
+                f.write(cmd)
+
+
+def load_json(json_file, verbose=True):
+    """
+    Simple utility function to load a json file as a dict.
+
+    Args:
+        json_file (str): path to json file to load
+        verbose (bool): if True, pretty print the loaded json dictionary
+
+    Returns:
+        config (dict): json dictionary
+    """
+    with open(json_file, 'r') as f:
+        config = json.load(f)
+    if verbose:
+        print('loading external config: =================')
+        print(json.dumps(config, indent=4))
+        print('==========================================')
+    return config
+
+
+def save_json(config, json_file):
+    """
+    Simple utility function to save a dictionary to a json file on disk.
+
+    Args:
+        config (dict): dictionary to save
+        json_file (str): path to json file to write
+    """
+    with open(json_file, 'w') as f:
+        # preserve original key ordering
+        json.dump(config, f, sort_keys=False, indent=4)
+
+
+def get_value_for_key(dic, k):
+    """
+    Get value for nested dictionary with levels denoted by "/" or ".".
+    For example, if @k is "a/b", then this function returns
+    @dic["a"]["b"].
+
+    Args:
+        dic (dict): a nested dictionary
+        k (str): a single string meant to index several levels down into
+            the nested dictionary, where levels can be denoted by "/" or
+            by ".".
+    Returns:
+        val: the nested dictionary value for the provided key
+    """
+    val = dic
+    subkeys = re.split('/|\.', k)
+    for s in subkeys[:-1]:
+        val = val[s]
+    return val[subkeys[-1]]
+
+
+def set_value_for_key(dic, k, v):
+    """
+    Set value for hierarchical dictionary with levels denoted by "/" or ".".
+
+    Args:
+        dic (dict): a nested dictionary
+        k (str): a single string meant to index several levels down into
+            the nested dictionary, where levels can be denoted by "/" or
+            by ".".
+        v: the value to set at the provided key
+    """
+    val = dic
+    subkeys = re.split('/|\.', k) #k.split('/')
+    for s in subkeys[:-1]:
+        val = val[s]
+    val[subkeys[-1]] = v
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/log_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/log_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b431d5d4988dfdd60135eb3b81319fc825ba223a
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/log_utils.py
@@ -0,0 +1,230 @@
+"""
+This file contains utility classes and functions for logging to stdout, stderr,
+and to tensorboard.
+"""
+import os
+import sys
+import numpy as np
+from datetime import datetime
+from contextlib import contextmanager
+import textwrap
+import time
+from tqdm import tqdm
+from termcolor import colored
+
+import robomimic
+
+# global list of warning messages can be populated with @log_warning and flushed with @flush_warnings
+WARNINGS_BUFFER = []
+
+
+class PrintLogger(object):
+    """
+    This class redirects print statements to both console and a file.
+    """
+    def __init__(self, log_file):
+        self.terminal = sys.stdout
+        print('STDOUT will be forked to %s' % log_file)
+        self.log_file = open(log_file, "a")
+
+    def fileno(self):
+        return self.terminal.fileno()
+
+    def write(self, message):
+        self.terminal.write(message)
+        self.log_file.write(message)
+        self.log_file.flush()
+
+    def flush(self):
+        # this flush method is needed for python 3 compatibility.
+        # this handles the flush command by doing nothing.
+        # you might want to specify some extra behavior here.
+        pass
+
+
+class DataLogger(object):
+    """
+    Logging class to log metrics to tensorboard and/or retrieve running statistics about logged data.
+    """
+    def __init__(self, log_dir, config, log_tb=True, log_wandb=False):
+        """
+        Args:
+            log_dir (str): base path to store logs
+            log_tb (bool): whether to use tensorboard logging
+        """
+        self._tb_logger = None
+        self._wandb_logger = None
+        self._data = dict() # store all the scalar data logged so far
+
+        if log_tb:
+            from tensorboardX import SummaryWriter
+            self._tb_logger = SummaryWriter(os.path.join(log_dir, 'tb'))
+
+        if log_wandb:
+            import wandb
+            import robomimic.macros as Macros
+
+            # set up wandb api key if specified in macros
+            if Macros.WANDB_API_KEY is not None:
+                os.environ["WANDB_API_KEY"] = Macros.WANDB_API_KEY
+
+            assert Macros.WANDB_ENTITY is not None, "WANDB_ENTITY macro is set to None." \
+                    "\nSet this macro in {base_path}/macros_private.py" \
+                    "\nIf this file does not exist, first run python {base_path}/scripts/setup_macros.py".format(base_path=robomimic.__path__[0])
+
+            # attempt to set up wandb 10 times. If unsuccessful after these trials, don't use wandb
+            num_attempts = 10
+            for attempt in range(num_attempts):
+                try:
+                    # set up wandb
+                    self._wandb_logger = wandb
+
+                    self._wandb_logger.init(
+                        entity=Macros.WANDB_ENTITY,
+                        project=config.experiment.logging.wandb_proj_name,
+                        name=config.experiment.name,
+                        dir=log_dir,
+                        mode=("offline" if attempt == num_attempts - 1 else "online"),
+                    )
+
+                    # set up info for identifying experiment
+                    wandb_config = {k: v for (k, v) in config.meta.items() if k not in ["hp_keys", "hp_values"]}
+                    for (k, v) in zip(config.meta["hp_keys"], config.meta["hp_values"]):
+                        wandb_config[k] = v
+                    if "algo" not in wandb_config:
+                        wandb_config["algo"] = config.algo_name
+                    self._wandb_logger.config.update(wandb_config)
+
+                    break
+                except Exception as e:
+                    log_warning("wandb initialization error (attempt #{}): {}".format(attempt + 1, e))
+                    self._wandb_logger = None
+                    time.sleep(30)
+
+    def record(self, k, v, epoch, data_type='scalar', log_stats=False):
+        """
+        Record data with logger.
+        Args:
+            k (str): key string
+            v (float or image): value to store
+            epoch: current epoch number
+            data_type (str): the type of data. either 'scalar' or 'image'
+            log_stats (bool): whether to store the mean/max/min/std for all data logged so far with key k
+        """
+
+        assert data_type in ['scalar', 'image']
+
+        if data_type == 'scalar':
+            # maybe update internal cache if logging stats for this key
+            if log_stats or k in self._data: # any key that we're logging or previously logged
+                if k not in self._data:
+                    self._data[k] = []
+                self._data[k].append(v)
+
+        # maybe log to tensorboard
+        if self._tb_logger is not None:
+            if data_type == 'scalar':
+                self._tb_logger.add_scalar(k, v, epoch)
+                if log_stats:
+                    stats = self.get_stats(k)
+                    for (stat_k, stat_v) in stats.items():
+                        stat_k_name = '{}-{}'.format(k, stat_k)
+                        self._tb_logger.add_scalar(stat_k_name, stat_v, epoch)
+            elif data_type == 'image':
+                self._tb_logger.add_images(k, img_tensor=v, global_step=epoch, dataformats="NHWC")
+
+        if self._wandb_logger is not None:
+            try:
+                if data_type == 'scalar':
+                    self._wandb_logger.log({k: v}, step=epoch)
+                    if log_stats:
+                        stats = self.get_stats(k)
+                        for (stat_k, stat_v) in stats.items():
+                            self._wandb_logger.log({"{}/{}".format(k, stat_k): stat_v}, step=epoch)
+                elif data_type == 'image':
+                    raise NotImplementedError
+            except Exception as e:
+                log_warning("wandb logging: {}".format(e))
+
+    def get_stats(self, k):
+        """
+        Computes running statistics for a particular key.
+        Args:
+            k (str): key string
+        Returns:
+            stats (dict): dictionary of statistics
+        """
+        stats = dict()
+        stats['mean'] = np.mean(self._data[k])
+        stats['std'] = np.std(self._data[k])
+        stats['min'] = np.min(self._data[k])
+        stats['max'] = np.max(self._data[k])
+        return stats
+
+    def close(self):
+        """
+        Run before terminating to make sure all logs are flushed
+        """
+        if self._tb_logger is not None:
+            self._tb_logger.close()
+
+        if self._wandb_logger is not None:
+            self._wandb_logger.finish()
+
+
+class custom_tqdm(tqdm):
+    """
+    Small extension to tqdm to make a few changes from default behavior.
+    By default tqdm writes to stderr. Instead, we change it to write
+    to stdout.
+    """
+    def __init__(self, *args, **kwargs):
+        assert "file" not in kwargs
+        super(custom_tqdm, self).__init__(*args, file=sys.stdout, **kwargs)
+
+
+@contextmanager
+def silence_stdout():
+    """
+    This contextmanager will redirect stdout so that nothing is printed
+    to the terminal. Taken from the link below:
+
+    https://stackoverflow.com/questions/6735917/redirecting-stdout-to-nothing-in-python
+    """
+    old_target = sys.stdout
+    try:
+        with open(os.devnull, "w") as new_target:
+            sys.stdout = new_target
+            yield new_target
+    finally:
+        sys.stdout = old_target
+
+
+def log_warning(message, color="yellow", print_now=True):
+    """
+    This function logs a warning message by recording it in a global warning buffer.
+    The global registry will be maintained until @flush_warnings is called, at
+    which point the warnings will get printed to the terminal.
+
+    Args:
+        message (str): warning message to display
+        color (str): color of message - defaults to "yellow"
+        print_now (bool): if True (default), will print to terminal immediately, in
+            addition to adding it to the global warning buffer
+    """
+    global WARNINGS_BUFFER
+    buffer_message = colored("ROBOMIMIC WARNING(\n{}\n)".format(textwrap.indent(message, "    ")), color)
+    WARNINGS_BUFFER.append(buffer_message)
+    if print_now:
+        print(buffer_message)
+
+
+def flush_warnings():
+    """
+    This function flushes all warnings from the global warning buffer to the terminal and
+    clears the global registry.
+    """
+    global WARNINGS_BUFFER
+    for msg in WARNINGS_BUFFER:
+        print(msg)
+    WARNINGS_BUFFER = []
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/loss_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/loss_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3f5bf223ed7dfbd510b4b8a8edf2e98b0567613
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/loss_utils.py
@@ -0,0 +1,208 @@
+"""
+This file contains a collection of useful loss functions for use with torch tensors.
+"""
+
+import math
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+
+def cosine_loss(preds, labels):
+    """
+    Cosine loss between two tensors.
+
+    Args:
+        preds (torch.Tensor): torch tensor
+        labels (torch.Tensor): torch tensor
+
+    Returns:
+        loss (torch.Tensor): cosine loss
+    """
+    sim = torch.nn.CosineSimilarity(dim=len(preds.shape) - 1)(preds, labels)
+    return -torch.mean(sim - 1.0)
+
+
+def KLD_0_1_loss(mu, logvar):
+    """
+    KL divergence loss. Computes D_KL( N(mu, sigma) || N(0, 1) ). Note that 
+    this function averages across the batch dimension, but sums across dimension.
+
+    Args:
+        mu (torch.Tensor): mean tensor of shape (B, D)
+        logvar (torch.Tensor): logvar tensor of shape (B, D)
+
+    Returns:
+        loss (torch.Tensor): KL divergence loss between the input gaussian distribution
+            and N(0, 1)
+    """
+    return -0.5 * (1. + logvar - mu.pow(2) - logvar.exp()).sum(dim=1).mean()
+
+
+def KLD_gaussian_loss(mu_1, logvar_1, mu_2, logvar_2):
+    """
+    KL divergence loss between two Gaussian distributions. This function 
+    computes the average loss across the batch.
+
+    Args:
+        mu_1 (torch.Tensor): first means tensor of shape (B, D)
+        logvar_1 (torch.Tensor): first logvars tensor of shape (B, D)
+        mu_2 (torch.Tensor): second means tensor of shape (B, D)
+        logvar_2 (torch.Tensor): second logvars tensor of shape (B, D)
+
+    Returns:
+        loss (torch.Tensor): KL divergence loss between the two gaussian distributions
+    """
+    return -0.5 * (1. + \
+        logvar_1 - logvar_2 \
+        - ((mu_2 - mu_1).pow(2) / logvar_2.exp()) \
+        - (logvar_1.exp() / logvar_2.exp()) \
+        ).sum(dim=1).mean()
+
+
+def log_normal(x, m, v):
+    """
+    Log probability of tensor x under diagonal multivariate normal with
+    mean m and variance v. The last dimension of the tensors is treated
+    as the dimension of the Gaussian distribution - all other dimensions
+    are treated as independent Gaussians. Adapted from CS 236 at Stanford.
+
+    Args:
+        x (torch.Tensor): tensor with shape (B, ..., D)
+        m (torch.Tensor): means tensor with shape (B, ..., D) or (1, ..., D)
+        v (torch.Tensor): variances tensor with shape (B, ..., D) or (1, ..., D)
+
+    Returns:
+        log_prob (torch.Tensor): log probabilities of shape (B, ...)
+    """
+    element_wise = -0.5 * (torch.log(v) + (x - m).pow(2) / v + np.log(2 * np.pi))
+    log_prob = element_wise.sum(-1)
+    return log_prob
+
+
+def log_normal_mixture(x, m, v, w=None, log_w=None):
+    """
+    Log probability of tensor x under a uniform mixture of Gaussians. 
+    Adapted from CS 236 at Stanford.
+
+    Args:
+        x (torch.Tensor): tensor with shape (B, D)
+        m (torch.Tensor): means tensor with shape (B, M, D) or (1, M, D), where 
+            M is number of mixture components
+        v (torch.Tensor): variances tensor with shape (B, M, D) or (1, M, D) where 
+            M is number of mixture components
+        w (torch.Tensor): weights tensor - if provided, should be 
+            shape (B, M) or (1, M)
+        log_w (torch.Tensor): log-weights tensor - if provided, should be 
+            shape (B, M) or (1, M)
+
+    Returns:
+        log_prob (torch.Tensor): log probabilities of shape (B,)
+    """
+
+    # (B , D) -> (B , 1, D)
+    x = x.unsqueeze(1)
+    # (B, 1, D) -> (B, M, D) -> (B, M)
+    log_prob = log_normal(x, m, v)
+    if w is not None or log_w is not None:
+        # this weights the log probabilities by the mixture weights so we have log(w_i * N(x | m_i, v_i))
+        if w is not None:
+            assert log_w is None
+            log_w = torch.log(w)
+        log_prob += log_w
+        # then compute log sum_i exp [log(w_i * N(x | m_i, v_i))]
+        # (B, M) -> (B,)
+        log_prob = log_sum_exp(log_prob , dim=1)
+    else:
+        # (B, M) -> (B,)
+        log_prob = log_mean_exp(log_prob , dim=1) # mean accounts for uniform weights
+    return log_prob
+
+
+def log_mean_exp(x, dim):
+    """
+    Compute the log(mean(exp(x), dim)) in a numerically stable manner.
+    Adapted from CS 236 at Stanford.
+
+    Args:
+        x (torch.Tensor): a tensor 
+        dim (int): dimension along which mean is computed
+
+    Returns:
+        y (torch.Tensor): log(mean(exp(x), dim))
+    """
+    return log_sum_exp(x, dim) - np.log(x.size(dim))
+
+
+def log_sum_exp(x, dim=0):
+    """
+    Compute the log(sum(exp(x), dim)) in a numerically stable manner.
+    Adapted from CS 236 at Stanford.
+
+    Args:
+        x (torch.Tensor): a tensor 
+        dim (int): dimension along which sum is computed
+
+    Returns:
+        y (torch.Tensor): log(sum(exp(x), dim))
+    """
+    max_x = torch.max(x, dim)[0]
+    new_x = x - max_x.unsqueeze(dim).expand_as(x)
+    return max_x + (new_x.exp().sum(dim)).log()
+
+
+def project_values_onto_atoms(values, probabilities, atoms):
+    """
+    Project the categorical distribution given by @probabilities on the
+    grid of values given by @values onto a grid of values given by @atoms.
+    This is useful when computing a bellman backup where the backed up
+    values from the original grid will not be in the original support,
+    requiring L2 projection. 
+
+    Each value in @values has a corresponding probability in @probabilities -
+    this probability mass is shifted to the closest neighboring grid points in
+    @atoms in proportion. For example, if the value in question is 0.2, and the
+    neighboring atoms are 0 and 1, then 0.8 of the probability weight goes to 
+    atom 0 and 0.2 of the probability weight will go to 1.
+
+    Adapted from https://github.com/deepmind/acme/blob/master/acme/tf/losses/distributional.py#L42
+    
+    Args:
+        values: value grid to project, of shape (batch_size, n_atoms)
+        probabilities: probabilities for categorical distribution on @values, shape (batch_size, n_atoms)
+        atoms: value grid to project onto, of shape (n_atoms,) or (1, n_atoms)
+
+    Returns:
+        new probability vectors that correspond to the L2 projection of the categorical distribution
+        onto @atoms
+    """
+
+    # make sure @atoms is shape (n_atoms,)
+    if len(atoms.shape) > 1:
+        atoms = atoms.squeeze(0)
+
+    # helper tensors from @atoms
+    vmin, vmax = atoms[0], atoms[1]
+    d_pos = torch.cat([atoms, vmin[None]], dim=0)[1:]
+    d_neg = torch.cat([vmax[None], atoms], dim=0)[:-1]
+
+    # ensure that @values grid is within the support of @atoms
+    clipped_values = values.clamp(min=vmin, max=vmax)[:, None, :] # (batch_size, 1, n_atoms)
+    clipped_atoms = atoms[None, :, None] # (1, n_atoms, 1)
+
+    # distance between atom values in support
+    d_pos = (d_pos - atoms)[None, :, None] # atoms[i + 1] - atoms[i], shape (1, n_atoms, 1)
+    d_neg = (atoms - d_neg)[None, :, None] # atoms[i] - atoms[i - 1], shape (1, n_atoms, 1)
+
+    # distances between all pairs of grid values
+    deltas = clipped_values - clipped_atoms # (batch_size, n_atoms, n_atoms)
+
+    # computes eqn (7) in distributional RL paper by doing the following - for each
+    # output atom in @atoms, consider values that are close enough, and weight their
+    # probability mass contribution by the normalized distance in [0, 1] given 
+    # by (1. - (z_j - z_i) / (delta_z)).
+    d_sign = (deltas >= 0.).float()
+    delta_hat = (d_sign * deltas / d_pos) - ((1. - d_sign) * deltas / d_neg)
+    delta_hat = (1. - delta_hat).clamp(min=0., max=1.)
+    probabilities = probabilities[:, None, :]
+    return (delta_hat * probabilities).sum(dim=2)
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/obs_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/obs_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e1840de0419ca1523f066aad095481ea3516d95
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/obs_utils.py
@@ -0,0 +1,1025 @@
+"""
+A collection of utilities for working with observation dictionaries and
+different kinds of modalities such as images.
+"""
+import numpy as np
+from copy import deepcopy
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+
+import robomimic.utils.tensor_utils as TU
+
+# MACRO FOR VALID IMAGE CHANNEL SIZES
+VALID_IMAGE_CHANNEL_DIMS = {1, 3}       # depth, rgb
+
+# DO NOT MODIFY THIS!
+# This keeps track of observation types (modalities) - and is populated on call to @initialize_obs_utils_with_obs_specs.
+# This will be a dictionary that maps observation modality (e.g. low_dim, rgb) to a list of observation
+# keys under that observation modality.
+OBS_MODALITIES_TO_KEYS = None
+
+# DO NOT MODIFY THIS!
+# This keeps track of observation types (modalities) - and is populated on call to @initialize_obs_utils_with_obs_specs.
+# This will be a dictionary that maps observation keys to their corresponding observation modality
+# (e.g. low_dim, rgb)
+OBS_KEYS_TO_MODALITIES = None
+
+# DO NOT MODIFY THIS
+# This holds the default encoder kwargs that will be used if none are passed at runtime for any given network
+DEFAULT_ENCODER_KWARGS = None
+
+# DO NOT MODIFY THIS
+# This holds the registered observation modality classes
+OBS_MODALITY_CLASSES = {}
+
+# DO NOT MODIFY THIS
+# This global dict stores mapping from observation encoder / randomizer network name to class.
+# We keep track of these registries to enable automated class inference at runtime, allowing
+# users to simply extend our base encoder / randomizer class and refer to that class in string form
+# in their config, without having to manually register their class internally.
+# This also future-proofs us for any additional encoder / randomizer classes we would
+# like to add ourselves.
+OBS_ENCODER_CORES = {"None": None}          # Include default None
+OBS_RANDOMIZERS = {"None": None}            # Include default None
+
+
+def register_obs_key(target_class):
+    assert target_class not in OBS_MODALITY_CLASSES, f"Already registered modality {target_class}!"
+    OBS_MODALITY_CLASSES[target_class.name] = target_class
+
+
+def register_encoder_core(target_class):
+    assert target_class not in OBS_ENCODER_CORES, f"Already registered obs encoder core {target_class}!"
+    OBS_ENCODER_CORES[target_class.__name__] = target_class
+
+
+def register_randomizer(target_class):
+    assert target_class not in OBS_RANDOMIZERS, f"Already registered obs randomizer {target_class}!"
+    OBS_RANDOMIZERS[target_class.__name__] = target_class
+
+
+class ObservationKeyToModalityDict(dict):
+    """
+    Custom dictionary class with the sole additional purpose of automatically registering new "keys" at runtime
+    without breaking. This is mainly for backwards compatibility, where certain keys such as "latent", "actions", etc.
+    are used automatically by certain models (e.g.: VAEs) but were never specified by the user externally in their
+    config. Thus, this dictionary will automatically handle those keys by implicitly associating them with the low_dim
+    modality.
+    """
+    def __getitem__(self, item):
+        # If a key doesn't already exist, warn the user and add default mapping
+        if item not in self.keys():
+            print(f"ObservationKeyToModalityDict: {item} not found,"
+                  f" adding {item} to mapping with assumed low_dim modality!")
+            self.__setitem__(item, "low_dim")
+        return super(ObservationKeyToModalityDict, self).__getitem__(item)
+
+
+def obs_encoder_kwargs_from_config(obs_encoder_config):
+    """
+    Generate a set of args used to create visual backbones for networks
+    from the observation encoder config.
+
+    Args:
+        obs_encoder_config (Config): Config object containing relevant encoder information. Should be equivalent to
+            config.observation.encoder
+
+    Returns:
+        dict: Processed encoder kwargs
+    """
+    # Loop over each obs modality
+    # Unlock encoder config
+    obs_encoder_config.unlock()
+    for obs_modality, encoder_kwargs in obs_encoder_config.items():
+        # First run some sanity checks and store the classes
+        for cls_name, cores in zip(("core", "obs_randomizer"), (OBS_ENCODER_CORES, OBS_RANDOMIZERS)):
+            # Make sure the requested encoder for each obs_modality exists
+            cfg_cls = encoder_kwargs[f"{cls_name}_class"]
+            if cfg_cls is not None:
+                assert cfg_cls in cores, f"No {cls_name} class with name {cfg_cls} found, must register this class before" \
+                    f"creating model!"
+                # encoder_kwargs[f"{cls_name}_class"] = cores[cfg_cls]
+
+        # Process core and randomizer kwargs
+        encoder_kwargs.core_kwargs = dict() if encoder_kwargs.core_kwargs is None else \
+            deepcopy(encoder_kwargs.core_kwargs)
+        encoder_kwargs.obs_randomizer_kwargs = dict() if encoder_kwargs.obs_randomizer_kwargs is None else \
+            deepcopy(encoder_kwargs.obs_randomizer_kwargs)
+
+    # Re-lock keys
+    obs_encoder_config.lock()
+
+    return dict(obs_encoder_config)
+
+
+def initialize_obs_modality_mapping_from_dict(modality_mapping):
+    """
+    This function is an alternative to @initialize_obs_utils_with_obs_specs, that allows manually setting of modalities.
+    NOTE: Only one of these should be called at runtime -- not both! (Note that all training scripts that use a config)
+        automatically handle obs modality mapping, so using this function is usually unnecessary)
+
+    Args:
+        modality_mapping (dict): Maps modality string names (e.g.: rgb, low_dim, etc.) to a list of observation
+            keys that should belong to that modality
+    """
+    global OBS_KEYS_TO_MODALITIES, OBS_MODALITIES_TO_KEYS
+
+    OBS_KEYS_TO_MODALITIES = ObservationKeyToModalityDict()
+    OBS_MODALITIES_TO_KEYS = dict()
+
+    for mod, keys in modality_mapping.items():
+        OBS_MODALITIES_TO_KEYS[mod] = deepcopy(keys)
+        OBS_KEYS_TO_MODALITIES.update({k: mod for k in keys})
+
+
+def initialize_obs_utils_with_obs_specs(obs_modality_specs):
+    """
+    This function should be called before using any observation key-specific
+    functions in this file, in order to make sure that all utility
+    functions are aware of the observation modalities (e.g. which ones
+    are low-dimensional, which ones are rgb, etc.).
+
+    It constructs two dictionaries: (1) that map observation modality (e.g. low_dim, rgb) to
+    a list of observation keys under that modality, and (2) that maps the inverse, specific
+    observation keys to their corresponding observation modality.
+
+    Input should be a nested dictionary (or list of such dicts) with the following structure:
+
+        obs_variant (str):
+            obs_modality (str): observation keys (list)
+            ...
+        ...
+
+    Example:
+        {
+            "obs": {
+                "low_dim": ["robot0_eef_pos", "robot0_eef_quat"],
+                "rgb": ["agentview_image", "robot0_eye_in_hand"],
+            }
+            "goal": {
+                "low_dim": ["robot0_eef_pos"],
+                "rgb": ["agentview_image"]
+            }
+        }
+
+    In the example, raw observations consist of low-dim and rgb modalities, with
+    the robot end effector pose under low-dim, and the agentview and wrist camera
+    images under rgb, while goal observations also consist of low-dim and rgb modalities,
+    with a subset of the raw observation keys per modality.
+
+    Args:
+        obs_modality_specs (dict or list): A nested dictionary (see docstring above for an example)
+            or a list of nested dictionaries. Accepting a list as input makes it convenient for
+            situations where multiple modules may each have their own modality spec.
+    """
+    global OBS_KEYS_TO_MODALITIES, OBS_MODALITIES_TO_KEYS
+
+    OBS_KEYS_TO_MODALITIES = ObservationKeyToModalityDict()
+
+    # accept one or more spec dictionaries - if it's just one, account for this
+    if isinstance(obs_modality_specs, dict):
+        obs_modality_spec_list = [obs_modality_specs]
+    else:
+        obs_modality_spec_list = obs_modality_specs
+
+    # iterates over observation specs
+    obs_modality_mapping = {}
+    for obs_modality_spec in obs_modality_spec_list:
+        # iterates over observation variants (e.g. observations, goals, subgoals)
+        for obs_modalities in obs_modality_spec.values():
+            for obs_modality, obs_keys in obs_modalities.items():
+                # add all keys for each obs modality to the corresponding list in obs_modality_mapping
+                if obs_modality not in obs_modality_mapping:
+                    obs_modality_mapping[obs_modality] = []
+                obs_modality_mapping[obs_modality] += obs_keys
+                # loop over each modality, and add to global dict if it doesn't exist yet
+                for obs_key in obs_keys:
+                    if obs_key not in OBS_KEYS_TO_MODALITIES:
+                        OBS_KEYS_TO_MODALITIES[obs_key] = obs_modality
+                    # otherwise, run sanity check to make sure we don't have conflicting, duplicate entries
+                    else:
+                        assert OBS_KEYS_TO_MODALITIES[obs_key] == obs_modality, \
+                            f"Cannot register obs key {obs_key} with modality {obs_modality}; " \
+                            f"already exists with corresponding modality {OBS_KEYS_TO_MODALITIES[obs_key]}"
+
+    # remove duplicate entries and store in global mapping
+    OBS_MODALITIES_TO_KEYS = { obs_modality : list(set(obs_modality_mapping[obs_modality])) for obs_modality in obs_modality_mapping }
+
+
+def initialize_default_obs_encoder(obs_encoder_config):
+    """
+    Initializes the default observation encoder kwarg information to be used by all networks if no values are manually
+    specified at runtime.
+
+    Args:
+        obs_encoder_config (Config): Observation encoder config to use.
+            Should be equivalent to config.observation.encoder
+    """
+    global DEFAULT_ENCODER_KWARGS
+    DEFAULT_ENCODER_KWARGS = obs_encoder_kwargs_from_config(obs_encoder_config)
+
+
+def initialize_obs_utils_with_config(config):
+    """
+    Utility function to parse config and call @initialize_obs_utils_with_obs_specs and
+    @initialize_default_obs_encoder_kwargs with the correct arguments.
+
+    Args:
+        config (BaseConfig instance): config object
+    """
+    if config.algo_name == "hbc":
+        obs_modality_specs = [
+            config.observation.planner.modalities, 
+            config.observation.actor.modalities,
+        ]
+        obs_encoder_config = config.observation.actor.encoder
+    elif config.algo_name == "iris":
+        obs_modality_specs = [
+            config.observation.value_planner.planner.modalities, 
+            config.observation.value_planner.value.modalities, 
+            config.observation.actor.modalities,
+        ]
+        obs_encoder_config = config.observation.actor.encoder
+    else:
+        obs_modality_specs = [config.observation.modalities]
+        obs_encoder_config = config.observation.encoder
+    initialize_obs_utils_with_obs_specs(obs_modality_specs=obs_modality_specs)
+    initialize_default_obs_encoder(obs_encoder_config=obs_encoder_config)
+
+
+def key_is_obs_modality(key, obs_modality):
+    """
+    Check if observation key corresponds to modality @obs_modality.
+
+    Args:
+        key (str): obs key name to check
+        obs_modality (str): observation modality - e.g.: "low_dim", "rgb"
+    """
+    assert OBS_KEYS_TO_MODALITIES is not None, "error: must call ObsUtils.initialize_obs_utils_with_obs_config first"
+    return OBS_KEYS_TO_MODALITIES[key] == obs_modality
+
+
+def center_crop(im, t_h, t_w):
+    """
+    Takes a center crop of an image.
+
+    Args:
+        im (np.array or torch.Tensor): image of shape (..., height, width, channel)
+        t_h (int): height of crop
+        t_w (int): width of crop
+
+    Returns:
+        im (np.array or torch.Tensor): center cropped image
+    """
+    assert(im.shape[-3] >= t_h and im.shape[-2] >= t_w)
+    assert(im.shape[-1] in [1, 3])
+    crop_h = int((im.shape[-3] - t_h) / 2)
+    crop_w = int((im.shape[-2] - t_w) / 2)
+    return im[..., crop_h:crop_h + t_h, crop_w:crop_w + t_w, :]
+
+
+def batch_image_hwc_to_chw(im):
+    """
+    Channel swap for images - useful for preparing images for
+    torch training.
+
+    Args:
+        im (np.array or torch.Tensor): image of shape (batch, height, width, channel)
+            or (height, width, channel)
+
+    Returns:
+        im (np.array or torch.Tensor): image of shape (batch, channel, height, width)
+            or (channel, height, width)
+    """
+    start_dims = np.arange(len(im.shape) - 3).tolist()
+    s = start_dims[-1] if len(start_dims) > 0 else -1
+    if isinstance(im, np.ndarray):
+        return im.transpose(start_dims + [s + 3, s + 1, s + 2])
+    else:
+        return im.permute(start_dims + [s + 3, s + 1, s + 2])
+
+
+def batch_image_chw_to_hwc(im):
+    """
+    Inverse of channel swap in @batch_image_hwc_to_chw.
+
+    Args:
+        im (np.array or torch.Tensor): image of shape (batch, channel, height, width)
+            or (channel, height, width)
+
+    Returns:
+        im (np.array or torch.Tensor): image of shape (batch, height, width, channel)
+            or (height, width, channel)
+    """
+    start_dims = np.arange(len(im.shape) - 3).tolist()
+    s = start_dims[-1] if len(start_dims) > 0 else -1
+    if isinstance(im, np.ndarray):
+        return im.transpose(start_dims + [s + 2, s + 3, s + 1])
+    else:
+        return im.permute(start_dims + [s + 2, s + 3, s + 1])
+
+
+def process_obs(obs, obs_modality=None, obs_key=None):
+    """
+    Process observation @obs corresponding to @obs_modality modality (or implicitly inferred from @obs_key)
+    to prepare for network input.
+
+    Note that either obs_modality OR obs_key must be specified!
+
+    If both are specified, obs_key will override obs_modality
+
+    Args:
+        obs (np.array or torch.Tensor): Observation to process. Leading batch dimension is optional
+        obs_modality (str): Observation modality (e.g.: depth, image, low_dim, etc.)
+        obs_key (str): Name of observation from which to infer @obs_modality
+
+    Returns:
+        processed_obs (np.array or torch.Tensor): processed observation
+    """
+    assert obs_modality is not None or obs_key is not None, "Either obs_modality or obs_key must be specified!"
+    if obs_key is not None:
+        obs_modality = OBS_KEYS_TO_MODALITIES[obs_key]
+    return OBS_MODALITY_CLASSES[obs_modality].process_obs(obs)
+
+
+def process_obs_dict(obs_dict):
+    """
+    Process observations in observation dictionary to prepare for network input.
+
+    Args:
+        obs_dict (dict): dictionary mapping observation keys to np.array or
+            torch.Tensor. Leading batch dimensions are optional.
+
+    Returns:
+        new_dict (dict): dictionary where observation keys have been processed by their corresponding processors
+    """
+    return { k : process_obs(obs=obs, obs_key=k) for k, obs in obs_dict.items() } # shallow copy
+
+
+def process_frame(frame, channel_dim, scale):
+    """
+    Given frame fetched from dataset, process for network input. Converts array
+    to float (from uint8), normalizes pixels from range [0, @scale] to [0, 1], and channel swaps
+    from (H, W, C) to (C, H, W).
+
+    Args:
+        frame (np.array or torch.Tensor): frame array
+        channel_dim (int): Number of channels to sanity check for
+        scale (float or None): Value to normalize inputs by
+
+    Returns:
+        processed_frame (np.array or torch.Tensor): processed frame
+    """
+    # Channel size should either be 3 (RGB) or 1 (depth)
+    frame = TU.to_float(frame)
+    if scale is not None:
+        frame = frame / scale
+        frame = frame.clip(0.0, 1.0)
+    if frame.shape[-1] == 3 or frame.shape[-1] == 1:
+        frame = batch_image_hwc_to_chw(frame)
+
+    return frame
+
+
+def unprocess_obs(obs, obs_modality=None, obs_key=None):
+    """
+    Prepare observation @obs corresponding to @obs_modality modality (or implicitly inferred from @obs_key)
+    to prepare for deployment.
+
+    Note that either obs_modality OR obs_key must be specified!
+
+    If both are specified, obs_key will override obs_modality
+
+    Args:
+        obs (np.array or torch.Tensor): Observation to unprocess. Leading batch dimension is optional
+        obs_modality (str): Observation modality (e.g.: depth, image, low_dim, etc.)
+        obs_key (str): Name of observation from which to infer @obs_modality
+
+    Returns:
+        unprocessed_obs (np.array or torch.Tensor): unprocessed observation
+    """
+    assert obs_modality is not None or obs_key is not None, "Either obs_modality or obs_key must be specified!"
+    if obs_key is not None:
+        obs_modality = OBS_KEYS_TO_MODALITIES[obs_key]
+    return OBS_MODALITY_CLASSES[obs_modality].unprocess_obs(obs)
+
+
+def unprocess_obs_dict(obs_dict):
+    """
+    Prepare processed observation dictionary for saving to dataset. Inverse of
+    @process_obs.
+
+    Args:
+        obs_dict (dict): dictionary mapping observation keys to np.array or
+            torch.Tensor. Leading batch dimensions are optional.
+
+    Returns:
+        new_dict (dict): dictionary where observation keys have been unprocessed by
+            their respective unprocessor methods
+    """
+    return { k : unprocess_obs(obs=obs, obs_key=k) for k, obs in obs_dict.items() } # shallow copy
+
+
+def unprocess_frame(frame, channel_dim, scale):
+    """
+    Given frame prepared for network input, prepare for saving to dataset.
+    Inverse of @process_frame.
+
+    Args:
+        frame (np.array or torch.Tensor): frame array
+        channel_dim (int): What channel dimension should be (used for sanity check)
+        scale (float or None): Scaling factor to apply during denormalization
+
+    Returns:
+        unprocessed_frame (np.array or torch.Tensor): frame passed through
+            inverse operation of @process_frame
+    """
+    assert frame.shape[-3] == channel_dim # check for channel dimension
+    frame = batch_image_chw_to_hwc(frame)
+    if scale is not None:
+        frame = scale * frame
+    return frame
+
+
+def get_processed_shape(obs_modality, input_shape):
+    """
+    Given observation modality @obs_modality and expected inputs of shape @input_shape (excluding batch dimension), return the
+    expected processed observation shape resulting from process_{obs_modality}.
+
+    Args:
+        obs_modality (str): Observation modality to use (e.g.: low_dim, rgb, depth, etc...)
+        input_shape (list of int): Expected input dimensions, excluding the batch dimension
+
+    Returns:
+        list of int: expected processed input shape
+    """
+    return list(process_obs(obs=np.zeros(input_shape), obs_modality=obs_modality).shape)
+
+
+def normalize_dict(dict, normalization_stats):
+    """
+    Normalize dict using the provided "offset" and "scale" entries
+    for each observation key. The dictionary will be
+    modified in-place.
+
+    Args:
+        dict (dict): dictionary mapping key to np.array or
+            torch.Tensor. Leading batch dimensions are optional.
+
+        normalization_stats (dict): this should map keys to dicts
+            with a "offset" and "scale" of shape (1, ...) where ... is the default
+            shape for the dict value.
+
+    Returns:
+        dict (dict): obs dict with normalized arrays
+    """
+
+    # ensure we have statistics for each modality key in the dict
+    assert set(dict.keys()).issubset(normalization_stats)
+
+    for m in dict:
+        # get rid of extra dimension - we will pad for broadcasting later
+        offset = normalization_stats[m]["offset"][0]
+        scale = normalization_stats[m]["scale"][0]
+
+        # shape consistency checks
+        m_num_dims = len(offset.shape)
+        shape_len_diff = len(dict[m].shape) - m_num_dims
+        assert shape_len_diff >= 0, "shape length mismatch in @normalize_dict"
+        assert dict[m].shape[-m_num_dims:] == offset.shape, "shape mismatch in @normalize_dict"
+
+        # dict can have one or more leading batch dims - prepare for broadcasting
+        reshape_padding = tuple([1] * shape_len_diff)
+        offset = offset.reshape(reshape_padding + tuple(offset.shape))
+        scale = scale.reshape(reshape_padding + tuple(scale.shape))
+
+        dict[m] = (dict[m] - offset) / scale
+
+    return dict
+
+
+def unnormalize_dict(dict, normalization_stats):
+    """
+    Unnormalize dict using the provided "offset" and "scale" entries
+    for each observation key. The dictionary will be
+    modified in-place.
+
+    Args:
+        dict (dict): dictionary mapping key to np.array or
+            torch.Tensor. Leading batch dimensions are optional.
+
+        normalization_stats (dict): this should map keys to dicts
+            with a "offset" and "scale" of shape (1, ...) where ... is the default
+            shape for the dict value.
+
+    Returns:
+        dict (dict): obs dict with normalized arrays
+    """
+
+    # ensure we have statistics for each modality key in the dict
+    assert set(dict.keys()).issubset(normalization_stats)
+
+    for m in dict:
+        # get rid of extra dimension - we will pad for broadcasting later
+        offset = normalization_stats[m]["offset"][0]
+        scale = normalization_stats[m]["scale"][0]
+
+        # shape consistency checks
+        m_num_dims = len(offset.shape)
+        shape_len_diff = len(dict[m].shape) - m_num_dims
+        assert shape_len_diff >= 0, "shape length mismatch in @unnormalize_dict"
+        assert dict[m].shape[-m_num_dims:] == offset.shape, "shape mismatch in @unnormalize_dict"
+
+        # dict can have one or more leading batch dims - prepare for broadcasting
+        reshape_padding = tuple([1] * shape_len_diff)
+        offset = offset.reshape(reshape_padding + tuple(offset.shape))
+        scale = scale.reshape(reshape_padding + tuple(scale.shape))
+
+        dict[m] = (dict[m] * scale) + offset
+
+    return dict
+
+
+def has_modality(modality, obs_keys):
+    """
+    Returns True if @modality is present in the list of observation keys @obs_keys.
+
+    Args:
+        modality (str): modality to check for, e.g.: rgb, depth, etc.
+        obs_keys (list): list of observation keys
+    """
+    for k in obs_keys:
+        if key_is_obs_modality(k, obs_modality=modality):
+            return True
+    return False
+
+
+def repeat_and_stack_observation(obs_dict, n):
+    """
+    Given an observation dictionary and a desired repeat value @n,
+    this function will return a new observation dictionary where
+    each modality is repeated @n times and the copies are
+    stacked in the first dimension. 
+
+    For example, if a batch of 3 observations comes in, and n is 2,
+    the output will look like [ob1; ob1; ob2; ob2; ob3; ob3] in
+    each modality.
+
+    Args:
+        obs_dict (dict): dictionary mapping observation key to np.array or
+            torch.Tensor. Leading batch dimensions are optional.
+
+        n (int): number to repeat by
+
+    Returns:
+        repeat_obs_dict (dict): repeated obs dict
+    """
+    return TU.repeat_by_expand_at(obs_dict, repeats=n, dim=0)
+
+
+def crop_image_from_indices(images, crop_indices, crop_height, crop_width):
+    """
+    Crops images at the locations specified by @crop_indices. Crops will be 
+    taken across all channels.
+
+    Args:
+        images (torch.Tensor): batch of images of shape [..., C, H, W]
+
+        crop_indices (torch.Tensor): batch of indices of shape [..., N, 2] where
+            N is the number of crops to take per image and each entry corresponds
+            to the pixel height and width of where to take the crop. Note that
+            the indices can also be of shape [..., 2] if only 1 crop should
+            be taken per image. Leading dimensions must be consistent with
+            @images argument. Each index specifies the top left of the crop.
+            Values must be in range [0, H - CH - 1] x [0, W - CW - 1] where
+            H and W are the height and width of @images and CH and CW are
+            @crop_height and @crop_width.
+
+        crop_height (int): height of crop to take
+
+        crop_width (int): width of crop to take
+
+    Returns:
+        crops (torch.Tesnor): cropped images of shape [..., C, @crop_height, @crop_width]
+    """
+
+    # make sure length of input shapes is consistent
+    assert crop_indices.shape[-1] == 2
+    ndim_im_shape = len(images.shape)
+    ndim_indices_shape = len(crop_indices.shape)
+    assert (ndim_im_shape == ndim_indices_shape + 1) or (ndim_im_shape == ndim_indices_shape + 2)
+
+    # maybe pad so that @crop_indices is shape [..., N, 2]
+    is_padded = False
+    if ndim_im_shape == ndim_indices_shape + 2:
+        crop_indices = crop_indices.unsqueeze(-2)
+        is_padded = True
+
+    # make sure leading dimensions between images and indices are consistent
+    assert images.shape[:-3] == crop_indices.shape[:-2]
+
+    device = images.device
+    image_c, image_h, image_w = images.shape[-3:]
+    num_crops = crop_indices.shape[-2]
+
+    # make sure @crop_indices are in valid range
+    assert (crop_indices[..., 0] >= 0).all().item()
+    assert (crop_indices[..., 0] < (image_h - crop_height)).all().item()
+    assert (crop_indices[..., 1] >= 0).all().item()
+    assert (crop_indices[..., 1] < (image_w - crop_width)).all().item()
+
+    # convert each crop index (ch, cw) into a list of pixel indices that correspond to the entire window.
+
+    # 2D index array with columns [0, 1, ..., CH - 1] and shape [CH, CW]
+    crop_ind_grid_h = torch.arange(crop_height).to(device)
+    crop_ind_grid_h = TU.unsqueeze_expand_at(crop_ind_grid_h, size=crop_width, dim=-1)
+    # 2D index array with rows [0, 1, ..., CW - 1] and shape [CH, CW]
+    crop_ind_grid_w = torch.arange(crop_width).to(device)
+    crop_ind_grid_w = TU.unsqueeze_expand_at(crop_ind_grid_w, size=crop_height, dim=0)
+    # combine into shape [CH, CW, 2]
+    crop_in_grid = torch.cat((crop_ind_grid_h.unsqueeze(-1), crop_ind_grid_w.unsqueeze(-1)), dim=-1)
+
+    # Add above grid with the offset index of each sampled crop to get 2d indices for each crop.
+    # After broadcasting, this will be shape [..., N, CH, CW, 2] and each crop has a [CH, CW, 2]
+    # shape array that tells us which pixels from the corresponding source image to grab.
+    grid_reshape = [1] * len(crop_indices.shape[:-1]) + [crop_height, crop_width, 2]
+    all_crop_inds = crop_indices.unsqueeze(-2).unsqueeze(-2) + crop_in_grid.reshape(grid_reshape)
+
+    # For using @torch.gather, convert to flat indices from 2D indices, and also
+    # repeat across the channel dimension. To get flat index of each pixel to grab for 
+    # each sampled crop, we just use the mapping: ind = h_ind * @image_w + w_ind
+    all_crop_inds = all_crop_inds[..., 0] * image_w + all_crop_inds[..., 1] # shape [..., N, CH, CW]
+    all_crop_inds = TU.unsqueeze_expand_at(all_crop_inds, size=image_c, dim=-3) # shape [..., N, C, CH, CW]
+    all_crop_inds = TU.flatten(all_crop_inds, begin_axis=-2) # shape [..., N, C, CH * CW]
+
+    # Repeat and flatten the source images -> [..., N, C, H * W] and then use gather to index with crop pixel inds
+    images_to_crop = TU.unsqueeze_expand_at(images, size=num_crops, dim=-4)
+    images_to_crop = TU.flatten(images_to_crop, begin_axis=-2)
+    crops = torch.gather(images_to_crop, dim=-1, index=all_crop_inds)
+    # [..., N, C, CH * CW] -> [..., N, C, CH, CW]
+    reshape_axis = len(crops.shape) - 1
+    crops = TU.reshape_dimensions(crops, begin_axis=reshape_axis, end_axis=reshape_axis, 
+                    target_dims=(crop_height, crop_width))
+
+    if is_padded:
+        # undo padding -> [..., C, CH, CW]
+        crops = crops.squeeze(-4)
+    return crops
+
+
+def sample_random_image_crops(images, crop_height, crop_width, num_crops, pos_enc=False):
+    """
+    For each image, randomly sample @num_crops crops of size (@crop_height, @crop_width), from
+    @images.
+
+    Args:
+        images (torch.Tensor): batch of images of shape [..., C, H, W]
+
+        crop_height (int): height of crop to take
+        
+        crop_width (int): width of crop to take
+
+        num_crops (n): number of crops to sample
+
+        pos_enc (bool): if True, also add 2 channels to the outputs that gives a spatial 
+            encoding of the original source pixel locations. This means that the
+            output crops will contain information about where in the source image 
+            it was sampled from.
+
+    Returns:
+        crops (torch.Tensor): crops of shape (..., @num_crops, C, @crop_height, @crop_width) 
+            if @pos_enc is False, otherwise (..., @num_crops, C + 2, @crop_height, @crop_width)
+
+        crop_inds (torch.Tensor): sampled crop indices of shape (..., N, 2)
+    """
+    device = images.device
+
+    # maybe add 2 channels of spatial encoding to the source image
+    source_im = images
+    if pos_enc:
+        # spatial encoding [y, x] in [0, 1]
+        h, w = source_im.shape[-2:]
+        pos_y, pos_x = torch.meshgrid(torch.arange(h), torch.arange(w))
+        pos_y = pos_y.float().to(device) / float(h)
+        pos_x = pos_x.float().to(device) / float(w)
+        position_enc = torch.stack((pos_y, pos_x)) # shape [C, H, W]
+
+        # unsqueeze and expand to match leading dimensions -> shape [..., C, H, W]
+        leading_shape = source_im.shape[:-3]
+        position_enc = position_enc[(None,) * len(leading_shape)]
+        position_enc = position_enc.expand(*leading_shape, -1, -1, -1)
+
+        # concat across channel dimension with input
+        source_im = torch.cat((source_im, position_enc), dim=-3)
+
+    # make sure sample boundaries ensure crops are fully within the images
+    image_c, image_h, image_w = source_im.shape[-3:]
+    max_sample_h = image_h - crop_height
+    max_sample_w = image_w - crop_width
+
+    # Sample crop locations for all tensor dimensions up to the last 3, which are [C, H, W].
+    # Each gets @num_crops samples - typically this will just be the batch dimension (B), so 
+    # we will sample [B, N] indices, but this supports having more than one leading dimension,
+    # or possibly no leading dimension.
+    #
+    # Trick: sample in [0, 1) with rand, then re-scale to [0, M) and convert to long to get sampled ints
+    crop_inds_h = (max_sample_h * torch.rand(*source_im.shape[:-3], num_crops).to(device)).long()
+    crop_inds_w = (max_sample_w * torch.rand(*source_im.shape[:-3], num_crops).to(device)).long()
+    crop_inds = torch.cat((crop_inds_h.unsqueeze(-1), crop_inds_w.unsqueeze(-1)), dim=-1) # shape [..., N, 2]
+
+    crops = crop_image_from_indices(
+        images=source_im, 
+        crop_indices=crop_inds, 
+        crop_height=crop_height, 
+        crop_width=crop_width, 
+    )
+
+    return crops, crop_inds
+
+
+class Modality:
+    """
+    Observation Modality class to encapsulate necessary functions needed to
+    process observations of this modality
+    """
+    # observation keys to associate with this modality
+    keys = set()
+
+    # Custom processing function that should prepare raw observations of this modality for training
+    _custom_obs_processor = None
+
+    # Custom unprocessing function that should prepare observations of this modality used during training for deployment
+    _custom_obs_unprocessor = None
+
+    # Name of this modality -- must be set by subclass!
+    name = None
+
+    def __init_subclass__(cls, **kwargs):
+        """
+        Hook method to automatically register all valid subclasses so we can keep track of valid modalities
+        """
+        assert cls.name is not None, f"Name of modality {cls.__name__} must be specified!"
+        register_obs_key(cls)
+
+    @classmethod
+    def set_keys(cls, keys):
+        """
+        Sets the observation keys associated with this modality.
+
+        Args:
+            keys (list or set): observation keys to associate with this modality
+        """
+        cls.keys = {k for k in keys}
+
+    @classmethod
+    def add_keys(cls, keys):
+        """
+        Adds the observation @keys associated with this modality to the current set of keys.
+
+        Args:
+            keys (list or set): observation keys to add to associate with this modality
+        """
+        for key in keys:
+            cls.keys.add(key)
+
+    @classmethod
+    def set_obs_processor(cls, processor=None):
+        """
+        Sets the processor for this observation modality. If @processor is set to None, then
+        the obs processor will use the default one (self.process_obs(...)). Otherwise, @processor
+        should be a function to process this corresponding observation modality.
+
+        Args:
+            processor (function or None): If not None, should be function that takes in either a
+                np.array or torch.Tensor and output the processed array / tensor. If None, will reset
+                to the default processor (self.process_obs(...))
+        """
+        cls._custom_obs_processor = processor
+
+    @classmethod
+    def set_obs_unprocessor(cls, unprocessor=None):
+        """
+        Sets the unprocessor for this observation modality. If @unprocessor is set to None, then
+        the obs unprocessor will use the default one (self.unprocess_obs(...)). Otherwise, @unprocessor
+        should be a function to process this corresponding observation modality.
+
+        Args:
+            unprocessor (function or None): If not None, should be function that takes in either a
+                np.array or torch.Tensor and output the unprocessed array / tensor. If None, will reset
+                to the default unprocessor (self.unprocess_obs(...))
+        """
+        cls._custom_obs_unprocessor = unprocessor
+
+    @classmethod
+    def _default_obs_processor(cls, obs):
+        """
+        Default processing function for this obs modality.
+
+        Note that this function is overridden by self.custom_obs_processor (a function with identical inputs / outputs)
+        if it is not None.
+
+        Args:
+            obs (np.array or torch.Tensor): raw observation, which may include a leading batch dimension
+
+        Returns:
+            np.array or torch.Tensor: processed observation
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def _default_obs_unprocessor(cls, obs):
+        """
+        Default unprocessing function for this obs modality.
+
+        Note that this function is overridden by self.custom_obs_unprocessor
+        (a function with identical inputs / outputs) if it is not None.
+
+        Args:
+            obs (np.array or torch.Tensor): processed observation, which may include a leading batch dimension
+
+        Returns:
+            np.array or torch.Tensor: unprocessed observation
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def process_obs(cls, obs):
+        """
+        Prepares an observation @obs of this modality for network input.
+
+        Args:
+            obs (np.array or torch.Tensor): raw observation, which may include a leading batch dimension
+
+        Returns:
+            np.array or torch.Tensor: processed observation
+        """
+        processor = cls._custom_obs_processor if \
+            cls._custom_obs_processor is not None else cls._default_obs_processor
+        return processor(obs)
+
+    @classmethod
+    def unprocess_obs(cls, obs):
+        """
+        Prepares an observation @obs of this modality for deployment.
+
+        Args:
+            obs (np.array or torch.Tensor): processed observation, which may include a leading batch dimension
+
+        Returns:
+            np.array or torch.Tensor: unprocessed observation
+        """
+        unprocessor = cls._custom_obs_unprocessor if \
+            cls._custom_obs_unprocessor is not None else cls._default_obs_unprocessor
+        return unprocessor(obs)
+
+    @classmethod
+    def process_obs_from_dict(cls, obs_dict, inplace=True):
+        """
+        Receives a dictionary of keyword mapped observations @obs_dict, and processes the observations with keys
+        corresponding to this modality. A copy will be made of the received dictionary unless @inplace is True
+
+        Args:
+            obs_dict (dict): Dictionary mapping observation keys to observations
+            inplace (bool): If True, will modify @obs_dict in place, otherwise, will create a copy
+
+        Returns:
+            dict: observation dictionary with processed observations corresponding to this modality
+        """
+        if inplace:
+            obs_dict = deepcopy(obs_dict)
+        # Loop over all keys and process the ones corresponding to this modality
+        for key, obs in obs_dict.values():
+            if key in cls.keys:
+                obs_dict[key] = cls.process_obs(obs)
+
+        return obs_dict
+
+
+class ImageModality(Modality):
+    """
+    Modality for RGB image observations
+    """
+    name = "rgb"
+
+    @classmethod
+    def _default_obs_processor(cls, obs):
+        """
+        Given image fetched from dataset, process for network input. Converts array
+        to float (from uint8), normalizes pixels from range [0, 255] to [0, 1], and channel swaps
+        from (H, W, C) to (C, H, W).
+
+        Args:
+            obs (np.array or torch.Tensor): image array
+
+        Returns:
+            processed_obs (np.array or torch.Tensor): processed image
+        """
+        return process_frame(frame=obs, channel_dim=3, scale=255.)
+
+    @classmethod
+    def _default_obs_unprocessor(cls, obs):
+        """
+        Given image prepared for network input, prepare for saving to dataset.
+        Inverse of @process_frame.
+
+        Args:
+            obs (np.array or torch.Tensor): image array
+
+        Returns:
+            unprocessed_obs (np.array or torch.Tensor): image passed through
+                inverse operation of @process_frame
+        """
+        return TU.to_uint8(unprocess_frame(frame=obs, channel_dim=3, scale=255.))
+
+
+class DepthModality(Modality):
+    """
+    Modality for depth observations
+    """
+    name = "depth"
+
+    @classmethod
+    def _default_obs_processor(cls, obs):
+        """
+        Given depth fetched from dataset, process for network input. Converts array
+        to float (from uint8), normalizes pixels from range [0, 1] to [0, 1], and channel swaps
+        from (H, W, C) to (C, H, W).
+
+        Args:
+            obs (np.array or torch.Tensor): depth array
+
+        Returns:
+            processed_obs (np.array or torch.Tensor): processed depth
+        """
+        return process_frame(frame=obs, channel_dim=1, scale=1.)
+
+    @classmethod
+    def _default_obs_unprocessor(cls, obs):
+        """
+        Given depth prepared for network input, prepare for saving to dataset.
+        Inverse of @process_depth.
+
+        Args:
+            obs (np.array or torch.Tensor): depth array
+
+        Returns:
+            unprocessed_obs (np.array or torch.Tensor): depth passed through
+                inverse operation of @process_depth
+        """
+        return unprocess_frame(frame=obs, channel_dim=1, scale=1.)
+
+
+class ScanModality(Modality):
+    """
+    Modality for scan observations
+    """
+    name = "scan"
+
+    @classmethod
+    def _default_obs_processor(cls, obs):
+        # Channel swaps ([...,] L, C) --> ([...,] C, L)
+
+        # First, add extra dimension at 2nd to last index to treat this as a frame
+        shape = obs.shape
+        new_shape = [*shape[:-2], 1, *shape[-2:]]
+        obs = obs.reshape(new_shape)
+
+        # Convert shape
+        obs = batch_image_hwc_to_chw(obs)
+
+        # Remove extra dimension (it's the second from last dimension)
+        obs = obs.squeeze(-2)
+        return obs
+
+    @classmethod
+    def _default_obs_unprocessor(cls, obs):
+        # Channel swaps ([B,] C, L) --> ([B,] L, C)
+
+        # First, add extra dimension at 1st index to treat this as a frame
+        shape = obs.shape
+        new_shape = [*shape[:-2], 1, *shape[-2:]]
+        obs = obs.reshape(new_shape)
+
+        # Convert shape
+        obs = batch_image_chw_to_hwc(obs)
+
+        # Remove extra dimension (it's the second from last dimension)
+        obs = obs.squeeze(-2)
+        return obs
+
+
+class LowDimModality(Modality):
+    """
+    Modality for low dimensional observations
+    """
+    name = "low_dim"
+
+    @classmethod
+    def _default_obs_processor(cls, obs):
+        return obs
+
+    @classmethod
+    def _default_obs_unprocessor(cls, obs):
+        return obs
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/python_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/python_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bc71bd1aaaf08bb406f3a72e83886d86c0d19a6
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/python_utils.py
@@ -0,0 +1,73 @@
+"""
+Set of general purpose utility functions for easier interfacing with Python API
+"""
+import inspect
+from copy import deepcopy
+import robomimic.macros as Macros
+
+
+def get_class_init_kwargs(cls):
+    """
+    Helper function to return a list of all valid keyword arguments (excluding "self") for the given @cls class.
+
+    Args:
+        cls (object): Class from which to grab __init__ kwargs
+
+    Returns:
+        list: All keyword arguments (excluding "self") specified by @cls __init__ constructor method
+    """
+    return list(inspect.signature(cls.__init__).parameters.keys())[1:]
+
+
+def extract_subset_dict(dic, keys, copy=False):
+    """
+    Helper function to extract a subset of dictionary key-values from a current dictionary. Optionally (deep)copies
+    the values extracted from the original @dic if @copy is True.
+
+    Args:
+        dic (dict): Dictionary containing multiple key-values
+        keys (Iterable): Specific keys to extract from @dic. If the key doesn't exist in @dic, then the key is skipped
+        copy (bool): If True, will deepcopy all values corresponding to the specified @keys
+
+    Returns:
+        dict: Extracted subset dictionary containing only the specified @keys and their corresponding values
+    """
+    subset = {k: dic[k] for k in keys if k in dic}
+    return deepcopy(subset) if copy else subset
+
+
+def extract_class_init_kwargs_from_dict(cls, dic, copy=False, verbose=False):
+    """
+    Helper function to return a dictionary of key-values that specifically correspond to @cls class's __init__
+    constructor method, from @dic which may or may not contain additional, irrelevant kwargs.
+
+    Note that @dic may possibly be missing certain kwargs as specified by cls.__init__. No error will be raised.
+
+    Args:
+        cls (object): Class from which to grab __init__ kwargs that will be be used as filtering keys for @dic
+        dic (dict): Dictionary containing multiple key-values
+        copy (bool): If True, will deepcopy all values corresponding to the specified @keys
+        verbose (bool): If True (or if macro DEBUG is True), then will print out mismatched keys
+
+    Returns:
+        dict: Extracted subset dictionary possibly containing only the specified keys from cls.__init__ and their
+            corresponding values
+    """
+    # extract only relevant kwargs for this specific backbone
+    cls_keys = get_class_init_kwargs(cls)
+    subdic = extract_subset_dict(
+        dic=dic,
+        keys=cls_keys,
+        copy=copy,
+    )
+
+    # Run sanity check if verbose or debugging
+    if verbose or Macros.DEBUG:
+        keys_not_in_cls = [k for k in dic if k not in cls_keys]
+        keys_not_in_dic = [k for k in cls_keys if k not in list(dic.keys())]
+        if len(keys_not_in_cls) > 0:
+            print(f"Warning: For class {cls.__name__}, got unknown keys: {keys_not_in_cls} ")
+        if len(keys_not_in_dic) > 0:
+            print(f"Warning: For class {cls.__name__}, got missing keys: {keys_not_in_dic} ")
+
+    return subdic
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/script_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/script_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e42eaf627715b7d8ce30aaf43dfbfd992678beab
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/script_utils.py
@@ -0,0 +1,15 @@
+"""
+Collection of miscellaneous utility tools
+"""
+
+def deep_update(d, u):
+    """
+    Copied from https://stackoverflow.com/a/3233356
+    """
+    import collections
+    for k, v in u.items():
+        if isinstance(v, collections.abc.Mapping):
+            d[k] = deep_update(d.get(k, {}), v)
+        else:
+            d[k] = v
+    return d
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/tensor_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/tensor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfdbbc5299e6f623c40c1e9b2b40bb7a2ac5dcb4
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/tensor_utils.py
@@ -0,0 +1,995 @@
+"""
+A collection of utilities for working with nested tensor structures consisting
+of numpy arrays and torch tensors.
+"""
+import collections
+import numpy as np
+import torch
+
+
+def recursive_dict_list_tuple_apply(x, type_func_dict, error_on_missing_type=True):
+    """
+    Recursively apply functions to a nested dictionary or list or tuple, given a dictionary of 
+    {data_type: function_to_apply}.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        type_func_dict (dict): a mapping from data types to the functions to be 
+            applied for each data type.
+        error_on_missing_type (bool): if True, raise an error if a type outside the @type_func_dict is
+            encountered, else, just return the same value (identity function)
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    assert(list not in type_func_dict)
+    assert(tuple not in type_func_dict)
+    assert(dict not in type_func_dict)
+
+    if isinstance(x, (dict, collections.OrderedDict)):
+        new_x = collections.OrderedDict() if isinstance(x, collections.OrderedDict) else dict()
+        for k, v in x.items():
+            new_x[k] = recursive_dict_list_tuple_apply(v, type_func_dict, error_on_missing_type)
+        return new_x
+    elif isinstance(x, (list, tuple)):
+        ret = [recursive_dict_list_tuple_apply(v, type_func_dict, error_on_missing_type) for v in x]
+        if isinstance(x, tuple):
+            ret = tuple(ret)
+        return ret
+    else:
+        for t, f in type_func_dict.items():
+            if isinstance(x, t):
+                return f(x)
+        else:
+            if error_on_missing_type:
+                raise NotImplementedError(
+                    'Cannot handle data type %s' % str(type(x)))
+            return x
+
+
+def map_tensor(x, func, error_on_missing_type=True):
+    """
+    Apply function @func to torch.Tensor objects in a nested dictionary or
+    list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        func (function): function to apply to each tensor
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: func,
+            type(None): lambda x: x,
+        },
+        error_on_missing_type=error_on_missing_type,
+    )
+
+
+def map_ndarray(x, func, error_on_missing_type=True):
+    """
+    Apply function @func to np.ndarray objects in a nested dictionary or
+    list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        func (function): function to apply to each array
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            np.ndarray: func,
+            type(None): lambda x: x,
+        },
+        error_on_missing_type=error_on_missing_type,
+    )
+
+
+def map_tensor_ndarray(x, tensor_func, ndarray_func, error_on_missing_type=True):
+    """
+    Apply function @tensor_func to torch.Tensor objects and @ndarray_func to 
+    np.ndarray objects in a nested dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        tensor_func (function): function to apply to each tensor
+        ndarray_Func (function): function to apply to each array
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: tensor_func,
+            np.ndarray: ndarray_func,
+            type(None): lambda x: x,
+        },
+        error_on_missing_type=error_on_missing_type,
+    )
+
+
+def clone(x):
+    """
+    Clones all torch tensors and numpy arrays in nested dictionary or list
+    or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.clone(),
+            np.ndarray: lambda x: x.copy(),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def detach(x):
+    """
+    Detaches all torch tensors in nested dictionary or list
+    or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.detach(),
+        }
+    )
+
+
+def to_batch(x):
+    """
+    Introduces a leading batch dimension of 1 for all torch tensors and numpy 
+    arrays in nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x[None, ...],
+            np.ndarray: lambda x: x[None, ...],
+            type(None): lambda x: x,
+        }
+    )
+
+
+def to_sequence(x):
+    """
+    Introduces a time dimension of 1 at dimension 1 for all torch tensors and numpy 
+    arrays in nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x[:, None, ...],
+            np.ndarray: lambda x: x[:, None, ...],
+            type(None): lambda x: x,
+        }
+    )
+
+
+def index_at_time(x, ind):
+    """
+    Indexes all torch tensors and numpy arrays in dimension 1 with index @ind in
+    nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        ind (int): index
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x[:, ind, ...],
+            np.ndarray: lambda x: x[:, ind, ...],
+            type(None): lambda x: x,
+        }
+    )
+
+
+def unsqueeze(x, dim):
+    """
+    Adds dimension of size 1 at dimension @dim in all torch tensors and numpy arrays
+    in nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        dim (int): dimension
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.unsqueeze(dim=dim),
+            np.ndarray: lambda x: np.expand_dims(x, axis=dim),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def contiguous(x):
+    """
+    Makes all torch tensors and numpy arrays contiguous in nested dictionary or 
+    list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.contiguous(),
+            np.ndarray: lambda x: np.ascontiguousarray(x),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def to_device(x, device):
+    """
+    Sends all torch tensors in nested dictionary or list or tuple to device
+    @device, and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        device (torch.Device): device to send tensors to
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x, d=device: x.to(d),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def to_tensor(x):
+    """
+    Converts all numpy arrays in nested dictionary or list or tuple to
+    torch tensors (and leaves existing torch Tensors as-is), and returns 
+    a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x,
+            np.ndarray: lambda x: torch.from_numpy(x),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def to_numpy(x):
+    """
+    Converts all torch tensors in nested dictionary or list or tuple to
+    numpy (and leaves existing numpy arrays as-is), and returns 
+    a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    def f(tensor):
+        if tensor.is_cuda:
+            return tensor.detach().cpu().numpy()
+        else:
+            return tensor.detach().numpy()
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: f,
+            np.ndarray: lambda x: x,
+            type(None): lambda x: x,
+        }
+    )
+
+
+def to_list(x):
+    """
+    Converts all torch tensors and numpy arrays in nested dictionary or list 
+    or tuple to a list, and returns a new nested structure. Useful for
+    json encoding.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    def f(tensor):
+        if tensor.is_cuda:
+            return tensor.detach().cpu().numpy().tolist()
+        else:
+            return tensor.detach().numpy().tolist()
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: f,
+            np.ndarray: lambda x: x.tolist(),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def to_float(x):
+    """
+    Converts all torch tensors and numpy arrays in nested dictionary or list 
+    or tuple to float type entries, and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.float(),
+            np.ndarray: lambda x: x.astype(np.float32),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def to_uint8(x):
+    """
+    Converts all torch tensors and numpy arrays in nested dictionary or list 
+    or tuple to uint8 type entries, and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.byte(),
+            np.ndarray: lambda x: x.astype(np.uint8),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def to_uint16(x):
+    """
+    Converts all torch tensors and numpy arrays in nested dictionary or list 
+    or tuple to uint16 type entries, and returns a new nested structure. Note 
+    that torch does not support uint16, so int32 will be used (double storage).
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.int(),
+            np.ndarray: lambda x: x.astype(np.uint16),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def to_torch(x, device):
+    """
+    Converts all numpy arrays and torch tensors in nested dictionary or list or tuple to 
+    torch tensors on device @device and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        device (torch.Device): device to send tensors to
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return to_device(to_float(to_tensor(x)), device)
+
+
+def to_one_hot_single(tensor, num_class):
+    """
+    Convert tensor to one-hot representation, assuming a certain number of total class labels.
+
+    Args:
+        tensor (torch.Tensor): tensor containing integer labels
+        num_class (int): number of classes
+
+    Returns:
+        x (torch.Tensor): tensor containing one-hot representation of labels
+    """
+    x = torch.zeros(tensor.size() + (num_class,)).to(tensor.device)
+    x.scatter_(-1, tensor.unsqueeze(-1), 1)
+    return x
+
+
+def to_one_hot(tensor, num_class):
+    """
+    Convert all tensors in nested dictionary or list or tuple to one-hot representation, 
+    assuming a certain number of total class labels.
+
+    Args:
+        tensor (dict or list or tuple): a possibly nested dictionary or list or tuple
+        num_class (int): number of classes
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(tensor, func=lambda x, nc=num_class: to_one_hot_single(x, nc))
+
+
+def flatten_single(x, begin_axis=1):
+    """
+    Flatten a tensor in all dimensions from @begin_axis onwards.
+
+    Args:
+        x (torch.Tensor): tensor to flatten
+        begin_axis (int): which axis to flatten from
+
+    Returns:
+        y (torch.Tensor): flattened tensor
+    """
+    fixed_size = x.size()[:begin_axis]
+    _s = list(fixed_size) + [-1]
+    return x.reshape(*_s)
+
+
+def flatten(x, begin_axis=1):
+    """
+    Flatten all tensors in nested dictionary or list or tuple, from @begin_axis onwards.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        begin_axis (int): which axis to flatten from
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x, b=begin_axis: flatten_single(x, begin_axis=b),
+        }
+    )
+
+
+def reshape_dimensions_single(x, begin_axis, end_axis, target_dims):
+    """
+    Reshape selected dimensions in a tensor to a target dimension.
+
+    Args:
+        x (torch.Tensor): tensor to reshape
+        begin_axis (int): begin dimension
+        end_axis (int): end dimension (inclusive)
+        target_dims (tuple or list): target shape for the range of dimensions
+            (@begin_axis, @end_axis)
+
+    Returns:
+        y (torch.Tensor): reshaped tensor
+    """
+    assert(begin_axis <= end_axis)
+    assert(begin_axis >= 0)
+    assert(end_axis < len(x.shape))
+    assert(isinstance(target_dims, (tuple, list)))
+    s = x.shape
+    final_s = []
+    for i in range(len(s)):
+        if i == begin_axis:
+            final_s.extend(target_dims)
+        elif i < begin_axis or i > end_axis:
+            final_s.append(s[i])
+    return x.reshape(*final_s)
+
+
+def reshape_dimensions(x, begin_axis, end_axis, target_dims):
+    """
+    Reshape selected dimensions for all tensors in nested dictionary or list or tuple 
+    to a target dimension.
+    
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        begin_axis (int): begin dimension
+        end_axis (int): end dimension (inclusive)
+        target_dims (tuple or list): target shape for the range of dimensions
+            (@begin_axis, @end_axis)
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x, b=begin_axis, e=end_axis, t=target_dims: reshape_dimensions_single(
+                x, begin_axis=b, end_axis=e, target_dims=t),
+            np.ndarray: lambda x, b=begin_axis, e=end_axis, t=target_dims: reshape_dimensions_single(
+                x, begin_axis=b, end_axis=e, target_dims=t),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def join_dimensions(x, begin_axis, end_axis):
+    """
+    Joins all dimensions between dimensions (@begin_axis, @end_axis) into a flat dimension, for
+    all tensors in nested dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        begin_axis (int): begin dimension
+        end_axis (int): end dimension
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x, b=begin_axis, e=end_axis: reshape_dimensions_single(
+                x, begin_axis=b, end_axis=e, target_dims=[-1]),
+            np.ndarray: lambda x, b=begin_axis, e=end_axis: reshape_dimensions_single(
+                x, begin_axis=b, end_axis=e, target_dims=[-1]),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def expand_at_single(x, size, dim):
+    """
+    Expand a tensor at a single dimension @dim by @size
+
+    Args:
+        x (torch.Tensor): input tensor
+        size (int): size to expand
+        dim (int): dimension to expand
+
+    Returns:
+        y (torch.Tensor): expanded tensor
+    """
+    assert dim < x.ndimension()
+    assert x.shape[dim] == 1
+    expand_dims = [-1] * x.ndimension()
+    expand_dims[dim] = size
+    return x.expand(*expand_dims)
+
+
+def expand_at(x, size, dim):
+    """
+    Expand all tensors in nested dictionary or list or tuple at a single
+    dimension @dim by @size.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        size (int): size to expand
+        dim (int): dimension to expand
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(x, lambda t, s=size, d=dim: expand_at_single(t, s, d))
+
+
+def unsqueeze_expand_at(x, size, dim):
+    """
+    Unsqueeze and expand a tensor at a dimension @dim by @size.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        size (int): size to expand
+        dim (int): dimension to unsqueeze and expand
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    x = unsqueeze(x, dim)
+    return expand_at(x, size, dim)
+
+
+def repeat_by_expand_at(x, repeats, dim):
+    """
+    Repeat a dimension by combining expand and reshape operations.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        repeats (int): number of times to repeat the target dimension
+        dim (int): dimension to repeat on
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    x = unsqueeze_expand_at(x, repeats, dim + 1)
+    return join_dimensions(x, dim, dim + 1)
+
+
+def named_reduce_single(x, reduction, dim):
+    """
+    Reduce tensor at a dimension by named reduction functions.
+
+    Args:
+        x (torch.Tensor): tensor to be reduced
+        reduction (str): one of ["sum", "max", "mean", "flatten"]
+        dim (int): dimension to be reduced (or begin axis for flatten)
+
+    Returns:
+        y (torch.Tensor): reduced tensor
+    """
+    assert x.ndimension() > dim
+    assert reduction in ["sum", "max", "mean", "flatten"]
+    if reduction == "flatten":
+        x = flatten(x, begin_axis=dim)
+    elif reduction == "max":
+        x = torch.max(x, dim=dim)[0]  # [B, D]
+    elif reduction == "sum":
+        x = torch.sum(x, dim=dim)
+    else:
+        x = torch.mean(x, dim=dim)
+    return x
+
+
+def named_reduce(x, reduction, dim):
+    """
+    Reduces all tensors in nested dictionary or list or tuple at a dimension
+    using a named reduction function.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        reduction (str): one of ["sum", "max", "mean", "flatten"]
+        dim (int): dimension to be reduced (or begin axis for flatten)
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(x, func=lambda t, r=reduction, d=dim: named_reduce_single(t, r, d))
+
+
+def gather_along_dim_with_dim_single(x, target_dim, source_dim, indices):
+    """
+    This function indexes out a target dimension of a tensor in a structured way,
+    by allowing a different value to be selected for each member of a flat index 
+    tensor (@indices) corresponding to a source dimension. This can be interpreted
+    as moving along the source dimension, using the corresponding index value
+    in @indices to select values for all other dimensions outside of the
+    source and target dimensions. A common use case is to gather values
+    in target dimension 1 for each batch member (target dimension 0).
+
+    Args:
+        x (torch.Tensor): tensor to gather values for
+        target_dim (int): dimension to gather values along
+        source_dim (int): dimension to hold constant and use for gathering values
+            from the other dimensions
+        indices (torch.Tensor): flat index tensor with same shape as tensor @x along
+            @source_dim
+    
+    Returns:
+        y (torch.Tensor): gathered tensor, with dimension @target_dim indexed out
+    """
+    assert len(indices.shape) == 1
+    assert x.shape[source_dim] == indices.shape[0]
+
+    # unsqueeze in all dimensions except the source dimension
+    new_shape = [1] * x.ndimension()
+    new_shape[source_dim] = -1
+    indices = indices.reshape(*new_shape)
+
+    # repeat in all dimensions - but preserve shape of source dimension,
+    # and make sure target_dimension has singleton dimension
+    expand_shape = list(x.shape)
+    expand_shape[source_dim] = -1
+    expand_shape[target_dim] = 1
+    indices = indices.expand(*expand_shape)
+
+    out = x.gather(dim=target_dim, index=indices)
+    return out.squeeze(target_dim)
+
+
+def gather_along_dim_with_dim(x, target_dim, source_dim, indices):
+    """
+    Apply @gather_along_dim_with_dim_single to all tensors in a nested 
+    dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        target_dim (int): dimension to gather values along
+        source_dim (int): dimension to hold constant and use for gathering values
+            from the other dimensions
+        indices (torch.Tensor): flat index tensor with same shape as tensor @x along
+            @source_dim
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(x, 
+        lambda y, t=target_dim, s=source_dim, i=indices: gather_along_dim_with_dim_single(y, t, s, i))
+    
+
+def gather_sequence_single(seq, indices):
+    """
+    Given a tensor with leading dimensions [B, T, ...], gather an element from each sequence in 
+    the batch given an index for each sequence.
+
+    Args:
+        seq (torch.Tensor): tensor with leading dimensions [B, T, ...]
+        indices (torch.Tensor): tensor indices of shape [B]
+
+    Return:
+        y (torch.Tensor): indexed tensor of shape [B, ....]
+    """
+    return gather_along_dim_with_dim_single(seq, target_dim=1, source_dim=0, indices=indices)
+
+
+def gather_sequence(seq, indices):
+    """
+    Given a nested dictionary or list or tuple, gathers an element from each sequence of the batch
+    for tensors with leading dimensions [B, T, ...].
+
+    Args:
+        seq (dict or list or tuple): a possibly nested dictionary or list or tuple with tensors
+            of leading dimensions [B, T, ...]
+        indices (torch.Tensor): tensor indices of shape [B]
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple with tensors of shape [B, ...]
+    """
+    return gather_along_dim_with_dim(seq, target_dim=1, source_dim=0, indices=indices)
+
+
+def pad_sequence_single(seq, padding, batched=False, pad_same=True, pad_values=None):
+    """
+    Pad input tensor or array @seq in the time dimension (dimension 1).
+
+    Args:
+        seq (np.ndarray or torch.Tensor): sequence to be padded
+        padding (tuple): begin and end padding, e.g. [1, 1] pads both begin and end of the sequence by 1
+        batched (bool): if sequence has the batch dimension
+        pad_same (bool): if pad by duplicating
+        pad_values (scalar or (ndarray, Tensor)): values to be padded if not pad_same
+
+    Returns:
+        padded sequence (np.ndarray or torch.Tensor)
+    """
+    assert isinstance(seq, (np.ndarray, torch.Tensor))
+    assert pad_same or (pad_values is not None)
+    if pad_values is not None:
+        assert isinstance(pad_values, float)
+    repeat_func = np.repeat if isinstance(seq, np.ndarray) else torch.repeat_interleave
+    concat_func = np.concatenate if isinstance(seq, np.ndarray) else torch.cat
+    ones_like_func = np.ones_like if isinstance(seq, np.ndarray) else torch.ones_like
+    seq_dim = 1 if batched else 0
+
+    begin_pad = []
+    end_pad = []
+
+    if padding[0] > 0:
+        if batched:
+            pad = seq[:, [0]] if pad_same else ones_like_func(seq[:, [0]]) * pad_values
+        else:
+            pad = seq[[0]] if pad_same else ones_like_func(seq[[0]]) * pad_values
+        begin_pad.append(repeat_func(pad, padding[0], seq_dim))
+    if padding[1] > 0:
+        if batched:
+            pad = seq[:, [-1]] if pad_same else ones_like_func(seq[:, [-1]]) * pad_values
+        else:
+            pad = seq[[-1]] if pad_same else ones_like_func(seq[[-1]]) * pad_values
+        end_pad.append(repeat_func(pad, padding[1], seq_dim))
+
+    return concat_func(begin_pad + [seq] + end_pad, seq_dim)
+
+
+def pad_sequence(seq, padding, batched=False, pad_same=True, pad_values=None):
+    """
+    Pad a nested dictionary or list or tuple of sequence tensors in the time dimension (dimension 1).
+
+    Args:
+        seq (dict or list or tuple): a possibly nested dictionary or list or tuple with tensors
+            of leading dimensions [B, T, ...]
+        padding (tuple): begin and end padding, e.g. [1, 1] pads both begin and end of the sequence by 1
+        batched (bool): if sequence has the batch dimension
+        pad_same (bool): if pad by duplicating
+        pad_values (scalar or (ndarray, Tensor)): values to be padded if not pad_same
+
+    Returns:
+        padded sequence (dict or list or tuple)
+    """
+    return recursive_dict_list_tuple_apply(
+        seq,
+        {
+            torch.Tensor: lambda x, p=padding, b=batched, ps=pad_same, pv=pad_values:
+                pad_sequence_single(x, p, b, ps, pv),
+            np.ndarray: lambda x, p=padding, b=batched, ps=pad_same, pv=pad_values:
+                pad_sequence_single(x, p, b, ps, pv),
+            type(None): lambda x: x,
+        }
+    )
+
+
+def assert_size_at_dim_single(x, size, dim, msg):
+    """
+    Ensure that array or tensor @x has size @size in dim @dim.
+
+    Args:
+        x (np.ndarray or torch.Tensor): input array or tensor
+        size (int): size that tensors should have at @dim
+        dim (int): dimension to check
+        msg (str): text to display if assertion fails
+    """
+    assert x.shape[dim] == size, msg
+
+
+def assert_size_at_dim(x, size, dim, msg):
+    """
+    Ensure that arrays and tensors in nested dictionary or list or tuple have 
+    size @size in dim @dim.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        size (int): size that tensors should have at @dim
+        dim (int): dimension to check
+    """
+    map_tensor(x, lambda t, s=size, d=dim, m=msg: assert_size_at_dim_single(t, s, d, m))
+
+
+def get_shape(x):
+    """
+    Get all shapes of arrays and tensors in nested dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple that contains each array or
+            tensor's shape
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.shape,
+            np.ndarray: lambda x: x.shape,
+            type(None): lambda x: x,
+        }
+    )
+
+
+def list_of_flat_dict_to_dict_of_list(list_of_dict):
+    """
+    Helper function to go from a list of flat dictionaries to a dictionary of lists.
+    By "flat" we mean that none of the values are dictionaries, but are numpy arrays,
+    floats, etc.
+
+    Args:
+        list_of_dict (list): list of flat dictionaries
+
+    Returns:
+        dict_of_list (dict): dictionary of lists
+    """
+    assert isinstance(list_of_dict, list)
+    dic = collections.OrderedDict()
+    for i in range(len(list_of_dict)):
+        for k in list_of_dict[i]:
+            if k not in dic:
+                dic[k] = []
+            dic[k].append(list_of_dict[i][k])
+    return dic
+
+
+def flatten_nested_dict_list(d, parent_key='', sep='_', item_key=''):
+    """
+    Flatten a nested dict or list to a list.
+
+    For example, given a dict
+    {
+        a: 1
+        b: {
+            c: 2
+        }
+        c: 3
+    }
+
+    the function would return [(a, 1), (b_c, 2), (c, 3)]
+
+    Args:
+        d (dict, list): a nested dict or list to be flattened
+        parent_key (str): recursion helper
+        sep (str): separator for nesting keys
+        item_key (str): recursion helper
+    Returns:
+        list: a list of (key, value) tuples
+    """
+    items = []
+    if isinstance(d, (tuple, list)):
+        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
+        for i, v in enumerate(d):
+            items.extend(flatten_nested_dict_list(v, new_key, sep=sep, item_key=str(i)))
+        return items
+    elif isinstance(d, dict):
+        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
+        for k, v in d.items():
+            assert isinstance(k, str)
+            items.extend(flatten_nested_dict_list(v, new_key, sep=sep, item_key=k))
+        return items
+    else:
+        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
+        return [(new_key, d)]
+
+
+def time_distributed(inputs, op, activation=None, inputs_as_kwargs=False, inputs_as_args=False, **kwargs):
+    """
+    Apply function @op to all tensors in nested dictionary or list or tuple @inputs in both the
+    batch (B) and time (T) dimension, where the tensors are expected to have shape [B, T, ...].
+    Will do this by reshaping tensors to [B * T, ...], passing through the op, and then reshaping
+    outputs to [B, T, ...].
+
+    Args:
+        inputs (list or tuple or dict): a possibly nested dictionary or list or tuple with tensors
+            of leading dimensions [B, T, ...]
+        op: a layer op that accepts inputs
+        activation: activation to apply at the output
+        inputs_as_kwargs (bool): whether to feed input as a kwargs dict to the op
+        inputs_as_args (bool) whether to feed input as a args list to the op
+        kwargs (dict): other kwargs to supply to the op
+
+    Returns:
+        outputs (dict or list or tuple): new nested dict-list-tuple with tensors of leading dimension [B, T].
+    """
+    batch_size, seq_len = flatten_nested_dict_list(inputs)[0][1].shape[:2]
+    inputs = join_dimensions(inputs, 0, 1)
+    if inputs_as_kwargs:
+        outputs = op(**inputs, **kwargs)
+    elif inputs_as_args:
+        outputs = op(*inputs, **kwargs)
+    else:
+        outputs = op(inputs, **kwargs)
+
+    if activation is not None:
+        outputs = map_tensor(outputs, activation)
+    outputs = reshape_dimensions(outputs, begin_axis=0, end_axis=0, target_dims=(batch_size, seq_len))
+    return outputs
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/test_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d70c0f073b7b22c0b3608ba5c4408a23904259a
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/test_utils.py
@@ -0,0 +1,270 @@
+"""
+Utilities for testing algorithm implementations - used mainly by scripts in tests directory.
+"""
+import os
+import json
+import shutil
+import traceback
+from termcolor import colored
+
+import numpy as np
+import torch
+
+import robomimic
+import robomimic.utils.file_utils as FileUtils
+import robomimic.utils.torch_utils as TorchUtils
+from robomimic.config import Config, config_factory
+from robomimic.scripts.train import train
+
+
+def maybe_remove_dir(dir_to_remove):
+    """
+    Remove directory if it exists.
+
+    Args:
+        dir_to_remove (str): path to directory to remove
+    """
+    if os.path.exists(dir_to_remove):
+        shutil.rmtree(dir_to_remove)
+
+
+def maybe_remove_file(file_to_remove):
+    """
+    Remove file if it exists.
+
+    Args:
+        file_to_remove (str): path to file to remove
+    """
+    if os.path.exists(file_to_remove):
+        os.remove(file_to_remove)
+
+
+def example_dataset_path():
+    """
+    Path to dataset to use for testing and example purposes. It should
+    exist under the tests/assets directory, and will be downloaded 
+    from a server if it does not exist.
+    """
+    dataset_folder = os.path.join(robomimic.__path__[0], "../tests/assets/")
+    dataset_path = os.path.join(dataset_folder, "test_v141.hdf5")
+    if not os.path.exists(dataset_path):
+        print("\nWARNING: test hdf5 does not exist! Downloading from server...")
+        os.makedirs(dataset_folder, exist_ok=True)
+        FileUtils.download_url(
+            url="http://downloads.cs.stanford.edu/downloads/rt_benchmark/test_v141.hdf5", 
+            download_dir=dataset_folder,
+        )
+    return dataset_path
+
+
+def example_momart_dataset_path():
+    """
+    Path to momart dataset to use for testing and example purposes. It should
+    exist under the tests/assets directory, and will be downloaded
+    from a server if it does not exist.
+    """
+    dataset_folder = os.path.join(robomimic.__path__[0], "../tests/assets/")
+    dataset_path = os.path.join(dataset_folder, "test_momart.hdf5")
+    if not os.path.exists(dataset_path):
+        user_response = input("\nWARNING: momart test hdf5 does not exist! We will download sample dataset. "
+                              "This will take 0.6GB space. Proceed? y/n\n")
+        assert user_response.lower() in {"yes", "y"}, f"Did not receive confirmation. Aborting download."
+
+        print("\nDownloading from server...")
+
+        os.makedirs(dataset_folder, exist_ok=True)
+        FileUtils.download_url(
+            url="http://downloads.cs.stanford.edu/downloads/rt_mm/sample/test_momart.hdf5",
+            download_dir=dataset_folder,
+        )
+    return dataset_path
+
+
+def temp_model_dir_path():
+    """
+    Path to a temporary model directory to write to for testing and example purposes.
+    """
+    return os.path.join(robomimic.__path__[0], "../tests/tmp_model_dir")
+
+
+def temp_dataset_path():
+    """
+    Defines default dataset path to write to for testing.
+    """
+    return os.path.join(robomimic.__path__[0], "../tests/", "tmp.hdf5")
+
+
+def temp_video_path():
+    """
+    Defines default video path to write to for testing.
+    """
+    return os.path.join(robomimic.__path__[0], "../tests/", "tmp.mp4")
+
+
+def get_base_config(algo_name):
+    """
+    Base config for testing algorithms.
+
+    Args:
+        algo_name (str): name of algorithm - loads the corresponding json
+            from the config templates directory
+    """
+
+    # we will load and override defaults from template config
+    base_config_path = os.path.join(robomimic.__path__[0], "exps/templates/{}.json".format(algo_name))
+    with open(base_config_path, 'r') as f:
+        config = Config(json.load(f))
+
+    # small dataset with a handful of trajectories
+    config.train.data = example_dataset_path()
+
+    # temporary model dir
+    model_dir = temp_model_dir_path()
+    maybe_remove_dir(model_dir)
+    config.train.output_dir = model_dir
+
+    # train and validate for 3 gradient steps
+    config.experiment.name = "test"
+    config.experiment.validate = True
+    config.experiment.epoch_every_n_steps = 3
+    config.experiment.validation_epoch_every_n_steps = 3
+    config.train.num_epochs = 1
+
+    # default train and validation filter keys
+    config.train.hdf5_filter_key = "train"
+    config.train.hdf5_validation_filter_key = "valid"
+
+    # ensure model saving, rollout, and offscreen video rendering are tested too
+    config.experiment.save.enabled = True
+    config.experiment.save.every_n_epochs = 1
+    config.experiment.rollout.enabled = True
+    config.experiment.rollout.rate = 1
+    config.experiment.rollout.n = 1
+    config.experiment.rollout.horizon = 10
+    config.experiment.render_video = True
+
+    # turn off logging to stdout, since that can interfere with testing code outputs
+    config.experiment.logging.terminal_output_to_txt = False
+
+    # test cuda (if available)
+    config.train.cuda = True
+
+    return config
+
+
+def config_from_modifier(base_config, config_modifier):
+    """
+    Helper function to load a base config, modify it using
+    the passed @config modifier function, and finalize it
+    for training.
+
+    Args:
+        base_config (BaseConfig instance): starting config object that is
+            loaded (to change algorithm config defaults), and then modified
+            with @config_modifier
+
+        config_modifier (function): function that takes a config object as
+            input, and modifies it
+    """
+
+    # algo name to default config for this algorithm
+    algo_name = base_config["algo_name"]
+    config = config_factory(algo_name)
+
+    # update config with the settings specified in the base config
+    with config.unlocked():
+        config.update(base_config)
+
+        # modify the config and finalize it for training (no more modifications allowed)
+        config = config_modifier(config)
+
+    return config
+
+
+def checkpoint_path_from_test_run():
+    """
+    Helper function that gets the path of a model checkpoint after a test training run is finished.
+    """
+    exp_dir = os.path.join(temp_model_dir_path(), "test")
+    time_dir_names = [f.name for f in os.scandir(exp_dir) if f.is_dir()]
+    assert len(time_dir_names) == 1
+    path_to_models = os.path.join(exp_dir, time_dir_names[0], "models")
+    epoch_name = [f.name for f in os.scandir(path_to_models) if f.name.startswith("model")][0]
+    return os.path.join(path_to_models, epoch_name)
+
+
+def test_eval_agent_from_checkpoint(ckpt_path, device):
+    """
+    Test loading a model from checkpoint and running a rollout with the 
+    trained agent for a small number of steps.
+
+    Args:
+        ckpt_path (str): path to a checkpoint pth file
+
+        device (torch.Device): torch device
+    """
+
+    # get policy and env from checkpoint
+    policy, ckpt_dict = FileUtils.policy_from_checkpoint(ckpt_path=ckpt_path, device=device, verbose=True)
+    env, _ = FileUtils.env_from_checkpoint(ckpt_dict=ckpt_dict, verbose=True)
+
+    # run a test rollout
+    ob_dict = env.reset()
+    policy.start_episode()
+    for _ in range(15):
+        ac = policy(ob=ob_dict)
+        ob_dict, r, done, _ = env.step(ac)
+
+
+def test_run(base_config, config_modifier):
+    """
+    Takes a base_config and config_modifier (function that modifies a passed Config object)
+    and runs training as a test. It also takes the trained checkpoint, tries to load the
+    policy and environment from the checkpoint, and run an evaluation rollout. Returns
+    a string that is colored green if the run finished successfully without any issues,
+    and colored red if an error occurred. If an error occurs, the traceback is included
+    in the string.
+
+    Args:
+        base_config (BaseConfig instance): starting config object that is
+            loaded (to change algorithm config defaults), and then modified
+            with @config_modifier
+
+        config_modifier (function): function that takes a config object as
+            input, and modifies it
+
+    Returns:
+        ret (str): a green "passed!" string, or a red "failed with error" string that contains
+            the traceback
+    """
+
+    # disable some macros for testing
+    Macros.RESULTS_SYNC_PATH = None
+    Macros.USE_MAGLEV = False
+    Macros.USE_NGC = False
+
+    try:
+        # get config
+        config = config_from_modifier(base_config=base_config, config_modifier=config_modifier)
+
+        # set torch device
+        device = TorchUtils.get_torch_device(try_to_use_cuda=config.train.cuda)
+
+        # run training
+        train(config, device=device)
+
+        # test evaluating a trained agent using saved checkpoint
+        ckpt_path = checkpoint_path_from_test_run()
+        test_eval_agent_from_checkpoint(ckpt_path, device=device)
+
+        # indicate success
+        ret = colored("passed!", "green")
+
+    except Exception as e:
+        # indicate failure by returning error string
+        ret = colored("failed with error:\n{}\n\n{}".format(e, traceback.format_exc()), "red")
+
+    # make sure model directory is cleaned up before returning from this function
+    maybe_remove_dir(temp_model_dir_path())
+
+    return ret
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/torch_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/torch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..494dbddb7c37a05d8b7c1c66cf96aff8255655fa
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/torch_utils.py
@@ -0,0 +1,489 @@
+"""
+This file contains some PyTorch utilities.
+"""
+import numpy as np
+import torch
+import torch.optim as optim
+
+
+def soft_update(source, target, tau):
+    """
+    Soft update from the parameters of a @source torch module to a @target torch module
+    with strength @tau. The update follows target = target * (1 - tau) + source * tau.
+
+    Args:
+        source (torch.nn.Module): source network to push target network parameters towards
+        target (torch.nn.Module): target network to update
+    """
+    for target_param, param in zip(target.parameters(), source.parameters()):
+        target_param.copy_(
+            target_param * (1.0 - tau) + param * tau
+        )
+
+
+def hard_update(source, target):
+    """
+    Hard update @target parameters to match @source.
+
+    Args:
+        source (torch.nn.Module): source network to provide parameters
+        target (torch.nn.Module): target network to update parameters for
+    """
+    for target_param, param in zip(target.parameters(), source.parameters()):
+            target_param.copy_(param)
+
+
+def get_torch_device(try_to_use_cuda):
+    """
+    Return torch device. If using cuda (GPU), will also set cudnn.benchmark to True
+    to optimize CNNs.
+
+    Args:
+        try_to_use_cuda (bool): if True and cuda is available, will use GPU
+
+    Returns:
+        device (torch.Device): device to use for models
+    """
+    if try_to_use_cuda and torch.cuda.is_available():
+        torch.backends.cudnn.benchmark = True
+        device = torch.device("cuda:0")
+    else:
+        device = torch.device("cpu")
+    return device
+
+
+def reparameterize(mu, logvar):
+    """
+    Reparameterize for the backpropagation of z instead of q.
+    This makes it so that we can backpropagate through the sampling of z from
+    our encoder when feeding the sampled variable to the decoder.
+
+    (See "The reparameterization trick" section of https://arxiv.org/abs/1312.6114)
+
+    Args:
+        mu (torch.Tensor): batch of means from the encoder distribution
+        logvar (torch.Tensor): batch of log variances from the encoder distribution
+
+    Returns:
+        z (torch.Tensor): batch of sampled latents from the encoder distribution that
+            support backpropagation
+    """
+    # logvar = \log(\sigma^2) = 2 * \log(\sigma)
+    # \sigma = \exp(0.5 * logvar)
+
+    # clamped for numerical stability
+    logstd = (0.5 * logvar).clamp(-4, 15)
+    std = torch.exp(logstd)
+
+    # Sample \epsilon from normal distribution
+    # use std to create a new tensor, so we don't have to care
+    # about running on GPU or not
+    eps = std.new(std.size()).normal_()
+
+    # Then multiply with the standard deviation and add the mean
+    z = eps.mul(std).add_(mu)
+
+    return z
+
+
+def optimizer_from_optim_params(net_optim_params, net):
+    """
+    Helper function to return a torch Optimizer from the optim_params 
+    section of the config for a particular network.
+
+    Args:
+        optim_params (Config): optim_params part of algo_config corresponding
+            to @net. This determines the optimizer that is created.
+
+        net (torch.nn.Module): module whose parameters this optimizer will be
+            responsible
+
+    Returns:
+        optimizer (torch.optim.Optimizer): optimizer
+    """
+    optimizer_type = net_optim_params.get("optimizer_type", "adam")
+    lr = net_optim_params["learning_rate"]["initial"]
+
+    if optimizer_type == "adam":
+        return optim.Adam(
+            params=net.parameters(),
+            lr=lr,
+            weight_decay=net_optim_params["regularization"]["L2"],
+        )
+    elif optimizer_type == "adamw":
+        return optim.AdamW(
+            params=net.parameters(),
+            lr=lr,
+            weight_decay=net_optim_params["regularization"]["L2"],
+        )
+
+
+def lr_scheduler_from_optim_params(net_optim_params, net, optimizer):
+    """
+    Helper function to return a LRScheduler from the optim_params 
+    section of the config for a particular network. Returns None
+    if a scheduler is not needed.
+
+    Args:
+        optim_params (Config): optim_params part of algo_config corresponding
+            to @net. This determines whether a learning rate scheduler is created.
+
+        net (torch.nn.Module): module whose parameters this optimizer will be
+            responsible
+
+        optimizer (torch.optim.Optimizer): optimizer for this net
+
+    Returns:
+        lr_scheduler (torch.optim.lr_scheduler or None): learning rate scheduler
+    """
+    lr_scheduler_type = net_optim_params["learning_rate"].get("scheduler_type", "multistep")
+    epoch_schedule = net_optim_params["learning_rate"]["epoch_schedule"]
+
+    lr_scheduler = None
+    if len(epoch_schedule) > 0:
+        if lr_scheduler_type == "linear":
+            assert len(epoch_schedule) == 1
+            end_epoch = epoch_schedule[0]
+            
+            return optim.lr_scheduler.LinearLR(
+                optimizer,
+                start_factor=1.0,
+                end_factor=net_optim_params["learning_rate"]["decay_factor"],
+                total_iters=end_epoch,
+            )
+        elif lr_scheduler_type == "multistep":
+            return optim.lr_scheduler.MultiStepLR(
+                optimizer=optimizer,
+                milestones=epoch_schedule,
+                gamma=net_optim_params["learning_rate"]["decay_factor"],
+            )
+        else:
+            raise ValueError("Invalid LR scheduler type: {}".format(lr_scheduler_type))
+        
+    return lr_scheduler
+
+
+def backprop_for_loss(net, optim, loss, max_grad_norm=None, retain_graph=False):
+    """
+    Backpropagate loss and update parameters for network with
+    name @name.
+
+    Args:
+        net (torch.nn.Module): network to update
+
+        optim (torch.optim.Optimizer): optimizer to use
+
+        loss (torch.Tensor): loss to use for backpropagation
+
+        max_grad_norm (float): if provided, used to clip gradients
+
+        retain_graph (bool): if True, graph is not freed after backward call
+
+    Returns:
+        grad_norms (float): average gradient norms from backpropagation
+    """
+
+    # backprop
+    optim.zero_grad()
+    loss.backward(retain_graph=retain_graph)
+
+    # gradient clipping
+    if max_grad_norm is not None:
+        torch.nn.utils.clip_grad_norm_(net.parameters(), max_grad_norm)
+
+    # compute grad norms
+    grad_norms = 0.
+    for p in net.parameters():
+        # only clip gradients for parameters for which requires_grad is True
+        if p.grad is not None:
+            grad_norms += p.grad.data.norm(2).pow(2).item()
+
+    # step
+    optim.step()
+
+    return grad_norms
+
+
+def rot_6d_to_axis_angle(rot_6d):
+    """
+    Converts tensor with rot_6d representation to axis-angle representation.
+    """
+    rot_mat = rotation_6d_to_matrix(rot_6d)
+    rot = matrix_to_axis_angle(rot_mat)
+    return rot
+
+
+def axis_angle_to_rot_6d(axis_angle):
+    """
+    Converts tensor with rot_6d representation to axis-angle representation.
+    """
+    rot_mat = axis_angle_to_matrix(axis_angle)
+    rot_6d = matrix_to_rotation_6d(rot_mat)
+    return rot_6d
+
+
+class dummy_context_mgr():
+    """
+    A dummy context manager - useful for having conditional scopes (such
+    as @maybe_no_grad). Nothing happens in this scope.
+    """
+    def __enter__(self):
+        return None
+    def __exit__(self, exc_type, exc_value, traceback):
+        return False
+
+
+def maybe_no_grad(no_grad):
+    """
+    Args:
+        no_grad (bool): if True, the returned context will be torch.no_grad(), otherwise
+            it will be a dummy context
+    """
+    return torch.no_grad() if no_grad else dummy_context_mgr()
+
+
+"""
+The following utility functions were taken from PyTorch3D:
+https://github.com/facebookresearch/pytorch3d/blob/d84f274a0822da969668d00e831870fd88327845/pytorch3d/transforms/rotation_conversions.py
+"""
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+
+
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to rotation matrices.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+
+    return quat_candidates[
+        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :
+    ].reshape(batch_dim + (4,))
+
+
+def axis_angle_to_matrix(axis_angle: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as axis/angle to rotation matrices.
+
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    return quaternion_to_matrix(axis_angle_to_quaternion(axis_angle))
+
+
+def matrix_to_axis_angle(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to axis/angle.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+    Returns:
+        Rotations given as a vector in axis angle form, as a tensor
+            of shape (..., 3), where the magnitude is the angle
+            turned anticlockwise in radians around the vector's
+            direction.
+    """
+    return quaternion_to_axis_angle(matrix_to_quaternion(matrix))
+
+
+def axis_angle_to_quaternion(axis_angle: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as axis/angle to quaternions.
+
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    angles = torch.norm(axis_angle, p=2, dim=-1, keepdim=True)
+    half_angles = angles * 0.5
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    quaternions = torch.cat(
+        [torch.cos(half_angles), axis_angle * sin_half_angles_over_angles], dim=-1
+    )
+    return quaternions
+
+
+def quaternion_to_axis_angle(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to axis/angle.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotations given as a vector in axis angle form, as a tensor
+            of shape (..., 3), where the magnitude is the angle
+            turned anticlockwise in radians around the vector's
+            direction.
+    """
+    norms = torch.norm(quaternions[..., 1:], p=2, dim=-1, keepdim=True)
+    half_angles = torch.atan2(norms, quaternions[..., :1])
+    angles = 2 * half_angles
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    return quaternions[..., 1:] / sin_half_angles_over_angles
+
+
+def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
+    """
+    Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
+    using Gram--Schmidt orthogonalization per Section B of [1].
+    Args:
+        d6: 6D rotation representation, of size (*, 6)
+
+    Returns:
+        batch of rotation matrices of size (*, 3, 3)
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+
+    a1, a2 = d6[..., :3], d6[..., 3:]
+    b1 = F.normalize(a1, dim=-1)
+    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+    b2 = F.normalize(b2, dim=-1)
+    b3 = torch.cross(b1, b2, dim=-1)
+    return torch.stack((b1, b2, b3), dim=-2)
+
+
+def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
+    by dropping the last row. Note that 6D representation is not unique.
+    Args:
+        matrix: batch of rotation matrices of size (*, 3, 3)
+
+    Returns:
+        6D rotation representation, of size (*, 6)
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    batch_dim = matrix.size()[:-2]
+    return matrix[..., :2, :].clone().reshape(batch_dim + (6,))
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/train_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/train_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c86ef393194c362cdc6ef2adf511c92536c9aae
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/train_utils.py
@@ -0,0 +1,806 @@
+"""
+This file contains several utility functions used to define the main training loop. It 
+mainly consists of functions to assist with logging, rollouts, and the @run_epoch function,
+which is the core training logic for models in this repository.
+"""
+import os
+import time
+import datetime
+import shutil
+import json
+import h5py
+import imageio
+import numpy as np
+from copy import deepcopy
+from collections import OrderedDict
+
+import torch
+
+import robomimic
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.log_utils as LogUtils
+import robomimic.utils.file_utils as FileUtils
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.env_utils as EnvUtils
+import robomimic.macros as Macros
+
+
+from robomimic.utils.dataset import SequenceDataset, R2D2Dataset, MetaDataset
+from robomimic.envs.env_base import EnvBase
+from robomimic.envs.wrappers import EnvWrapper
+from robomimic.algo import RolloutPolicy
+
+
+def get_exp_dir(config, auto_remove_exp_dir=False):
+    """
+    Create experiment directory from config. If an identical experiment directory
+    exists and @auto_remove_exp_dir is False (default), the function will prompt 
+    the user on whether to remove and replace it, or keep the existing one and
+    add a new subdirectory with the new timestamp for the current run.
+
+    Args:
+        auto_remove_exp_dir (bool): if True, automatically remove the existing experiment
+            folder if it exists at the same path.
+    
+    Returns:
+        log_dir (str): path to created log directory (sub-folder in experiment directory)
+        output_dir (str): path to created models directory (sub-folder in experiment directory)
+            to store model checkpoints
+        video_dir (str): path to video directory (sub-folder in experiment directory)
+            to store rollout videos
+    """
+    assert not (Macros.USE_MAGLEV and Macros.USE_NGC)
+    if Macros.USE_MAGLEV or Macros.USE_NGC:
+        # remove existing experiment directory automatically if path exists so that we don't block on user input
+        auto_remove_exp_dir = True
+
+    # timestamp for directory names
+    t_now = time.time()
+    time_str = datetime.datetime.fromtimestamp(t_now).strftime('%Y%m%d%H%M%S')
+
+    # create directory for where to dump model parameters, tensorboard logs, and videos
+    base_output_dir = os.path.expandvars(os.path.expanduser(config.train.output_dir))
+    if not os.path.isabs(base_output_dir):
+        # relative paths are specified relative to robomimic module location
+        base_output_dir = os.path.join(robomimic.__path__[0], base_output_dir)
+    base_output_dir = os.path.join(base_output_dir, config.experiment.name)
+    if os.path.exists(base_output_dir):
+        if not auto_remove_exp_dir:
+            ans = input("WARNING: model directory ({}) already exists! \noverwrite? (y/n)\n".format(base_output_dir))
+        else:
+            ans = "y"
+        if ans == "y":
+            print("REMOVING")
+            shutil.rmtree(base_output_dir)
+
+    # only make model directory if model saving is enabled
+    output_dir = None
+    if config.experiment.save.enabled:
+        output_dir = os.path.join(base_output_dir, time_str, "models")
+        os.makedirs(output_dir)
+
+    # tensorboard directory
+    log_dir = os.path.join(base_output_dir, time_str, "logs")
+    os.makedirs(log_dir)
+
+    # video directory
+    video_dir = os.path.join(base_output_dir, time_str, "videos")
+    os.makedirs(video_dir)
+
+    # establish sync path for syncing important training results back
+    set_absolute_sync_path(
+        output_dir=config.train.output_dir,
+        exp_name=config.experiment.name,
+        time_str=time_str,
+    )
+
+    return log_dir, output_dir, video_dir
+
+
+def set_absolute_sync_path(output_dir, exp_name, time_str=None):
+    """
+    Establish sync path for syncing important training results back and puts the path
+    into Macros.RESULTS_SYNC_PATH_ABS
+    """
+    need_sync_results = (Macros.USE_MAGLEV and (Macros.MAGLEV_SCRATCH_SYNC_PATH is not None)) or \
+        (Macros.USE_NGC and (Macros.NGC_SCRATCH_SYNC_PATH is not None)) or \
+        ((not Macros.USE_MAGLEV) and (not Macros.USE_NGC) and (Macros.RESULTS_SYNC_PATH is not None))
+    if need_sync_results:
+        # get path where we will sync results
+        assert Macros.RESULTS_SYNC_PATH_ABS is None
+        base_output_dir_name = os.path.basename(os.path.normpath(os.path.expandvars(os.path.expanduser(output_dir))))
+        
+        if Macros.USE_MAGLEV:
+            # turn relative scratch space path into absolute scratch space path
+            sync_prefix = os.path.join(
+                os.getenv("WORKFLOW_SCRATCH"),
+                "test_disk", # NOTE: most workflows mount scratch space under this prefix
+                Macros.MAGLEV_SCRATCH_SYNC_PATH,
+            )
+        elif Macros.USE_NGC:
+            sync_prefix = os.path.expandvars(os.path.expanduser(Macros.NGC_SCRATCH_SYNC_PATH))
+        else:
+            sync_prefix = os.path.expandvars(os.path.expanduser(Macros.RESULTS_SYNC_PATH))
+
+        # store at results_sync_path/output_dir_name/experiment_name/time_str
+        sync_path_without_time_dir = os.path.join(
+            sync_prefix,
+            base_output_dir_name,
+            exp_name,
+        )
+        if os.path.exists(sync_path_without_time_dir):
+            # only keep one time directory per exp name
+            shutil.rmtree(sync_path_without_time_dir)
+        Macros.RESULTS_SYNC_PATH_ABS = sync_path_without_time_dir
+        if time_str is not None:
+            Macros.RESULTS_SYNC_PATH_ABS = os.path.join(sync_path_without_time_dir, time_str)
+        os.makedirs(Macros.RESULTS_SYNC_PATH_ABS)
+    elif (Macros.USE_MAGLEV or Macros.USE_NGC):
+        LogUtils.log_warning(
+            "Using MagLev / NGC, but MAGLEV_SCRATCH_SYNC_PATH / NGC_SCRATCH_SYNC_PATH is unset in macros.py."
+            "No results will be synced back to scratch space."
+        )
+
+
+def load_data_for_training(config, obs_keys):
+    """
+    Data loading at the start of an algorithm.
+
+    Args:
+        config (BaseConfig instance): config object
+        obs_keys (list): list of observation modalities that are required for
+            training (this will inform the dataloader on what modalities to load)
+
+    Returns:
+        train_dataset (SequenceDataset instance): train dataset object
+        valid_dataset (SequenceDataset instance): valid dataset object (only if using validation)
+    """
+
+    # config can contain an attribute to filter on
+    train_filter_by_attribute = config.train.hdf5_filter_key
+    valid_filter_by_attribute = config.train.hdf5_validation_filter_key
+    if valid_filter_by_attribute is not None:
+        assert config.experiment.validate, "specified validation filter key {}, but config.experiment.validate is not set".format(valid_filter_by_attribute)
+
+    # load the dataset into memory
+    if config.experiment.validate:
+        assert not config.train.hdf5_normalize_obs, "no support for observation normalization with validation data yet"
+        assert (train_filter_by_attribute is not None) and (valid_filter_by_attribute is not None), \
+            "did not specify filter keys corresponding to train and valid split in dataset" \
+            " - please fill config.train.hdf5_filter_key and config.train.hdf5_validation_filter_key"
+        dataset_path = config.train.data if isinstance(config.train.data, str) else config.train.data[0]["path"]
+        train_demo_keys = FileUtils.get_demos_for_filter_key(
+            hdf5_path=os.path.expanduser(dataset_path),
+            filter_key=train_filter_by_attribute,
+        )
+        valid_demo_keys = FileUtils.get_demos_for_filter_key(
+            hdf5_path=os.path.expanduser(dataset_path),
+            filter_key=valid_filter_by_attribute,
+        )
+        assert set(train_demo_keys).isdisjoint(set(valid_demo_keys)), "training demonstrations overlap with " \
+            "validation demonstrations!"
+        train_dataset = dataset_factory(config, obs_keys, filter_by_attribute=train_filter_by_attribute)
+        valid_dataset = dataset_factory(config, obs_keys, filter_by_attribute=valid_filter_by_attribute)
+    else:
+        train_dataset = dataset_factory(config, obs_keys, filter_by_attribute=train_filter_by_attribute)
+        valid_dataset = None
+
+    return train_dataset, valid_dataset
+
+
+def dataset_factory(config, obs_keys, filter_by_attribute=None, dataset_path=None):
+    """
+    Create a SequenceDataset instance to pass to a torch DataLoader.
+
+    Args:
+        config (BaseConfig instance): config object
+
+        obs_keys (list): list of observation modalities that are required for
+            training (this will inform the dataloader on what modalities to load)
+
+        filter_by_attribute (str): if provided, use the provided filter key
+            to select a subset of demonstration trajectories to load
+
+        dataset_path (str): if provided, the SequenceDataset instance should load
+            data from this dataset path. Defaults to config.train.data.
+
+    Returns:
+        dataset (SequenceDataset instance): dataset object
+    """
+    if dataset_path is None:
+        dataset_path = config.train.data
+
+    ds_kwargs = dict(
+        # hdf5_path=dataset_path,
+        obs_keys=obs_keys,
+        action_keys=config.train.action_keys,
+        dataset_keys=config.train.dataset_keys,
+        action_config=config.train.action_config,
+        load_next_obs=config.train.hdf5_load_next_obs, # whether to load next observations (s') from dataset
+        frame_stack=config.train.frame_stack,
+        seq_length=config.train.seq_length,
+        pad_frame_stack=config.train.pad_frame_stack,
+        pad_seq_length=config.train.pad_seq_length,
+        get_pad_mask=False,
+        goal_mode=config.train.goal_mode,
+        hdf5_cache_mode=config.train.hdf5_cache_mode,
+        hdf5_use_swmr=config.train.hdf5_use_swmr,
+        hdf5_normalize_obs=config.train.hdf5_normalize_obs,
+        # filter_by_attribute=filter_by_attribute
+    )
+
+    if isinstance(dataset_path, str):
+        ds_kwargs["hdf5_path"] = [dataset_path]
+        ds_kwargs["filter_by_attribute"] = [filter_by_attribute]
+        ds_weights = [1.0]
+        ds_labels = ["dummy"]
+    else:
+        ds_kwargs["hdf5_path"] = [ds_cfg["path"] for ds_cfg in config.train.data]
+        ds_kwargs["filter_by_attribute"] = [filter_by_attribute for ds_cfg in config.train.data]
+        ds_weights = [ds_cfg.get("weight", 1.0) for ds_cfg in config.train.data]
+        ds_labels = [ds_cfg.get("label", "dummy") for ds_cfg in config.train.data]
+
+    meta_ds_kwargs = dict()
+
+    dataset = get_dataset(
+        ds_class=R2D2Dataset if config.train.data_format == "r2d2" else SequenceDataset,
+        ds_kwargs=ds_kwargs,
+        ds_weights=ds_weights,
+        ds_labels=ds_labels,
+        normalize_weights_by_ds_size=False,
+        meta_ds_class=MetaDataset,
+        meta_ds_kwargs=meta_ds_kwargs,
+    )
+
+    return dataset
+
+
+def get_dataset(
+    ds_class,
+    ds_kwargs,
+    ds_weights,
+    ds_labels,
+    normalize_weights_by_ds_size,
+    meta_ds_class=MetaDataset,
+    meta_ds_kwargs=None,
+):
+    ds_list = []
+    for i in range(len(ds_weights)):
+        
+        ds_kwargs_copy = deepcopy(ds_kwargs)
+
+        keys = ["hdf5_path", "filter_by_attribute"]
+
+        for k in keys:
+            ds_kwargs_copy[k] = ds_kwargs[k][i]
+        
+        ds_list.append(ds_class(**ds_kwargs_copy))
+    
+    if len(ds_weights) == 1:
+        ds = ds_list[0]
+    else:
+        if meta_ds_kwargs is None:
+            meta_ds_kwargs = dict()
+        ds = meta_ds_class(
+            datasets=ds_list,
+            ds_weights=ds_weights,
+            ds_labels=ds_labels,
+            normalize_weights_by_ds_size=normalize_weights_by_ds_size,
+            **meta_ds_kwargs
+        )
+
+    return ds
+
+
+def run_rollout(
+        policy, 
+        env, 
+        horizon,
+        use_goals=False,
+        render=False,
+        video_writer=None,
+        video_skip=5,
+        terminate_on_success=False,
+    ):
+    """
+    Runs a rollout in an environment with the current network parameters.
+
+    Args:
+        policy (RolloutPolicy instance): policy to use for rollouts.
+
+        env (EnvBase instance): environment to use for rollouts.
+
+        horizon (int): maximum number of steps to roll the agent out for
+
+        use_goals (bool): if True, agent is goal-conditioned, so provide goal observations from env
+
+        render (bool): if True, render the rollout to the screen
+
+        video_writer (imageio Writer instance): if not None, use video writer object to append frames at 
+            rate given by @video_skip
+
+        video_skip (int): how often to write video frame
+
+        terminate_on_success (bool): if True, terminate episode early as soon as a success is encountered
+
+    Returns:
+        results (dict): dictionary containing return, success rate, etc.
+    """
+    assert isinstance(policy, RolloutPolicy)
+    assert isinstance(env, EnvBase) or isinstance(env, EnvWrapper)
+
+    policy.start_episode()
+
+    ob_dict = env.reset()
+    goal_dict = None
+    if use_goals:
+        # retrieve goal from the environment
+        goal_dict = env.get_goal()
+
+    results = {}
+    video_count = 0  # video frame counter
+
+    total_reward = 0.
+    success = { k: False for k in env.is_success() } # success metrics
+    got_exception = False
+
+    try:
+        for step_i in range(horizon):
+
+            # get action from policy
+            ac = policy(ob=ob_dict, goal=goal_dict)
+
+            # play action
+            ob_dict, r, done, _ = env.step(ac)
+
+            # render to screen
+            if render:
+                env.render(mode="human")
+
+            # compute reward
+            total_reward += r
+
+            cur_success_metrics = env.is_success()
+            for k in success:
+                success[k] = success[k] or cur_success_metrics[k]
+
+            # visualization
+            if video_writer is not None:
+                if video_count % video_skip == 0:
+                    video_img = env.render(mode="rgb_array", height=512, width=512)
+                    video_writer.append_data(video_img)
+
+                video_count += 1
+
+            # break if done
+            if done or (terminate_on_success and success["task"]):
+                break
+
+    except env.rollout_exceptions as e:
+        print("WARNING: got rollout exception {}".format(e))
+        got_exception = True
+
+    results["Return"] = total_reward
+    results["Horizon"] = step_i + 1
+    results["Success_Rate"] = float(success["task"])
+    results["Exception_Rate"] = float(got_exception)
+
+    # log additional success metrics
+    for k in success:
+        if k != "task":
+            results["{}_Success_Rate".format(k)] = float(success[k])
+
+    return results
+
+
+def rollout_with_stats(
+        policy,
+        envs,
+        horizon,
+        use_goals=False,
+        num_episodes=None,
+        render=False,
+        video_dir=None,
+        video_path=None,
+        epoch=None,
+        video_skip=5,
+        terminate_on_success=False,
+        verbose=False,
+    ):
+    """
+    A helper function used in the train loop to conduct evaluation rollouts per environment
+    and summarize the results.
+
+    Can specify @video_dir (to dump a video per environment) or @video_path (to dump a single video
+    for all environments).
+
+    Args:
+        policy (RolloutPolicy instance): policy to use for rollouts.
+
+        envs (dict): dictionary that maps env_name (str) to EnvBase instance. The policy will
+            be rolled out in each env.
+
+        horizon (int): maximum number of steps to roll the agent out for
+
+        use_goals (bool): if True, agent is goal-conditioned, so provide goal observations from env
+
+        num_episodes (int): number of rollout episodes per environment
+
+        render (bool): if True, render the rollout to the screen
+
+        video_dir (str): if not None, dump rollout videos to this directory (one per environment)
+
+        video_path (str): if not None, dump a single rollout video for all environments
+
+        epoch (int): epoch number (used for video naming)
+
+        video_skip (int): how often to write video frame
+
+        terminate_on_success (bool): if True, terminate episode early as soon as a success is encountered
+
+        verbose (bool): if True, print results of each rollout
+    
+    Returns:
+        all_rollout_logs (dict): dictionary of rollout statistics (e.g. return, success rate, ...) 
+            averaged across all rollouts 
+
+        video_paths (dict): path to rollout videos for each environment
+    """
+    assert isinstance(policy, RolloutPolicy)
+
+    all_rollout_logs = OrderedDict()
+
+    # handle paths and create writers for video writing
+    assert (video_path is None) or (video_dir is None), "rollout_with_stats: can't specify both video path and dir"
+    write_video = (video_path is not None) or (video_dir is not None)
+    video_paths = OrderedDict()
+    video_writers = OrderedDict()
+    if video_path is not None:
+        # a single video is written for all envs
+        video_paths = { k : video_path for k in envs }
+        video_writer = imageio.get_writer(video_path, fps=20)
+        video_writers = { k : video_writer for k in envs }
+    if video_dir is not None:
+        # video is written per env
+        video_str = "_epoch_{}.mp4".format(epoch) if epoch is not None else ".mp4" 
+        video_paths = { k : os.path.join(video_dir, "{}{}".format(k, video_str)) for k in envs }
+        video_writers = { k : imageio.get_writer(video_paths[k], fps=20) for k in envs }
+
+    for env_name, env in envs.items():
+        env_video_writer = None
+        if write_video:
+            print("video writes to " + video_paths[env_name])
+            env_video_writer = video_writers[env_name]
+
+        print("rollout: env={}, horizon={}, use_goals={}, num_episodes={}".format(
+            env.name, horizon, use_goals, num_episodes,
+        ))
+        rollout_logs = []
+        iterator = range(num_episodes)
+        if not verbose:
+            iterator = LogUtils.custom_tqdm(iterator, total=num_episodes)
+
+        num_success = 0
+        for ep_i in iterator:
+            rollout_timestamp = time.time()
+            rollout_info = run_rollout(
+                policy=policy,
+                env=env,
+                horizon=horizon,
+                render=render,
+                use_goals=use_goals,
+                video_writer=env_video_writer,
+                video_skip=video_skip,
+                terminate_on_success=terminate_on_success,
+            )
+            rollout_info["time"] = time.time() - rollout_timestamp
+            rollout_logs.append(rollout_info)
+            num_success += rollout_info["Success_Rate"]
+            if verbose:
+                print("Episode {}, horizon={}, num_success={}".format(ep_i + 1, horizon, num_success))
+                print(json.dumps(rollout_info, sort_keys=True, indent=4))
+
+        if video_dir is not None:
+            # close this env's video writer (next env has it's own)
+            env_video_writer.close()
+
+        # average metric across all episodes
+        rollout_logs = dict((k, [rollout_logs[i][k] for i in range(len(rollout_logs))]) for k in rollout_logs[0])
+        rollout_logs_mean = dict((k, np.mean(v)) for k, v in rollout_logs.items())
+        rollout_logs_mean["Time_Episode"] = np.sum(rollout_logs["time"]) / 60. # total time taken for rollouts in minutes
+        all_rollout_logs[env_name] = rollout_logs_mean
+
+    if video_path is not None:
+        # close video writer that was used for all envs
+        video_writer.close()
+
+    return all_rollout_logs, video_paths
+
+
+def should_save_from_rollout_logs(
+        all_rollout_logs,
+        best_return,
+        best_success_rate,
+        epoch_ckpt_name,
+        save_on_best_rollout_return,
+        save_on_best_rollout_success_rate,
+    ):
+    """
+    Helper function used during training to determine whether checkpoints and videos
+    should be saved. It will modify input attributes appropriately (such as updating
+    the best returns and success rates seen and modifying the epoch ckpt name), and
+    returns a dict with the updated statistics.
+
+    Args:
+        all_rollout_logs (dict): dictionary of rollout results that should be consistent
+            with the output of @rollout_with_stats
+
+        best_return (dict): dictionary that stores the best average rollout return seen so far
+            during training, for each environment
+
+        best_success_rate (dict): dictionary that stores the best average success rate seen so far
+            during training, for each environment
+
+        epoch_ckpt_name (str): what to name the checkpoint file - this name might be modified
+            by this function
+
+        save_on_best_rollout_return (bool): if True, should save checkpoints that achieve a 
+            new best rollout return
+
+        save_on_best_rollout_success_rate (bool): if True, should save checkpoints that achieve a 
+            new best rollout success rate
+
+    Returns:
+        save_info (dict): dictionary that contains updated input attributes @best_return,
+            @best_success_rate, @epoch_ckpt_name, along with two additional attributes
+            @should_save_ckpt (True if should save this checkpoint), and @ckpt_reason
+            (string that contains the reason for saving the checkpoint)
+    """
+    should_save_ckpt = False
+    ckpt_reason = None
+    for env_name in all_rollout_logs:
+        rollout_logs = all_rollout_logs[env_name]
+
+        if rollout_logs["Return"] > best_return[env_name]:
+            best_return[env_name] = rollout_logs["Return"]
+            if save_on_best_rollout_return:
+                # save checkpoint if achieve new best return
+                epoch_ckpt_name += "_{}_return_{}".format(env_name, best_return[env_name])
+                should_save_ckpt = True
+                ckpt_reason = "return"
+
+        if rollout_logs["Success_Rate"] > best_success_rate[env_name]:
+            best_success_rate[env_name] = rollout_logs["Success_Rate"]
+            if save_on_best_rollout_success_rate:
+                # save checkpoint if achieve new best success rate
+                epoch_ckpt_name += "_{}_success_{}".format(env_name, best_success_rate[env_name])
+                should_save_ckpt = True
+                ckpt_reason = "success"
+
+    # return the modified input attributes
+    return dict(
+        best_return=best_return,
+        best_success_rate=best_success_rate,
+        epoch_ckpt_name=epoch_ckpt_name,
+        should_save_ckpt=should_save_ckpt,
+        ckpt_reason=ckpt_reason,
+    )
+
+
+def save_model(model, config, env_meta, shape_meta, ckpt_path, obs_normalization_stats=None, action_normalization_stats=None):
+    """
+    Save model to a torch pth file.
+
+    Args:
+        model (Algo instance): model to save
+
+        config (BaseConfig instance): config to save
+
+        env_meta (dict): env metadata for this training run
+
+        shape_meta (dict): shape metdata for this training run
+
+        ckpt_path (str): writes model checkpoint to this path
+
+        obs_normalization_stats (dict): optionally pass a dictionary for observation
+            normalization. This should map observation keys to dicts
+            with a "mean" and "std" of shape (1, ...) where ... is the default
+            shape for the observation.
+
+        action_normalization_stats (dict): TODO
+    """
+    env_meta = deepcopy(env_meta)
+    shape_meta = deepcopy(shape_meta)
+    params = dict(
+        model=model.serialize(),
+        config=config.dump(),
+        algo_name=config.algo_name,
+        env_metadata=env_meta,
+        shape_metadata=shape_meta,
+    )
+    if obs_normalization_stats is not None:
+        assert config.train.hdf5_normalize_obs
+        obs_normalization_stats = deepcopy(obs_normalization_stats)
+        params["obs_normalization_stats"] = TensorUtils.to_list(obs_normalization_stats)
+    if action_normalization_stats is not None:
+        action_normalization_stats = deepcopy(action_normalization_stats)
+        params["action_normalization_stats"] = TensorUtils.to_list(action_normalization_stats)
+    torch.save(params, ckpt_path)
+    print("save checkpoint to {}".format(ckpt_path))
+
+
+def run_epoch(model, data_loader, epoch, validate=False, num_steps=None, obs_normalization_stats=None):
+    """
+    Run an epoch of training or validation.
+
+    Args:
+        model (Algo instance): model to train
+
+        data_loader (DataLoader instance): data loader that will be used to serve batches of data
+            to the model
+
+        epoch (int): epoch number
+
+        validate (bool): whether this is a training epoch or validation epoch. This tells the model
+            whether to do gradient steps or purely do forward passes.
+
+        num_steps (int): if provided, this epoch lasts for a fixed number of batches (gradient steps),
+            otherwise the epoch is a complete pass through the training dataset
+
+        obs_normalization_stats (dict or None): if provided, this should map observation keys to dicts
+            with a "mean" and "std" of shape (1, ...) where ... is the default
+            shape for the observation.
+
+    Returns:
+        step_log_all (dict): dictionary of logged training metrics averaged across all batches
+    """
+    epoch_timestamp = time.time()
+    if validate:
+        model.set_eval()
+    else:
+        model.set_train()
+    if num_steps is None:
+        num_steps = len(data_loader)
+
+    step_log_all = []
+    timing_stats = dict(Data_Loading=[], Process_Batch=[], Train_Batch=[], Log_Info=[])
+    start_time = time.time()
+
+    data_loader_iter = iter(data_loader)
+    for _ in LogUtils.custom_tqdm(range(num_steps)):
+
+        # load next batch from data loader
+        try:
+            t = time.time()
+            batch = next(data_loader_iter)
+        except StopIteration:
+            # reset for next dataset pass
+            data_loader_iter = iter(data_loader)
+            t = time.time()
+            batch = next(data_loader_iter)
+        timing_stats["Data_Loading"].append(time.time() - t)
+
+        # process batch for training
+        t = time.time()
+        input_batch = model.process_batch_for_training(batch)
+        input_batch = model.postprocess_batch_for_training(input_batch, obs_normalization_stats=obs_normalization_stats)
+        timing_stats["Process_Batch"].append(time.time() - t)
+
+        # forward and backward pass
+        t = time.time()
+        info = model.train_on_batch(input_batch, epoch, validate=validate)
+        timing_stats["Train_Batch"].append(time.time() - t)
+
+        # tensorboard logging
+        t = time.time()
+        step_log = model.log_info(info)
+        step_log_all.append(step_log)
+        timing_stats["Log_Info"].append(time.time() - t)
+
+    # flatten and take the mean of the metrics
+    step_log_dict = {}
+    for i in range(len(step_log_all)):
+        for k in step_log_all[i]:
+            if k not in step_log_dict:
+                step_log_dict[k] = []
+            step_log_dict[k].append(step_log_all[i][k])
+    step_log_all = dict((k, float(np.mean(v))) for k, v in step_log_dict.items())
+
+    # add in timing stats
+    for k in timing_stats:
+        # sum across all training steps, and convert from seconds to minutes
+        step_log_all["Time_{}".format(k)] = np.sum(timing_stats[k]) / 60.
+    step_log_all["Time_Epoch"] = (time.time() - epoch_timestamp) / 60.
+
+    return step_log_all
+
+
+def is_every_n_steps(interval, current_step, skip_zero=False):
+    """
+    Convenient function to check whether current_step is at the interval. 
+    Returns True if current_step % interval == 0 and asserts a few corner cases (e.g., interval <= 0)
+    
+    Args:
+        interval (int): target interval
+        current_step (int): current step
+        skip_zero (bool): whether to skip 0 (return False at 0)
+
+    Returns:
+        is_at_interval (bool): whether current_step is at the interval
+    """
+    if interval is None:
+        return False
+    assert isinstance(interval, int) and interval > 0
+    assert isinstance(current_step, int) and current_step >= 0
+    if skip_zero and current_step == 0:
+        return False
+    return current_step % interval == 0
+
+
+def get_model_from_output_folder(models_path, videos_path=None, epoch=None, best=False, last=False):
+    """
+    Gets path to model (and video) for a certain epoch number (or the best or last epoch).
+
+    Args:
+        models_path (str): path to models folder (in output directory)
+        videos_path (str): path to videos folder (in output directory)
+        epoch (int): if provided, get model ckpt and video for this epoch
+        best (bool): if True, get the model and video for the best checkpoint (according to success rate)
+        last (bool): if True, get the model and video for the last checkpoint (according to epoch number)
+
+    Returns:
+        model_path (str): path to model pth
+        video_path (str): path to mp4
+        epoch (int): epoch number for retrieved model and video paths
+    """
+
+    # make sure we either grab a specific epoch, best epoch, or last epoch
+    assert sum([(epoch is not None), best, last]) == 1
+
+    # run through models to find the epoch we want
+    best_success_rate = -0.1
+    need_particular_epoch = (epoch is not None)
+    need_best_epoch = best
+    need_max_epoch = last
+
+    selected_epoch = -1
+    selected_model_path = None
+    for f in os.scandir(models_path):
+        model_epoch = int(f.name.split("_")[2].strip(".pth"))
+
+        if need_particular_epoch and (model_epoch == epoch):
+            selected_epoch = epoch
+            selected_model_path = os.path.join(models_path, f.name)
+
+        elif need_best_epoch: 
+            # this block assumes that the experiment run opted to save the model with the best checkpoint
+            if "success" in f.name:
+                # example name: model_epoch_250_NutAssemblySquareTarget_6_success_0.86.pth
+                # take last piece - "0.86.pth" -> "0.86" -> convert to float
+                success_rate = float(f.name.split("success_")[-1][:-4])
+                if success_rate > best_success_rate:
+                    best_success_rate = success_rate
+                    selected_epoch = model_epoch
+                    selected_model_path = os.path.join(models_path, f.name)
+
+        elif need_max_epoch:
+            # find last epoch
+            if model_epoch > selected_epoch:
+                selected_epoch = model_epoch
+                selected_model_path = os.path.join(models_path, f.name)
+
+    assert selected_epoch != -1
+    assert selected_model_path is not None
+
+    selected_video_path = None
+    if videos_path is not None:
+        # get random video filename
+        video_fname = None
+        for f in os.scandir(videos_path):
+            video_fname = f.name
+            break
+        # example video file name: NutAssemblySquareTarget_6_epoch_150.mp4
+        # take name skeleton and use it to infer name of source videos we want, then copy them
+        video_name_prefix = video_fname.split("epoch")[0]
+        selected_video_path = os.path.join(videos_path, "{}epoch_{}.mp4".format(video_name_prefix, selected_epoch))
+    return selected_model_path, selected_video_path, selected_epoch
diff --git a/phantom/submodules/phantom-robomimic/robomimic/utils/vis_utils.py b/phantom/submodules/phantom-robomimic/robomimic/utils/vis_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c73d7a1ee504db2a73e62c28e6aaf11a0f714b
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/robomimic/utils/vis_utils.py
@@ -0,0 +1,111 @@
+"""
+This file contains utility functions for visualizing image observations in the training pipeline.
+These functions can be a useful debugging tool.
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+
+import robomimic.utils.tensor_utils as TensorUtils
+import robomimic.utils.obs_utils as ObsUtils
+
+
+def image_tensor_to_numpy(image):
+    """
+    Converts processed image tensors to numpy so that they can be saved to disk or video.
+    A useful utility function for visualizing images in the middle of training.
+
+    Args:
+        image (torch.Tensor): images of shape [..., C, H, W]
+
+    Returns:
+        image (np.array): converted images of shape [..., H, W, C] and type uint8
+    """
+    return TensorUtils.to_numpy(
+            ObsUtils.unprocess_image(image)
+        ).astype(np.uint8)
+
+
+def image_to_disk(image, fname):
+    """
+    Writes an image to disk.
+
+    Args:
+        image (np.array): image of shape [H, W, 3]
+        fname (str): path to save image to
+    """
+    image = Image.fromarray(image)
+    image.save(fname)
+
+
+def image_tensor_to_disk(image, fname):
+    """
+    Writes an image tensor to disk. Any leading batch dimensions are indexed out
+    with the first element.
+
+    Args:
+        image (torch.Tensor): image of shape [..., C, H, W]. All leading dimensions
+            will be indexed out with the first element
+        fname (str): path to save image to
+    """
+    # index out all leading dimensions before [C, H, W]
+    num_leading_dims = len(image.shape[:-3])
+    for _ in range(num_leading_dims):
+        image = image[0]
+    image = image_tensor_to_numpy(image)
+    image_to_disk(image, fname)
+
+
+def visualize_image_randomizer(original_image, randomized_image, randomizer_name=None):
+    """
+    A function that visualizes the before and after of an image-based input randomizer
+    Args:
+        original_image: batch of original image shaped [B, H, W, 3]
+        randomized_image: randomized image shaped [B, N, H, W, 3]. N is the number of randomization per input sample
+        randomizer_name: (Optional) name of the randomizer
+    Returns:
+        None
+    """
+
+    B, N, H, W, C = randomized_image.shape
+
+    # Create a grid of subplots with B rows and N+1 columns (1 for the original image, N for the randomized images)
+    fig, axes = plt.subplots(B, N + 1, figsize=(4 * (N + 1), 4 * B))
+
+    for i in range(B):
+        # Display the original image in the first column of each row
+        axes[i, 0].imshow(original_image[i])
+        axes[i, 0].set_title("Original")
+        axes[i, 0].axis("off")
+
+        # Display the randomized images in the remaining columns of each row
+        for j in range(N):
+            axes[i, j + 1].imshow(randomized_image[i, j])
+            axes[i, j + 1].axis("off")
+
+    title = randomizer_name if randomizer_name is not None else "Randomized"
+    fig.suptitle(title, fontsize=16)
+
+    # Adjust the space between subplots for better visualization
+    plt.subplots_adjust(wspace=0.5, hspace=0.5)
+
+    # Show the entire grid of subplots
+    plt.show()
+
+
+def depth_to_rgb(depth_map, depth_min=None, depth_max=None):
+    """
+    Convert depth map to rgb array by computing normalized depth values in [0, 1].
+    """
+    # normalize depth map into [0, 1]
+    if depth_min is None:
+        depth_min = depth_map.min()
+    if depth_max is None:
+        depth_max = depth_map.max()
+    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+    # depth_map = np.clip(depth_map / 3., 0., 1.)
+    if len(depth_map.shape) == 3:
+        assert depth_map.shape[-1] == 1
+        depth_map = depth_map[..., 0]
+    assert len(depth_map.shape) == 2 # [H, W]
+    return (255. * cm.hot(depth_map, 3)).astype(np.uint8)[..., :3]
diff --git a/phantom/submodules/phantom-robomimic/setup.py b/phantom/submodules/phantom-robomimic/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..91eca6ca948f03ee52383e697ce14b00b39856e9
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/setup.py
@@ -0,0 +1,44 @@
+from setuptools import setup, find_packages
+
+# read the contents of your README file
+from os import path
+this_directory = path.abspath(path.dirname(__file__))
+with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
+    lines = f.readlines()
+
+# remove images from README
+lines = [x for x in lines if (('.png' not in x) and ('.gif' not in x))]
+long_description = ''.join(lines)
+
+setup(
+    name="robomimic",
+    packages=[
+        package for package in find_packages() if package.startswith("robomimic")
+    ],
+    install_requires=[
+        "numpy>=1.13.3",
+        "h5py",
+        "psutil",
+        "tqdm",
+        "termcolor",
+        "tensorboard",
+        "tensorboardX",
+        "imageio",
+        "imageio-ffmpeg",
+        "matplotlib",
+        "egl_probe>=1.0.1",
+        # "torch",
+        # "torchvision",
+        "diffusers>=0.26.2",
+    ],
+    eager_resources=['*'],
+    include_package_data=True,
+    python_requires='>=3',
+    description="robomimic: A Modular Framework for Robot Learning from Demonstration",
+    author="Ajay Mandlekar, Danfei Xu, Josiah Wong, Soroush Nasiriany, Chen Wang, Matthew Bronars",
+    url="https://github.com/ARISE-Initiative/robomimic",
+    author_email="amandlek@cs.stanford.edu",
+    version="0.3.0",
+    long_description=long_description,
+    long_description_content_type='text/markdown'
+)
diff --git a/phantom/submodules/phantom-robomimic/tests/test.sh b/phantom/submodules/phantom-robomimic/tests/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e25e77c134753f486bfa3231fc3abe7bef11754c
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/tests/test.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+echo "running tests for bc..."
+python test_bc.py
+echo "running tests for hbc..."
+python test_hbc.py
+echo "running tests for iris..."
+python test_iris.py
+echo "running tests for bcq..."
+python test_bcq.py
+echo "running tests for cql..."
+python test_cql.py
+echo "running tests for scripts..."
+python test_scripts.py
+echo "running tests for examples..."
+python test_examples.py
diff --git a/phantom/submodules/phantom-robomimic/tests/test_bc.py b/phantom/submodules/phantom-robomimic/tests/test_bc.py
new file mode 100644
index 0000000000000000000000000000000000000000..adc125014bafb920aa45d69fc7d48798d3231c6f
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/tests/test_bc.py
@@ -0,0 +1,295 @@
+"""
+Test script for BC algorithms. Each test trains a variant of BC
+for a handful of gradient steps and tries one rollout with 
+the model. Excludes stdout output by default (pass --verbose
+to see stdout output).
+"""
+import argparse
+from collections import OrderedDict
+
+import robomimic
+from robomimic.config import Config
+import robomimic.utils.test_utils as TestUtils
+from robomimic.utils.log_utils import silence_stdout
+from robomimic.utils.torch_utils import dummy_context_mgr
+
+
+def get_algo_base_config():
+    """
+    Base config for testing BC algorithms.
+    """
+
+    # config with basic settings for quick training run
+    config = TestUtils.get_base_config(algo_name="bc")
+
+    # low-level obs (note that we define it here because @observation structure might vary per algorithm, 
+    # for example HBC)
+    config.observation.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.modalities.obs.rgb = []
+
+    # by default, vanilla BC
+    config.algo.gaussian.enabled = False
+    config.algo.gmm.enabled = False
+    config.algo.vae.enabled = False
+    config.algo.rnn.enabled = False
+
+    return config
+
+
+def convert_config_for_images(config):
+    """
+    Modify config to use image observations.
+    """
+
+    # using high-dimensional images - don't load entire dataset into memory, and smaller batch size
+    config.train.hdf5_cache_mode = "low_dim"
+    config.train.num_data_workers = 0
+    config.train.batch_size = 16
+
+    # replace object with rgb modality
+    config.observation.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos"]
+    config.observation.modalities.obs.rgb = ["agentview_image"]
+
+    # set up visual encoders
+    config.observation.encoder.rgb.core_class = "VisualCore"
+    config.observation.encoder.rgb.core_kwargs.feature_dimension = 64
+    config.observation.encoder.rgb.core_kwargs.backbone_class = 'ResNet18Conv'                         # ResNet backbone for image observations (unused if no image observations)
+    config.observation.encoder.rgb.core_kwargs.backbone_kwargs.pretrained = False                # kwargs for visual core
+    config.observation.encoder.rgb.core_kwargs.backbone_kwargs.input_coord_conv = False
+    config.observation.encoder.rgb.core_kwargs.pool_class = "SpatialSoftmax"                # Alternate options are "SpatialMeanPool" or None (no pooling)
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.num_kp = 32                      # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.learnable_temperature = False    # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.temperature = 1.0                # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.noise_std = 0.0
+
+    # observation randomizer class - set to None to use no randomization, or 'CropRandomizer' to use crop randomization
+    config.observation.encoder.rgb.obs_randomizer_class = None
+
+    return config
+
+
+def make_image_modifier(config_modifier):
+    """
+    Turn a config modifier into its image version. Note that
+    this explicit function definition is needed for proper
+    scoping of @config_modifier.
+    """
+    return lambda x: config_modifier(convert_config_for_images(x))
+
+
+# mapping from test name to config modifier functions
+MODIFIERS = OrderedDict()
+def register_mod(test_name):
+    def decorator(config_modifier):
+        MODIFIERS[test_name] = config_modifier
+    return decorator
+
+
+@register_mod("bc")
+def bc_modifier(config):
+    # no-op
+    return config
+
+
+@register_mod("bc-gaussian")
+def bc_gaussian_modifier(config):
+    config.algo.gaussian.enabled = True
+    return config
+
+
+@register_mod("bc-gmm")
+def bc_gmm_modifier(config):
+    config.algo.gmm.enabled = True
+    return config
+
+
+@register_mod("bc-vae, N(0, 1) prior")
+def bc_vae_modifier_1(config):
+    # N(0, 1) prior
+    config.algo.vae.enabled = True
+    config.algo.vae.prior.learn = False
+    config.algo.vae.prior.is_conditioned = False
+    return config
+
+
+@register_mod("bc-vae, Gaussian prior (obs-independent)")
+def bc_vae_modifier_2(config):
+    # learn parameters of Gaussian prior (obs-independent)
+    config.algo.vae.enabled = True
+    config.algo.vae.prior.learn = True
+    config.algo.vae.prior.is_conditioned = False
+    config.algo.vae.prior.use_gmm = False
+    config.algo.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bc-vae, Gaussian prior (obs-dependent)")
+def bc_vae_modifier_3(config):
+    # learn parameters of Gaussian prior (obs-dependent)
+    config.algo.vae.enabled = True
+    config.algo.vae.prior.learn = True
+    config.algo.vae.prior.is_conditioned = True
+    config.algo.vae.prior.use_gmm = False
+    config.algo.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bc-vae, GMM prior (obs-independent, weights-fixed)")
+def bc_vae_modifier_4(config):
+    # learn parameters of GMM prior (obs-independent, weights-fixed)
+    config.algo.vae.enabled = True
+    config.algo.vae.prior.learn = True
+    config.algo.vae.prior.is_conditioned = False
+    config.algo.vae.prior.use_gmm = True
+    config.algo.vae.prior.gmm_learn_weights = False
+    config.algo.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bc-vae, GMM prior (obs-independent, weights-learned)")
+def bc_vae_modifier_5(config):
+    # learn parameters of GMM prior (obs-independent, weights-learned)
+    config.algo.vae.enabled = True
+    config.algo.vae.prior.learn = True
+    config.algo.vae.prior.is_conditioned = False
+    config.algo.vae.prior.use_gmm = True
+    config.algo.vae.prior.gmm_learn_weights = True
+    config.algo.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bc-vae, GMM prior (obs-dependent, weights-fixed)")
+def bc_vae_modifier_6(config):
+    # learn parameters of GMM prior (obs-dependent, weights-fixed)
+    config.algo.vae.enabled = True
+    config.algo.vae.prior.learn = True
+    config.algo.vae.prior.is_conditioned = True
+    config.algo.vae.prior.use_gmm = True
+    config.algo.vae.prior.gmm_learn_weights = False
+    config.algo.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bc-vae, GMM prior (obs-dependent, weights-learned)")
+def bc_vae_modifier_7(config):
+    # learn parameters of GMM prior (obs-dependent, weights-learned)
+    config.algo.vae.enabled = True
+    config.algo.vae.prior.learn = True
+    config.algo.vae.prior.is_conditioned = True
+    config.algo.vae.prior.use_gmm = True
+    config.algo.vae.prior.gmm_learn_weights = True
+    config.algo.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bc-vae, uniform categorical prior")
+def bc_vae_modifier_8(config):
+    # uniform categorical prior
+    config.algo.vae.enabled = True
+    config.algo.vae.prior.learn = False
+    config.algo.vae.prior.is_conditioned = False
+    config.algo.vae.prior.use_gmm = False
+    config.algo.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("bc-vae, categorical prior (obs-independent)")
+def bc_vae_modifier_9(config):
+    # learn parameters of categorical prior (obs-independent)
+    config.algo.vae.enabled = True
+    config.algo.vae.prior.learn = True
+    config.algo.vae.prior.is_conditioned = False
+    config.algo.vae.prior.use_gmm = False
+    config.algo.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("bc-vae, categorical prior (obs-dependent)")
+def bc_vae_modifier_10(config):
+    # learn parameters of categorical prior (obs-dependent)
+    config.algo.vae.enabled = True
+    config.algo.vae.prior.learn = True
+    config.algo.vae.prior.is_conditioned = True
+    config.algo.vae.prior.use_gmm = False
+    config.algo.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("bc-rnn")
+def bc_rnn_modifier(config):
+    config.algo.rnn.enabled = True
+    config.algo.rnn.horizon = 10
+    config.train.seq_length = 10
+    return config
+
+
+@register_mod("bc-rnn-gmm")
+def bc_rnn_gmm_modifier(config):
+    config.algo.gmm.enabled = True
+    config.algo.rnn.enabled = True
+    config.algo.rnn.horizon = 10
+    config.train.seq_length = 10
+    return config
+
+
+@register_mod("bc-transformer")
+def bc_transformer_modifier(config):
+    config.algo.transformer.enabled = True
+    config.train.frame_stack = 10
+    config.train.seq_length = 1
+    return config
+
+
+@register_mod("bc-transformer-gmm")
+def bc_transformer_gmm_modifier(config):
+    config.algo.gmm.enabled = True
+    config.algo.transformer.enabled = True
+    config.train.frame_stack = 10
+    config.train.seq_length = 1
+    return config
+
+
+# add image version of all tests
+image_modifiers = OrderedDict()
+for test_name in MODIFIERS:
+    lst = test_name.split("-")
+    name = "-".join(lst[:1] + ["rgb"] + lst[1:])
+    image_modifiers[name] = make_image_modifier(MODIFIERS[test_name])
+MODIFIERS.update(image_modifiers)
+
+
+# test for image crop randomization
+@register_mod("bc-image-crop")
+def bc_image_crop_modifier(config):
+    config = convert_config_for_images(config)
+
+    # observation randomizer class - using Crop randomizer
+    config.observation.encoder.rgb.obs_randomizer_class = "CropRandomizer"
+
+    # kwargs for observation randomizers (for the CropRandomizer, this is size and number of crops)
+    config.observation.encoder.rgb.obs_randomizer_kwargs.crop_height = 76
+    config.observation.encoder.rgb.obs_randomizer_kwargs.crop_width = 76
+    config.observation.encoder.rgb.obs_randomizer_kwargs.num_crops = 1
+    config.observation.encoder.rgb.obs_randomizer_kwargs.pos_enc = False
+    return config
+
+
+def test_bc(silence=True):
+    for test_name in MODIFIERS:
+        context = silence_stdout() if silence else dummy_context_mgr()
+        with context:
+            base_config = get_algo_base_config()
+            res_str = TestUtils.test_run(base_config=base_config, config_modifier=MODIFIERS[test_name])
+        print("{}: {}".format(test_name, res_str))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--verbose",
+        action='store_true',
+        help="don't suppress stdout during tests",
+    )
+    args = parser.parse_args()
+
+    test_bc(silence=(not args.verbose))
diff --git a/phantom/submodules/phantom-robomimic/tests/test_bcq.py b/phantom/submodules/phantom-robomimic/tests/test_bcq.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8bd08356575e66cb779afa2aebf71b560262cd4
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/tests/test_bcq.py
@@ -0,0 +1,263 @@
+"""
+Test script for BCQ algorithms. Each test trains a variant of BCQ
+for a handful of gradient steps and tries one rollout with 
+the model. Excludes stdout output by default (pass --verbose
+to see stdout output).
+"""
+import argparse
+from collections import OrderedDict
+
+import robomimic
+from robomimic.config import Config
+import robomimic.utils.test_utils as TestUtils
+from robomimic.utils.log_utils import silence_stdout
+from robomimic.utils.torch_utils import dummy_context_mgr
+
+
+def get_algo_base_config():
+    """
+    Base config for testing BCQ algorithms.
+    """
+
+    # config with basic settings for quick training run
+    config = TestUtils.get_base_config(algo_name="bcq")
+
+    # low-level obs (note that we define it here because @observation structure might vary per algorithm, 
+    # for example HBC)
+    config.observation.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.modalities.obs.rgb = []
+
+    # by default, vanilla BCQ
+    config.algo.actor.enabled = True # perturbation actor
+    config.algo.critic.distributional.enabled = False # vanilla critic training
+    config.algo.action_sampler.vae.enabled = True # action sampler is VAE
+    config.algo.action_sampler.gmm.enabled = False
+
+    return config
+
+
+def convert_config_for_images(config):
+    """
+    Modify config to use image observations.
+    """
+
+    # using high-dimensional images - don't load entire dataset into memory, and smaller batch size
+    config.train.hdf5_cache_mode = "low_dim"
+    config.train.num_data_workers = 0
+    config.train.batch_size = 16
+
+    # replace object with rgb modality
+    config.observation.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos"]
+    config.observation.modalities.obs.rgb = ["agentview_image"]
+
+    # set up visual encoders
+    config.observation.encoder.rgb.core_class = "VisualCore"
+    config.observation.encoder.rgb.core_kwargs.feature_dimension = 64
+    config.observation.encoder.rgb.core_kwargs.backbone_class = 'ResNet18Conv'                         # ResNet backbone for image observations (unused if no image observations)
+    config.observation.encoder.rgb.core_kwargs.backbone_kwargs.pretrained = False                # kwargs for visual core
+    config.observation.encoder.rgb.core_kwargs.backbone_kwargs.input_coord_conv = False
+    config.observation.encoder.rgb.core_kwargs.pool_class = "SpatialSoftmax"                # Alternate options are "SpatialMeanPool" or None (no pooling)
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.num_kp = 32                      # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.learnable_temperature = False    # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.temperature = 1.0                # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.noise_std = 0.0
+
+    # observation randomizer class - set to None to use no randomization, or 'CropRandomizer' to use crop randomization
+    config.observation.encoder.rgb.obs_randomizer_class = None
+
+    return config
+
+
+def make_image_modifier(config_modifier):
+    """
+    turn a config modifier into its image version. Note that
+    this explicit function definition is needed for proper
+    scoping of @config_modifier
+    """
+    return lambda x: config_modifier(convert_config_for_images(x))
+
+
+# mapping from test name to config modifier functions
+MODIFIERS = OrderedDict()
+def register_mod(test_name):
+    def decorator(config_modifier):
+        MODIFIERS[test_name] = config_modifier
+    return decorator
+
+
+@register_mod("bcq-no-actor")
+def bcq_no_actor_modifier(config):
+    config.algo.actor.enabled = False
+    return config
+
+
+@register_mod("bcq-distributional")
+def bcq_distributional_modifier(config):
+    config.algo.critic.distributional.enabled = True
+    config.algo.critic.value_bounds = [-100., 100.]
+    return config
+
+
+@register_mod("bcq-as-gmm")
+def bcq_gmm_modifier(config):
+    config.algo.action_sampler.gmm.enabled = True
+    config.algo.action_sampler.vae.enabled = False
+    return config
+
+
+@register_mod("bcq-as-vae, N(0, 1) prior")
+def bcq_vae_modifier_1(config):
+    # N(0, 1) prior
+    config.algo.action_sampler.vae.enabled = True
+    config.algo.action_sampler.vae.prior.learn = False
+    config.algo.action_sampler.vae.prior.is_conditioned = False
+    return config
+
+
+@register_mod("bcq-as-vae, Gaussian prior (obs-independent)")
+def bcq_vae_modifier_2(config):
+    # learn parameters of Gaussian prior (obs-independent)
+    config.algo.action_sampler.vae.enabled = True
+    config.algo.action_sampler.vae.prior.learn = True
+    config.algo.action_sampler.vae.prior.is_conditioned = False
+    config.algo.action_sampler.vae.prior.use_gmm = False
+    config.algo.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bcq-as-vae, Gaussian prior (obs-dependent)")
+def bcq_vae_modifier_3(config):
+    # learn parameters of Gaussian prior (obs-dependent)
+    config.algo.action_sampler.vae.enabled = True
+    config.algo.action_sampler.vae.prior.learn = True
+    config.algo.action_sampler.vae.prior.is_conditioned = True
+    config.algo.action_sampler.vae.prior.use_gmm = False
+    config.algo.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bcq-as-vae, GMM prior (obs-independent, weights-fixed)")
+def bcq_vae_modifier_4(config):
+    # learn parameters of GMM prior (obs-independent, weights-fixed)
+    config.algo.action_sampler.vae.enabled = True
+    config.algo.action_sampler.vae.prior.learn = True
+    config.algo.action_sampler.vae.prior.is_conditioned = False
+    config.algo.action_sampler.vae.prior.use_gmm = True
+    config.algo.action_sampler.vae.prior.gmm_learn_weights = False
+    config.algo.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bcq-as-vae, GMM prior (obs-independent, weights-learned)")
+def bcq_vae_modifier_5(config):
+    # learn parameters of GMM prior (obs-independent, weights-learned)
+    config.algo.action_sampler.vae.enabled = True
+    config.algo.action_sampler.vae.prior.learn = True
+    config.algo.action_sampler.vae.prior.is_conditioned = False
+    config.algo.action_sampler.vae.prior.use_gmm = True
+    config.algo.action_sampler.vae.prior.gmm_learn_weights = True
+    config.algo.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bcq-as-vae, GMM prior (obs-dependent, weights-fixed)")
+def bcq_vae_modifier_6(config):
+    # learn parameters of GMM prior (obs-dependent, weights-fixed)
+    config.algo.action_sampler.vae.enabled = True
+    config.algo.action_sampler.vae.prior.learn = True
+    config.algo.action_sampler.vae.prior.is_conditioned = True
+    config.algo.action_sampler.vae.prior.use_gmm = True
+    config.algo.action_sampler.vae.prior.gmm_learn_weights = False
+    config.algo.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bcq-as-vae, GMM prior (obs-dependent, weights-learned)")
+def bcq_vae_modifier_7(config):
+    # learn parameters of GMM prior (obs-dependent, weights-learned)
+    config.algo.action_sampler.vae.enabled = True
+    config.algo.action_sampler.vae.prior.learn = True
+    config.algo.action_sampler.vae.prior.is_conditioned = True
+    config.algo.action_sampler.vae.prior.use_gmm = True
+    config.algo.action_sampler.vae.prior.gmm_learn_weights = True
+    config.algo.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("bcq-as-vae, uniform categorical prior")
+def bcq_vae_modifier_8(config):
+    # uniform categorical prior
+    config.algo.action_sampler.vae.enabled = True
+    config.algo.action_sampler.vae.prior.learn = False
+    config.algo.action_sampler.vae.prior.is_conditioned = False
+    config.algo.action_sampler.vae.prior.use_gmm = False
+    config.algo.action_sampler.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("bcq-as-vae, categorical prior (obs-independent)")
+def bcq_vae_modifier_9(config):
+    # learn parameters of categorical prior (obs-independent)
+    config.algo.action_sampler.vae.enabled = True
+    config.algo.action_sampler.vae.prior.learn = True
+    config.algo.action_sampler.vae.prior.is_conditioned = False
+    config.algo.action_sampler.vae.prior.use_gmm = False
+    config.algo.action_sampler.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("bcq-as-vae, categorical prior (obs-dependent)")
+def bcq_vae_modifier_10(config):
+    # learn parameters of categorical prior (obs-dependent)
+    config.algo.action_sampler.vae.enabled = True
+    config.algo.action_sampler.vae.prior.learn = True
+    config.algo.action_sampler.vae.prior.is_conditioned = True
+    config.algo.action_sampler.vae.prior.use_gmm = False
+    config.algo.action_sampler.vae.prior.use_categorical = True
+    return config
+
+
+# add image version of all tests
+image_modifiers = OrderedDict()
+for test_name in MODIFIERS:
+    lst = test_name.split("-")
+    name = "-".join(lst[:1] + ["rgb"] + lst[1:])
+    image_modifiers[name] = make_image_modifier(MODIFIERS[test_name])
+MODIFIERS.update(image_modifiers)
+
+
+# test for image crop randomization
+@register_mod("bcq-image-crop")
+def bcq_image_crop_modifier(config):
+    config = convert_config_for_images(config)
+
+    # observation randomizer class - using Crop randomizer
+    config.observation.encoder.rgb.obs_randomizer_class = "CropRandomizer"
+
+    # kwargs for observation randomizers (for the CropRandomizer, this is size and number of crops)
+    config.observation.encoder.rgb.obs_randomizer_kwargs.crop_height = 76
+    config.observation.encoder.rgb.obs_randomizer_kwargs.crop_width = 76
+    config.observation.encoder.rgb.obs_randomizer_kwargs.num_crops = 1
+    config.observation.encoder.rgb.obs_randomizer_kwargs.pos_enc = False
+    return config
+
+
+def test_bcq(silence=True):
+    for test_name in MODIFIERS:
+        context = silence_stdout() if silence else dummy_context_mgr()
+        with context:
+            base_config = get_algo_base_config()
+            res_str = TestUtils.test_run(base_config=base_config, config_modifier=MODIFIERS[test_name])
+        print("{}: {}".format(test_name, res_str))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--verbose",
+        action='store_true',
+        help="don't suppress stdout during tests",
+    )
+    args = parser.parse_args()
+
+    test_bcq(silence=(not args.verbose))
diff --git a/phantom/submodules/phantom-robomimic/tests/test_cql.py b/phantom/submodules/phantom-robomimic/tests/test_cql.py
new file mode 100644
index 0000000000000000000000000000000000000000..a78c4bf222e4fb312c0b9395a65001ccbd474f83
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/tests/test_cql.py
@@ -0,0 +1,152 @@
+"""
+Test script for CQL algorithms. Each test trains a variant of CQL
+for a handful of gradient steps and tries one rollout with 
+the model. Excludes stdout output by default (pass --verbose
+to see stdout output).
+"""
+import argparse
+from collections import OrderedDict
+
+import robomimic
+from robomimic.config import Config
+import robomimic.utils.test_utils as TestUtils
+from robomimic.utils.log_utils import silence_stdout
+from robomimic.utils.torch_utils import dummy_context_mgr
+
+
+def get_algo_base_config():
+    """
+    Base config for testing CQL algorithms.
+    """
+
+    # config with basic settings for quick training run
+    config = TestUtils.get_base_config(algo_name="cql")
+
+    # low-level obs (note that we define it here because @observation structure might vary per algorithm, 
+    # for example HBC)
+    config.observation.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.modalities.obs.rgb = []
+
+    # by default, vanilla CQL
+    config.algo.actor.bc_start_steps = 40           # BC training initially
+    config.algo.critic.target_q_gap = 5.0           # use automatic cql tuning
+    config.algo.actor.target_entropy = "default"    # use automatic entropy tuning
+
+    # lower batch size to 100 to accomodate small test dataset
+    config.train.batch_size = 100
+
+    return config
+
+
+def convert_config_for_images(config):
+    """
+    Modify config to use image observations.
+    """
+
+    # using high-dimensional images - don't load entire dataset into memory, and smaller batch size
+    config.train.hdf5_cache_mode = "low_dim"
+    config.train.num_data_workers = 0
+    config.train.batch_size = 16
+
+    # replace object with rgb modality
+    config.observation.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos"]
+    config.observation.modalities.obs.rgb = ["agentview_image"]
+
+    # set up visual encoders
+    config.observation.encoder.rgb.core_class = "VisualCore"
+    config.observation.encoder.rgb.core_kwargs.feature_dimension = 64
+    config.observation.encoder.rgb.core_kwargs.backbone_class = 'ResNet18Conv'                         # ResNet backbone for image observations (unused if no image observations)
+    config.observation.encoder.rgb.core_kwargs.backbone_kwargs.pretrained = False                # kwargs for visual core
+    config.observation.encoder.rgb.core_kwargs.backbone_kwargs.input_coord_conv = False
+    config.observation.encoder.rgb.core_kwargs.pool_class = "SpatialSoftmax"                # Alternate options are "SpatialMeanPool" or None (no pooling)
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.num_kp = 32                      # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.learnable_temperature = False    # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.temperature = 1.0                # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.noise_std = 0.0
+
+    # observation randomizer class - set to None to use no randomization, or 'CropRandomizer' to use crop randomization
+    config.observation.encoder.rgb.obs_randomizer_class = None
+
+    return config
+
+
+def make_image_modifier(config_modifier):
+    """
+    turn a config modifier into its image version. Note that
+    this explicit function definition is needed for proper
+    scoping of @config_modifier
+    """
+    return lambda x: config_modifier(convert_config_for_images(x))
+
+
+# mapping from test name to config modifier functions
+MODIFIERS = OrderedDict()
+def register_mod(test_name):
+    def decorator(config_modifier):
+        MODIFIERS[test_name] = config_modifier
+    return decorator
+
+
+@register_mod("cql-fixed-entropy")
+def cql_entropy_modifier(config):
+    config.algo.actor.target_entropy = None
+    return config
+
+
+@register_mod("cql-fixed-q-gap")
+def cql_q_gap_modifier(config):
+    config.algo.critic.target_q_gap = None
+    config.algo.critic.cql_weight = 1.0
+    return config
+
+
+@register_mod("cql-fixed-gaussian")
+def cql_gaussian_modifier(config):
+    config.algo.actor.net.gaussian.fixed_std = True
+    return config
+
+
+# add image version of all tests
+image_modifiers = OrderedDict()
+for test_name in MODIFIERS:
+    lst = test_name.split("-")
+    name = "-".join(lst[:1] + ["rgb"] + lst[1:])
+    image_modifiers[name] = make_image_modifier(MODIFIERS[test_name])
+MODIFIERS.update(image_modifiers)
+
+
+# test for image crop randomization
+@register_mod("cql-image-crop")
+def cql_image_crop_modifier(config):
+    config = convert_config_for_images(config)
+
+    # observation randomizer class - using Crop randomizer
+    config.observation.encoder.rgb.obs_randomizer_class = "CropRandomizer"
+
+    # kwargs for observation randomizers (for the CropRandomizer, this is size and number of crops)
+    config.observation.encoder.rgb.obs_randomizer_kwargs.crop_height = 76
+    config.observation.encoder.rgb.obs_randomizer_kwargs.crop_width = 76
+    config.observation.encoder.rgb.obs_randomizer_kwargs.num_crops = 1
+    config.observation.encoder.rgb.obs_randomizer_kwargs.pos_enc = False
+    return config
+
+
+def test_cql(silence=True):
+    for test_name in MODIFIERS:
+        context = silence_stdout() if silence else dummy_context_mgr()
+        with context:
+            base_config = get_algo_base_config()
+            res_str = TestUtils.test_run(base_config=base_config, config_modifier=MODIFIERS[test_name])
+        print("{}: {}".format(test_name, res_str))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--verbose",
+        action='store_true',
+        help="don't suppress stdout during tests",
+    )
+    args = parser.parse_args()
+
+    test_cql(silence=(not args.verbose))
diff --git a/phantom/submodules/phantom-robomimic/tests/test_examples.py b/phantom/submodules/phantom-robomimic/tests/test_examples.py
new file mode 100644
index 0000000000000000000000000000000000000000..6696015f18bd3ef59bf4e04fd19d41a1026dd0e3
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/tests/test_examples.py
@@ -0,0 +1,84 @@
+"""
+Tests for the provided examples in the repository. Excludes stdout output 
+by default (pass --verbose to see stdout output).
+"""
+import argparse
+import traceback
+import os
+import subprocess
+import time
+import h5py
+import numpy as np
+import torch
+from collections import OrderedDict
+from termcolor import colored
+
+import robomimic
+import robomimic.utils.test_utils as TestUtils
+import robomimic.utils.torch_utils as TorchUtils
+from robomimic.utils.log_utils import silence_stdout
+from robomimic.utils.torch_utils import dummy_context_mgr
+
+
+def test_example_script(script_name, args_string, test_name, silence=True):
+    """
+    Helper function to run an example script with filename @script_name and
+    with test name @test_name (which will be printed to terminal with
+    the stderr output of the example script).
+    """
+
+    # run example script
+    stdout = subprocess.DEVNULL if silence else None
+    path_to_script = os.path.join(robomimic.__path__[0], "../examples/{}".format(script_name))
+    example_job = subprocess.Popen("python {} {}".format(path_to_script, args_string), 
+        shell=True, stdout=stdout, stderr=subprocess.PIPE)
+    example_job.wait()
+
+    # get stderr output
+    out, err = example_job.communicate()
+    err = err.decode("utf-8")
+    if len(err) > 0:
+        ret = "maybe failed - stderr output below (if it's only from tqdm, the test passed)\n{}".format(err)
+        ret = colored(ret, "red")
+    else:
+        ret = colored("passed", "green")
+    print("{}: {}".format(test_name, ret))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--verbose",
+        action='store_true',
+        help="don't suppress stdout during tests",
+    )
+    args = parser.parse_args()
+
+    test_example_script(
+        script_name="simple_config.py", 
+        args_string="",
+        test_name="simple-config-example", 
+        silence=(not args.verbose),
+    )
+    test_example_script(
+        script_name="simple_obs_nets.py", 
+        args_string="",
+        test_name="simple-obs-nets-example", 
+        silence=(not args.verbose),
+    )
+    test_example_script(
+        script_name="simple_train_loop.py", 
+        args_string="",
+        test_name="simple-train-loop-example", 
+        silence=(not args.verbose),
+    )
+    # clear tmp model dir before running script
+    TestUtils.maybe_remove_dir(TestUtils.temp_model_dir_path())
+    test_example_script(
+        script_name="train_bc_rnn.py", 
+        args_string="--debug",
+        test_name="train-bc-rnn-example", 
+        silence=(not args.verbose),
+    )
+    # cleanup
+    TestUtils.maybe_remove_dir(TestUtils.temp_model_dir_path())
diff --git a/phantom/submodules/phantom-robomimic/tests/test_hbc.py b/phantom/submodules/phantom-robomimic/tests/test_hbc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e55606960b85164a36d3b6d9fc35bde1c6f0fa03
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/tests/test_hbc.py
@@ -0,0 +1,184 @@
+"""
+Test script for HBC algorithm. Each test trains a variant of HBC
+for a handful of gradient steps and tries one rollout with 
+the model. Excludes stdout output by default (pass --verbose
+to see stdout output).
+"""
+import argparse
+from collections import OrderedDict
+
+import robomimic
+import robomimic.utils.test_utils as TestUtils
+from robomimic.utils.log_utils import silence_stdout
+from robomimic.utils.torch_utils import dummy_context_mgr
+
+
+def get_algo_base_config():
+    """
+    Base config for testing BCQ algorithms.
+    """
+
+    # config with basic settings for quick training run
+    config = TestUtils.get_base_config(algo_name="hbc")
+
+    # low-level obs (note that we define it here because @observation structure might vary per algorithm, 
+    # for example HBC)
+    config.observation.planner.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.planner.modalities.obs.rgb = []
+
+    config.observation.planner.modalities.subgoal.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.planner.modalities.subgoal.rgb = []
+
+    config.observation.actor.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.actor.modalities.obs.rgb = []
+
+    # by default, planner is deterministic prediction
+    config.algo.planner.vae.enabled = False
+
+    return config
+
+
+# mapping from test name to config modifier functions
+MODIFIERS = OrderedDict()
+def register_mod(test_name):
+    def decorator(config_modifier):
+        MODIFIERS[test_name] = config_modifier
+    return decorator
+
+
+@register_mod("hbc")
+def hbc_modifier(config):
+    # no-op
+    return config
+
+
+@register_mod("hbc-vae, N(0, 1) prior")
+def hbc_vae_modifier_1(config):
+    config.algo.planner.vae.enabled = True
+    config.algo.planner.vae.prior.learn = False
+    config.algo.planner.vae.prior.is_conditioned = False
+    return config
+
+
+@register_mod("hbc-vae, Gaussian prior (obs-independent)")
+def hbc_vae_modifier_2(config):
+    # learn parameters of Gaussian prior (obs-independent)
+    config.algo.planner.vae.enabled = True
+    config.algo.planner.vae.prior.learn = True
+    config.algo.planner.vae.prior.is_conditioned = False
+    config.algo.planner.vae.prior.use_gmm = False
+    config.algo.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("hbc-vae, Gaussian prior (obs-dependent)")
+def hbc_vae_modifier_3(config):
+    # learn parameters of Gaussian prior (obs-dependent)
+    config.algo.planner.vae.enabled = True
+    config.algo.planner.vae.prior.learn = True
+    config.algo.planner.vae.prior.is_conditioned = True
+    config.algo.planner.vae.prior.use_gmm = False
+    config.algo.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("hbc-vae, GMM prior (obs-independent, weights-fixed)")
+def hbc_vae_modifier_4(config):
+    # learn parameters of GMM prior (obs-independent, weights-fixed)
+    config.algo.planner.vae.enabled = True
+    config.algo.planner.vae.prior.learn = True
+    config.algo.planner.vae.prior.is_conditioned = False
+    config.algo.planner.vae.prior.use_gmm = True
+    config.algo.planner.vae.prior.gmm_learn_weights = False
+    config.algo.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("hbc-vae, GMM prior (obs-independent, weights-learned)")
+def hbc_vae_modifier_5(config):
+    # learn parameters of GMM prior (obs-independent, weights-learned)
+    config.algo.planner.vae.enabled = True
+    config.algo.planner.vae.prior.learn = True
+    config.algo.planner.vae.prior.is_conditioned = False
+    config.algo.planner.vae.prior.use_gmm = True
+    config.algo.planner.vae.prior.gmm_learn_weights = True
+    config.algo.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("hbc-vae, GMM prior (obs-dependent, weights-fixed)")
+def hbc_vae_modifier_6(config):
+    # learn parameters of GMM prior (obs-dependent, weights-fixed)
+    config.algo.planner.vae.enabled = True
+    config.algo.planner.vae.prior.learn = True
+    config.algo.planner.vae.prior.is_conditioned = True
+    config.algo.planner.vae.prior.use_gmm = True
+    config.algo.planner.vae.prior.gmm_learn_weights = False
+    config.algo.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("hbc-vae, GMM prior (obs-dependent, weights-learned)")
+def hbc_vae_modifier_7(config):
+    # learn parameters of GMM prior (obs-dependent, weights-learned)
+    config.algo.planner.vae.enabled = True
+    config.algo.planner.vae.prior.learn = True
+    config.algo.planner.vae.prior.is_conditioned = True
+    config.algo.planner.vae.prior.use_gmm = True
+    config.algo.planner.vae.prior.gmm_learn_weights = True
+    config.algo.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("hbc-vae, uniform categorical prior")
+def hbc_vae_modifier_8(config):
+    # uniform categorical prior
+    config.algo.planner.vae.enabled = True
+    config.algo.planner.vae.prior.learn = False
+    config.algo.planner.vae.prior.is_conditioned = False
+    config.algo.planner.vae.prior.use_gmm = False
+    config.algo.planner.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("hbc-vae, categorical prior (obs-independent)")
+def hbc_vae_modifier_9(config):
+    # learn parameters of categorical prior (obs-independent)
+    config.algo.planner.vae.enabled = True
+    config.algo.planner.vae.prior.learn = True
+    config.algo.planner.vae.prior.is_conditioned = False
+    config.algo.planner.vae.prior.use_gmm = False
+    config.algo.planner.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("hbc-vae, categorical prior (obs-dependent)")
+def hbc_vae_modifier_10(config):
+    # learn parameters of categorical prior (obs-dependent)
+    config.algo.planner.vae.enabled = True
+    config.algo.planner.vae.prior.learn = True
+    config.algo.planner.vae.prior.is_conditioned = True
+    config.algo.planner.vae.prior.use_gmm = False
+    config.algo.planner.vae.prior.use_categorical = True
+    return config
+
+
+def test_hbc(silence=True):
+    for test_name in MODIFIERS:
+        context = silence_stdout() if silence else dummy_context_mgr()
+        with context:
+            base_config = get_algo_base_config()
+            res_str = TestUtils.test_run(base_config=base_config, config_modifier=MODIFIERS[test_name])
+        print("{}: {}".format(test_name, res_str))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--verbose",
+        action='store_true',
+        help="don't suppress stdout during tests",
+    )
+    args = parser.parse_args()
+
+    test_hbc(silence=(not args.verbose))
diff --git a/phantom/submodules/phantom-robomimic/tests/test_iql.py b/phantom/submodules/phantom-robomimic/tests/test_iql.py
new file mode 100644
index 0000000000000000000000000000000000000000..e80a8f3bdc2949166d65c32de1134c85fc6c7901
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/tests/test_iql.py
@@ -0,0 +1,143 @@
+"""
+Test script for IQL algorithms. Each test trains a variant of IQL
+for a handful of gradient steps and tries one rollout with
+the model. Excludes stdout output by default (pass --verbose
+to see stdout output).
+"""
+import argparse
+from collections import OrderedDict
+
+import robomimic
+from robomimic.config import Config
+import robomimic.utils.test_utils as TestUtils
+from robomimic.utils.log_utils import silence_stdout
+from robomimic.utils.torch_utils import dummy_context_mgr
+
+
+def get_algo_base_config():
+    """
+    Base config for testing IQL algorithms.
+    """
+
+    # config with basic settings for quick training run
+    config = TestUtils.get_base_config(algo_name="iql")
+
+    # low-level obs (note that we define it here because @observation structure might vary per algorithm,
+    # for example HBC)
+    config.observation.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.modalities.obs.rgb = []
+
+    return config
+
+
+def convert_config_for_images(config):
+    """
+    Modify config to use image observations.
+    """
+
+    # using high-dimensional images - don't load entire dataset into memory, and smaller batch size
+    config.train.hdf5_cache_mode = "low_dim"
+    config.train.num_data_workers = 0
+    config.train.batch_size = 16
+
+    # replace object with rgb modality
+    config.observation.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos"]
+    config.observation.modalities.obs.rgb = ["agentview_image"]
+
+    # set up visual encoders
+    config.observation.encoder.rgb.core_class = "VisualCore"
+    config.observation.encoder.rgb.core_kwargs.feature_dimension = 64
+    config.observation.encoder.rgb.core_kwargs.backbone_class = 'ResNet18Conv'                         # ResNet backbone for image observations (unused if no image observations)
+    config.observation.encoder.rgb.core_kwargs.backbone_kwargs.pretrained = False                # kwargs for visual core
+    config.observation.encoder.rgb.core_kwargs.backbone_kwargs.input_coord_conv = False
+    config.observation.encoder.rgb.core_kwargs.pool_class = "SpatialSoftmax"                # Alternate options are "SpatialMeanPool" or None (no pooling)
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.num_kp = 32                      # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.learnable_temperature = False    # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.temperature = 1.0                # Default arguments for "SpatialSoftmax"
+    config.observation.encoder.rgb.core_kwargs.pool_kwargs.noise_std = 0.0
+
+    # observation randomizer class - set to None to use no randomization, or 'CropRandomizer' to use crop randomization
+    config.observation.encoder.rgb.obs_randomizer_class = None
+
+    return config
+
+
+def make_image_modifier(config_modifier):
+    """
+    turn a config modifier into its image version. Note that
+    this explicit function definition is needed for proper
+    scoping of @config_modifier
+    """
+    return lambda x: config_modifier(convert_config_for_images(x))
+
+
+# mapping from test name to config modifier functions
+MODIFIERS = OrderedDict()
+def register_mod(test_name):
+    def decorator(config_modifier):
+        MODIFIERS[test_name] = config_modifier
+    return decorator
+
+
+@register_mod("iql-gaussian")
+def iql_default_modifier(config):
+    config.algo.actor.net.type = "gaussian"
+    return config
+
+
+@register_mod("iql-gmm")
+def iql_default_modifier(config):
+    config.algo.actor.net.type = "gmm"
+    return config
+
+
+@register_mod("iql-clip-adv")
+def iql_default_modifier(config):
+    config.algo.adv.clip_adv_value = 1.0
+    return config
+
+
+# add image version of all tests
+image_modifiers = OrderedDict()
+for test_name in MODIFIERS:
+    lst = test_name.split("-")
+    name = "-".join(lst[:1] + ["rgb"] + lst[1:])
+    image_modifiers[name] = make_image_modifier(MODIFIERS[test_name])
+MODIFIERS.update(image_modifiers)
+
+
+# test for image crop randomization
+@register_mod("iql-image-crop")
+def iql_image_crop_modifier(config):
+    config = convert_config_for_images(config)
+
+    # observation randomizer class - using Crop randomizer
+    config.observation.encoder.rgb.obs_randomizer_class = "CropRandomizer"
+
+    # kwargs for observation randomizers (for the CropRandomizer, this is size and number of crops)
+    config.observation.encoder.rgb.obs_randomizer_kwargs.crop_height = 76
+    config.observation.encoder.rgb.obs_randomizer_kwargs.crop_width = 76
+    config.observation.encoder.rgb.obs_randomizer_kwargs.num_crops = 1
+    config.observation.encoder.rgb.obs_randomizer_kwargs.pos_enc = False
+    return config
+
+
+def test_iql(silence=True):
+    for test_name in MODIFIERS:
+        context = silence_stdout() if silence else dummy_context_mgr()
+        with context:
+            base_config = get_algo_base_config()
+            res_str = TestUtils.test_run(base_config=base_config, config_modifier=MODIFIERS[test_name])
+        print("{}: {}".format(test_name, res_str))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--verbose",
+        action='store_true',
+        help="don't suppress stdout during tests",
+    )
+    args = parser.parse_args()
+
+    test_iql(silence=(not args.verbose))
diff --git a/phantom/submodules/phantom-robomimic/tests/test_iris.py b/phantom/submodules/phantom-robomimic/tests/test_iris.py
new file mode 100644
index 0000000000000000000000000000000000000000..126c5c288fff150bbde55aa8570e3e4e31df3943
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/tests/test_iris.py
@@ -0,0 +1,302 @@
+"""
+Test script for IRIS algorithms. Each test trains a variant of IRIS
+for a handful of gradient steps and tries one rollout with 
+the model. Excludes stdout output by default (pass --verbose
+to see stdout output).
+"""
+import argparse
+from collections import OrderedDict
+
+import robomimic
+import robomimic.utils.test_utils as TestUtils
+from robomimic.utils.log_utils import silence_stdout
+from robomimic.utils.torch_utils import dummy_context_mgr
+
+
+def get_algo_base_config():
+    """
+    Base config for testing BCQ algorithms.
+    """
+
+    # config with basic settings for quick training run
+    config = TestUtils.get_base_config(algo_name="iris")
+
+    # low-level obs (note that we define it here because @observation structure might vary per algorithm, 
+    # for example iris)
+    config.observation.value_planner.planner.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.value_planner.planner.modalities.obs.rgb = []
+
+    config.observation.value_planner.planner.modalities.subgoal.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.value_planner.planner.modalities.subgoal.rgb = []
+
+    config.observation.value_planner.value.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.value_planner.value.modalities.obs.rgb = []
+
+    config.observation.actor.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos", "object"]
+    config.observation.actor.modalities.obs.rgb = []
+
+    # by default, basic N(0, 1) prior for both planner VAE and BCQ cVAE
+    config.algo.value_planner.planner.vae.enabled = True
+    config.algo.value_planner.planner.vae.prior.learn = False
+    config.algo.value_planner.planner.vae.prior.is_conditioned = False
+    config.algo.value_planner.value.action_sampler.vae.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.prior.learn = False
+    config.algo.value_planner.value.action_sampler.vae.prior.is_conditioned = False
+
+    return config
+
+
+# mapping from test name to config modifier functions
+MODIFIERS = OrderedDict()
+def register_mod(test_name):
+    def decorator(config_modifier):
+        MODIFIERS[test_name] = config_modifier
+    return decorator
+
+
+@register_mod("iris")
+def iris_modifier_1(config):
+    # no-op
+    return config
+
+
+@register_mod("iris, planner vae Gaussian prior (obs-independent)")
+def iris_modifier_2(config):
+    # learn parameters of Gaussian prior (obs-independent)
+    config.algo.value_planner.planner.vae.enabled = True
+    config.algo.value_planner.planner.vae.prior.learn = True
+    config.algo.value_planner.planner.vae.prior.is_conditioned = False
+    config.algo.value_planner.planner.vae.prior.use_gmm = False
+    config.algo.value_planner.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, planner vae Gaussian prior (obs-dependent)")
+def iris_modifier_3(config):
+    # learn parameters of Gaussian prior (obs-dependent)
+    config.algo.value_planner.planner.vae.enabled = True
+    config.algo.value_planner.planner.vae.prior.learn = True
+    config.algo.value_planner.planner.vae.prior.is_conditioned = True
+    config.algo.value_planner.planner.vae.prior.use_gmm = False
+    config.algo.value_planner.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, planner vae GMM prior (obs-independent, weights-fixed)")
+def iris_modifier_4(config):
+    # learn parameters of GMM prior (obs-independent, weights-fixed)
+    config.algo.value_planner.planner.vae.enabled = True
+    config.algo.value_planner.planner.vae.prior.learn = True
+    config.algo.value_planner.planner.vae.prior.is_conditioned = False
+    config.algo.value_planner.planner.vae.prior.use_gmm = True
+    config.algo.value_planner.planner.vae.prior.gmm_learn_weights = False
+    config.algo.value_planner.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, planner vae GMM prior (obs-independent, weights-learned)")
+def iris_modifier_5(config):
+    # learn parameters of GMM prior (obs-independent, weights-learned)
+    config.algo.value_planner.planner.vae.enabled = True
+    config.algo.value_planner.planner.vae.prior.learn = True
+    config.algo.value_planner.planner.vae.prior.is_conditioned = False
+    config.algo.value_planner.planner.vae.prior.use_gmm = True
+    config.algo.value_planner.planner.vae.prior.gmm_learn_weights = True
+    config.algo.value_planner.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, planner vae GMM prior (obs-dependent, weights-fixed)")
+def iris_modifier_6(config):
+    # learn parameters of GMM prior (obs-dependent, weights-fixed)
+    config.algo.value_planner.planner.vae.enabled = True
+    config.algo.value_planner.planner.vae.prior.learn = True
+    config.algo.value_planner.planner.vae.prior.is_conditioned = True
+    config.algo.value_planner.planner.vae.prior.use_gmm = True
+    config.algo.value_planner.planner.vae.prior.gmm_learn_weights = False
+    config.algo.value_planner.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, planner vae GMM prior (obs-dependent, weights-learned)")
+def iris_modifier_7(config):
+    # learn parameters of GMM prior (obs-dependent, weights-learned)
+    config.algo.value_planner.planner.vae.enabled = True
+    config.algo.value_planner.planner.vae.prior.learn = True
+    config.algo.value_planner.planner.vae.prior.is_conditioned = True
+    config.algo.value_planner.planner.vae.prior.use_gmm = True
+    config.algo.value_planner.planner.vae.prior.gmm_learn_weights = True
+    config.algo.value_planner.planner.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, planner vae uniform categorical prior")
+def iris_modifier_8(config):
+    # uniform categorical prior
+    config.algo.value_planner.planner.vae.enabled = True
+    config.algo.value_planner.planner.vae.prior.learn = False
+    config.algo.value_planner.planner.vae.prior.is_conditioned = False
+    config.algo.value_planner.planner.vae.prior.use_gmm = False
+    config.algo.value_planner.planner.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("iris, planner vae categorical prior (obs-independent)")
+def iris_modifier_9(config):
+    # learn parameters of categorical prior (obs-independent)
+    config.algo.value_planner.planner.vae.enabled = True
+    config.algo.value_planner.planner.vae.prior.learn = True
+    config.algo.value_planner.planner.vae.prior.is_conditioned = False
+    config.algo.value_planner.planner.vae.prior.use_gmm = False
+    config.algo.value_planner.planner.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("iris, planner vae categorical prior (obs-dependent)")
+def iris_modifier_10(config):
+    # learn parameters of categorical prior (obs-dependent)
+    config.algo.value_planner.planner.vae.enabled = True
+    config.algo.value_planner.planner.vae.prior.learn = True
+    config.algo.value_planner.planner.vae.prior.is_conditioned = True
+    config.algo.value_planner.planner.vae.prior.use_gmm = False
+    config.algo.value_planner.planner.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("iris, bcq gmm")
+def iris_modifier_11(config):
+    # bcq action sampler is GMM
+    config.algo.value_planner.value.action_sampler.gmm.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.enabled = False
+    return config
+
+
+@register_mod("iris, bcq distributional")
+def iris_modifier_12(config):
+    # bcq value function is distributional
+    config.algo.value_planner.value.critic.distributional.enabled = True
+    config.algo.value_planner.value.critic.value_bounds = [-100., 100.]
+    return config
+
+@register_mod("iris, bcq cVAE Gaussian prior (obs-independent)")
+def iris_modifier_13(config):
+    # learn parameters of Gaussian prior (obs-independent)
+    config.algo.value_planner.value.action_sampler.vae.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.prior.learn = True
+    config.algo.value_planner.value.action_sampler.vae.prior.is_conditioned = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_gmm = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, bcq cVAE Gaussian prior (obs-dependent)")
+def iris_modifier_14(config):
+    # learn parameters of Gaussian prior (obs-dependent)
+    config.algo.value_planner.value.action_sampler.vae.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.prior.learn = True
+    config.algo.value_planner.value.action_sampler.vae.prior.is_conditioned = True
+    config.algo.value_planner.value.action_sampler.vae.prior.use_gmm = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, bcq cVAE GMM prior (obs-independent, weights-fixed)")
+def iris_modifier_15(config):
+    # learn parameters of GMM prior (obs-independent, weights-fixed)
+    config.algo.value_planner.value.action_sampler.vae.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.prior.learn = True
+    config.algo.value_planner.value.action_sampler.vae.prior.is_conditioned = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_gmm = True
+    config.algo.value_planner.value.action_sampler.vae.prior.gmm_learn_weights = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, bcq cVAE GMM prior (obs-independent, weights-learned)")
+def iris_modifier_16(config):
+    # learn parameters of GMM prior (obs-independent, weights-learned)
+    config.algo.value_planner.value.action_sampler.vae.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.prior.learn = True
+    config.algo.value_planner.value.action_sampler.vae.prior.is_conditioned = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_gmm = True
+    config.algo.value_planner.value.action_sampler.vae.prior.gmm_learn_weights = True
+    config.algo.value_planner.value.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, bcq cVAE GMM prior (obs-dependent, weights-fixed)")
+def iris_modifier_17(config):
+    # learn parameters of GMM prior (obs-dependent, weights-fixed)
+    config.algo.value_planner.value.action_sampler.vae.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.prior.learn = True
+    config.algo.value_planner.value.action_sampler.vae.prior.is_conditioned = True
+    config.algo.value_planner.value.action_sampler.vae.prior.use_gmm = True
+    config.algo.value_planner.value.action_sampler.vae.prior.gmm_learn_weights = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, bcq cVAE GMM prior (obs-dependent, weights-learned)")
+def iris_modifier_18(config):
+    # learn parameters of GMM prior (obs-dependent, weights-learned)
+    config.algo.value_planner.value.action_sampler.vae.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.prior.learn = True
+    config.algo.value_planner.value.action_sampler.vae.prior.is_conditioned = True
+    config.algo.value_planner.value.action_sampler.vae.prior.use_gmm = True
+    config.algo.value_planner.value.action_sampler.vae.prior.gmm_learn_weights = True
+    config.algo.value_planner.value.action_sampler.vae.prior.use_categorical = False
+    return config
+
+
+@register_mod("iris, bcq cVAE uniform categorical prior")
+def iris_modifier_19(config):
+    # uniform categorical prior
+    config.algo.value_planner.value.action_sampler.vae.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.prior.learn = False
+    config.algo.value_planner.value.action_sampler.vae.prior.is_conditioned = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_gmm = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("iris, bcq cVAE categorical prior (obs-independent)")
+def iris_modifier_20(config):
+    # learn parameters of categorical prior (obs-independent)
+    config.algo.value_planner.value.action_sampler.vae.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.prior.learn = True
+    config.algo.value_planner.value.action_sampler.vae.prior.is_conditioned = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_gmm = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_categorical = True
+    return config
+
+
+@register_mod("iris, bcq cVAE categorical prior (obs-dependent)")
+def iris_modifier_21(config):
+    # learn parameters of categorical prior (obs-dependent)
+    config.algo.value_planner.value.action_sampler.vae.enabled = True
+    config.algo.value_planner.value.action_sampler.vae.prior.learn = True
+    config.algo.value_planner.value.action_sampler.vae.prior.is_conditioned = True
+    config.algo.value_planner.value.action_sampler.vae.prior.use_gmm = False
+    config.algo.value_planner.value.action_sampler.vae.prior.use_categorical = True
+    return config
+
+
+def test_iris(silence=True):
+    for test_name in MODIFIERS:
+        context = silence_stdout() if silence else dummy_context_mgr()
+        with context:
+            base_config = get_algo_base_config()
+            res_str = TestUtils.test_run(base_config=base_config, config_modifier=MODIFIERS[test_name])
+        print("{}: {}".format(test_name, res_str))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--verbose",
+        action='store_true',
+        help="don't suppress stdout during tests",
+    )
+    args = parser.parse_args()
+
+    test_iris(silence=(not args.verbose))
diff --git a/phantom/submodules/phantom-robomimic/tests/test_scripts.py b/phantom/submodules/phantom-robomimic/tests/test_scripts.py
new file mode 100644
index 0000000000000000000000000000000000000000..30ed7f6112028f1403eb64b87ff4448664358869
--- /dev/null
+++ b/phantom/submodules/phantom-robomimic/tests/test_scripts.py
@@ -0,0 +1,170 @@
+"""
+Tests for a handful of scripts. Excludes stdout output by 
+default (pass --verbose to see stdout output).
+"""
+import argparse
+import traceback
+import h5py
+import numpy as np
+import torch
+from collections import OrderedDict
+from termcolor import colored
+
+import robomimic
+import robomimic.utils.test_utils as TestUtils
+import robomimic.utils.torch_utils as TorchUtils
+from robomimic.config import Config
+from robomimic.utils.log_utils import silence_stdout
+from robomimic.utils.torch_utils import dummy_context_mgr
+from robomimic.scripts.train import train
+from robomimic.scripts.playback_dataset import playback_dataset
+from robomimic.scripts.run_trained_agent import run_trained_agent
+
+
+def get_checkpoint_to_test():
+    """
+    Run a quick training run to get a checkpoint. This function runs a basic bc-image
+    training run. RGB modality is used for a harder test case for the run agent
+    script, which will need to also try writing image observations to the rollout
+    dataset.
+    """
+
+    # prepare image training run
+    config = TestUtils.get_base_config(algo_name="bc")
+
+    def image_modifier(conf):
+        # using high-dimensional images - don't load entire dataset into memory, and smaller batch size
+        conf.train.hdf5_cache_mode = "low_dim"
+        conf.train.num_data_workers = 0
+        conf.train.batch_size = 16
+
+        # replace object with rgb modality
+        conf.observation.modalities.obs.low_dim = ["robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos"]
+        conf.observation.modalities.obs.rgb = ["agentview_image"]
+
+        # set up visual encoders
+        conf.observation.encoder.rgb.core_class = "VisualCore"
+        conf.observation.encoder.rgb.core_kwargs.feature_dimension = 64
+        conf.observation.encoder.rgb.core_kwargs.backbone_class = 'ResNet18Conv'                         # ResNet backbone for image observations (unused if no image observations)
+        conf.observation.encoder.rgb.core_kwargs.backbone_kwargs.pretrained = False                # kwargs for visual core
+        conf.observation.encoder.rgb.core_kwargs.backbone_kwargs.input_coord_conv = False
+        conf.observation.encoder.rgb.core_kwargs.pool_class = "SpatialSoftmax"                # Alternate options are "SpatialMeanPool" or None (no pooling)
+        conf.observation.encoder.rgb.core_kwargs.pool_kwargs.num_kp = 32                      # Default arguments for "SpatialSoftmax"
+        conf.observation.encoder.rgb.core_kwargs.pool_kwargs.learnable_temperature = False    # Default arguments for "SpatialSoftmax"
+        conf.observation.encoder.rgb.core_kwargs.pool_kwargs.temperature = 1.0                # Default arguments for "SpatialSoftmax"
+        conf.observation.encoder.rgb.core_kwargs.pool_kwargs.noise_std = 0.0
+
+        # observation randomizer class - set to None to use no randomization, or 'CropRandomizer' to use crop randomization
+        conf.observation.encoder.rgb.obs_randomizer_class = None
+
+        return conf
+
+    config = TestUtils.config_from_modifier(base_config=config, config_modifier=image_modifier)
+
+    # run training
+    device = TorchUtils.get_torch_device(try_to_use_cuda=True)
+    train(config, device=device)
+
+    # return checkpoint
+    ckpt_path = TestUtils.checkpoint_path_from_test_run()
+    return ckpt_path
+
+
+def test_playback_script(silence=True, use_actions=False, use_obs=False):
+    context = silence_stdout() if silence else dummy_context_mgr()
+    with context:
+
+        try:
+            # setup args and run script
+            args = argparse.Namespace()
+            args.dataset = TestUtils.example_dataset_path()
+            args.filter_key = None
+            args.n = 3 # playback 3 demonstrations
+            args.use_actions = use_actions
+            args.use_obs = use_obs
+            args.render = False
+            args.video_path = TestUtils.temp_video_path() # dump video
+            args.video_skip = 5
+            if use_obs:
+                # camera observation names
+                args.render_image_names = ["agentview_image", "robot0_eye_in_hand_image"]
+            else:
+                # camera names
+                args.render_image_names = ["agentview", "robot0_eye_in_hand"]
+            args.first = False
+            playback_dataset(args)
+
+            # indicate success
+            ret = colored("passed!", "green")
+
+        except Exception as e:
+            # indicate failure by returning error string
+            ret = colored("failed with error:\n{}\n\n{}".format(e, traceback.format_exc()), "red")
+
+        # delete output video
+        TestUtils.maybe_remove_file(TestUtils.temp_video_path())
+
+    act_str = "-action_playback" if use_actions else ""
+    obs_str = "-obs" if use_obs else ""
+    test_name = "playback-script{}{}".format(act_str, obs_str)
+    print("{}: {}".format(test_name, ret))
+
+
+def test_run_agent_script(silence=True):
+    context = silence_stdout() if silence else dummy_context_mgr()
+    with context:
+
+        try:
+            # get a model checkpoint
+            ckpt_path = get_checkpoint_to_test()
+
+            # setup args and run script
+            args = argparse.Namespace()
+            args.agent = ckpt_path
+            args.n_rollouts = 3 # 3 rollouts
+            args.horizon = 10 # short rollouts - 10 steps
+            args.env = None
+            args.render = False
+            args.video_path = TestUtils.temp_video_path() # dump video
+            args.video_skip = 5
+            args.camera_names = ["agentview", "robot0_eye_in_hand"]
+            args.dataset_path = TestUtils.temp_dataset_path() # dump dataset
+            args.dataset_obs = True
+            args.seed = 0
+            run_trained_agent(args)
+
+            # simple sanity check for shape of image observations in rollout dataset
+            f = h5py.File(TestUtils.temp_dataset_path(), "r")
+            assert f["data/demo_1/obs/agentview_image"].shape == (10, 84, 84, 3)
+            assert f["data/demo_1/obs/agentview_image"].dtype == np.uint8
+            f.close()
+
+            # indicate success
+            ret = colored("passed!", "green")
+
+        except Exception as e:
+            # indicate failure by returning error string
+            ret = colored("failed with error:\n{}\n\n{}".format(e, traceback.format_exc()), "red")
+
+        # delete trained model directory, output video, and output dataset
+        TestUtils.maybe_remove_dir(TestUtils.temp_model_dir_path())
+        TestUtils.maybe_remove_file(TestUtils.temp_video_path())
+        TestUtils.maybe_remove_file(TestUtils.temp_dataset_path())
+
+    test_name = "run-agent-script"
+    print("{}: {}".format(test_name, ret))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--verbose",
+        action='store_true',
+        help="don't suppress stdout during tests",
+    )
+    args = parser.parse_args()
+
+    test_playback_script(silence=(not args.verbose), use_actions=False, use_obs=False)
+    test_playback_script(silence=(not args.verbose), use_actions=True, use_obs=False)
+    test_playback_script(silence=(not args.verbose), use_actions=False, use_obs=True)
+    test_run_agent_script(silence=(not args.verbose))
diff --git a/phantom/submodules/phantom-robosuite/.gitignore b/phantom/submodules/phantom-robosuite/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a4fac789a46293cb89d30431ce350f3b617c87d6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/.gitignore
@@ -0,0 +1,117 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# mac
+.DS_Store
+
+# mujoco-key
+mjkey.txt
+
+.mujocomanip_temp_model.xml
+
+*.jpg
+.idea
+
+.pytest_cache/
+
+# private macros
+macros_private.py
diff --git a/phantom/submodules/phantom-robosuite/.pre-commit-config.yaml b/phantom/submodules/phantom-robosuite/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..32ee1fc68dc7abfff165bfe215ef82d1e2a3ea6b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/.pre-commit-config.yaml
@@ -0,0 +1,11 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 22.10.0 # Replace by any tag/version: https://github.com/psf/black/tags
+    hooks:
+      - id: black
+        language_version: python3 # Should be a command that runs python3.6+
+  - repo: https://github.com/pycqa/isort
+    rev: 5.10.1
+    hooks:
+      - id: isort
+        name: isort (python)
diff --git a/phantom/submodules/phantom-robosuite/AUTHORS b/phantom/submodules/phantom-robosuite/AUTHORS
new file mode 100644
index 0000000000000000000000000000000000000000..281ace4009f198d4e5c46f7dcbd98decc20fc943
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/AUTHORS
@@ -0,0 +1,31 @@
+# This file contains an official list of authors of this framework.
+
+# Names should be added to this file as:
+# Name or Organization <email address>
+# The email address is not required for organizations.
+
+Core Team
+Yuke Zhu <yukez@cs.utexas.edu>
+Josiah Wong <jdwong@stanford.edu>
+Ajay Mandlekar <amandlek@cs.stanford.edu>
+Roberto Martín-Martín <robertomm@cs.utexas.edu>
+Abhishek Joshi <ahjoshi@utexas.edu>
+Soroush Nasiriany <soroush@cs.utexas.edu>
+Yifeng Zhu <yifeng.zhu@utexas.edu>
+
+Past Contributors
+Jiren Zhu <jirenz@stanford.edu>
+Jim (Linxi) Fan <jimfan@cs.stanford.edu>
+Orien Zeng <orien.f.zeng@gmail.com>
+Anchit Gupta <anchitg@stanford.edu>
+Zihua Liu <zliu19@stanford.edu>
+Joan Creus-Costa <jcreus@stanford.edu>
+Anchit Gupta <anchitg@stanford.edu>
+Michelle Lee <mishlee@stanford.edu>
+Andrew Kondrich <andrewk1@stanford.edu>
+Rachel Gardner <rachel0@stanford.edu>
+Jonathan Booher <jaustinb@stanford.edu>
+Danfei Xu <danfei@stanford.edu>
+Rachel Gardner <rachel0@stanford.edu>
+Albert Tung <atung3@stanford.edu>
+Divyansh Jha <divyanshj.16@gmail.com>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/CONTRIBUTING.md b/phantom/submodules/phantom-robosuite/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..6c85921f8091257d0a7c564aef74d0f01206831b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/CONTRIBUTING.md
@@ -0,0 +1,47 @@
+How to Contribute
+=================
+
+We are so happy to see you reading this page!
+
+Our team wholeheartedly welcomes the community to contribute to robosuite. Contributions from members of the community will help ensure the long-term success of this project. Before you plan to make contributions, here are important resources to get started with:
+
+- Read the robosuite [documentation](https://robosuite.ai/docs/overview.html) and [whitepaper](https://robosuite.ai/assets/whitepaper.pdf)
+- Check our latest status from existing [issues](https://github.com/ARISE-Initiative/robosuite/issues), [pull requests](https://github.com/ARISE-Initiative/robosuite/pulls), and [branches](https://github.com/ARISE-Initiative/robosuite/branches) and avoid duplicate efforts
+- Join our [ARISE Slack](https://ariseinitiative.slack.com) workspace for technical discussions. Please [email us](mailto:yukez@cs.utexas.edu) to be added to the workspace.
+
+We encourage the community to make four major types of contributions:
+
+- **Bug fixes**: Address open issues and fix bugs presented in the `master` branch
+- **Environment designs:** Design new environments and add them to our existing set of [environments](https://github.com/ARISE-Initiative/robosuite/tree/master/robosuite/environments)
+- **Additional assets:** Incorporate new [models](https://github.com/ARISE-Initiative/robosuite/tree/master/robosuite/models) and functionalities of robots, grippers, objects, and workspaces
+- **New functionalities:** Implement new features, such as dynamics randomization, rendering tools, new controllers, etc.
+
+Testing
+-------
+Before submitting your contributions, make sure that the changes do not break existing functionalities.
+We have a handful of [tests](https://github.com/ARISE-Initiative/robosuite/tree/master/tests) for verifying the correctness of the code.
+You can run all the tests with the following command in the root folder of robosuite. Make sure that it does not throw any error before you proceed to the next step.
+```sh
+$ python -m pytest
+```
+
+Submission
+----------
+Please read the coding conventions below and make sure that your code is consistent with ours. We use the [black](https://github.com/psf/black) and [isort](https://github.com/pycqa/isort) as the [pre-commit](https://pre-commit.com/) hooks to format the source code before code review. To install these hooks, first `pip install pre-commit; pre-commit install` to set them up. Once set up, these hooks should be automatically triggered when committing new changes. If you want to manually check the format of the codes that have already been committed, please run `pre-commit run --all-files` in the project folder.
+
+When making a contribution, make a [pull request](https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/proposing-changes-to-your-work-with-pull-requests)
+to robosuite with an itemized list of what you have done. When you submit a pull request, it is immensely helpful to include example script(s) that showcase the proposed changes and highlight any new APIs. 
+We always love to see more test coverage. When it is appropriate, add a new test to the [tests](https://github.com/ARISE-Initiative/robosuite/tree/master/tests) folder for checking the correctness of your code.
+
+Coding Conventions
+------------------
+In addition to the pre-commit hooks, we value readability and adhere to the following coding conventions:
+- Indent using four spaces (soft tabs)
+- Always put spaces after list items and method parameters (e.g., `[1, 2, 3]` rather than `[1,2,3]`), and around operators and hash arrows (e.g., `x += 1` rather than `x+=1`)
+- Use the [Google Python Style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) for the docstrings
+- For scripts such as in [demos](https://github.com/ARISE-Initiative/robosuite/tree/master/robosuite/demos) and [tests](https://github.com/ARISE-Initiative/robosuite/tree/master/tests),
+  include a docstring at the top of the file that describes the high-level purpose of the script and/or instructions on how to use the scripts (if relevant).
+
+We look forward to your contributions. Thanks!
+
+The robosuite core team
diff --git a/phantom/submodules/phantom-robosuite/LICENSE b/phantom/submodules/phantom-robosuite/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..e20189628cc282626830a7855edf722de38dc700
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/LICENSE
@@ -0,0 +1,28 @@
+MIT License
+
+Copyright (c) 2022 Stanford Vision and Learning Lab and UT Robot Perception and Learning Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+This software includes the partial implementation of Deepmind Mujoco https://github.com/deepmind/mujoco. 
+Deepmind Mujoco is licensed under the Apache License, Version 2.0 (the "License");
+you may not use the files except in compliance with the License. 
+
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
diff --git a/phantom/submodules/phantom-robosuite/MANIFEST.in b/phantom/submodules/phantom-robosuite/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..fa3a4c99e41758a1e0515f0d281e07a36374add4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/MANIFEST.in
@@ -0,0 +1,4 @@
+recursive-include robosuite/controllers/config/ *
+recursive-include robosuite/demos *
+recursive-include robosuite/models/assets/ *
+recursive-include robosuite/scripts *
diff --git a/phantom/submodules/phantom-robosuite/README.md b/phantom/submodules/phantom-robosuite/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..38e02567f34ddfbb58e0bf25525cf8cde259b685
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/README.md
@@ -0,0 +1,47 @@
+# robosuite
+
+![gallery of_environments](docs/images/gallery.png)
+
+[**[Homepage]**](https://robosuite.ai/) &ensp; [**[White Paper]**](https://arxiv.org/abs/2009.12293) &ensp; [**[Documentations]**](https://robosuite.ai/docs/overview.html) &ensp; [**[ARISE Initiative]**](https://github.com/ARISE-Initiative)
+
+-------
+## Latest Updates
+- [11/15/2022] **v1.4**: Backend migration to DeepMind's official [MuJoCo Python binding](https://github.com/deepmind/mujoco), robot textures, and bug fixes :robot: [[release notes]](https://github.com/ARISE-Initiative/robosuite/releases/tag/v1.4.0) [[documentation]](http://robosuite.ai/docs/v1.4/)
+
+- [10/19/2021] **v1.3**: Ray tracing and physically based rendering tools :sparkles: and access to additional vision modalities 🎥 [[video spotlight]](https://www.youtube.com/watch?v=2xesly6JrQ8) [[release notes]](https://github.com/ARISE-Initiative/robosuite/releases/tag/v1.3) [[documentation]](http://robosuite.ai/docs/v1.3/)
+
+- [02/17/2021] **v1.2**: Added observable sensor models :eyes: and dynamics randomization :game_die: [[release notes]](https://github.com/ARISE-Initiative/robosuite/releases/tag/v1.2)
+
+- [12/17/2020] **v1.1**: Refactored infrastructure and standardized model classes for much easier environment prototyping :wrench: [[release notes]](https://github.com/ARISE-Initiative/robosuite/releases/tag/v1.1)
+
+-------
+
+**robosuite** is a simulation framework powered by the [MuJoCo](http://mujoco.org/) physics engine for robot learning. It also offers a suite of benchmark environments for reproducible research. The current release (v1.4) features long-term support with the official MuJoCo binding from DeepMind. This project is part of the broader [Advancing Robot Intelligence through Simulated Environments (ARISE) Initiative](https://github.com/ARISE-Initiative), with the aim of lowering the barriers of entry for cutting-edge research at the intersection of AI and Robotics.
+
+Data-driven algorithms, such as reinforcement learning and imitation learning, provide a powerful and generic tool in robotics. These learning paradigms, fueled by new advances in deep learning, have achieved some exciting successes in a variety of robot control problems. However, the challenges of reproducibility and the limited accessibility of robot hardware (especially during a pandemic) have impaired research progress. The overarching goal of **robosuite** is to provide researchers with:
+
+* a standardized set of benchmarking tasks for rigorous evaluation and algorithm development;
+* a modular design that offers great flexibility to design new robot simulation environments;
+* a high-quality implementation of robot controllers and off-the-shelf learning algorithms to lower the barriers to entry.
+
+This framework was originally developed since late 2017 by researchers in [Stanford Vision and Learning Lab](http://svl.stanford.edu) (SVL) as an internal tool for robot learning research. Now it is actively maintained and used for robotics research projects in SVL and the [UT Robot Perception and Learning Lab](http://rpl.cs.utexas.edu) (RPL). We welcome community contributions to this project. For details please check out our [contributing guidelines](CONTRIBUTING.md).
+
+This release of **robosuite** contains seven robot models, eight gripper models, six controller modes, and nine standardized tasks. It also offers a modular design of APIs for building new environments with procedural generation. We highlight these primary features below:
+
+* **standardized tasks**: a set of standardized manipulation tasks of large diversity and varying complexity and RL benchmarking results for reproducible research;
+* **procedural generation**: modular APIs for programmatically creating new environments and new tasks as combinations of robot models, arenas, and parameterized 3D objects;
+* **robot controllers**: a selection of controller types to command the robots, such as joint-space velocity control, inverse kinematics control, operational space control, and 3D motion devices for teleoperation;
+* **multi-modal sensors**: heterogeneous types of sensory signals, including low-level physical states, RGB cameras, depth maps, and proprioception;
+* **human demonstrations**: utilities for collecting human demonstrations, replaying demonstration datasets, and leveraging demonstration data for learning. Check out our sister project [robomimic](https://arise-initiative.github.io/robomimic-web/);
+* **photorealistic rendering**: integration with advanced graphics tools that provide real-time photorealistic renderings of simulated scenes.
+
+## Citation
+Please cite [**robosuite**](https://robosuite.ai) if you use this framework in your publications:
+```bibtex
+@inproceedings{robosuite2020,
+  title={robosuite: A Modular Simulation Framework and Benchmark for Robot Learning},
+  author={Yuke Zhu and Josiah Wong and Ajay Mandlekar and Roberto Mart\'{i}n-Mart\'{i}n and Abhishek Joshi and Soroush Nasiriany and Yifeng Zhu},
+  booktitle={arXiv preprint arXiv:2009.12293},
+  year={2020}
+}
+```
diff --git a/phantom/submodules/phantom-robosuite/pyproject.toml b/phantom/submodules/phantom-robosuite/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..6871e11a49971eef9e89416f8b5398efc4a1b522
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/pyproject.toml
@@ -0,0 +1,15 @@
+[tool.black]
+line-length = 120
+target-version = ["py36", "py37", "py38"]
+extend-exclude = "robosuite/((models/assets)|(controllers/config))"
+
+[tool.isort]
+profile = "black"
+line_length = 120
+skip = ["__init__.py"]
+filter_files = true
+py_version = "all"
+extend_skip = [
+    "robosuite/models/assets",
+    "robosuite/controllers/config",
+]
diff --git a/phantom/submodules/phantom-robosuite/requirements-extra.txt b/phantom/submodules/phantom-robosuite/requirements-extra.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f607ba9fdb88f725ee888089104e001c3e61a4f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/requirements-extra.txt
@@ -0,0 +1,14 @@
+# required for IK controllers
+pybullet-svl>=3.1.6.4
+
+# required for GymWrapper
+gymnasium
+
+# macOS only
+hidapi
+
+# required for demonstration utils
+h5py
+
+# required for nvisii renderer
+open3d
diff --git a/phantom/submodules/phantom-robosuite/requirements.txt b/phantom/submodules/phantom-robosuite/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d6e1198b1ab1f5a7f19c6f1fc2ba7338438cf718
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/requirements.txt
@@ -0,0 +1 @@
+-e .
diff --git a/phantom/submodules/phantom-robosuite/robosuite/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b88a03d99d94563900bae6711aeeca06961df0a3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/__init__.py
@@ -0,0 +1,29 @@
+from robosuite.environments.base import make
+
+# Manipulation environments
+from robosuite.environments.manipulation.lift import Lift
+from robosuite.environments.manipulation.stack import Stack
+from robosuite.environments.manipulation.nut_assembly import NutAssembly
+from robosuite.environments.manipulation.pick_place import PickPlace
+from robosuite.environments.manipulation.door import Door
+from robosuite.environments.manipulation.wipe import Wipe
+from robosuite.environments.manipulation.tool_hang import ToolHang
+from robosuite.environments.manipulation.two_arm_lift import TwoArmLift
+from robosuite.environments.manipulation.two_arm_peg_in_hole import TwoArmPegInHole
+from robosuite.environments.manipulation.two_arm_handover import TwoArmHandover
+from robosuite.environments.manipulation.two_arm_transport import TwoArmTransport
+from robosuite.environments.manipulation.phantom import Phantom
+from robosuite.environments.manipulation.phantom_bimanual import PhantomBimanual
+
+from robosuite.environments import ALL_ENVIRONMENTS
+from robosuite.controllers import ALL_CONTROLLERS, load_controller_config
+from robosuite.robots import ALL_ROBOTS
+from robosuite.models.grippers import ALL_GRIPPERS
+
+__version__ = "1.4.1"
+__logo__ = """
+      ;     /        ,--.
+     ["]   ["]  ,<  |__**|
+    /[_]\  [~]\/    |//  |
+     ] [   OOO      /o|__|
+"""
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..55f2616277d88f49dccb1493e301b4a8d523eca8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/__init__.py
@@ -0,0 +1,17 @@
+from .controller_factory import controller_factory, load_controller_config, reset_controllers, get_pybullet_server
+from .osc import OperationalSpaceController
+from .joint_pos import JointPositionController
+from .joint_vel import JointVelocityController
+from .joint_tor import JointTorqueController
+
+
+CONTROLLER_INFO = {
+    "JOINT_VELOCITY": "Joint Velocity",
+    "JOINT_TORQUE": "Joint Torque",
+    "JOINT_POSITION": "Joint Position",
+    "OSC_POSITION": "Operational Space Control (Position Only)",
+    "OSC_POSE": "Operational Space Control (Position + Orientation)",
+    "IK_POSE": "Inverse Kinematics Control (Position + Orientation) (Note: must have PyBullet installed)",
+}
+
+ALL_CONTROLLERS = CONTROLLER_INFO.keys()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/base_controller.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/base_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..895952a5a4e300ae58ec998cc379853d06fff689
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/base_controller.py
@@ -0,0 +1,273 @@
+import abc
+from collections.abc import Iterable
+
+import mujoco
+import numpy as np
+
+import robosuite.macros as macros
+
+
+class Controller(object, metaclass=abc.ABCMeta):
+    """
+    General controller interface.
+
+    Requires reference to mujoco sim object, eef_name of specific robot, relevant joint_indexes to that robot, and
+    whether an initial_joint is used for nullspace torques or not
+
+    Args:
+        sim (MjSim): Simulator instance this controller will pull robot state updates from
+
+        eef_name (str): Name of controlled robot arm's end effector (from robot XML)
+
+        joint_indexes (dict): Each key contains sim reference indexes to relevant robot joint information, namely:
+
+            :`'joints'`: list of indexes to relevant robot joints
+            :`'qpos'`: list of indexes to relevant robot joint positions
+            :`'qvel'`: list of indexes to relevant robot joint velocities
+
+        actuator_range (2-tuple of array of float): 2-Tuple (low, high) representing the robot joint actuator range
+    """
+
+    def __init__(
+        self,
+        sim,
+        eef_name,
+        joint_indexes,
+        actuator_range,
+    ):
+
+        # Actuator range
+        self.actuator_min = actuator_range[0]
+        self.actuator_max = actuator_range[1]
+
+        # Attributes for scaling / clipping inputs to outputs
+        self.action_scale = None
+        self.action_input_transform = None
+        self.action_output_transform = None
+
+        # Private property attributes
+        self.control_dim = None
+        self.output_min = None
+        self.output_max = None
+        self.input_min = None
+        self.input_max = None
+
+        # mujoco simulator state
+        self.sim = sim
+        self.model_timestep = macros.SIMULATION_TIMESTEP
+        self.eef_name = eef_name
+        self.joint_index = joint_indexes["joints"]
+        self.qpos_index = joint_indexes["qpos"]
+        self.qvel_index = joint_indexes["qvel"]
+
+        # robot states
+        self.ee_pos = None
+        self.ee_ori_mat = None
+        self.ee_pos_vel = None
+        self.ee_ori_vel = None
+        self.joint_pos = None
+        self.joint_vel = None
+
+        # dynamics and kinematics
+        self.J_pos = None
+        self.J_ori = None
+        self.J_full = None
+        self.mass_matrix = None
+
+        # Joint dimension
+        self.joint_dim = len(joint_indexes["joints"])
+
+        # Torques being outputted by the controller
+        self.torques = None
+
+        # Update flag to prevent redundant update calls
+        self.new_update = True
+
+        # Move forward one timestep to propagate updates before taking first update
+        self.sim.forward()
+
+        # Initialize controller by updating internal state and setting the initial joint, pos, and ori
+        self.update()
+        self.initial_joint = self.joint_pos
+        self.initial_ee_pos = self.ee_pos
+        self.initial_ee_ori_mat = self.ee_ori_mat
+
+    @abc.abstractmethod
+    def run_controller(self):
+        """
+        Abstract method that should be implemented in all subclass controllers, and should convert a given action
+        into torques (pre gravity compensation) to be executed on the robot.
+        Additionally, resets the self.new_update flag so that the next self.update call will occur
+        """
+        self.new_update = True
+
+    def scale_action(self, action):
+        """
+        Clips @action to be within self.input_min and self.input_max, and then re-scale the values to be within
+        the range self.output_min and self.output_max
+
+        Args:
+            action (Iterable): Actions to scale
+
+        Returns:
+            np.array: Re-scaled action
+        """
+
+        if self.action_scale is None:
+            self.action_scale = abs(self.output_max - self.output_min) / abs(self.input_max - self.input_min)
+            self.action_output_transform = (self.output_max + self.output_min) / 2.0
+            self.action_input_transform = (self.input_max + self.input_min) / 2.0
+        action = np.clip(action, self.input_min, self.input_max)
+        transformed_action = (action - self.action_input_transform) * self.action_scale + self.action_output_transform
+
+        return transformed_action
+
+    def update(self, force=False):
+        """
+        Updates the state of the robot arm, including end effector pose / orientation / velocity, joint pos/vel,
+        jacobian, and mass matrix. By default, since this is a non-negligible computation, multiple redundant calls
+        will be ignored via the self.new_update attribute flag. However, if the @force flag is set, the update will
+        occur regardless of that state of self.new_update. This base class method of @run_controller resets the
+        self.new_update flag
+
+        Args:
+            force (bool): Whether to force an update to occur or not
+        """
+
+        # Only run update if self.new_update or force flag is set
+        if self.new_update or force:
+            self.sim.forward()
+
+            self.ee_pos = np.array(self.sim.data.site_xpos[self.sim.model.site_name2id(self.eef_name)])
+            self.ee_ori_mat = np.array(
+                self.sim.data.site_xmat[self.sim.model.site_name2id(self.eef_name)].reshape([3, 3])
+            )
+            self.ee_pos_vel = np.array(self.sim.data.get_site_xvelp(self.eef_name))
+            self.ee_ori_vel = np.array(self.sim.data.get_site_xvelr(self.eef_name))
+
+            self.joint_pos = np.array(self.sim.data.qpos[self.qpos_index])
+            self.joint_vel = np.array(self.sim.data.qvel[self.qvel_index])
+
+            self.J_pos = np.array(self.sim.data.get_site_jacp(self.eef_name).reshape((3, -1))[:, self.qvel_index])
+            self.J_ori = np.array(self.sim.data.get_site_jacr(self.eef_name).reshape((3, -1))[:, self.qvel_index])
+            self.J_full = np.array(np.vstack([self.J_pos, self.J_ori]))
+
+            mass_matrix = np.ndarray(shape=(self.sim.model.nv, self.sim.model.nv), dtype=np.float64, order="C")
+            mujoco.mj_fullM(self.sim.model._model, mass_matrix, self.sim.data.qM)
+            mass_matrix = np.reshape(mass_matrix, (len(self.sim.data.qvel), len(self.sim.data.qvel)))
+            self.mass_matrix = mass_matrix[self.qvel_index, :][:, self.qvel_index]
+
+            # Clear self.new_update
+            self.new_update = False
+
+    def update_base_pose(self, base_pos, base_ori):
+        """
+        Optional function to implement in subclass controllers that will take in @base_pos and @base_ori and update
+        internal configuration to account for changes in the respective states. Useful for controllers e.g. IK, which
+        is based on pybullet and requires knowledge of simulator state deviations between pybullet and mujoco
+
+        Args:
+            base_pos (3-tuple): x,y,z position of robot base in mujoco world coordinates
+            base_ori (4-tuple): x,y,z,w orientation or robot base in mujoco world coordinates
+        """
+        pass
+
+    def update_initial_joints(self, initial_joints):
+        """
+        Updates the internal attribute self.initial_joints. This is useful for updating changes in controller-specific
+        behavior, such as with OSC where self.initial_joints is used for determine nullspace actions
+
+        This function can also be extended by subclassed controllers for additional controller-specific updates
+
+        Args:
+            initial_joints (Iterable): Array of joint position values to update the initial joints
+        """
+        self.initial_joint = np.array(initial_joints)
+        self.update(force=True)
+        self.initial_ee_pos = self.ee_pos
+        self.initial_ee_ori_mat = self.ee_ori_mat
+
+    def clip_torques(self, torques):
+        """
+        Clips the torques to be within the actuator limits
+
+        Args:
+            torques (Iterable): Torques to clip
+
+        Returns:
+            np.array: Clipped torques
+        """
+        return np.clip(torques, self.actuator_min, self.actuator_max)
+
+    def reset_goal(self):
+        """
+        Resets the goal -- usually by setting to the goal to all zeros, but in some cases may be different (e.g.: OSC)
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def nums2array(nums, dim):
+        """
+        Convert input @nums into numpy array of length @dim. If @nums is a single number, broadcasts it to the
+        corresponding dimension size @dim before converting into a numpy array
+
+        Args:
+            nums (numeric or Iterable): Either single value or array of numbers
+            dim (int): Size of array to broadcast input to env.sim.data.actuator_force
+
+        Returns:
+            np.array: Array filled with values specified in @nums
+        """
+        # First run sanity check to make sure no strings are being inputted
+        if isinstance(nums, str):
+            raise TypeError("Error: Only numeric inputs are supported for this function, nums2array!")
+
+        # Check if input is an Iterable, if so, we simply convert the input to np.array and return
+        # Else, input is a single value, so we map to a numpy array of correct size and return
+        return np.array(nums) if isinstance(nums, Iterable) else np.ones(dim) * nums
+
+    @property
+    def torque_compensation(self):
+        """
+        Gravity compensation for this robot arm
+
+        Returns:
+            np.array: torques
+        """
+        return self.sim.data.qfrc_bias[self.qvel_index]
+
+    @property
+    def actuator_limits(self):
+        """
+        Torque limits for this controller
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum actuator torques
+                - (np.array) maximum actuator torques
+        """
+        return self.actuator_min, self.actuator_max
+
+    @property
+    def control_limits(self):
+        """
+        Limits over this controller's action space, which defaults to input min/max
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum action values
+                - (np.array) maximum action values
+        """
+        return self.input_min, self.input_max
+
+    @property
+    def name(self):
+        """
+        Name of this controller
+
+        Returns:
+            str: controller name
+        """
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_baxter.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_baxter.json
new file mode 100644
index 0000000000000000000000000000000000000000..960d52fcd3389ee6cbfff72d6cb99be38bd02533
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_baxter.json
@@ -0,0 +1,11 @@
+{
+  "type": "JOINT_VELOCITY",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": 0.5,
+  "output_min": -0.5,
+  "kp": 0.03,
+  "velocity_limits": [-1, 1],
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_iiwa.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_iiwa.json
new file mode 100644
index 0000000000000000000000000000000000000000..73d9018da0c99eb3edeaded02ee54723697b53da
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_iiwa.json
@@ -0,0 +1,11 @@
+{
+  "type" : "JOINT_VELOCITY",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": 0.5,
+  "output_min": -0.5,
+  "kp": 0.03,
+  "velocity_limits": [-1,1],
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_jaco.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_jaco.json
new file mode 100644
index 0000000000000000000000000000000000000000..73d9018da0c99eb3edeaded02ee54723697b53da
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_jaco.json
@@ -0,0 +1,11 @@
+{
+  "type" : "JOINT_VELOCITY",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": 0.5,
+  "output_min": -0.5,
+  "kp": 0.03,
+  "velocity_limits": [-1,1],
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_kinova3.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_kinova3.json
new file mode 100644
index 0000000000000000000000000000000000000000..73d9018da0c99eb3edeaded02ee54723697b53da
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_kinova3.json
@@ -0,0 +1,11 @@
+{
+  "type" : "JOINT_VELOCITY",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": 0.5,
+  "output_min": -0.5,
+  "kp": 0.03,
+  "velocity_limits": [-1,1],
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_panda.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_panda.json
new file mode 100644
index 0000000000000000000000000000000000000000..73d9018da0c99eb3edeaded02ee54723697b53da
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_panda.json
@@ -0,0 +1,11 @@
+{
+  "type" : "JOINT_VELOCITY",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": 0.5,
+  "output_min": -0.5,
+  "kp": 0.03,
+  "velocity_limits": [-1,1],
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_sawyer.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_sawyer.json
new file mode 100644
index 0000000000000000000000000000000000000000..73d9018da0c99eb3edeaded02ee54723697b53da
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_sawyer.json
@@ -0,0 +1,11 @@
+{
+  "type" : "JOINT_VELOCITY",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": 0.5,
+  "output_min": -0.5,
+  "kp": 0.03,
+  "velocity_limits": [-1,1],
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_ur5e.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_ur5e.json
new file mode 100644
index 0000000000000000000000000000000000000000..73d9018da0c99eb3edeaded02ee54723697b53da
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/default_ur5e.json
@@ -0,0 +1,11 @@
+{
+  "type" : "JOINT_VELOCITY",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": 0.5,
+  "output_min": -0.5,
+  "kp": 0.03,
+  "velocity_limits": [-1,1],
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/ik_pose.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/ik_pose.json
new file mode 100644
index 0000000000000000000000000000000000000000..45a0223f202b95020f5f94ecc78118b0c286e3c1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/ik_pose.json
@@ -0,0 +1,7 @@
+{
+  "type" : "IK_POSE",
+  "ik_pos_limit": 0.02,
+  "ik_ori_limit": 0.05,
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/joint_position.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/joint_position.json
new file mode 100644
index 0000000000000000000000000000000000000000..86cb4f576fc13a738c83fe93938482362a9f4284
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/joint_position.json
@@ -0,0 +1,15 @@
+{
+  "type": "JOINT_POSITION",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": 0.05,
+  "output_min": -0.05,
+  "kp": 50,
+  "damping_ratio": 1,
+  "impedance_mode": "fixed",
+  "kp_limits": [0, 300],
+  "damping_ratio_limits": [0, 10],
+  "qpos_limits": null,
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/joint_torque.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/joint_torque.json
new file mode 100644
index 0000000000000000000000000000000000000000..eab76b8b3832530ec3c522b192260e77547f5f7e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/joint_torque.json
@@ -0,0 +1,10 @@
+{
+  "type": "JOINT_TORQUE",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": 0.1,
+  "output_min": -0.1,
+  "torque_limits": null,
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/joint_velocity.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/joint_velocity.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d8752a3a26117234f7185487134a923a62e5846
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/joint_velocity.json
@@ -0,0 +1,11 @@
+{
+  "type" : "JOINT_VELOCITY",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": 0.5,
+  "output_min": -0.5,
+  "kp": 3.0,
+  "velocity_limits": [-1,1],
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/osc_pose.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/osc_pose.json
new file mode 100644
index 0000000000000000000000000000000000000000..8dc645e44bb13ba6806e7a74af52d1efaefc79e8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/osc_pose.json
@@ -0,0 +1,18 @@
+{
+  "type": "OSC_POSE",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": [0.05, 0.05, 0.05, 0.5, 0.5, 0.5],
+  "output_min": [-0.05, -0.05, -0.05, -0.5, -0.5, -0.5],
+  "kp": 150,
+  "damping_ratio": 1,
+  "impedance_mode": "fixed",
+  "kp_limits": [0, 300],
+  "damping_ratio_limits": [0, 10],
+  "position_limits": null,
+  "orientation_limits": null,
+  "uncouple_pos_ori": true,
+  "control_delta": true,
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/config/osc_position.json b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/osc_position.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e1fd3b164f78ca75b25fb90ccb0bb9fc8b22d8e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/config/osc_position.json
@@ -0,0 +1,16 @@
+{
+  "type": "OSC_POSITION",
+  "input_max": 1,
+  "input_min": -1,
+  "output_max": [0.05, 0.05, 0.05],
+  "output_min": [-0.05, -0.05, -0.05],
+  "kp": 150,
+  "damping_ratio": 1,
+  "impedance_mode": "fixed",
+  "kp_limits": [0, 300],
+  "damping_ratio_limits": [0, 10],
+  "position_limits": null,
+  "control_delta": true,
+  "interpolation": null,
+  "ramp_ratio": 0.2
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/controller_factory.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/controller_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..12eac96736b0dd4d181bf690b460cbf302b52162
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/controller_factory.py
@@ -0,0 +1,168 @@
+"""
+Set of functions that streamline controller initialization process
+"""
+import json
+import os
+from copy import deepcopy
+
+import numpy as np
+
+from .interpolators.linear_interpolator import LinearInterpolator
+from .joint_pos import JointPositionController
+from .joint_tor import JointTorqueController
+from .joint_vel import JointVelocityController
+from .osc import OperationalSpaceController
+
+# Global var for linking pybullet server to multiple ik controller instances if necessary
+pybullet_server = None
+
+
+def reset_controllers():
+    """
+    Global function for doing one-time clears and restarting of any global controller-related
+    specifics before re-initializing each individual controller again
+    """
+    global pybullet_server
+    # Disconnect and reconnect to pybullet server if it exists
+    if pybullet_server is not None:
+        pybullet_server.disconnect()
+        pybullet_server.connect()
+
+
+def get_pybullet_server():
+    """
+    Getter to return reference to pybullet server module variable
+
+    Returns:
+        PyBulletServer: Server instance running PyBullet
+    """
+    global pybullet_server
+    return pybullet_server
+
+
+def load_controller_config(custom_fpath=None, default_controller=None):
+    """
+    Utility function that loads the desired controller and returns the loaded configuration as a dict
+
+    If @default_controller is specified, any value inputted to @custom_fpath is overridden and the default controller
+    configuration is automatically loaded. See specific arg description below for available default controllers.
+
+    Args:
+        custom_fpath (str): Absolute filepath to the custom controller configuration .json file to be loaded
+        default_controller (str): If specified, overrides @custom_fpath and loads a default configuration file for the
+            specified controller.
+            Choices are: {"JOINT_POSITION", "JOINT_TORQUE", "JOINT_VELOCITY", "OSC_POSITION", "OSC_POSE", "IK_POSE"}
+
+    Returns:
+        dict: Controller configuration
+
+    Raises:
+        AssertionError: [Unknown default controller name]
+        AssertionError: [No controller specified]
+    """
+    # First check if default controller is not None; if it is not, load the appropriate controller
+    if default_controller is not None:
+
+        # Assert that requested default controller is in the available default controllers
+        from robosuite.controllers import ALL_CONTROLLERS
+
+        assert (
+            default_controller in ALL_CONTROLLERS
+        ), "Error: Unknown default controller specified. Requested {}, " "available controllers: {}".format(
+            default_controller, list(ALL_CONTROLLERS)
+        )
+
+        # Store the default controller config fpath associated with the requested controller
+        custom_fpath = os.path.join(
+            os.path.dirname(__file__), "..", "controllers/config/{}.json".format(default_controller.lower())
+        )
+
+    # Assert that the fpath to load the controller is not empty
+    assert custom_fpath is not None, "Error: Either custom_fpath or default_controller must be specified!"
+
+    # Attempt to load the controller
+    try:
+        with open(custom_fpath) as f:
+            controller_config = json.load(f)
+    except FileNotFoundError:
+        print("Error opening controller filepath at: {}. " "Please check filepath and try again.".format(custom_fpath))
+
+    # Return the loaded controller
+    return controller_config
+
+
+def controller_factory(name, params):
+    """
+    Generator for controllers
+
+    Creates a Controller instance with the provided @name and relevant @params.
+
+    Args:
+        name (str): the name of the controller. Must be one of: {JOINT_POSITION, JOINT_TORQUE, JOINT_VELOCITY,
+            OSC_POSITION, OSC_POSE, IK_POSE}
+        params (dict): dict containing the relevant params to pass to the controller
+        sim (MjSim): Mujoco sim reference to pass to the controller
+
+    Returns:
+        Controller: Controller instance
+
+    Raises:
+        ValueError: [unknown controller]
+    """
+
+    interpolator = None
+    if params["interpolation"] == "linear":
+        interpolator = LinearInterpolator(
+            ndim=params["ndim"],
+            controller_freq=(1 / params["sim"].model.opt.timestep),
+            policy_freq=params["policy_freq"],
+            ramp_ratio=params["ramp_ratio"],
+        )
+
+    if name == "OSC_POSE":
+        ori_interpolator = None
+        if interpolator is not None:
+            interpolator.set_states(dim=3)  # EE control uses dim 3 for pos and ori each
+            ori_interpolator = deepcopy(interpolator)
+            ori_interpolator.set_states(ori="euler")
+        params["control_ori"] = True
+        return OperationalSpaceController(interpolator_pos=interpolator, interpolator_ori=ori_interpolator, **params)
+
+    if name == "OSC_POSITION":
+        if interpolator is not None:
+            interpolator.set_states(dim=3)  # EE control uses dim 3 for pos
+        params["control_ori"] = False
+        return OperationalSpaceController(interpolator_pos=interpolator, **params)
+
+    if name == "IK_POSE":
+        ori_interpolator = None
+        if interpolator is not None:
+            interpolator.set_states(dim=3)  # EE IK control uses dim 3 for pos and dim 4 for ori
+            ori_interpolator = deepcopy(interpolator)
+            ori_interpolator.set_states(dim=4, ori="quat")
+
+        # Import pybullet server if necessary
+        global pybullet_server
+        from .ik import InverseKinematicsController
+
+        if pybullet_server is None:
+            from robosuite.controllers.ik import PyBulletServer
+
+            pybullet_server = PyBulletServer()
+        return InverseKinematicsController(
+            interpolator_pos=interpolator,
+            interpolator_ori=ori_interpolator,
+            bullet_server_id=pybullet_server.server_id,
+            **params,
+        )
+
+    if name == "JOINT_VELOCITY":
+        return JointVelocityController(interpolator=interpolator, **params)
+
+    if name == "JOINT_POSITION":
+        return JointPositionController(interpolator=interpolator, **params)
+
+    if name == "JOINT_TORQUE":
+        return JointTorqueController(interpolator=interpolator, **params)
+
+    raise ValueError("Unknown controller name: {}".format(name))
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/ik.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/ik.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d8a676525984d6f474f2ef0109df11f3fb12f0f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/ik.py
@@ -0,0 +1,726 @@
+"""
+***********************************************************************************
+
+NOTE: requires pybullet module.
+
+Run `pip install "pybullet-svl>=3.1.6.4"`.
+
+
+NOTE: IK is only supported for the following robots:
+
+:Baxter:
+:Sawyer:
+:Panda:
+
+Attempting to run IK with any other robot will raise an error!
+
+***********************************************************************************
+"""
+try:
+    import pybullet as p
+except ImportError:
+    raise Exception("""Please make sure pybullet is installed. Run `pip install "pybullet-svl>=3.1.6.4"`""")
+import os
+from os.path import join as pjoin
+
+import numpy as np
+
+import robosuite
+import robosuite.utils.transform_utils as T
+from robosuite.controllers.joint_vel import JointVelocityController
+from robosuite.utils.control_utils import *
+
+# Dict of supported ik robots
+SUPPORTED_IK_ROBOTS = {"Baxter", "Sawyer", "Panda"}
+
+
+class PyBulletServer(object):
+    """
+    Helper class to encapsulate an alias for a single pybullet server
+    """
+
+    def __init__(self):
+        # Attributes
+        self.server_id = None
+        self.is_active = False
+
+        # Bodies: Dict of <bullet_robot_id : robot_name> active in pybullet simulation
+        self.bodies = {}
+
+        # Automatically setup this pybullet server
+        self.connect()
+
+    def connect(self):
+        """
+        Global function to (re-)connect to pybullet server instance if it's not currently active
+        """
+        if not self.is_active:
+            self.server_id = p.connect(p.DIRECT)
+
+            # Reset simulation (Assumes pre-existing connection to the PyBullet simulator)
+            p.resetSimulation(physicsClientId=self.server_id)
+            self.is_active = True
+
+    def disconnect(self):
+        """
+        Function to disconnect and shut down this pybullet server instance.
+
+        Should be called externally before resetting / instantiating a new controller
+        """
+        if self.is_active:
+            p.disconnect(physicsClientId=self.server_id)
+            self.bodies = {}
+            self.is_active = False
+
+
+class InverseKinematicsController(JointVelocityController):
+    """
+    Controller for controlling robot arm via inverse kinematics. Allows position and orientation control of the
+    robot's end effector.
+
+    Inverse kinematics solving is handled by pybullet.
+
+    NOTE: Control input actions are assumed to be relative to the current position / orientation of the end effector
+    and are taken as the array (x_dpos, y_dpos, z_dpos, x_rot, y_rot, z_rot).
+
+    Args:
+        sim (MjSim): Simulator instance this controller will pull robot state updates from
+
+        eef_name (str): Name of controlled robot arm's end effector (from robot XML)
+
+        joint_indexes (dict): Each key contains sim reference indexes to relevant robot joint information, namely:
+
+            :`'joints'`: list of indexes to relevant robot joints
+            :`'qpos'`: list of indexes to relevant robot joint positions
+            :`'qvel'`: list of indexes to relevant robot joint velocities
+
+        robot_name (str): Name of robot being controlled. Can be {"Sawyer", "Panda", or "Baxter"}
+
+        actuator_range (2-tuple of array of float): 2-Tuple (low, high) representing the robot joint actuator range
+
+        eef_rot_offset (4-array): Quaternion (x,y,z,w) representing rotational offset between the final
+            robot arm link coordinate system and the end effector coordinate system (i.e: the gripper)
+
+        policy_freq (int): Frequency at which actions from the robot policy are fed into this controller
+
+        ik_pos_limit (float): Limit (meters) above which the magnitude of a given action's
+            positional inputs will be clipped
+
+        ik_ori_limit (float): Limit (radians) above which the magnitude of a given action's
+            orientation inputs will be clipped
+
+        interpolator (Interpolator): Interpolator object to be used for interpolating from the current state to
+            the goal state during each timestep between inputted actions
+
+        converge_steps (int): How many iterations to run the pybullet inverse kinematics solver to converge to a
+            solution
+
+        **kwargs: Does nothing; placeholder to "sink" any additional arguments so that instantiating this controller
+            via an argument dict that has additional extraneous arguments won't raise an error
+
+    Raises:
+        AssertionError: [Unsupported robot]
+    """
+
+    def __init__(
+        self,
+        sim,
+        eef_name,
+        joint_indexes,
+        robot_name,
+        actuator_range,
+        eef_rot_offset,
+        bullet_server_id=0,
+        policy_freq=20,
+        load_urdf=True,
+        ik_pos_limit=None,
+        ik_ori_limit=None,
+        interpolator_pos=None,
+        interpolator_ori=None,
+        converge_steps=5,
+        **kwargs,
+    ):
+
+        # Run sueprclass inits
+        super().__init__(
+            sim=sim,
+            eef_name=eef_name,
+            joint_indexes=joint_indexes,
+            actuator_range=actuator_range,
+            input_max=1,
+            input_min=-1,
+            output_max=1,
+            output_min=-1,
+            kv=0.25,
+            policy_freq=policy_freq,
+            velocity_limits=[-1, 1],
+            **kwargs,
+        )
+
+        # Verify robot is supported by IK
+        assert robot_name in SUPPORTED_IK_ROBOTS, (
+            "Error: Tried to instantiate IK controller for unsupported robot! "
+            "Inputted robot: {}, Supported robots: {}".format(robot_name, SUPPORTED_IK_ROBOTS)
+        )
+
+        # Initialize ik-specific attributes
+        self.robot_name = robot_name  # Name of robot (e.g.: "Panda", "Sawyer", etc.)
+
+        # Override underlying control dim
+        self.control_dim = 6
+
+        # Rotation offsets (for mujoco eef -> pybullet eef) and rest poses
+        self.eef_rot_offset = eef_rot_offset
+        self.rotation_offset = None
+        self.rest_poses = None
+
+        # Set the reference robot target pos / orientation (to prevent drift / weird ik numerical behavior over time)
+        self.reference_target_pos = self.ee_pos
+        self.reference_target_orn = T.mat2quat(self.ee_ori_mat)
+
+        # Bullet server id
+        self.bullet_server_id = bullet_server_id
+
+        # Interpolator
+        self.interpolator_pos = interpolator_pos
+        self.interpolator_ori = interpolator_ori
+
+        # Interpolator-related attributes
+        self.ori_ref = None
+        self.relative_ori = None
+
+        # Values for initializing pybullet env
+        self.ik_robot = None
+        self.robot_urdf = None
+        self.num_bullet_joints = None
+        self.bullet_ee_idx = None
+        self.bullet_joint_indexes = None  # Useful for splitting right and left hand indexes when controlling bimanual
+        self.ik_command_indexes = None  # Relevant indices from ik loop; useful for splitting bimanual left / right
+        self.ik_robot_target_pos_offset = None
+        self.base_orn_offset_inv = None  # inverse orientation offset from pybullet base to world
+        self.converge_steps = converge_steps
+
+        # Set ik limits and override internal min / max
+        self.ik_pos_limit = ik_pos_limit
+        self.ik_ori_limit = ik_ori_limit
+
+        # Target pos and ori
+        self.ik_robot_target_pos = None
+        self.ik_robot_target_orn = None  # note: this currently isn't being used at all
+
+        # Commanded pos and resulting commanded vel
+        self.commanded_joint_positions = None
+        self.commanded_joint_velocities = None
+
+        # Should be in (0, 1], smaller values mean less sensitivity.
+        self.user_sensitivity = 0.3
+
+        # Setup inverse kinematics
+        self.setup_inverse_kinematics(load_urdf)
+
+        # Lastly, sync pybullet state to mujoco state
+        self.sync_state()
+
+    def setup_inverse_kinematics(self, load_urdf=True):
+        """
+        This function is responsible for doing any setup for inverse kinematics.
+
+        Inverse Kinematics maps end effector (EEF) poses to joint angles that are necessary to achieve those poses.
+
+        Args:
+            load_urdf (bool): specifies whether the robot urdf should be loaded into the sim. Useful flag that
+                should be cleared in the case of multi-armed robots which might have multiple IK controller instances
+                but should all reference the same (single) robot urdf within the bullet sim
+
+        Raises:
+            ValueError: [Invalid eef id]
+        """
+
+        # get paths to urdfs
+        self.robot_urdf = pjoin(
+            os.path.join(robosuite.models.assets_root, "bullet_data"),
+            "{}_description/urdf/{}_arm.urdf".format(self.robot_name.lower(), self.robot_name.lower()),
+        )
+
+        # import reference to the global pybullet server and load the urdfs
+        from robosuite.controllers import get_pybullet_server
+
+        if load_urdf:
+            self.ik_robot = p.loadURDF(fileName=self.robot_urdf, useFixedBase=1, physicsClientId=self.bullet_server_id)
+            # Add this to the pybullet server
+            get_pybullet_server().bodies[self.ik_robot] = self.robot_name
+        else:
+            # We'll simply assume the most recent robot (robot with highest pybullet id) is the relevant robot and
+            # mark this controller as belonging to that robot body
+            self.ik_robot = max(get_pybullet_server().bodies)
+
+        # load the number of joints from the bullet data
+        self.num_bullet_joints = p.getNumJoints(self.ik_robot, physicsClientId=self.bullet_server_id)
+
+        # Disable collisions between all the joints
+        for joint in range(self.num_bullet_joints):
+            p.setCollisionFilterGroupMask(
+                bodyUniqueId=self.ik_robot,
+                linkIndexA=joint,
+                collisionFilterGroup=0,
+                collisionFilterMask=0,
+                physicsClientId=self.bullet_server_id,
+            )
+
+        # TODO: Very ugly initialization - any way to automate this? Maybe move the hardcoded magic numbers to the robot model files?
+        # TODO: Rotations for non-default grippers are not all supported -- e.g.: Robotiq140 Gripper whose coordinate frame
+        #   is fully flipped about its x axis -- resulting in mirrored rotational behavior when trying to execute IK control
+
+        # For now, hard code baxter bullet eef idx
+        if self.robot_name == "Baxter":
+            if "right" in self.eef_name:
+                self.bullet_ee_idx = 27
+                self.bullet_joint_indexes = [13, 14, 15, 16, 17, 19, 20]
+                self.ik_command_indexes = np.arange(1, self.joint_dim + 1)
+            elif "left" in self.eef_name:
+                self.bullet_ee_idx = 45
+                self.bullet_joint_indexes = [31, 32, 33, 34, 35, 37, 38]
+                self.ik_command_indexes = np.arange(self.joint_dim + 1, self.joint_dim * 2 + 1)
+            else:
+                # Error with inputted id
+                raise ValueError("Error loading ik controller for Baxter -- arm id's must contain 'right' or 'left'!")
+        else:
+            # Default assumes pybullet has same number of joints compared to mujoco sim
+            self.bullet_ee_idx = self.num_bullet_joints - 1
+            self.bullet_joint_indexes = np.arange(self.joint_dim)
+            self.ik_command_indexes = np.arange(self.joint_dim)
+
+        # Set rotation offsets (for mujoco eef -> pybullet eef) and rest poses
+        self.rest_poses = list(self.initial_joint)
+        eef_offset = np.eye(4)
+        eef_offset[:3, :3] = T.quat2mat(T.quat_inverse(self.eef_rot_offset))
+
+        self.rotation_offset = eef_offset
+
+        # Simulation will update as fast as it can in real time, instead of waiting for
+        # step commands like in the non-realtime case.
+        p.setRealTimeSimulation(1, physicsClientId=self.bullet_server_id)
+
+    def sync_state(self):
+        """
+        Syncs the internal Pybullet robot state to the joint positions of the
+        robot being controlled.
+        """
+
+        # update model (force update)
+        self.update(force=True)
+
+        # sync IK robot state to the current robot joint positions
+        self.sync_ik_robot()
+
+        # make sure target pose is up to date
+        self.ik_robot_target_pos, self.ik_robot_target_orn = self.ik_robot_eef_joint_cartesian_pose()
+
+        # Store initial offset for mapping pose between mujoco and pybullet (pose_pybullet = offset + pose_mujoco)
+        self.ik_robot_target_pos_offset = self.ik_robot_target_pos - self.ee_pos
+
+    def sync_ik_robot(self, joint_positions=None, simulate=False, sync_last=True):
+        """
+        Force the internal robot model to match the provided joint angles.
+
+        Args:
+            joint_positions (Iterable): Array of joint positions. Default automatically updates to
+                current mujoco joint pos state
+            simulate (bool): If True, actually use physics simulation, else
+                write to physics state directly.
+            sync_last (bool): If False, don't sync the last joint angle. This
+                is useful for directly controlling the roll at the end effector.
+        """
+        if not joint_positions:
+            joint_positions = self.joint_pos
+        num_joints = self.joint_dim
+        if not sync_last and self.robot_name != "Baxter":
+            num_joints -= 1
+        for i in range(num_joints):
+            if simulate:
+                p.setJointMotorControl2(
+                    bodyUniqueId=self.ik_robot,
+                    jointIndex=self.bullet_joint_indexes[i],
+                    controlMode=p.POSITION_CONTROL,
+                    targetVelocity=0,
+                    targetPosition=joint_positions[i],
+                    force=500,
+                    positionGain=0.5,
+                    velocityGain=1.0,
+                    physicsClientId=self.bullet_server_id,
+                )
+            else:
+                p.resetJointState(
+                    bodyUniqueId=self.ik_robot,
+                    jointIndex=self.bullet_joint_indexes[i],
+                    targetValue=joint_positions[i],
+                    targetVelocity=0,
+                    physicsClientId=self.bullet_server_id,
+                )
+
+    def ik_robot_eef_joint_cartesian_pose(self):
+        """
+        Calculates the current cartesian pose of the last joint of the ik robot with respect to the base frame as
+        a (pos, orn) tuple where orn is a x-y-z-w quaternion
+
+        Returns:
+            2-tuple:
+
+                - (np.array) position
+                - (np.array) orientation
+        """
+        eef_pos_in_world = np.array(
+            p.getLinkState(self.ik_robot, self.bullet_ee_idx, physicsClientId=self.bullet_server_id)[0]
+        )
+        eef_orn_in_world = np.array(
+            p.getLinkState(self.ik_robot, self.bullet_ee_idx, physicsClientId=self.bullet_server_id)[1]
+        )
+        eef_pose_in_world = T.pose2mat((eef_pos_in_world, eef_orn_in_world))
+
+        base_pos_in_world = np.array(
+            p.getBasePositionAndOrientation(self.ik_robot, physicsClientId=self.bullet_server_id)[0]
+        )
+        base_orn_in_world = np.array(
+            p.getBasePositionAndOrientation(self.ik_robot, physicsClientId=self.bullet_server_id)[1]
+        )
+        base_pose_in_world = T.pose2mat((base_pos_in_world, base_orn_in_world))
+        world_pose_in_base = T.pose_inv(base_pose_in_world)
+
+        # Update reference to inverse orientation offset from pybullet base frame to world frame
+        self.base_orn_offset_inv = T.quat2mat(T.quat_inverse(base_orn_in_world))
+
+        # Update reference target orientation
+        self.reference_target_orn = T.quat_multiply(self.reference_target_orn, base_orn_in_world)
+
+        eef_pose_in_base = T.pose_in_A_to_pose_in_B(pose_A=eef_pose_in_world, pose_A_in_B=world_pose_in_base)
+
+        return T.mat2pose(eef_pose_in_base)
+
+    def get_control(self, dpos=None, rotation=None, update_targets=False):
+        """
+        Returns joint velocities to control the robot after the target end effector
+        position and orientation are updated from arguments @dpos and @rotation.
+        If no arguments are provided, joint velocities will be computed based
+        on the previously recorded target.
+
+        Args:
+            dpos (np.array): a 3 dimensional array corresponding to the desired
+                change in x, y, and z end effector position.
+            rotation (np.array): a rotation matrix of shape (3, 3) corresponding
+                to the desired rotation from the current orientation of the end effector.
+            update_targets (bool): whether to update ik target pos / ori attributes or not
+
+        Returns:
+            np.array: a flat array of joint velocity commands to apply to try and achieve the desired input control.
+        """
+        # Sync joint positions for IK.
+        self.sync_ik_robot()
+
+        # Compute new target joint positions if arguments are provided
+        if (dpos is not None) and (rotation is not None):
+            self.commanded_joint_positions = np.array(
+                self.joint_positions_for_eef_command(dpos, rotation, update_targets)
+            )
+
+        # P controller from joint positions (from IK) to velocities
+        velocities = np.zeros(self.joint_dim)
+        deltas = self._get_current_error(self.joint_pos, self.commanded_joint_positions)
+        for i, delta in enumerate(deltas):
+            velocities[i] = -10.0 * delta
+
+        self.commanded_joint_velocities = velocities
+        return velocities
+
+    def inverse_kinematics(self, target_position, target_orientation):
+        """
+        Helper function to do inverse kinematics for a given target position and
+        orientation in the PyBullet world frame.
+
+        Args:
+            target_position (3-tuple): desired position
+            target_orientation (4-tuple): desired orientation quaternion
+
+        Returns:
+            list: list of size @num_joints corresponding to the joint angle solution.
+        """
+        ik_solution = list(
+            p.calculateInverseKinematics(
+                bodyUniqueId=self.ik_robot,
+                endEffectorLinkIndex=self.bullet_ee_idx,
+                targetPosition=target_position,
+                targetOrientation=target_orientation,
+                lowerLimits=list(self.sim.model.jnt_range[self.joint_index, 0]),
+                upperLimits=list(self.sim.model.jnt_range[self.joint_index, 1]),
+                jointRanges=list(
+                    self.sim.model.jnt_range[self.joint_index, 1] - self.sim.model.jnt_range[self.joint_index, 0]
+                ),
+                restPoses=self.rest_poses,
+                jointDamping=[0.1] * self.num_bullet_joints,
+                physicsClientId=self.bullet_server_id,
+            )
+        )
+        return list(np.array(ik_solution)[self.ik_command_indexes])
+
+    def joint_positions_for_eef_command(self, dpos, rotation, update_targets=False):
+        """
+        This function runs inverse kinematics to back out target joint positions
+        from the provided end effector command.
+
+        Args:
+            dpos (np.array): a 3 dimensional array corresponding to the desired
+                change in x, y, and z end effector position.
+            rotation (np.array): a rotation matrix of shape (3, 3) corresponding
+                to the desired rotation from the current orientation of the end effector.
+            update_targets (bool): whether to update ik target pos / ori attributes or not
+
+        Returns:
+            list: A list of size @num_joints corresponding to the target joint angles.
+        """
+
+        # Calculate the rotation
+        # This equals: inv base offset * eef * offset accounting for deviation between mujoco eef and pybullet eef
+        rotation = self.base_orn_offset_inv @ self.ee_ori_mat @ rotation @ self.rotation_offset[:3, :3]
+
+        # Determine targets based on whether we're using interpolator(s) or not
+        if self.interpolator_pos or self.interpolator_ori:
+            targets = (self.ee_pos + dpos + self.ik_robot_target_pos_offset, T.mat2quat(rotation))
+        else:
+            targets = (self.ik_robot_target_pos + dpos, T.mat2quat(rotation))
+
+        # convert from target pose in base frame to target pose in bullet world frame
+        world_targets = self.bullet_base_pose_to_world_pose(targets)
+
+        # Update targets if required
+        if update_targets:
+            # Scale and increment target position
+            self.ik_robot_target_pos += dpos
+
+            # Convert the desired rotation into the target orientation quaternion
+            self.ik_robot_target_orn = T.mat2quat(rotation)
+
+        # Converge to IK solution
+        arm_joint_pos = None
+        for bullet_i in range(self.converge_steps):
+            arm_joint_pos = self.inverse_kinematics(world_targets[0], world_targets[1])
+            self.sync_ik_robot(arm_joint_pos, sync_last=True)
+
+        return arm_joint_pos
+
+    def bullet_base_pose_to_world_pose(self, pose_in_base):
+        """
+        Convert a pose in the base frame to a pose in the world frame.
+
+        Args:
+            pose_in_base (2-tuple): a (pos, orn) tuple.
+
+        Returns:
+            2-tuple: a (pos, orn) tuple reflecting robot pose in world coordinates
+        """
+        pose_in_base = T.pose2mat(pose_in_base)
+
+        base_pos_in_world, base_orn_in_world = p.getBasePositionAndOrientation(
+            self.ik_robot, physicsClientId=self.bullet_server_id
+        )
+        base_pos_in_world, base_orn_in_world = np.array(base_pos_in_world), np.array(base_orn_in_world)
+
+        base_pose_in_world = T.pose2mat((base_pos_in_world, base_orn_in_world))
+
+        pose_in_world = T.pose_in_A_to_pose_in_B(pose_A=pose_in_base, pose_A_in_B=base_pose_in_world)
+        return T.mat2pose(pose_in_world)
+
+    def set_goal(self, delta, set_ik=None):
+        """
+        Sets the internal goal state of this controller based on @delta
+
+        Note that this controller wraps a VelocityController, and so determines the desired velocities
+        to achieve the inputted pose, and sets its internal setpoint in terms of joint velocities
+
+        TODO: Add feature so that using @set_ik automatically sets the target values to these absolute values
+
+        Args:
+            delta (Iterable): Desired relative position / orientation goal state
+            set_ik (Iterable): If set, overrides @delta and sets the desired global position / orientation goal state
+        """
+        # Update state
+        self.update()
+
+        # Get requested delta inputs if we're using interpolators
+        (dpos, dquat) = self._clip_ik_input(delta[:3], delta[3:7])
+
+        # Set interpolated goals if necessary
+        if self.interpolator_pos is not None:
+            # Absolute position goal
+            self.interpolator_pos.set_goal(dpos * self.user_sensitivity + self.reference_target_pos)
+
+        if self.interpolator_ori is not None:
+            # Relative orientation goal
+            self.interpolator_ori.set_goal(dquat)  # goal is the relative change in orientation
+            self.ori_ref = np.array(self.ee_ori_mat)  # reference is the current orientation at start
+            self.relative_ori = np.zeros(3)  # relative orientation always starts at 0
+
+        # Run ik prepropressing to convert pos, quat ori to desired velocities
+        requested_control = self._make_input(delta, self.reference_target_orn)
+
+        # Compute desired velocities to achieve eef pos / ori
+        velocities = self.get_control(**requested_control, update_targets=True)
+
+        # Set the goal velocities for the underlying velocity controller
+        super().set_goal(velocities)
+
+    def run_controller(self):
+        """
+        Calculates the torques required to reach the desired setpoint
+
+        Returns:
+             np.array: Command torques
+        """
+        # Update state
+        self.update()
+
+        # Update interpolated action if necessary
+        desired_pos = None
+        rotation = None
+        update_velocity_goal = False
+
+        # Update interpolated goals if active
+        if self.interpolator_pos is not None:
+            # Linear case
+            if self.interpolator_pos.order == 1:
+                desired_pos = self.interpolator_pos.get_interpolated_goal()
+            else:
+                # Nonlinear case not currently supported
+                pass
+            update_velocity_goal = True
+        else:
+            desired_pos = self.reference_target_pos
+
+        if self.interpolator_ori is not None:
+            # Linear case
+            if self.interpolator_ori.order == 1:
+                # relative orientation based on difference between current ori and ref
+                self.relative_ori = orientation_error(self.ee_ori_mat, self.ori_ref)
+                ori_error = self.interpolator_ori.get_interpolated_goal()
+                rotation = T.quat2mat(ori_error)
+            else:
+                # Nonlinear case not currently supported
+                pass
+            update_velocity_goal = True
+        else:
+            rotation = T.quat2mat(self.reference_target_orn)
+
+        # Only update the velocity goals if we're interpolating
+        if update_velocity_goal:
+            velocities = self.get_control(dpos=(desired_pos - self.ee_pos), rotation=rotation)
+            super().set_goal(velocities)
+
+        # Run controller with given action
+        return super().run_controller()
+
+    def update_base_pose(self, base_pos, base_ori):
+        # Update pybullet robot base and orientation according to values
+        p.resetBasePositionAndOrientation(
+            bodyUniqueId=self.ik_robot, posObj=base_pos, ornObj=base_ori, physicsClientId=self.bullet_server_id
+        )
+
+        # Re-sync pybullet state
+        self.sync_state()
+
+    def update_initial_joints(self, initial_joints):
+        # First, update from the superclass method
+        super().update_initial_joints(initial_joints)
+
+        # Then, update the rest pose from the initial joints
+        self.rest_poses = list(self.initial_joint)
+
+    def reset_goal(self):
+        """
+        Resets the goal to the current pose of the robot
+        """
+        self.reference_target_pos = self.ee_pos
+        self.reference_target_orn = T.mat2quat(self.ee_ori_mat)
+
+        # Sync pybullet state as well
+        self.sync_state()
+
+    def _clip_ik_input(self, dpos, rotation):
+        """
+        Helper function that clips desired ik input deltas into a valid range.
+
+        Args:
+            dpos (np.array): a 3 dimensional array corresponding to the desired
+                change in x, y, and z end effector position.
+            rotation (np.array): relative rotation in scaled axis angle form (ax, ay, az)
+                corresponding to the (relative) desired orientation of the end effector.
+
+        Returns:
+            2-tuple:
+
+                - (np.array) clipped dpos
+                - (np.array) clipped rotation
+        """
+        # scale input range to desired magnitude
+        if dpos.any():
+            dpos, _ = T.clip_translation(dpos, self.ik_pos_limit)
+
+        # Map input to quaternion
+        rotation = T.axisangle2quat(rotation)
+
+        # Clip orientation to desired magnitude
+        rotation, _ = T.clip_rotation(rotation, self.ik_ori_limit)
+
+        return dpos, rotation
+
+    def _make_input(self, action, old_quat):
+        """
+        Helper function that returns a dictionary with keys dpos, rotation from a raw input
+        array. The first three elements are taken to be displacement in position, and a
+        quaternion indicating the change in rotation with respect to @old_quat. Additionally clips @action as well
+
+        Args:
+            action (np.array) should have form: [dx, dy, dz, ax, ay, az] (orientation in
+                scaled axis-angle form)
+            old_quat (np.array) the old target quaternion that will be updated with the relative change in @action
+        """
+        # Clip action appropriately
+        dpos, rotation = self._clip_ik_input(action[:3], action[3:])
+
+        # Update reference targets
+        self.reference_target_pos += dpos * self.user_sensitivity
+        self.reference_target_orn = T.quat_multiply(old_quat, rotation)
+
+        return {"dpos": dpos * self.user_sensitivity, "rotation": T.quat2mat(rotation)}
+
+    @staticmethod
+    def _get_current_error(current, set_point):
+        """
+        Returns an array of differences between the desired joint positions and current
+        joint positions. Useful for PID control.
+
+        Args:
+            current (np.array): the current joint positions
+            set_point (np.array): the joint positions that are desired as a numpy array
+
+        Returns:
+            np.array: the current error in the joint positions
+        """
+        error = current - set_point
+        return error
+
+    @property
+    def control_limits(self):
+        """
+        The limits over this controller's action space, as specified by self.ik_pos_limit and self.ik_ori_limit
+        and overriding the superclass method
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum control values
+                - (np.array) maximum control values
+        """
+        max_limit = np.concatenate([self.ik_pos_limit * np.ones(3), self.ik_ori_limit * np.ones(3)])
+        return -max_limit, max_limit
+
+    @property
+    def name(self):
+        return "IK_POSE"
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/interpolators/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/interpolators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/interpolators/base_interpolator.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/interpolators/base_interpolator.py
new file mode 100644
index 0000000000000000000000000000000000000000..a09c879b8650d1515d69f54c91093f8ae829c6f8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/interpolators/base_interpolator.py
@@ -0,0 +1,17 @@
+import abc
+
+
+class Interpolator(object, metaclass=abc.ABCMeta):
+    """
+    General interpolator interface.
+    """
+
+    @abc.abstractmethod
+    def get_interpolated_goal(self):
+        """
+        Provides the next step in interpolation given the remaining steps.
+
+        Returns:
+            np.array: Next interpolated step
+        """
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/interpolators/linear_interpolator.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/interpolators/linear_interpolator.py
new file mode 100644
index 0000000000000000000000000000000000000000..36a3aa49690c0d369c7d349e7492040ecdf3d76c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/interpolators/linear_interpolator.py
@@ -0,0 +1,137 @@
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.controllers.interpolators.base_interpolator import Interpolator
+
+
+class LinearInterpolator(Interpolator):
+    """
+    Simple class for implementing a linear interpolator.
+
+    Abstracted to interpolate n-dimensions
+
+    Args:
+        ndim (int): Number of dimensions to interpolate
+
+        controller_freq (float): Frequency (Hz) of the controller
+
+        policy_freq (float): Frequency (Hz) of the policy model
+
+        ramp_ratio (float): Percentage of interpolation timesteps across which we will interpolate to a goal position.
+
+            :Note: Num total interpolation steps will be equal to np.floor(ramp_ratio * controller_freq / policy_freq)
+                    i.e.: how many controller steps we get per action space update
+
+        ori_interpolate (None or str): If set, assumes that we are interpolating angles (orientation)
+            Specified string determines assumed type of input:
+
+                `'euler'`: Euler orientation inputs
+                `'quat'`: Quaternion inputs
+    """
+
+    def __init__(
+        self,
+        ndim,
+        controller_freq,
+        policy_freq,
+        ramp_ratio=0.2,
+        use_delta_goal=False,
+        ori_interpolate=None,
+    ):
+        self.dim = ndim  # Number of dimensions to interpolate
+        self.ori_interpolate = ori_interpolate  # Whether this is interpolating orientation or not
+        self.order = 1  # Order of the interpolator (1 = linear)
+        self.step = 0  # Current step of the interpolator
+        self.total_steps = np.ceil(
+            ramp_ratio * controller_freq / policy_freq
+        )  # Total num steps per interpolator action
+        self.use_delta_goal = use_delta_goal  # Whether to use delta or absolute goals (currently
+        # not implemented yet- TODO)
+        self.set_states(dim=ndim, ori=ori_interpolate)
+
+    def set_states(self, dim=None, ori=None):
+        """
+        Updates self.dim and self.ori_interpolate.
+
+        Initializes self.start and self.goal with correct dimensions.
+
+        Args:
+            ndim (None or int): Number of dimensions to interpolate
+
+            ori_interpolate (None or str): If set, assumes that we are interpolating angles (orientation)
+                Specified string determines assumed type of input:
+
+                    `'euler'`: Euler orientation inputs
+                    `'quat'`: Quaternion inputs
+        """
+        # Update self.dim and self.ori_interpolate
+        self.dim = dim if dim is not None else self.dim
+        self.ori_interpolate = ori if ori is not None else self.ori_interpolate
+
+        # Set start and goal states
+        if self.ori_interpolate is not None:
+            if self.ori_interpolate == "euler":
+                self.start = np.zeros(3)
+            else:  # quaternions
+                self.start = np.array((0, 0, 0, 1))
+        else:
+            self.start = np.zeros(self.dim)
+        self.goal = np.array(self.start)
+
+    def set_goal(self, goal):
+        """
+        Takes a requested (absolute) goal and updates internal parameters for next interpolation step
+
+        Args:
+            np.array: Requested goal (absolute value). Should be same dimension as self.dim
+        """
+        # First, check to make sure requested goal shape is the same as self.dim
+        if goal.shape[0] != self.dim:
+            print("Requested goal: {}".format(goal))
+            raise ValueError(
+                "LinearInterpolator: Input size wrong for goal; got {}, needs to be {}!".format(goal.shape[0], self.dim)
+            )
+
+        # Update start and goal
+        self.start = np.array(self.goal)
+        self.goal = np.array(goal)
+
+        # Reset interpolation steps
+        self.step = 0
+
+    def get_interpolated_goal(self):
+        """
+        Provides the next step in interpolation given the remaining steps.
+
+        NOTE: If this interpolator is for orientation, it is assumed to be receiving either euler angles or quaternions
+
+        Returns:
+            np.array: Next position in the interpolated trajectory
+        """
+        # Grab start position
+        x = np.array(self.start)
+        # Calculate the desired next step based on remaining interpolation steps
+        if self.ori_interpolate is not None:
+            # This is an orientation interpolation, so we interpolate linearly around a sphere instead
+            goal = np.array(self.goal)
+            if self.ori_interpolate == "euler":
+                # this is assumed to be euler angles (x,y,z), so we need to first map to quat
+                x = T.mat2quat(T.euler2mat(x))
+                goal = T.mat2quat(T.euler2mat(self.goal))
+
+            # Interpolate to the next sequence
+            x_current = T.quat_slerp(x, goal, fraction=(self.step + 1) / self.total_steps)
+            if self.ori_interpolate == "euler":
+                # Map back to euler
+                x_current = T.mat2euler(T.quat2mat(x_current))
+        else:
+            # This is a normal interpolation
+            dx = (self.goal - x) / (self.total_steps - self.step)
+            x_current = x + dx
+
+        # Increment step if there's still steps remaining based on ramp ratio
+        if self.step < self.total_steps - 1:
+            self.step += 1
+
+        # Return the new interpolated step
+        return x_current
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/joint_pos.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/joint_pos.py
new file mode 100644
index 0000000000000000000000000000000000000000..5604ae37c318a7c46a5866750f43df768e58e27f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/joint_pos.py
@@ -0,0 +1,304 @@
+from typing import Dict, List, Literal
+import numpy as np
+
+from robosuite.controllers.base_controller import Controller
+from robosuite.utils.control_utils import *
+
+# Supported impedance modes
+IMPEDANCE_MODES = {"fixed", "variable", "variable_kp"}
+
+
+class JointPositionController(Controller):
+    """
+    Controller for controlling robot arm via impedance control. Allows position control of the robot's joints.
+
+    NOTE: Control input actions assumed to be taken relative to the current joint positions. A given action to this
+    controller is assumed to be of the form: (dpos_j0, dpos_j1, ... , dpos_jn-1) for an n-joint robot
+
+    Args:
+        sim (MjSim): Simulator instance this controller will pull robot state updates from
+
+        eef_name (str): Name of controlled robot arm's end effector (from robot XML)
+
+        joint_indexes (dict): Each key contains sim reference indexes to relevant robot joint information, namely:
+
+            :`'joints'`: list of indexes to relevant robot joints
+            :`'qpos'`: list of indexes to relevant robot joint positions
+            :`'qvel'`: list of indexes to relevant robot joint velocities
+
+        actuator_range (2-tuple of array of float): 2-Tuple (low, high) representing the robot joint actuator range
+
+        input_max (float or Iterable of float): Maximum above which an inputted action will be clipped. Can be either be
+            a scalar (same value for all action dimensions), or a list (specific values for each dimension). If the
+            latter, dimension should be the same as the control dimension for this controller
+
+        input_min (float or Iterable of float): Minimum below which an inputted action will be clipped. Can be either be
+            a scalar (same value for all action dimensions), or a list (specific values for each dimension). If the
+            latter, dimension should be the same as the control dimension for this controller
+
+        output_max (float or Iterable of float): Maximum which defines upper end of scaling range when scaling an input
+            action. Can be either be a scalar (same value for all action dimensions), or a list (specific values for
+            each dimension). If the latter, dimension should be the same as the control dimension for this controller
+
+        output_min (float or Iterable of float): Minimum which defines upper end of scaling range when scaling an input
+            action. Can be either be a scalar (same value for all action dimensions), or a list (specific values for
+            each dimension). If the latter, dimension should be the same as the control dimension for this controller
+
+        kp (float or Iterable of float): positional gain for determining desired torques based upon the joint pos error.
+            Can be either be a scalar (same value for all action dims), or a list (specific values for each dim)
+
+        damping_ratio (float or Iterable of float): used in conjunction with kp to determine the velocity gain for
+            determining desired torques based upon the joint pos errors. Can be either be a scalar (same value for all
+            action dims), or a list (specific values for each dim)
+
+        impedance_mode (str): Impedance mode with which to run this controller. Options are {"fixed", "variable",
+            "variable_kp"}. If "fixed", the controller will have fixed kp and damping_ratio values as specified by the
+            @kp and @damping_ratio arguments. If "variable", both kp and damping_ratio will now be part of the
+            controller action space, resulting in a total action space of num_joints * 3. If "variable_kp", only kp
+            will become variable, with damping_ratio fixed at 1 (critically damped). The resulting action space will
+            then be num_joints * 2.
+
+        kp_limits (2-list of float or 2-list of Iterable of floats): Only applicable if @impedance_mode is set to either
+            "variable" or "variable_kp". This sets the corresponding min / max ranges of the controller action space
+            for the varying kp values. Can be either be a 2-list (same min / max for all kp action dims), or a 2-list
+            of list (specific min / max for each kp dim)
+
+        damping_ratio_limits (2-list of float or 2-list of Iterable of floats): Only applicable if @impedance_mode is
+            set to "variable". This sets the corresponding min / max ranges of the controller action space for the
+            varying damping_ratio values. Can be either be a 2-list (same min / max for all damping_ratio action dims),
+            or a 2-list of list (specific min / max for each damping_ratio dim)
+
+        policy_freq (int): Frequency at which actions from the robot policy are fed into this controller
+
+        qpos_limits (2-list of float or 2-list of Iterable of floats): Limits (rad) below and above which the magnitude
+            of a calculated goal joint position will be clipped. Can be either be a 2-list (same min/max value for all
+            joint dims), or a 2-list of list (specific min/max values for each dim)
+
+        interpolator (Interpolator): Interpolator object to be used for interpolating from the current joint position to
+            the goal joint position during each timestep between inputted actions
+
+        **kwargs: Does nothing; placeholder to "sink" any additional arguments so that instantiating this controller
+            via an argument dict that has additional extraneous arguments won't raise an error
+
+    Raises:
+        AssertionError: [Invalid impedance mode]
+    """
+
+    def __init__(
+        self,
+        sim,
+        eef_name,
+        joint_indexes,
+        actuator_range,
+        input_max=1,
+        input_min=-1,
+        output_max=0.05,
+        output_min=-0.05,
+        kp=50,
+        damping_ratio=1,
+        impedance_mode="fixed",
+        kp_limits=(0, 300),
+        damping_ratio_limits=(0, 100),
+        policy_freq=20,
+        qpos_limits=None,
+        interpolator=None,
+        input_type: Literal["delta", "absolute"] = "delta",
+        **kwargs,  # does nothing; used so no error raised when dict is passed with extra terms used previously
+    ):
+
+        super().__init__(
+            sim,
+            eef_name,
+            joint_indexes,
+            actuator_range,
+        )
+
+        # Control dimension
+        self.control_dim = len(joint_indexes["joints"])
+
+        # input and output max and min (allow for either explicit lists or single numbers)
+        self.input_max = self.nums2array(input_max, self.control_dim)
+        self.input_min = self.nums2array(input_min, self.control_dim)
+        self.output_max = self.nums2array(output_max, self.control_dim)
+        self.output_min = self.nums2array(output_min, self.control_dim)
+
+        # limits
+        self.position_limits = np.array(qpos_limits) if qpos_limits is not None else qpos_limits
+
+        # kp kd
+        self.kp = self.nums2array(kp, self.control_dim)
+        self.kd = 2 * np.sqrt(self.kp) * damping_ratio
+
+        # kp and kd limits
+        self.kp_min = self.nums2array(kp_limits[0], self.control_dim)
+        self.kp_max = self.nums2array(kp_limits[1], self.control_dim)
+        self.damping_ratio_min = self.nums2array(damping_ratio_limits[0], self.control_dim)
+        self.damping_ratio_max = self.nums2array(damping_ratio_limits[1], self.control_dim)
+
+        # Verify the proposed impedance mode is supported
+        assert impedance_mode in IMPEDANCE_MODES, (
+            "Error: Tried to instantiate OSC controller for unsupported "
+            "impedance mode! Inputted impedance mode: {}, Supported modes: {}".format(impedance_mode, IMPEDANCE_MODES)
+        )
+
+        # Impedance mode
+        self.impedance_mode = impedance_mode
+
+        # Add to control dim based on impedance_mode
+        if self.impedance_mode == "variable":
+            self.control_dim *= 3
+        elif self.impedance_mode == "variable_kp":
+            self.control_dim *= 2
+
+        # control frequency
+        self.control_freq = policy_freq
+
+        # interpolator
+        self.interpolator = interpolator
+
+        self.input_type = input_type
+        print(f"Input type: {self.input_type}")
+        assert self.input_type in ["delta", "absolute"], f"Input type must be delta or absolute, got: {self.input_type}"
+        if self.input_type == "absolute":
+            assert self.impedance_mode == "fixed", "Absolute input type is only supported for fixed impedance mode."
+
+
+        # initialize
+        self.goal_qpos = None
+
+    def set_goal(self, action, set_qpos=None):
+        """
+        Sets goal based on input @action. If self.impedance_mode is not "fixed", then the input will be parsed into the
+        delta values to update the goal position / pose and the kp and/or damping_ratio values to be immediately updated
+        internally before executing the proceeding control loop.
+
+        Note that @action expected to be in the following format, based on impedance mode!
+
+            :Mode `'fixed'`: [joint pos command]
+            :Mode `'variable'`: [damping_ratio values, kp values, joint pos command]
+            :Mode `'variable_kp'`: [kp values, joint pos command]
+
+        Args:
+            action (Iterable): Desired relative joint position goal state
+            set_qpos (Iterable): If set, overrides @action and sets the desired absolute joint position goal state
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        # Update state
+        self.update()
+
+        if self.input_type == "delta":
+            
+            # Parse action based on the impedance mode, and update kp / kd as necessary
+            jnt_dim = len(self.qpos_index)
+            if self.impedance_mode == "variable":
+                damping_ratio, kp, delta = action[:jnt_dim], action[jnt_dim : 2 * jnt_dim], action[2 * jnt_dim :]
+                self.kp = np.clip(kp, self.kp_min, self.kp_max)
+                self.kd = 2 * np.sqrt(self.kp) * np.clip(damping_ratio, self.damping_ratio_min, self.damping_ratio_max)
+            elif self.impedance_mode == "variable_kp":
+                kp, delta = action[:jnt_dim], action[jnt_dim:]
+                self.kp = np.clip(kp, self.kp_min, self.kp_max)
+                self.kd = 2 * np.sqrt(self.kp)  # critically damped
+            else:  # This is case "fixed"
+                delta = action
+
+            # Check to make sure delta is size self.joint_dim
+            assert len(delta) == jnt_dim, "Delta qpos must be equal to the robot's joint dimension space!"
+
+            if delta is not None:
+                scaled_delta = self.scale_action(delta)
+            else:
+                scaled_delta = None
+
+            self.goal_qpos = set_goal_position(
+                scaled_delta, self.joint_pos, position_limit=self.position_limits, set_pos=set_qpos
+            )
+        elif self.input_type == "absolute":
+            self.goal_qpos = action
+
+        if self.interpolator is not None:
+            self.interpolator.set_goal(self.goal_qpos)
+
+    def run_controller(self):
+        """
+        Calculates the torques required to reach the desired setpoint
+
+        Returns:
+             np.array: Command torques
+        """
+        # Make sure goal has been set
+        if self.goal_qpos is None:
+            self.set_goal(np.zeros(self.control_dim))
+
+        # Update state
+        self.update()
+
+        desired_qpos = None
+
+        # Only linear interpolator is currently supported
+        if self.interpolator is not None:
+            # Linear case
+            if self.interpolator.order == 1:
+                desired_qpos = self.interpolator.get_interpolated_goal()
+            else:
+                # Nonlinear case not currently supported
+                pass
+        else:
+            desired_qpos = np.array(self.goal_qpos)
+
+        # torques = pos_err * kp + vel_err * kd
+        position_error = desired_qpos - self.joint_pos
+        vel_pos_error = -self.joint_vel
+        desired_torque = np.multiply(np.array(position_error), np.array(self.kp)) + np.multiply(vel_pos_error, self.kd)
+
+        # Return desired torques plus gravity compensations
+        self.torques = np.dot(self.mass_matrix, desired_torque) + self.torque_compensation
+
+        # Always run superclass call for any cleanups at the end
+        super().run_controller()
+
+        # print(f"current qpos: {self.joint_pos}")
+        # print(f"desired qpos: {desired_qpos}")
+
+        return self.torques
+
+    def reset_goal(self):
+        """
+        Resets joint position goal to be current position
+        """
+        self.goal_qpos = self.joint_pos
+
+        # Reset interpolator if required
+        if self.interpolator is not None:
+            self.interpolator.set_goal(self.goal_qpos)
+
+    @property
+    def control_limits(self):
+        """
+        Returns the limits over this controller's action space, overrides the superclass property
+        Returns the following (generalized for both high and low limits), based on the impedance mode:
+
+            :Mode `'fixed'`: [joint pos command]
+            :Mode `'variable'`: [damping_ratio values, kp values, joint pos command]
+            :Mode `'variable_kp'`: [kp values, joint pos command]
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum action values
+                - (np.array) maximum action values
+        """
+        if self.impedance_mode == "variable":
+            low = np.concatenate([self.damping_ratio_min, self.kp_min, self.input_min])
+            high = np.concatenate([self.damping_ratio_max, self.kp_max, self.input_max])
+        elif self.impedance_mode == "variable_kp":
+            low = np.concatenate([self.kp_min, self.input_min])
+            high = np.concatenate([self.kp_max, self.input_max])
+        else:  # This is case "fixed"
+            low, high = self.input_min, self.input_max
+        return low, high
+
+    @property
+    def name(self):
+        return "JOINT_POSITION"
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/joint_tor.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/joint_tor.py
new file mode 100644
index 0000000000000000000000000000000000000000..643c43b5622965ef7e150bd8f3f1dd76d6325c9c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/joint_tor.py
@@ -0,0 +1,172 @@
+import numpy as np
+
+from robosuite.controllers.base_controller import Controller
+
+
+class JointTorqueController(Controller):
+    """
+    Controller for controlling the robot arm's joint torques. As the actuators at the mujoco sim level are already
+    torque actuators, this "controller" usually simply "passes through" desired torques, though it also includes the
+    typical input / output scaling and clipping, as well as interpolator features seen in other controllers classes
+    as well
+
+    NOTE: Control input actions assumed to be taken as absolute joint torques. A given action to this
+    controller is assumed to be of the form: (torq_j0, torq_j1, ... , torq_jn-1) for an n-joint robot
+
+    Args:
+        sim (MjSim): Simulator instance this controller will pull robot state updates from
+
+        eef_name (str): Name of controlled robot arm's end effector (from robot XML)
+
+        joint_indexes (dict): Each key contains sim reference indexes to relevant robot joint information, namely:
+
+            :`'joints'`: list of indexes to relevant robot joints
+            :`'qpos'`: list of indexes to relevant robot joint positions
+            :`'qvel'`: list of indexes to relevant robot joint velocities
+
+        actuator_range (2-tuple of array of float): 2-Tuple (low, high) representing the robot joint actuator range
+
+        input_max (float or list of float): Maximum above which an inputted action will be clipped. Can be either be
+            a scalar (same value for all action dimensions), or a list (specific values for each dimension). If the
+            latter, dimension should be the same as the control dimension for this controller
+
+        input_min (float or list of float): Minimum below which an inputted action will be clipped. Can be either be
+            a scalar (same value for all action dimensions), or a list (specific values for each dimension). If the
+            latter, dimension should be the same as the control dimension for this controller
+
+        output_max (float or list of float): Maximum which defines upper end of scaling range when scaling an input
+            action. Can be either be a scalar (same value for all action dimensions), or a list (specific values for
+            each dimension). If the latter, dimension should be the same as the control dimension for this controller
+
+        output_min (float or list of float): Minimum which defines upper end of scaling range when scaling an input
+            action. Can be either be a scalar (same value for all action dimensions), or a list (specific values for
+            each dimension). If the latter, dimension should be the same as the control dimension for this controller
+
+        policy_freq (int): Frequency at which actions from the robot policy are fed into this controller
+
+        torque_limits (2-list of float or 2-list of list of floats): Limits (N-m) below and above which the magnitude
+            of a calculated goal joint torque will be clipped. Can be either be a 2-list (same min/max value for all
+            joint dims), or a 2-list of list (specific min/max values for each dim)
+            If not specified, will automatically set the limits to the actuator limits for this robot arm
+
+        interpolator (Interpolator): Interpolator object to be used for interpolating from the current joint torques to
+            the goal joint torques during each timestep between inputted actions
+
+        **kwargs: Does nothing; placeholder to "sink" any additional arguments so that instantiating this controller
+            via an argument dict that has additional extraneous arguments won't raise an error
+    """
+
+    def __init__(
+        self,
+        sim,
+        eef_name,
+        joint_indexes,
+        actuator_range,
+        input_max=1,
+        input_min=-1,
+        output_max=0.05,
+        output_min=-0.05,
+        policy_freq=20,
+        torque_limits=None,
+        interpolator=None,
+        **kwargs,  # does nothing; used so no error raised when dict is passed with extra terms used previously
+    ):
+
+        super().__init__(
+            sim,
+            eef_name,
+            joint_indexes,
+            actuator_range,
+        )
+
+        # Control dimension
+        self.control_dim = len(joint_indexes["joints"])
+
+        # input and output max and min (allow for either explicit lists or single numbers)
+        self.input_max = self.nums2array(input_max, self.control_dim)
+        self.input_min = self.nums2array(input_min, self.control_dim)
+        self.output_max = self.nums2array(output_max, self.control_dim)
+        self.output_min = self.nums2array(output_min, self.control_dim)
+
+        # limits (if not specified, set them to actuator limits by default)
+        self.torque_limits = np.array(torque_limits) if torque_limits is not None else self.actuator_limits
+
+        # control frequency
+        self.control_freq = policy_freq
+
+        # interpolator
+        self.interpolator = interpolator
+
+        # initialize torques
+        self.goal_torque = None  # Goal torque desired, pre-compensation
+        self.current_torque = np.zeros(self.control_dim)  # Current torques being outputted, pre-compensation
+        self.torques = None  # Torques returned every time run_controller is called
+
+    def set_goal(self, torques):
+        """
+        Sets goal based on input @torques.
+
+        Args:
+            torques (Iterable): Desired joint torques
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        # Update state
+        self.update()
+
+        # Check to make sure torques is size self.joint_dim
+        assert len(torques) == self.control_dim, "Delta torque must be equal to the robot's joint dimension space!"
+
+        self.goal_torque = np.clip(self.scale_action(torques), self.torque_limits[0], self.torque_limits[1])
+
+        if self.interpolator is not None:
+            self.interpolator.set_goal(self.goal_torque)
+
+    def run_controller(self):
+        """
+        Calculates the torques required to reach the desired setpoint
+
+        Returns:
+             np.array: Command torques
+        """
+        # Make sure goal has been set
+        if self.goal_torque is None:
+            self.set_goal(np.zeros(self.control_dim))
+
+        # Update state
+        self.update()
+
+        # Only linear interpolator is currently supported
+        if self.interpolator is not None:
+            # Linear case
+            if self.interpolator.order == 1:
+                self.current_torque = self.interpolator.get_interpolated_goal()
+            else:
+                # Nonlinear case not currently supported
+                pass
+        else:
+            self.current_torque = np.array(self.goal_torque)
+
+        # Add gravity compensation
+        self.torques = self.current_torque + self.torque_compensation
+
+        # Always run superclass call for any cleanups at the end
+        super().run_controller()
+
+        # Return final torques
+        return self.torques
+
+    def reset_goal(self):
+        """
+        Resets joint torque goal to be all zeros (pre-compensation)
+        """
+        self.goal_torque = np.zeros(self.control_dim)
+
+        # Reset interpolator if required
+        if self.interpolator is not None:
+            self.interpolator.set_goal(self.goal_torque)
+
+    @property
+    def name(self):
+        return "JOINT_TORQUE"
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/joint_vel.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/joint_vel.py
new file mode 100644
index 0000000000000000000000000000000000000000..20ae9946b290182e99cfbd56e55efdf20ada6357
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/joint_vel.py
@@ -0,0 +1,211 @@
+import numpy as np
+
+from robosuite.controllers.base_controller import Controller
+from robosuite.utils.buffers import RingBuffer
+
+
+class JointVelocityController(Controller):
+    """
+    Controller for controlling the robot arm's joint velocities. This is simply a P controller with desired torques
+    (pre gravity compensation) taken to be proportional to the velocity error of the robot joints.
+
+    NOTE: Control input actions assumed to be taken as absolute joint velocities. A given action to this
+    controller is assumed to be of the form: (vel_j0, vel_j1, ... , vel_jn-1) for an n-joint robot
+
+    Args:
+        sim (MjSim): Simulator instance this controller will pull robot state updates from
+
+        eef_name (str): Name of controlled robot arm's end effector (from robot XML)
+
+        joint_indexes (dict): Each key contains sim reference indexes to relevant robot joint information, namely:
+
+            :`'joints'`: list of indexes to relevant robot joints
+            :`'qpos'`: list of indexes to relevant robot joint positions
+            :`'qvel'`: list of indexes to relevant robot joint velocities
+
+        actuator_range (2-tuple of array of float): 2-Tuple (low, high) representing the robot joint actuator range
+
+        input_max (float or list of float): Maximum above which an inputted action will be clipped. Can be either be
+            a scalar (same value for all action dimensions), or a list (specific values for each dimension). If the
+            latter, dimension should be the same as the control dimension for this controller
+
+        input_min (float or list of float): Minimum below which an inputted action will be clipped. Can be either be
+            a scalar (same value for all action dimensions), or a list (specific values for each dimension). If the
+            latter, dimension should be the same as the control dimension for this controller
+
+        output_max (float or list of float): Maximum which defines upper end of scaling range when scaling an input
+            action. Can be either be a scalar (same value for all action dimensions), or a list (specific values for
+            each dimension). If the latter, dimension should be the same as the control dimension for this controller
+
+        output_min (float or list of float): Minimum which defines upper end of scaling range when scaling an input
+            action. Can be either be a scalar (same value for all action dimensions), or a list (specific values for
+            each dimension). If the latter, dimension should be the same as the control dimension for this controller
+
+        kp (float or list of float): velocity gain for determining desired torques based upon the joint vel errors.
+            Can be either be a scalar (same value for all action dims), or a list (specific values for each dim)
+
+        policy_freq (int): Frequency at which actions from the robot policy are fed into this controller
+
+        velocity_limits (2-list of float or 2-list of list of floats): Limits (m/s) below and above which the magnitude
+            of a calculated goal joint velocity will be clipped. Can be either be a 2-list (same min/max value for all
+            joint dims), or a 2-list of list (specific min/max values for each dim)
+
+        interpolator (Interpolator): Interpolator object to be used for interpolating from the current joint velocities
+            to the goal joint velocities during each timestep between inputted actions
+
+        **kwargs: Does nothing; placeholder to "sink" any additional arguments so that instantiating this controller
+            via an argument dict that has additional extraneous arguments won't raise an error
+    """
+
+    def __init__(
+        self,
+        sim,
+        eef_name,
+        joint_indexes,
+        actuator_range,
+        input_max=1,
+        input_min=-1,
+        output_max=1,
+        output_min=-1,
+        kp=0.25,
+        policy_freq=20,
+        velocity_limits=None,
+        interpolator=None,
+        **kwargs,  # does nothing; used so no error raised when dict is passed with extra terms used previously
+    ):
+
+        super().__init__(
+            sim,
+            eef_name,
+            joint_indexes,
+            actuator_range,
+        )
+        # Control dimension
+        self.control_dim = len(joint_indexes["joints"])
+
+        # input and output max and min (allow for either explicit lists or single numbers)
+        self.input_max = self.nums2array(input_max, self.joint_dim)
+        self.input_min = self.nums2array(input_min, self.joint_dim)
+        self.output_max = self.nums2array(output_max, self.joint_dim)
+        self.output_min = self.nums2array(output_min, self.joint_dim)
+
+        # gains and corresopnding vars
+        self.kp = self.nums2array(kp, self.joint_dim)
+        # if kp is a single value, map wrist gains accordingly (scale down x10 for final two joints)
+
+        if type(kp) is float or type(kp) is int:
+            # Scale kpp according to how wide the actuator range is for this robot
+            low, high = self.actuator_limits
+            self.kp = kp * (high - low)
+        self.ki = self.kp * 0.005
+        self.kd = self.kp * 0.001
+        self.last_err = np.zeros(self.joint_dim)
+        self.derr_buf = RingBuffer(dim=self.joint_dim, length=5)
+        self.summed_err = np.zeros(self.joint_dim)
+        self.saturated = False
+        self.last_joint_vel = np.zeros(self.joint_dim)
+
+        # limits
+        self.velocity_limits = np.array(velocity_limits) if velocity_limits is not None else None
+
+        # control frequency
+        self.control_freq = policy_freq
+
+        # interpolator
+        self.interpolator = interpolator
+
+        # initialize torques and goal velocity
+        self.goal_vel = None  # Goal velocity desired, pre-compensation
+        self.current_vel = np.zeros(self.joint_dim)  # Current velocity setpoint, pre-compensation
+        self.torques = None  # Torques returned every time run_controller is called
+
+    def set_goal(self, velocities):
+        """
+        Sets goal based on input @velocities.
+
+        Args:
+            velocities (Iterable): Desired joint velocities
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        # Update state
+        self.update()
+
+        # Otherwise, check to make sure velocities is size self.joint_dim
+        assert (
+            len(velocities) == self.joint_dim
+        ), "Goal action must be equal to the robot's joint dimension space! Expected {}, got {}".format(
+            self.joint_dim, len(velocities)
+        )
+
+        self.goal_vel = self.scale_action(velocities)
+        if self.velocity_limits is not None:
+            self.goal_vel = np.clip(self.goal_vel, self.velocity_limits[0], self.velocity_limits[1])
+
+        if self.interpolator is not None:
+            self.interpolator.set_goal(self.goal_vel)
+
+    def run_controller(self):
+        """
+        Calculates the torques required to reach the desired setpoint
+
+        Returns:
+             np.array: Command torques
+        """
+        # Make sure goal has been set
+        if self.goal_vel is None:
+            self.set_goal(np.zeros(self.joint_dim))
+
+        # Update state
+        self.update()
+
+        # Only linear interpolator is currently supported
+        if self.interpolator is not None:
+            if self.interpolator.order == 1:
+                # Linear case
+                self.current_vel = self.interpolator.get_interpolated_goal()
+            else:
+                # Nonlinear case not currently supported
+                pass
+        else:
+            self.current_vel = np.array(self.goal_vel)
+
+        # Compute necessary error terms for PID velocity controller
+        err = self.current_vel - self.joint_vel
+        derr = err - self.last_err
+        self.last_err = err
+        self.derr_buf.push(derr)
+
+        # Only add to I component if we're not saturated (anti-windup)
+        if not self.saturated:
+            self.summed_err += err
+
+        # Compute command torques via PID velocity controller plus gravity compensation torques
+        torques = self.kp * err + self.ki * self.summed_err + self.kd * self.derr_buf.average + self.torque_compensation
+
+        # Clip torques
+        self.torques = self.clip_torques(torques)
+
+        # Check if we're saturated
+        self.saturated = False if np.sum(np.abs(self.torques - torques)) == 0 else True
+
+        # Always run superclass call for any cleanups at the end
+        super().run_controller()
+
+        # Return final torques
+        return self.torques
+
+    def reset_goal(self):
+        """
+        Resets joint velocity goal to be all zeros
+        """
+        self.goal_vel = np.zeros(self.joint_dim)
+
+        # Reset interpolator if required
+        if self.interpolator is not None:
+            self.interpolator.set_goal(self.goal_vel)
+
+    @property
+    def name(self):
+        return "JOINT_VELOCITY"
diff --git a/phantom/submodules/phantom-robosuite/robosuite/controllers/osc.py b/phantom/submodules/phantom-robosuite/robosuite/controllers/osc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a45843d087ca2329456a53f5a86e87ded6e9c44a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/controllers/osc.py
@@ -0,0 +1,413 @@
+import math
+
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.controllers.base_controller import Controller
+from robosuite.utils.control_utils import *
+
+# Supported impedance modes
+IMPEDANCE_MODES = {"fixed", "variable", "variable_kp"}
+
+# TODO: Maybe better naming scheme to differentiate between input / output min / max and pos/ori limits, etc.
+
+
+class OperationalSpaceController(Controller):
+    """
+    Controller for controlling robot arm via operational space control. Allows position and / or orientation control
+    of the robot's end effector. For detailed information as to the mathematical foundation for this controller, please
+    reference http://khatib.stanford.edu/publications/pdfs/Khatib_1987_RA.pdf
+
+    NOTE: Control input actions can either be taken to be relative to the current position / orientation of the
+    end effector or absolute values. In either case, a given action to this controller is assumed to be of the form:
+    (x, y, z, ax, ay, az) if controlling pos and ori or simply (x, y, z) if only controlling pos
+
+    Args:
+        sim (MjSim): Simulator instance this controller will pull robot state updates from
+
+        eef_name (str): Name of controlled robot arm's end effector (from robot XML)
+
+        joint_indexes (dict): Each key contains sim reference indexes to relevant robot joint information, namely:
+
+            :`'joints'`: list of indexes to relevant robot joints
+            :`'qpos'`: list of indexes to relevant robot joint positions
+            :`'qvel'`: list of indexes to relevant robot joint velocities
+
+        actuator_range (2-tuple of array of float): 2-Tuple (low, high) representing the robot joint actuator range
+
+        input_max (float or Iterable of float): Maximum above which an inputted action will be clipped. Can be either be
+            a scalar (same value for all action dimensions), or a list (specific values for each dimension). If the
+            latter, dimension should be the same as the control dimension for this controller
+
+        input_min (float or Iterable of float): Minimum below which an inputted action will be clipped. Can be either be
+            a scalar (same value for all action dimensions), or a list (specific values for each dimension). If the
+            latter, dimension should be the same as the control dimension for this controller
+
+        output_max (float or Iterable of float): Maximum which defines upper end of scaling range when scaling an input
+            action. Can be either be a scalar (same value for all action dimensions), or a list (specific values for
+            each dimension). If the latter, dimension should be the same as the control dimension for this controller
+
+        output_min (float or Iterable of float): Minimum which defines upper end of scaling range when scaling an input
+            action. Can be either be a scalar (same value for all action dimensions), or a list (specific values for
+            each dimension). If the latter, dimension should be the same as the control dimension for this controller
+
+        kp (float or Iterable of float): positional gain for determining desired torques based upon the pos / ori error.
+            Can be either be a scalar (same value for all action dims), or a list (specific values for each dim)
+
+        damping_ratio (float or Iterable of float): used in conjunction with kp to determine the velocity gain for
+            determining desired torques based upon the joint pos errors. Can be either be a scalar (same value for all
+            action dims), or a list (specific values for each dim)
+
+        impedance_mode (str): Impedance mode with which to run this controller. Options are {"fixed", "variable",
+            "variable_kp"}. If "fixed", the controller will have fixed kp and damping_ratio values as specified by the
+            @kp and @damping_ratio arguments. If "variable", both kp and damping_ratio will now be part of the
+            controller action space, resulting in a total action space of (6 or 3) + 6 * 2. If "variable_kp", only kp
+            will become variable, with damping_ratio fixed at 1 (critically damped). The resulting action space will
+            then be (6 or 3) + 6.
+
+        kp_limits (2-list of float or 2-list of Iterable of floats): Only applicable if @impedance_mode is set to either
+            "variable" or "variable_kp". This sets the corresponding min / max ranges of the controller action space
+            for the varying kp values. Can be either be a 2-list (same min / max for all kp action dims), or a 2-list
+            of list (specific min / max for each kp dim)
+
+        damping_ratio_limits (2-list of float or 2-list of Iterable of floats): Only applicable if @impedance_mode is
+            set to "variable". This sets the corresponding min / max ranges of the controller action space for the
+            varying damping_ratio values. Can be either be a 2-list (same min / max for all damping_ratio action dims),
+            or a 2-list of list (specific min / max for each damping_ratio dim)
+
+        policy_freq (int): Frequency at which actions from the robot policy are fed into this controller
+
+        position_limits (2-list of float or 2-list of Iterable of floats): Limits (m) below and above which the
+            magnitude of a calculated goal eef position will be clipped. Can be either be a 2-list (same min/max value
+            for all cartesian dims), or a 2-list of list (specific min/max values for each dim)
+
+        orientation_limits (2-list of float or 2-list of Iterable of floats): Limits (rad) below and above which the
+            magnitude of a calculated goal eef orientation will be clipped. Can be either be a 2-list
+            (same min/max value for all joint dims), or a 2-list of list (specific min/mx values for each dim)
+
+        interpolator_pos (Interpolator): Interpolator object to be used for interpolating from the current position to
+            the goal position during each timestep between inputted actions
+
+        interpolator_ori (Interpolator): Interpolator object to be used for interpolating from the current orientation
+            to the goal orientation during each timestep between inputted actions
+
+        control_ori (bool): Whether inputted actions will control both pos and ori or exclusively pos
+
+        control_delta (bool): Whether to control the robot using delta or absolute commands (where absolute commands
+            are taken in the world coordinate frame)
+
+        uncouple_pos_ori (bool): Whether to decouple torques meant to control pos and torques meant to control ori
+
+        **kwargs: Does nothing; placeholder to "sink" any additional arguments so that instantiating this controller
+            via an argument dict that has additional extraneous arguments won't raise an error
+
+    Raises:
+        AssertionError: [Invalid impedance mode]
+    """
+
+    def __init__(
+        self,
+        sim,
+        eef_name,
+        joint_indexes,
+        actuator_range,
+        input_max=1,
+        input_min=-1,
+        output_max=(0.05, 0.05, 0.05, 0.5, 0.5, 0.5),
+        output_min=(-0.05, -0.05, -0.05, -0.5, -0.5, -0.5),
+        kp=150,
+        damping_ratio=1,
+        impedance_mode="fixed",
+        kp_limits=(0, 300),
+        damping_ratio_limits=(0, 100),
+        policy_freq=20,
+        position_limits=None,
+        orientation_limits=None,
+        interpolator_pos=None,
+        interpolator_ori=None,
+        control_ori=True,
+        control_delta=True,
+        uncouple_pos_ori=True,
+        **kwargs,  # does nothing; used so no error raised when dict is passed with extra terms used previously
+    ):
+
+        super().__init__(
+            sim,
+            eef_name,
+            joint_indexes,
+            actuator_range,
+        )
+        # Determine whether this is pos ori or just pos
+        self.use_ori = control_ori
+
+        # Determine whether we want to use delta or absolute values as inputs
+        self.use_delta = control_delta
+
+        # Control dimension
+        self.control_dim = 6 if self.use_ori else 3
+        self.name_suffix = "POSE" if self.use_ori else "POSITION"
+
+        # input and output max and min (allow for either explicit lists or single numbers)
+        self.input_max = self.nums2array(input_max, self.control_dim)
+        self.input_min = self.nums2array(input_min, self.control_dim)
+        self.output_max = self.nums2array(output_max, self.control_dim)
+        self.output_min = self.nums2array(output_min, self.control_dim)
+
+        # kp kd
+        self.kp = self.nums2array(kp, 6)
+        self.kd = 2 * np.sqrt(self.kp) * damping_ratio
+
+        # kp and kd limits
+        self.kp_min = self.nums2array(kp_limits[0], 6)
+        self.kp_max = self.nums2array(kp_limits[1], 6)
+        self.damping_ratio_min = self.nums2array(damping_ratio_limits[0], 6)
+        self.damping_ratio_max = self.nums2array(damping_ratio_limits[1], 6)
+
+        # Verify the proposed impedance mode is supported
+        assert impedance_mode in IMPEDANCE_MODES, (
+            "Error: Tried to instantiate OSC controller for unsupported "
+            "impedance mode! Inputted impedance mode: {}, Supported modes: {}".format(impedance_mode, IMPEDANCE_MODES)
+        )
+
+        # Impedance mode
+        self.impedance_mode = impedance_mode
+
+        # Add to control dim based on impedance_mode
+        if self.impedance_mode == "variable":
+            self.control_dim += 12
+        elif self.impedance_mode == "variable_kp":
+            self.control_dim += 6
+
+        # limits
+        self.position_limits = np.array(position_limits) if position_limits is not None else position_limits
+        self.orientation_limits = np.array(orientation_limits) if orientation_limits is not None else orientation_limits
+
+        # control frequency
+        self.control_freq = policy_freq
+
+        # interpolator
+        self.interpolator_pos = interpolator_pos
+        self.interpolator_ori = interpolator_ori
+
+        # whether or not pos and ori want to be uncoupled
+        self.uncoupling = uncouple_pos_ori
+
+        # initialize goals based on initial pos / ori
+        self.goal_ori = np.array(self.initial_ee_ori_mat)
+        self.goal_pos = np.array(self.initial_ee_pos)
+
+        self.relative_ori = np.zeros(3)
+        self.ori_ref = None
+
+    def set_goal(self, action, set_pos=None, set_ori=None):
+        """
+        Sets goal based on input @action. If self.impedance_mode is not "fixed", then the input will be parsed into the
+        delta values to update the goal position / pose and the kp and/or damping_ratio values to be immediately updated
+        internally before executing the proceeding control loop.
+
+        Note that @action expected to be in the following format, based on impedance mode!
+
+            :Mode `'fixed'`: [joint pos command]
+            :Mode `'variable'`: [damping_ratio values, kp values, joint pos command]
+            :Mode `'variable_kp'`: [kp values, joint pos command]
+
+        Args:
+            action (Iterable): Desired relative joint position goal state
+            set_pos (Iterable): If set, overrides @action and sets the desired absolute eef position goal state
+            set_ori (Iterable): IF set, overrides @action and sets the desired absolute eef orientation goal state
+        """
+        # Update state
+        self.update()
+
+        # Parse action based on the impedance mode, and update kp / kd as necessary
+        if self.impedance_mode == "variable":
+            damping_ratio, kp, delta = action[:6], action[6:12], action[12:]
+            self.kp = np.clip(kp, self.kp_min, self.kp_max)
+            self.kd = 2 * np.sqrt(self.kp) * np.clip(damping_ratio, self.damping_ratio_min, self.damping_ratio_max)
+        elif self.impedance_mode == "variable_kp":
+            kp, delta = action[:6], action[6:]
+            self.kp = np.clip(kp, self.kp_min, self.kp_max)
+            self.kd = 2 * np.sqrt(self.kp)  # critically damped
+        else:  # This is case "fixed"
+            delta = action
+
+        # If we're using deltas, interpret actions as such
+        if self.use_delta:
+            if delta is not None:
+                scaled_delta = self.scale_action(delta)
+                if not self.use_ori and set_ori is None:
+                    # Set default control for ori since user isn't actively controlling ori
+                    set_ori = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, -1.0]])
+            else:
+                scaled_delta = []
+        # Else, interpret actions as absolute values
+        else:
+            if set_pos is None:
+                set_pos = delta[:3]
+            # Set default control for ori if we're only using position control
+            if set_ori is None:
+                set_ori = (
+                    T.quat2mat(T.axisangle2quat(delta[3:6]))
+                    if self.use_ori
+                    else np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, -1.0]])
+                )
+            # No scaling of values since these are absolute values
+            scaled_delta = delta
+
+        # We only want to update goal orientation if there is a valid delta ori value OR if we're using absolute ori
+        # use math.isclose instead of numpy because numpy is slow
+        bools = [0.0 if math.isclose(elem, 0.0) else 1.0 for elem in scaled_delta[3:]]
+        if sum(bools) > 0.0 or set_ori is not None:
+            self.goal_ori = set_goal_orientation(
+                scaled_delta[3:], self.ee_ori_mat, orientation_limit=self.orientation_limits, set_ori=set_ori
+            )
+        self.goal_pos = set_goal_position(
+            scaled_delta[:3], self.ee_pos, position_limit=self.position_limits, set_pos=set_pos
+        )
+
+        if self.interpolator_pos is not None:
+            self.interpolator_pos.set_goal(self.goal_pos)
+
+        if self.interpolator_ori is not None:
+            self.ori_ref = np.array(self.ee_ori_mat)  # reference is the current orientation at start
+            self.interpolator_ori.set_goal(
+                orientation_error(self.goal_ori, self.ori_ref)
+            )  # goal is the total orientation error
+            self.relative_ori = np.zeros(3)  # relative orientation always starts at 0
+
+    def run_controller(self):
+        """
+        Calculates the torques required to reach the desired setpoint.
+
+        Executes Operational Space Control (OSC) -- either position only or position and orientation.
+
+        A detailed overview of derivation of OSC equations can be seen at:
+        http://khatib.stanford.edu/publications/pdfs/Khatib_1987_RA.pdf
+
+        Returns:
+             np.array: Command torques
+        """
+        # Update state
+        self.update()
+
+        desired_pos = None
+        # Only linear interpolator is currently supported
+        if self.interpolator_pos is not None:
+            # Linear case
+            if self.interpolator_pos.order == 1:
+                desired_pos = self.interpolator_pos.get_interpolated_goal()
+            else:
+                # Nonlinear case not currently supported
+                pass
+        else:
+            desired_pos = np.array(self.goal_pos)
+
+        if self.interpolator_ori is not None:
+            # relative orientation based on difference between current ori and ref
+            self.relative_ori = orientation_error(self.ee_ori_mat, self.ori_ref)
+
+            ori_error = self.interpolator_ori.get_interpolated_goal()
+        else:
+            desired_ori = np.array(self.goal_ori)
+            ori_error = orientation_error(desired_ori, self.ee_ori_mat)
+
+        # Compute desired force and torque based on errors
+        position_error = desired_pos - self.ee_pos
+        vel_pos_error = -self.ee_pos_vel
+
+        # F_r = kp * pos_err + kd * vel_err
+        desired_force = np.multiply(np.array(position_error), np.array(self.kp[0:3])) + np.multiply(
+            vel_pos_error, self.kd[0:3]
+        )
+
+        vel_ori_error = -self.ee_ori_vel
+
+        # Tau_r = kp * ori_err + kd * vel_err
+        desired_torque = np.multiply(np.array(ori_error), np.array(self.kp[3:6])) + np.multiply(
+            vel_ori_error, self.kd[3:6]
+        )
+
+        # Compute nullspace matrix (I - Jbar * J) and lambda matrices ((J * M^-1 * J^T)^-1)
+        lambda_full, lambda_pos, lambda_ori, nullspace_matrix = opspace_matrices(
+            self.mass_matrix, self.J_full, self.J_pos, self.J_ori
+        )
+
+        # Decouples desired positional control from orientation control
+        if self.uncoupling:
+            decoupled_force = np.dot(lambda_pos, desired_force)
+            decoupled_torque = np.dot(lambda_ori, desired_torque)
+            decoupled_wrench = np.concatenate([decoupled_force, decoupled_torque])
+        else:
+            desired_wrench = np.concatenate([desired_force, desired_torque])
+            decoupled_wrench = np.dot(lambda_full, desired_wrench)
+
+        # Gamma (without null torques) = J^T * F + gravity compensations
+        self.torques = np.dot(self.J_full.T, decoupled_wrench) + self.torque_compensation
+
+        # Calculate and add nullspace torques (nullspace_matrix^T * Gamma_null) to final torques
+        # Note: Gamma_null = desired nullspace pose torques, assumed to be positional joint control relative
+        #                     to the initial joint positions
+        self.torques += nullspace_torques(
+            self.mass_matrix, nullspace_matrix, self.initial_joint, self.joint_pos, self.joint_vel
+        )
+
+        # Always run superclass call for any cleanups at the end
+        super().run_controller()
+
+        return self.torques
+
+    def update_initial_joints(self, initial_joints):
+        # First, update from the superclass method
+        super().update_initial_joints(initial_joints)
+
+        # We also need to reset the goal in case the old goals were set to the initial confguration
+        self.reset_goal()
+
+    def reset_goal(self):
+        """
+        Resets the goal to the current state of the robot
+        """
+        self.goal_ori = np.array(self.ee_ori_mat)
+        self.goal_pos = np.array(self.ee_pos)
+
+        # Also reset interpolators if required
+
+        if self.interpolator_pos is not None:
+            self.interpolator_pos.set_goal(self.goal_pos)
+
+        if self.interpolator_ori is not None:
+            self.ori_ref = np.array(self.ee_ori_mat)  # reference is the current orientation at start
+            self.interpolator_ori.set_goal(
+                orientation_error(self.goal_ori, self.ori_ref)
+            )  # goal is the total orientation error
+            self.relative_ori = np.zeros(3)  # relative orientation always starts at 0
+
+    @property
+    def control_limits(self):
+        """
+        Returns the limits over this controller's action space, overrides the superclass property
+        Returns the following (generalized for both high and low limits), based on the impedance mode:
+
+            :Mode `'fixed'`: [joint pos command]
+            :Mode `'variable'`: [damping_ratio values, kp values, joint pos command]
+            :Mode `'variable_kp'`: [kp values, joint pos command]
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum action values
+                - (np.array) maximum action values
+        """
+        if self.impedance_mode == "variable":
+            low = np.concatenate([self.damping_ratio_min, self.kp_min, self.input_min])
+            high = np.concatenate([self.damping_ratio_max, self.kp_max, self.input_max])
+        elif self.impedance_mode == "variable_kp":
+            low = np.concatenate([self.kp_min, self.input_min])
+            high = np.concatenate([self.kp_max, self.input_max])
+        else:  # This is case "fixed"
+            low, high = self.input_min, self.input_max
+        return low, high
+
+    @property
+    def name(self):
+        return "OSC_" + self.name_suffix
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_collect_and_playback_data.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_collect_and_playback_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bb71c316afe5f0b68e80e52082f83360d3ecec8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_collect_and_playback_data.py
@@ -0,0 +1,107 @@
+"""
+Record trajectory data with the DataCollectionWrapper wrapper and play them back.
+
+Example:
+    $ python demo_collect_and_playback_data.py --environment Lift
+"""
+
+import argparse
+import os
+from glob import glob
+
+import numpy as np
+
+import robosuite as suite
+from robosuite.wrappers import DataCollectionWrapper
+
+
+def collect_random_trajectory(env, timesteps=1000):
+    """Run a random policy to collect trajectories.
+
+    The rollout trajectory is saved to files in npz format.
+    Modify the DataCollectionWrapper wrapper to add new fields or change data formats.
+
+    Args:
+        env (MujocoEnv): environment instance to collect trajectories from
+        timesteps(int): how many environment timesteps to run for a given trajectory
+    """
+
+    env.reset()
+    dof = env.action_dim
+
+    for t in range(timesteps):
+        action = np.random.randn(dof)
+        env.step(action)
+        env.render()
+        if t % 100 == 0:
+            print(t)
+
+
+def playback_trajectory(env, ep_dir):
+    """Playback data from an episode.
+
+    Args:
+        env (MujocoEnv): environment instance to playback trajectory in
+        ep_dir (str): The path to the directory containing data for an episode.
+    """
+
+    # first reload the model from the xml
+    xml_path = os.path.join(ep_dir, "model.xml")
+    with open(xml_path, "r") as f:
+        env.reset_from_xml_string(f.read())
+
+    state_paths = os.path.join(ep_dir, "state_*.npz")
+
+    # read states back, load them one by one, and render
+    t = 0
+    for state_file in sorted(glob(state_paths)):
+        print(state_file)
+        dic = np.load(state_file)
+        states = dic["states"]
+        for state in states:
+            env.sim.set_state_from_flattened(state)
+            env.sim.forward()
+            env.render()
+            t += 1
+            if t % 100 == 0:
+                print(t)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--environment", type=str, default="Door")
+    parser.add_argument("--robots", nargs="+", type=str, default="Panda", help="Which robot(s) to use in the env")
+    parser.add_argument("--directory", type=str, default="/tmp/")
+    parser.add_argument("--timesteps", type=int, default=1000)
+    args = parser.parse_args()
+
+    # create original environment
+    env = suite.make(
+        args.environment,
+        robots=args.robots,
+        ignore_done=True,
+        use_camera_obs=False,
+        has_renderer=True,
+        has_offscreen_renderer=False,
+        control_freq=20,
+    )
+    data_directory = args.directory
+
+    # wrap the environment with data collection wrapper
+    env = DataCollectionWrapper(env, data_directory)
+
+    # testing to make sure multiple env.reset calls don't create multiple directories
+    env.reset()
+    env.reset()
+    env.reset()
+
+    # collect some data
+    print("Collecting some random data...")
+    collect_random_trajectory(env, timesteps=args.timesteps)
+
+    # playback some data
+    _ = input("Press any key to begin the playback...")
+    print("Playing back the data...")
+    data_directory = env.ep_directory
+    playback_trajectory(env, data_directory)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_control.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_control.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca78344fc93d7bacce611b9a3e77f6bbfde6f155
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_control.py
@@ -0,0 +1,161 @@
+"""
+This demo script demonstrates the various functionalities of each controller available within robosuite.
+
+For a given controller, runs through each dimension and executes a perturbation "test_value" from its
+neutral (stationary) value for a certain amount of time "steps_per_action", and then returns to all neutral values
+for time "steps_per_rest" before proceeding with the next action dim.
+
+    E.g.: Given that the expected action space of the Pos / Ori (OSC_POSE) controller (without a gripper) is
+    (dx, dy, dz, droll, dpitch, dyaw), the testing sequence of actions over time will be:
+
+        ***START OF DEMO***
+        ( dx,  0,  0,  0,  0,  0, grip)     <-- Translation in x-direction      for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest' steps
+        (  0, dy,  0,  0,  0,  0, grip)     <-- Translation in y-direction      for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest' steps
+        (  0,  0, dz,  0,  0,  0, grip)     <-- Translation in z-direction      for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest' steps
+        (  0,  0,  0, dr,  0,  0, grip)     <-- Rotation in roll (x) axis       for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest' steps
+        (  0,  0,  0,  0, dp,  0, grip)     <-- Rotation in pitch (y) axis      for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest' steps
+        (  0,  0,  0,  0,  0, dy, grip)     <-- Rotation in yaw (z) axis        for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest' steps
+        ***END OF DEMO***
+
+    Thus the OSC_POSE controller should be expected to sequentially move linearly in the x direction first,
+        then the y direction, then the z direction, and then begin sequentially rotating about its x-axis,
+        then y-axis, then z-axis.
+
+Please reference the documentation of Controllers in the Modules section for an overview of each controller.
+Controllers are expected to behave in a generally controlled manner, according to their control space. The expected
+sequential qualitative behavior during the test is described below for each controller:
+
+* OSC_POSE: Gripper moves sequentially and linearly in x, y, z direction, then sequentially rotates in x-axis, y-axis,
+            z-axis, relative to the global coordinate frame
+* OSC_POSITION: Gripper moves sequentially and linearly in x, y, z direction, relative to the global coordinate frame
+* IK_POSE: Gripper moves sequentially and linearly in x, y, z direction, then sequentially rotates in x-axis, y-axis,
+            z-axis, relative to the local robot end effector frame
+* JOINT_POSITION: Robot Joints move sequentially in a controlled fashion
+* JOINT_VELOCITY: Robot Joints move sequentially in a controlled fashion
+* JOINT_TORQUE: Unlike other controllers, joint torque controller is expected to act rather lethargic, as the
+            "controller" is really just a wrapper for direct torque control of the mujoco actuators. Therefore, a
+            "neutral" value of 0 torque will not guarantee a stable robot when it has non-zero velocity!
+
+"""
+
+import robosuite as suite
+from robosuite.controllers import load_controller_config
+from robosuite.robots import Bimanual
+from robosuite.utils.input_utils import *
+
+if __name__ == "__main__":
+
+    # Create dict to hold options that will be passed to env creation call
+    options = {}
+
+    # print welcome info
+    print("Welcome to robosuite v{}!".format(suite.__version__))
+    print(suite.__logo__)
+
+    # Choose environment and add it to options
+    options["env_name"] = choose_environment()
+
+    # If a multi-arm environment has been chosen, choose configuration and appropriate robot(s)
+    if "TwoArm" in options["env_name"]:
+        # Choose env config and add it to options
+        options["env_configuration"] = choose_multi_arm_config()
+
+        # If chosen configuration was bimanual, the corresponding robot must be Baxter. Else, have user choose robots
+        if options["env_configuration"] == "bimanual":
+            options["robots"] = "Baxter"
+        else:
+            options["robots"] = []
+
+            # Have user choose two robots
+            print("A multiple single-arm configuration was chosen.\n")
+
+            for i in range(2):
+                print("Please choose Robot {}...\n".format(i))
+                options["robots"].append(choose_robots(exclude_bimanual=True))
+
+    # Else, we simply choose a single (single-armed) robot to instantiate in the environment
+    else:
+        options["robots"] = choose_robots(exclude_bimanual=True)
+
+    # Hacky way to grab joint dimension for now
+    joint_dim = 6 if options["robots"] == "UR5e" else 7
+
+    # Choose controller
+    controller_name = choose_controller()
+
+    # Load the desired controller
+    options["controller_configs"] = suite.load_controller_config(default_controller=controller_name)
+
+    # Define the pre-defined controller actions to use (action_dim, num_test_steps, test_value)
+    controller_settings = {
+        "OSC_POSE": [6, 6, 0.1],
+        "OSC_POSITION": [3, 3, 0.1],
+        "IK_POSE": [6, 6, 0.01],
+        "JOINT_POSITION": [joint_dim, joint_dim, 0.2],
+        "JOINT_VELOCITY": [joint_dim, joint_dim, -0.1],
+        "JOINT_TORQUE": [joint_dim, joint_dim, 0.25],
+    }
+
+    # Define variables for each controller test
+    action_dim = controller_settings[controller_name][0]
+    num_test_steps = controller_settings[controller_name][1]
+    test_value = controller_settings[controller_name][2]
+
+    # Define the number of timesteps to use per controller action as well as timesteps in between actions
+    steps_per_action = 75
+    steps_per_rest = 75
+
+    # initialize the task
+    env = suite.make(
+        **options,
+        has_renderer=True,
+        has_offscreen_renderer=False,
+        ignore_done=True,
+        use_camera_obs=False,
+        horizon=(steps_per_action + steps_per_rest) * num_test_steps,
+        control_freq=20,
+    )
+    env.reset()
+    env.viewer.set_camera(camera_id=0)
+
+    # To accommodate for multi-arm settings (e.g.: Baxter), we need to make sure to fill any extra action space
+    # Get total number of arms being controlled
+    n = 0
+    gripper_dim = 0
+    for robot in env.robots:
+        gripper_dim = robot.gripper["right"].dof if isinstance(robot, Bimanual) else robot.gripper.dof
+        n += int(robot.action_dim / (action_dim + gripper_dim))
+
+    # Define neutral value
+    neutral = np.zeros(action_dim + gripper_dim)
+
+    # Keep track of done variable to know when to break loop
+    count = 0
+    # Loop through controller space
+    while count < num_test_steps:
+        action = neutral.copy()
+        for i in range(steps_per_action):
+            if controller_name in {"IK_POSE", "OSC_POSE"} and count > 2:
+                # Set this value to be the scaled axis angle vector
+                vec = np.zeros(3)
+                vec[count - 3] = test_value
+                action[3:6] = vec
+            else:
+                action[count] = test_value
+            total_action = np.tile(action, n)
+            env.step(total_action)
+            env.render()
+        for i in range(steps_per_rest):
+            total_action = np.tile(neutral, n)
+            env.step(total_action)
+            env.render()
+        count += 1
+
+    # Shut down this env before starting the next test
+    env.close()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_device_control.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_device_control.py
new file mode 100644
index 0000000000000000000000000000000000000000..57c8cb1c819e86ea50448c9fb4a43fcb49cd3815
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_device_control.py
@@ -0,0 +1,241 @@
+"""Teleoperate robot with keyboard or SpaceMouse.
+
+***Choose user input option with the --device argument***
+
+Keyboard:
+    We use the keyboard to control the end-effector of the robot.
+    The keyboard provides 6-DoF control commands through various keys.
+    The commands are mapped to joint velocities through an inverse kinematics
+    solver from Bullet physics.
+
+    Note:
+        To run this script with macOS, you must run it with root access.
+
+SpaceMouse:
+
+    We use the SpaceMouse 3D mouse to control the end-effector of the robot.
+    The mouse provides 6-DoF control commands. The commands are mapped to joint
+    velocities through an inverse kinematics solver from Bullet physics.
+
+    The two side buttons of SpaceMouse are used for controlling the grippers.
+
+    SpaceMouse Wireless from 3Dconnexion: https://www.3dconnexion.com/spacemouse_wireless/en/
+    We used the SpaceMouse Wireless in our experiments. The paper below used the same device
+    to collect human demonstrations for imitation learning.
+
+    Reinforcement and Imitation Learning for Diverse Visuomotor Skills
+    Yuke Zhu, Ziyu Wang, Josh Merel, Andrei Rusu, Tom Erez, Serkan Cabi, Saran Tunyasuvunakool,
+    János Kramár, Raia Hadsell, Nando de Freitas, Nicolas Heess
+    RSS 2018
+
+    Note:
+        This current implementation only supports macOS (Linux support can be added).
+        Download and install the driver before running the script:
+            https://www.3dconnexion.com/service/drivers.html
+
+Additionally, --pos_sensitivity and --rot_sensitivity provide relative gains for increasing / decreasing the user input
+device sensitivity
+
+
+***Choose controller with the --controller argument***
+
+Choice of using either inverse kinematics controller (ik) or operational space controller (osc):
+Main difference is that user inputs with ik's rotations are always taken relative to eef coordinate frame, whereas
+    user inputs with osc's rotations are taken relative to global frame (i.e.: static / camera frame of reference).
+
+    Notes:
+        OSC also tends to be more computationally efficient since IK relies on the backend pybullet IK solver.
+
+
+***Choose environment specifics with the following arguments***
+
+    --environment: Task to perform, e.g.: "Lift", "TwoArmPegInHole", "NutAssembly", etc.
+
+    --robots: Robot(s) with which to perform the task. Can be any in
+        {"Panda", "Sawyer", "IIWA", "Jaco", "Kinova3", "UR5e", "Baxter"}. Note that the environments include sanity
+        checks, such that a "TwoArm..." environment will only accept either a 2-tuple of robot names or a single
+        bimanual robot name, according to the specified configuration (see below), and all other environments will
+        only accept a single single-armed robot name
+
+    --config: Exclusively applicable and only should be specified for "TwoArm..." environments. Specifies the robot
+        configuration desired for the task. Options are {"bimanual", "single-arm-parallel", and "single-arm-opposed"}
+
+            -"bimanual": Sets up the environment for a single bimanual robot. Expects a single bimanual robot name to
+                be specified in the --robots argument
+
+            -"single-arm-parallel": Sets up the environment such that two single-armed robots are stationed next to
+                each other facing the same direction. Expects a 2-tuple of single-armed robot names to be specified
+                in the --robots argument.
+
+            -"single-arm-opposed": Sets up the environment such that two single-armed robots are stationed opposed from
+                each other, facing each other from opposite directions. Expects a 2-tuple of single-armed robot names
+                to be specified in the --robots argument.
+
+    --arm: Exclusively applicable and only should be specified for "TwoArm..." environments. Specifies which of the
+        multiple arm eef's to control. The other (passive) arm will remain stationary. Options are {"right", "left"}
+        (from the point of view of the robot(s) facing against the viewer direction)
+
+    --switch-on-grasp: Exclusively applicable and only should be specified for "TwoArm..." environments. If enabled,
+        will switch the current arm being controlled every time the gripper input is pressed
+
+    --toggle-camera-on-grasp: If enabled, gripper input presses will cycle through the available camera angles
+
+Examples:
+
+    For normal single-arm environment:
+        $ python demo_device_control.py --environment PickPlaceCan --robots Sawyer --controller osc
+
+    For two-arm bimanual environment:
+        $ python demo_device_control.py --environment TwoArmLift --robots Baxter --config bimanual --arm left --controller osc
+
+    For two-arm multi single-arm robot environment:
+        $ python demo_device_control.py --environment TwoArmLift --robots Sawyer Sawyer --config single-arm-parallel --controller osc
+
+
+"""
+
+import argparse
+
+import numpy as np
+
+import robosuite as suite
+from robosuite import load_controller_config
+from robosuite.utils.input_utils import input2action
+from robosuite.wrappers import VisualizationWrapper
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--environment", type=str, default="Lift")
+    parser.add_argument("--robots", nargs="+", type=str, default="Panda", help="Which robot(s) to use in the env")
+    parser.add_argument(
+        "--config", type=str, default="single-arm-opposed", help="Specified environment configuration if necessary"
+    )
+    parser.add_argument("--arm", type=str, default="right", help="Which arm to control (eg bimanual) 'right' or 'left'")
+    parser.add_argument("--switch-on-grasp", action="store_true", help="Switch gripper control on gripper action")
+    parser.add_argument("--toggle-camera-on-grasp", action="store_true", help="Switch camera angle on gripper action")
+    parser.add_argument("--controller", type=str, default="osc", help="Choice of controller. Can be 'ik' or 'osc'")
+    parser.add_argument("--device", type=str, default="keyboard")
+    parser.add_argument("--pos-sensitivity", type=float, default=1.0, help="How much to scale position user inputs")
+    parser.add_argument("--rot-sensitivity", type=float, default=1.0, help="How much to scale rotation user inputs")
+    args = parser.parse_args()
+
+    # Import controller config for EE IK or OSC (pos/ori)
+    if args.controller == "ik":
+        controller_name = "IK_POSE"
+    elif args.controller == "osc":
+        controller_name = "OSC_POSE"
+    else:
+        print("Error: Unsupported controller specified. Must be either 'ik' or 'osc'!")
+        raise ValueError
+
+    # Get controller config
+    controller_config = load_controller_config(default_controller=controller_name)
+
+    # Create argument configuration
+    config = {
+        "env_name": args.environment,
+        "robots": args.robots,
+        "controller_configs": controller_config,
+    }
+
+    # Check if we're using a multi-armed environment and use env_configuration argument if so
+    if "TwoArm" in args.environment:
+        config["env_configuration"] = args.config
+    else:
+        args.config = None
+
+    # Create environment
+    env = suite.make(
+        **config,
+        has_renderer=True,
+        has_offscreen_renderer=False,
+        render_camera="agentview",
+        ignore_done=True,
+        use_camera_obs=False,
+        reward_shaping=True,
+        control_freq=20,
+        hard_reset=False,
+    )
+
+    # Wrap this environment in a visualization wrapper
+    env = VisualizationWrapper(env, indicator_configs=None)
+
+    # Setup printing options for numbers
+    np.set_printoptions(formatter={"float": lambda x: "{0:0.3f}".format(x)})
+
+    # initialize device
+    if args.device == "keyboard":
+        from robosuite.devices import Keyboard
+
+        device = Keyboard(pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
+        env.viewer.add_keypress_callback(device.on_press)
+    elif args.device == "spacemouse":
+        from robosuite.devices import SpaceMouse
+
+        device = SpaceMouse(pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
+    else:
+        raise Exception("Invalid device choice: choose either 'keyboard' or 'spacemouse'.")
+
+    while True:
+        # Reset the environment
+        obs = env.reset()
+
+        # Setup rendering
+        cam_id = 0
+        num_cam = len(env.sim.model.camera_names)
+        env.render()
+
+        # Initialize variables that should the maintained between resets
+        last_grasp = 0
+
+        # Initialize device control
+        device.start_control()
+
+        while True:
+            # Set active robot
+            active_robot = env.robots[0] if args.config == "bimanual" else env.robots[args.arm == "left"]
+
+            # Get the newest action
+            action, grasp = input2action(
+                device=device, robot=active_robot, active_arm=args.arm, env_configuration=args.config
+            )
+
+            # If action is none, then this a reset so we should break
+            if action is None:
+                break
+
+            # If the current grasp is active (1) and last grasp is not (-1) (i.e.: grasping input just pressed),
+            # toggle arm control and / or camera viewing angle if requested
+            if last_grasp < 0 < grasp:
+                if args.switch_on_grasp:
+                    args.arm = "left" if args.arm == "right" else "right"
+                if args.toggle_camera_on_grasp:
+                    cam_id = (cam_id + 1) % num_cam
+                    env.viewer.set_camera(camera_id=cam_id)
+            # Update last grasp
+            last_grasp = grasp
+
+            # Fill out the rest of the action space if necessary
+            rem_action_dim = env.action_dim - action.size
+            if rem_action_dim > 0:
+                # Initialize remaining action space
+                rem_action = np.zeros(rem_action_dim)
+                # This is a multi-arm setting, choose which arm to control and fill the rest with zeros
+                if args.arm == "right":
+                    action = np.concatenate([action, rem_action])
+                elif args.arm == "left":
+                    action = np.concatenate([rem_action, action])
+                else:
+                    # Only right and left arms supported
+                    print(
+                        "Error: Unsupported arm specified -- "
+                        "must be either 'right' or 'left'! Got: {}".format(args.arm)
+                    )
+            elif rem_action_dim < 0:
+                # We're in an environment with no gripper action space, so trim the action space to be the action dim
+                action = action[: env.action_dim]
+
+            # Step through the simulation and render
+            obs, reward, done, info = env.step(action)
+            env.render()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_domain_randomization.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_domain_randomization.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5efb7c98dfe1c7169db5d520a104f31cb9d788d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_domain_randomization.py
@@ -0,0 +1,74 @@
+"""
+Script to showcase domain randomization functionality.
+"""
+
+import robosuite.macros as macros
+from robosuite.controllers import load_controller_config
+from robosuite.utils.input_utils import *
+from robosuite.wrappers import DomainRandomizationWrapper
+
+# We'll use instance randomization so that entire geom groups are randomized together
+macros.USING_INSTANCE_RANDOMIZATION = True
+
+if __name__ == "__main__":
+
+    # Create dict to hold options that will be passed to env creation call
+    options = {}
+
+    # print welcome info
+    print("Welcome to robosuite v{}!".format(suite.__version__))
+    print(suite.__logo__)
+
+    # Choose environment and add it to options
+    options["env_name"] = choose_environment()
+
+    # If a multi-arm environment has been chosen, choose configuration and appropriate robot(s)
+    if "TwoArm" in options["env_name"]:
+        # Choose env config and add it to options
+        options["env_configuration"] = choose_multi_arm_config()
+
+        # If chosen configuration was bimanual, the corresponding robot must be Baxter. Else, have user choose robots
+        if options["env_configuration"] == "bimanual":
+            options["robots"] = "Baxter"
+        else:
+            options["robots"] = []
+
+            # Have user choose two robots
+            print("A multiple single-arm configuration was chosen.\n")
+
+            for i in range(2):
+                print("Please choose Robot {}...\n".format(i))
+                options["robots"].append(choose_robots(exclude_bimanual=True))
+
+    # Else, we simply choose a single (single-armed) robot to instantiate in the environment
+    else:
+        options["robots"] = choose_robots(exclude_bimanual=True)
+
+    # Choose controller
+    controller_name = choose_controller()
+
+    # Load the desired controller
+    options["controller_configs"] = load_controller_config(default_controller=controller_name)
+
+    # initialize the task
+    env = suite.make(
+        **options,
+        has_renderer=True,
+        has_offscreen_renderer=False,
+        ignore_done=True,
+        use_camera_obs=False,
+        control_freq=20,
+        hard_reset=False,  # TODO: Not setting this flag to False brings up a segfault on macos or glfw error on linux
+    )
+    env = DomainRandomizationWrapper(env)
+    env.reset()
+    env.viewer.set_camera(camera_id=0)
+
+    # Get action limits
+    low, high = env.action_spec
+
+    # do visualization
+    for i in range(100):
+        action = np.random.uniform(low, high)
+        obs, reward, done, _ = env.step(action)
+        env.render()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_gripper_interaction.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_gripper_interaction.py
new file mode 100644
index 0000000000000000000000000000000000000000..a677725f19e9c5b5c06911f6855c8d2515bee645
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_gripper_interaction.py
@@ -0,0 +1,133 @@
+"""Gripper interaction demo.
+
+This script illustrates the process of importing grippers into a scene and making it interact
+with the objects with actuators. It also shows how to procedurally generate a scene with the
+APIs of the MJCF utility functions.
+
+Example:
+    $ python run_gripper_test.py
+"""
+
+import xml.etree.ElementTree as ET
+
+from robosuite.models import MujocoWorldBase
+from robosuite.models.arenas.table_arena import TableArena
+from robosuite.models.grippers import PandaGripper, RethinkGripper
+from robosuite.models.objects import BoxObject
+from robosuite.utils import OpenCVRenderer
+from robosuite.utils.binding_utils import MjRenderContextOffscreen, MjSim
+from robosuite.utils.mjcf_utils import new_actuator, new_joint
+
+if __name__ == "__main__":
+
+    # start with an empty world
+    world = MujocoWorldBase()
+
+    # add a table
+    arena = TableArena(table_full_size=(0.4, 0.4, 0.05), table_offset=(0, 0, 1.1), has_legs=False)
+    world.merge(arena)
+
+    # add a gripper
+    gripper = RethinkGripper()
+    # Create another body with a slider joint to which we'll add this gripper
+    gripper_body = ET.Element("body", name="gripper_base")
+    gripper_body.set("pos", "0 0 1.3")
+    gripper_body.set("quat", "0 0 1 0")  # flip z
+    gripper_body.append(new_joint(name="gripper_z_joint", type="slide", axis="0 0 1", damping="50"))
+    # Add the dummy body with the joint to the global worldbody
+    world.worldbody.append(gripper_body)
+    # Merge the actual gripper as a child of the dummy body
+    world.merge(gripper, merge_body="gripper_base")
+    # Create a new actuator to control our slider joint
+    world.actuator.append(new_actuator(joint="gripper_z_joint", act_type="position", name="gripper_z", kp="500"))
+
+    # add an object for grasping
+    mujoco_object = BoxObject(
+        name="box", size=[0.02, 0.02, 0.02], rgba=[1, 0, 0, 1], friction=[1, 0.005, 0.0001]
+    ).get_obj()
+    # Set the position of this object
+    mujoco_object.set("pos", "0 0 1.11")
+    # Add our object to the world body
+    world.worldbody.append(mujoco_object)
+
+    # add reference objects for x and y axes
+    x_ref = BoxObject(
+        name="x_ref", size=[0.01, 0.01, 0.01], rgba=[0, 1, 0, 1], obj_type="visual", joints=None
+    ).get_obj()
+    x_ref.set("pos", "0.2 0 1.105")
+    world.worldbody.append(x_ref)
+    y_ref = BoxObject(
+        name="y_ref", size=[0.01, 0.01, 0.01], rgba=[0, 0, 1, 1], obj_type="visual", joints=None
+    ).get_obj()
+    y_ref.set("pos", "0 0.2 1.105")
+    world.worldbody.append(y_ref)
+
+    # start simulation
+    model = world.get_model(mode="mujoco")
+
+    sim = MjSim(model)
+    viewer = OpenCVRenderer(sim)
+    render_context = MjRenderContextOffscreen(sim, device_id=-1)
+    sim.add_render_context(render_context)
+
+    sim_state = sim.get_state()
+
+    # for gravity correction
+    gravity_corrected = ["gripper_z_joint"]
+    _ref_joint_vel_indexes = [sim.model.get_joint_qvel_addr(x) for x in gravity_corrected]
+
+    # Set gripper parameters
+    gripper_z_id = sim.model.actuator_name2id("gripper_z")
+    gripper_z_low = 0.07
+    gripper_z_high = -0.02
+    gripper_z_is_low = False
+
+    gripper_jaw_ids = [sim.model.actuator_name2id(x) for x in gripper.actuators]
+    gripper_open = [-0.0115, 0.0115]
+    gripper_closed = [0.020833, -0.020833]
+    gripper_is_closed = True
+
+    # hardcode sequence for gripper looping trajectory
+    seq = [(False, False), (True, False), (True, True), (False, True)]
+
+    sim.set_state(sim_state)
+    step = 0
+    T = 500
+    while True:
+        if step % 100 == 0:
+            print("step: {}".format(step))
+
+            # Get contact information
+            for contact in sim.data.contact[0 : sim.data.ncon]:
+
+                geom_name1 = sim.model.geom_id2name(contact.geom1)
+                geom_name2 = sim.model.geom_id2name(contact.geom2)
+                if geom_name1 == "floor" and geom_name2 == "floor":
+                    continue
+
+                print("geom1: {}, geom2: {}".format(geom_name1, geom_name2))
+                print("contact id {}".format(id(contact)))
+                print("friction: {}".format(contact.friction))
+                print("normal: {}".format(contact.frame[0:3]))
+
+        # Iterate through gripping trajectory
+        if step % T == 0:
+            plan = seq[int(step / T) % len(seq)]
+            gripper_z_is_low, gripper_is_closed = plan
+            print("changing plan: gripper low: {}, gripper closed {}".format(gripper_z_is_low, gripper_is_closed))
+
+        # Control gripper
+        if gripper_z_is_low:
+            sim.data.ctrl[gripper_z_id] = gripper_z_low
+        else:
+            sim.data.ctrl[gripper_z_id] = gripper_z_high
+        if gripper_is_closed:
+            sim.data.ctrl[gripper_jaw_ids] = gripper_closed
+        else:
+            sim.data.ctrl[gripper_jaw_ids] = gripper_open
+
+        # Step through sim
+        sim.step()
+        sim.data.qfrc_applied[_ref_joint_vel_indexes] = sim.data.qfrc_bias[_ref_joint_vel_indexes]
+        viewer.render()
+        step += 1
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_gripper_selection.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_gripper_selection.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f87a800b83b7d7754286d7334d9b9abf3c1bf12
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_gripper_selection.py
@@ -0,0 +1,45 @@
+"""
+This script shows you how to select gripper for an environment.
+This is controlled by gripper_type keyword argument.
+"""
+import numpy as np
+
+import robosuite as suite
+from robosuite import ALL_GRIPPERS
+
+if __name__ == "__main__":
+
+    for gripper in ALL_GRIPPERS:
+
+        # Notify user which gripper we're currently using
+        print("Using gripper {}...".format(gripper))
+
+        # create environment with selected grippers
+        env = suite.make(
+            "Lift",
+            robots="Panda",
+            gripper_types=gripper,
+            has_renderer=True,  # make sure we can render to the screen
+            has_offscreen_renderer=False,  # not needed since not using pixel obs
+            use_camera_obs=False,  # do not use pixel observations
+            control_freq=50,  # control should happen fast enough so that simulation looks smoother
+            camera_names="frontview",
+        )
+
+        # Reset the env
+        env.reset()
+
+        # Get action limits
+        low, high = env.action_spec
+
+        # Run random policy
+        for t in range(100):
+            env.render()
+            action = np.random.uniform(low, high)
+            observation, reward, done, info = env.step(action)
+            if done:
+                print("Episode finished after {} timesteps".format(t + 1))
+                break
+
+        # close window
+        env.close()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_gym_functionality.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_gym_functionality.py
new file mode 100644
index 0000000000000000000000000000000000000000..381733676c140e168791120702453c0117e93498
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_gym_functionality.py
@@ -0,0 +1,61 @@
+"""
+This script shows how to adapt an environment to be compatible
+with the Gymnasium API. This is useful when using
+learning pipelines that require supporting these APIs.
+
+For instance, this can be used with OpenAI Baselines
+(https://github.com/openai/baselines) to train agents
+with RL.
+
+
+We base this script off of some code snippets found
+in the "Basic Usage" section of the Gymnasium documentation
+
+The following snippet was used to demo basic functionality.
+
+    import gymnasium as gym
+    env = gym.make("LunarLander-v2", render_mode="human")
+    observation, info = env.reset()
+
+    for _ in range(1000):
+        action = env.action_space.sample()  # agent policy that uses the observation and info
+        observation, reward, terminated, truncated, info = env.step(action)
+        if terminated or truncated:
+            observation, info = env.reset()
+            env.close()
+
+To adapt our APIs to be compatible with OpenAI Gym's style, this script
+demonstrates how this can be easily achieved by using the GymWrapper.
+"""
+
+import robosuite as suite
+from robosuite.wrappers import GymWrapper
+
+if __name__ == "__main__":
+
+    # Notice how the environment is wrapped by the wrapper
+    env = GymWrapper(
+        suite.make(
+            "Lift",
+            robots="Sawyer",  # use Sawyer robot
+            use_camera_obs=False,  # do not use pixel observations
+            has_offscreen_renderer=False,  # not needed since not using pixel obs
+            has_renderer=True,  # make sure we can render to the screen
+            reward_shaping=True,  # use dense rewards
+            control_freq=20,  # control should happen fast enough so that simulation looks smooth
+        )
+    )
+
+    env.reset(seed=0)
+
+    for i_episode in range(20):
+        observation = env.reset()
+        for t in range(500):
+            env.render()
+            action = env.action_space.sample()
+            observation, reward, terminated, truncated, info = env.step(action)
+            if terminated or truncated:
+                print("Episode finished after {} timesteps".format(t + 1))
+                observation, info = env.reset()
+                env.close()
+                break
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_nvisii_modalities.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_nvisii_modalities.py
new file mode 100644
index 0000000000000000000000000000000000000000..37728c561c01bbd554eef3f42875c356c2bf19f5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_nvisii_modalities.py
@@ -0,0 +1,106 @@
+"""
+Dumps video of the modality specified from the renderer.
+"""
+
+import argparse
+
+import imageio
+import matplotlib.cm
+import numpy as np
+
+import robosuite as suite
+import robosuite.macros as macros
+from robosuite.controllers import load_controller_config
+from robosuite.renderers import load_renderer_config
+from robosuite.utils.input_utils import *
+
+if __name__ == "__main__":
+
+    """
+    Registered environments: Lift, Stack, NutAssembly, NutAssemblySingle, NutAssemblySquare, NutAssemblyRound,
+                             PickPlace, PickPlaceSingle, PickPlaceMilk, PickPlaceBread, PickPlaceCereal,
+                             PickPlaceCan, Door, Wipe, TwoArmLift, TwoArmPegInHole, TwoArmHandover
+
+    Possible robots: Baxter, IIWA, Jaco, Kinova3, Panda, Sawyer, UR5e
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--vision-modality",
+        type=str,
+        default="rgb",
+        help="Modality to render. Could be set to `depth`, `normal`, `segmentation`, or `rgb`",
+    )
+
+    args = parser.parse_args()
+
+    options = {}
+
+    # print welcome info
+    print("Welcome to robosuite v{}!".format(suite.__version__))
+    print(suite.__logo__)
+
+    options["env_name"] = choose_environment()
+
+    # If a multi-arm environment has been chosen, choose configuration and appropriate robot(s)
+    if "TwoArm" in options["env_name"]:
+        # Choose env config and add it to options
+        options["env_configuration"] = choose_multi_arm_config()
+
+        # If chosen configuration was bimanual, the corresponding robot must be Baxter. Else, have user choose robots
+        if options["env_configuration"] == "bimanual":
+            options["robots"] = "Baxter"
+        else:
+            options["robots"] = []
+
+            # Have user choose two robots
+            print("A multiple single-arm configuration was chosen.\n")
+
+            for i in range(2):
+                print("Please choose Robot {}...\n".format(i))
+                options["robots"].append(choose_robots(exclude_bimanual=True))
+
+    # Else, we simply choose a single (single-armed) robot to instantiate in the environment
+    else:
+        options["robots"] = choose_robots(exclude_bimanual=True)
+
+    # Load the desired controller
+    options["controller_configs"] = load_controller_config(default_controller="OSC_POSE")
+
+    # change renderer config
+    config = load_renderer_config("nvisii")
+
+    if args.vision_modality == "rgb":
+        config["vision_modalities"] = None
+    if args.vision_modality == "segmentation":
+        config["vision_modalities"] = "segmentation"
+    if args.vision_modality == "depth":
+        config["vision_modalities"] = "depth"
+    if args.vision_modality == "normal":
+        config["vision_modalities"] = "normal"
+
+    env = suite.make(
+        **options,
+        has_renderer=False,  # no on-screen renderer
+        has_offscreen_renderer=False,  # no off-screen renderer
+        ignore_done=True,
+        use_camera_obs=False,  # no camera observations
+        control_freq=20,
+        renderer="nvisii",
+        renderer_config=config,
+        camera_segmentations="element" if config["vision_modalities"] == "segmentation" else None,
+    )
+
+    env.reset()
+
+    low, high = env.action_spec
+
+    timesteps = 300
+    for i in range(timesteps):
+        action = np.random.uniform(low, high)
+        obs, reward, done, _ = env.step(action)
+
+        if i % 100 == 0:
+            env.render()
+
+    env.close_renderer()
+    print("Done.")
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_random_action.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_random_action.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f196758fee9f53cb16b8fceb9c7efe6e37286b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_random_action.py
@@ -0,0 +1,63 @@
+from robosuite.controllers import load_controller_config
+from robosuite.utils.input_utils import *
+
+if __name__ == "__main__":
+
+    # Create dict to hold options that will be passed to env creation call
+    options = {}
+
+    # print welcome info
+    print("Welcome to robosuite v{}!".format(suite.__version__))
+    print(suite.__logo__)
+
+    # Choose environment and add it to options
+    options["env_name"] = choose_environment()
+
+    # If a multi-arm environment has been chosen, choose configuration and appropriate robot(s)
+    if "TwoArm" in options["env_name"]:
+        # Choose env config and add it to options
+        options["env_configuration"] = choose_multi_arm_config()
+
+        # If chosen configuration was bimanual, the corresponding robot must be Baxter. Else, have user choose robots
+        if options["env_configuration"] == "bimanual":
+            options["robots"] = "Baxter"
+        else:
+            options["robots"] = []
+
+            # Have user choose two robots
+            print("A multiple single-arm configuration was chosen.\n")
+
+            for i in range(2):
+                print("Please choose Robot {}...\n".format(i))
+                options["robots"].append(choose_robots(exclude_bimanual=True))
+
+    # Else, we simply choose a single (single-armed) robot to instantiate in the environment
+    else:
+        options["robots"] = choose_robots(exclude_bimanual=True)
+
+    # Choose controller
+    controller_name = choose_controller()
+
+    # Load the desired controller
+    options["controller_configs"] = load_controller_config(default_controller=controller_name)
+
+    # initialize the task
+    env = suite.make(
+        **options,
+        has_renderer=True,
+        has_offscreen_renderer=False,
+        ignore_done=True,
+        use_camera_obs=False,
+        control_freq=20,
+    )
+    env.reset()
+    env.viewer.set_camera(camera_id=0)
+
+    # Get action limits
+    low, high = env.action_spec
+
+    # do visualization
+    for i in range(10000):
+        action = np.random.uniform(low, high)
+        obs, reward, done, _ = env.step(action)
+        env.render()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_renderers.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_renderers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e6ae44ca6f0fb08953a5921ae173054c0b1930b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_renderers.py
@@ -0,0 +1,107 @@
+import argparse
+import json
+
+import numpy as np
+
+import robosuite as suite
+import robosuite.utils.transform_utils as T
+from robosuite.controllers import load_controller_config
+from robosuite.renderers import load_renderer_config
+from robosuite.utils.input_utils import *
+
+
+def str2bool(v):
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+if __name__ == "__main__":
+
+    """
+    Registered environments: Lift, Stack, NutAssembly, NutAssemblySingle, NutAssemblySquare, NutAssemblyRound,
+                             PickPlace, PickPlaceSingle, PickPlaceMilk, PickPlaceBread, PickPlaceCereal,
+                             PickPlaceCan, Door, Wipe, TwoArmLift, TwoArmPegInHole, TwoArmHandover
+
+    Possible robots: Baxter, IIWA, Jaco, Kinova3, Panda, Sawyer, UR5e
+    """
+
+    options = {}
+
+    # print welcome info
+    print("Welcome to robosuite v{}!".format(suite.__version__))
+    print(suite.__logo__)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--renderer", type=str, default="mujoco", help="Valid options include mujoco, and nvisii")
+
+    args = parser.parse_args()
+    renderer = args.renderer
+
+    options["env_name"] = choose_environment()
+
+    # If a multi-arm environment has been chosen, choose configuration and appropriate robot(s)
+    if "TwoArm" in options["env_name"]:
+        # Choose env config and add it to options
+        options["env_configuration"] = choose_multi_arm_config()
+
+        # If chosen configuration was bimanual, the corresponding robot must be Baxter. Else, have user choose robots
+        if options["env_configuration"] == "bimanual":
+            options["robots"] = "Baxter"
+        else:
+            options["robots"] = []
+
+            # Have user choose two robots
+            print("A multiple single-arm configuration was chosen.\n")
+
+            for i in range(2):
+                print("Please choose Robot {}...\n".format(i))
+                options["robots"].append(choose_robots(exclude_bimanual=True))
+
+    # Else, we simply choose a single (single-armed) robot to instantiate in the environment
+    else:
+        options["robots"] = choose_robots(exclude_bimanual=True)
+
+    # Choose controller
+    controller_name = choose_controller()
+
+    # Load the desired controller
+    options["controller_configs"] = load_controller_config(default_controller=controller_name)
+
+    env = suite.make(
+        **options,
+        has_renderer=False if renderer != "mujoco" else True,  # no on-screen renderer
+        has_offscreen_renderer=False,  # no off-screen renderer
+        ignore_done=True,
+        use_camera_obs=False,  # no camera observations
+        control_freq=20,
+        renderer=renderer,
+    )
+
+    env.reset()
+
+    low, high = env.action_spec
+
+    if renderer == "nvisii":
+
+        timesteps = 300
+        for i in range(timesteps):
+            action = np.random.uniform(low, high)
+            obs, reward, done, _ = env.step(action)
+
+            if i % 100 == 0:
+                env.render()
+
+    else:
+
+        # do visualization
+        for i in range(10000):
+            action = np.random.uniform(low, high)
+            obs, reward, done, _ = env.step(action)
+            env.render()
+
+    env.close_renderer()
+    print("Done.")
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_segmentation.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..54c25b962ead4945e55f0fe87a83d8e55d9fbcdb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_segmentation.py
@@ -0,0 +1,118 @@
+"""
+Play random actions in an environment and render a video that demonstrates segmentation.
+"""
+import argparse
+import colorsys
+import json
+import random
+
+import imageio
+import matplotlib.cm as cm
+import numpy as np
+from PIL import Image
+
+import robosuite as suite
+from robosuite.controllers import load_controller_config
+
+
+def randomize_colors(N, bright=True):
+    """
+    Modified from https://github.com/matterport/Mask_RCNN/blob/master/mrcnn/visualize.py#L59
+    Generate random colors.
+    To get visually distinct colors, generate them in HSV space then
+    convert to RGB.
+    """
+    brightness = 1.0 if bright else 0.5
+    hsv = [(1.0 * i / N, 1, brightness) for i in range(N)]
+    colors = np.array(list(map(lambda c: colorsys.hsv_to_rgb(*c), hsv)))
+    rstate = np.random.RandomState(seed=20)
+    np.random.shuffle(colors)
+    return colors
+
+
+def segmentation_to_rgb(seg_im, random_colors=False):
+    """
+    Helper function to visualize segmentations as RGB frames.
+    NOTE: assumes that geom IDs go up to 255 at most - if not,
+    multiple geoms might be assigned to the same color.
+    """
+    # ensure all values lie within [0, 255]
+    seg_im = np.mod(seg_im, 256)
+
+    if random_colors:
+        colors = randomize_colors(N=256, bright=True)
+        return (255.0 * colors[seg_im]).astype(np.uint8)
+    else:
+        # deterministic shuffling of values to map each geom ID to a random int in [0, 255]
+        rstate = np.random.RandomState(seed=8)
+        inds = np.arange(256)
+        rstate.shuffle(inds)
+
+        # use @inds to map each geom ID to a color
+        return (255.0 * cm.rainbow(inds[seg_im], 3)).astype(np.uint8)[..., :3]
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--video-path", type=str, default="/tmp/video.mp4", help="Path to video file")
+    parser.add_argument("--random-colors", action="store_true", help="Radnomize segmentation colors")
+    parser.add_argument("--segmentation-level", type=str, default="element", help="instance, class, or element")
+    args = parser.parse_args()
+
+    # Create dict to hold options that will be passed to env creation call
+    options = {}
+
+    # Choose environment and add it to options
+    options["env_name"] = "TwoArmHandover"
+    options["robots"] = ["Panda", "Panda"]
+
+    # Choose controller
+    controller_name = "OSC_POSE"
+
+    # Choose camera
+    camera = "frontview"
+
+    # Choose segmentation type
+    segmentation_level = args.segmentation_level  # Options are {instance, class, element}
+
+    # Load the desired controller
+    options["controller_configs"] = load_controller_config(default_controller=controller_name)
+
+    # initialize the task
+    env = suite.make(
+        **options,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        ignore_done=True,
+        use_camera_obs=True,
+        control_freq=20,
+        camera_names=camera,
+        camera_segmentations=segmentation_level,
+        camera_heights=512,
+        camera_widths=512,
+    )
+    env.reset()
+
+    video_writer = imageio.get_writer(args.video_path, fps=20)
+
+    # Get action limits
+    low, high = env.action_spec
+
+    # do visualization
+    for i in range(100):
+        action = 0.5 * np.random.uniform(low, high)
+        obs, reward, done, _ = env.step(action)
+
+        video_img = obs[f"{camera}_segmentation_{segmentation_level}"].squeeze(-1)[::-1]
+        np.savetxt("/tmp/seg_{}.txt".format(i), video_img, fmt="%.2f")
+        video_img = segmentation_to_rgb(video_img, args.random_colors)
+        video_writer.append_data(video_img)
+
+        image = Image.fromarray(video_img)
+        image.save("/tmp/seg_{}.png".format(i))
+        if i % 5 == 0:
+            print("Step #{} / 100".format(i))
+
+    video_writer.close()
+    print("Video saved to {}".format(args.video_path))
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_sensor_corruption.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_sensor_corruption.py
new file mode 100644
index 0000000000000000000000000000000000000000..8207741e14cc4d99efff3fd379e302f935f91fbe
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_sensor_corruption.py
@@ -0,0 +1,264 @@
+"""Sensor Corruption Demo.
+
+This script provides an example of using the Observables functionality to implement a corrupted sensor
+(corruption + delay).
+Images will be rendered in a delayed fashion, such that the user will have seemingly delayed actions
+
+This is a modified version of the demo_device_control teleoperation script.
+
+Example:
+    $ python demo_sensor_corruption.py --environment Stack --robots Panda --delay 0.05 --corruption 5.0 --toggle-corruption-on-grasp
+"""
+
+import argparse
+import sys
+
+import cv2
+import numpy as np
+
+import robosuite as suite
+from robosuite import load_controller_config
+from robosuite.utils.input_utils import input2action
+from robosuite.utils.observables import Observable, create_gaussian_noise_corrupter, create_uniform_sampled_delayer
+from robosuite.wrappers import VisualizationWrapper
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--environment", type=str, default="Lift")
+    parser.add_argument("--robots", nargs="+", type=str, default="Panda", help="Which robot(s) to use in the env")
+    parser.add_argument(
+        "--config", type=str, default="single-arm-opposed", help="Specified environment configuration if necessary"
+    )
+    parser.add_argument("--arm", type=str, default="right", help="Which arm to control (eg bimanual) 'right' or 'left'")
+    parser.add_argument("--switch-on-grasp", action="store_true", help="Switch gripper control on gripper action")
+    parser.add_argument(
+        "--toggle-corruption-on-grasp", action="store_true", help="Toggle corruption ON / OFF on gripper action"
+    )
+    parser.add_argument("--controller", type=str, default="osc", help="Choice of controller. Can be 'ik' or 'osc'")
+    parser.add_argument("--device", type=str, default="keyboard")
+    parser.add_argument("--pos-sensitivity", type=float, default=1.0, help="How much to scale position user inputs")
+    parser.add_argument("--rot-sensitivity", type=float, default=1.0, help="How much to scale rotation user inputs")
+    parser.add_argument("--delay", type=float, default=0.04, help="average delay to use (sec)")
+    parser.add_argument("--corruption", type=float, default=20.0, help="Scale of corruption to use (std dev)")
+    parser.add_argument("--camera", type=str, default="agentview", help="Name of camera to render")
+    parser.add_argument("--width", type=int, default=512)
+    parser.add_argument("--height", type=int, default=384)
+    args = parser.parse_args()
+
+    # Import controller config for EE IK or OSC (pos/ori)
+    if args.controller == "ik":
+        controller_name = "IK_POSE"
+    elif args.controller == "osc":
+        controller_name = "OSC_POSE"
+    else:
+        print("Error: Unsupported controller specified. Must be either 'ik' or 'osc'!")
+        raise ValueError
+
+    # Get controller config
+    controller_config = load_controller_config(default_controller=controller_name)
+
+    # Create argument configuration
+    config = {
+        "env_name": args.environment,
+        "robots": args.robots,
+        "controller_configs": controller_config,
+    }
+
+    # Check if we're using a multi-armed environment and use env_configuration argument if so
+    if "TwoArm" in args.environment:
+        config["env_configuration"] = args.config
+    else:
+        args.config = None
+
+    # Create environment
+    env = suite.make(
+        **config,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        ignore_done=True,
+        camera_names=args.camera,
+        camera_heights=args.height,
+        camera_widths=args.width,
+        use_camera_obs=True,
+        use_object_obs=True,
+        hard_reset=False,
+    )
+
+    # Wrap this environment in a visualization wrapper
+    env = VisualizationWrapper(env, indicator_configs=None)
+
+    # Set shared settings
+    attributes = ["corrupter", "delayer", "sampling_rate"]
+    corruption_mode = 1  # 1 is corruption = ON, 0 is corruption = OFF
+    obs_settings = {}
+
+    # Function to easily modify observable on the fly
+    def modify_obs(obs_name, attrs, mods):
+        for attr, mod in zip(attrs, mods):
+            env.modify_observable(
+                observable_name=obs_name,
+                attribute=attr,
+                modifier=mod,
+            )
+
+    # Add image corruption and delay
+    image_sampling_rate = 10.0
+    image_obs_name = f"{args.camera}_image"
+    image_corrupter = create_gaussian_noise_corrupter(mean=0.0, std=args.corruption, low=0, high=255)
+    image_delayer = create_uniform_sampled_delayer(min_delay=max(0, args.delay - 0.025), max_delay=args.delay + 0.025)
+    image_modifiers = [image_corrupter, image_delayer, image_sampling_rate]
+
+    # Initialize settings
+    modify_obs(obs_name=image_obs_name, attrs=attributes, mods=image_modifiers)
+
+    # Add entry for the corruption / delay settings in dict
+    obs_settings[image_obs_name] = {
+        "attrs": attributes[:2],
+        "mods": lambda: image_modifiers[:2] if corruption_mode else [None, None],
+    }
+
+    # Add proprioception corruption and delay
+    proprio_sampling_rate = 20.0
+    proprio_obs_name = f"{env.robots[0].robot_model.naming_prefix}joint_pos"
+    joint_limits = env.sim.model.jnt_range[env.robots[0]._ref_joint_indexes]
+    joint_range = joint_limits[:, 1] - joint_limits[:, 0]
+    proprio_corrupter = create_gaussian_noise_corrupter(mean=0.0, std=joint_range / 50.0)
+    curr_proprio_delay = 0.0
+    tmp_delayer = create_uniform_sampled_delayer(
+        min_delay=max(0, (args.delay - 0.025) / 2), max_delay=(args.delay + 0.025) / 2
+    )
+
+    # Define delayer to synchronize delay between ground truth and corrupted sensors
+    def proprio_delayer():
+        global curr_proprio_delay
+        curr_proprio_delay = tmp_delayer()
+        return curr_proprio_delay
+
+    # Define function to convert raw delay time to actual sampling delay (in discrete timesteps)
+    def calculate_proprio_delay():
+        base = env.model_timestep
+        return base * round(curr_proprio_delay / base) if corruption_mode else 0.0
+
+    proprio_modifiers = [proprio_corrupter, proprio_delayer, proprio_sampling_rate]
+
+    # We will create a separate "ground truth" delayed proprio observable to track exactly
+    # how much corruption we're getting in real time
+    proprio_sensor = env._observables[proprio_obs_name]._sensor
+    proprio_ground_truth_obs_name = f"{proprio_obs_name}_ground_truth"
+    observable = Observable(
+        name=proprio_ground_truth_obs_name,
+        sensor=proprio_sensor,
+        delayer=lambda: curr_proprio_delay,
+        sampling_rate=proprio_sampling_rate,
+    )
+
+    # Add this observable
+    env.add_observable(observable)
+
+    # We also need to set the normal joint pos observable to be active (not active by default)
+    env.modify_observable(observable_name=proprio_obs_name, attribute="active", modifier=True)
+
+    # Initialize settings
+    modify_obs(obs_name=proprio_obs_name, attrs=attributes, mods=proprio_modifiers)
+
+    # Add entry for the corruption / delay settings in dict
+    obs_settings[proprio_obs_name] = {
+        "attrs": attributes[:2],
+        "mods": lambda: proprio_modifiers[:2] if corruption_mode else [None, None],
+    }
+    obs_settings[proprio_ground_truth_obs_name] = {
+        "attrs": [attributes[1]],
+        "mods": lambda: [lambda: curr_proprio_delay] if corruption_mode else [None],
+    }
+
+    # Setup printing options for numbers
+    np.set_printoptions(precision=3, suppress=True, floatmode="fixed")
+
+    # initialize device
+    if args.device == "keyboard":
+        from robosuite.devices import Keyboard
+
+        device = Keyboard(pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
+    elif args.device == "spacemouse":
+        from robosuite.devices import SpaceMouse
+
+        device = SpaceMouse(pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
+    else:
+        raise Exception("Invalid device choice: choose either 'keyboard' or 'spacemouse'.")
+
+    while True:
+        # Reset the environment
+        obs = env.reset()
+
+        # Reset corruption mode
+        corruption_mode = 1
+
+        # Initialize variables that should the maintained between resets
+        last_grasp = 0
+
+        # Initialize device control
+        device.start_control()
+
+        while True:
+            # Set active robot
+            active_robot = env.robots[0] if args.config == "bimanual" else env.robots[args.arm == "left"]
+
+            # Get the newest action
+            action, grasp = input2action(
+                device=device, robot=active_robot, active_arm=args.arm, env_configuration=args.config
+            )
+
+            # If action is none, then this a reset so we should break
+            if action is None:
+                break
+
+            # If the current grasp is active (1) and last grasp is not (-1) (i.e.: grasping input just pressed),
+            # toggle arm control and / or corruption if requested
+            if last_grasp < 0 < grasp:
+                if args.switch_on_grasp:
+                    args.arm = "left" if args.arm == "right" else "right"
+                if args.toggle_corruption_on_grasp:
+                    # Toggle corruption and update observable
+                    corruption_mode = 1 - corruption_mode
+                    for obs_name, settings in obs_settings.items():
+                        modify_obs(obs_name=obs_name, attrs=settings["attrs"], mods=settings["mods"]())
+            # Update last grasp
+            last_grasp = grasp
+
+            # Fill out the rest of the action space if necessary
+            rem_action_dim = env.action_dim - action.size
+            if rem_action_dim > 0:
+                # Initialize remaining action space
+                rem_action = np.zeros(rem_action_dim)
+                # This is a multi-arm setting, choose which arm to control and fill the rest with zeros
+                if args.arm == "right":
+                    action = np.concatenate([action, rem_action])
+                elif args.arm == "left":
+                    action = np.concatenate([rem_action, action])
+                else:
+                    # Only right and left arms supported
+                    print(
+                        "Error: Unsupported arm specified -- "
+                        "must be either 'right' or 'left'! Got: {}".format(args.arm)
+                    )
+            elif rem_action_dim < 0:
+                # We're in an environment with no gripper action space, so trim the action space to be the action dim
+                action = action[: env.action_dim]
+
+            # Step through the simulation and render
+            obs, reward, done, info = env.step(action)
+
+            # Calculate and print out stats for proprio observation
+            observed_value = obs[proprio_obs_name]
+            ground_truth_delayed_value = obs[proprio_ground_truth_obs_name]
+            print(
+                f"Observed joint pos: {observed_value}, "
+                f"Corruption: {observed_value - ground_truth_delayed_value}, "
+                f"Delay: {calculate_proprio_delay():.3f} sec"
+            )
+
+            # read camera observation
+            im = np.flip(obs[args.camera + "_image"][..., ::-1], 0).astype(np.uint8)
+
+            cv2.imshow("offscreen render", im)
+            cv2.waitKey(1)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/demos/demo_video_recording.py b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_video_recording.py
new file mode 100644
index 0000000000000000000000000000000000000000..3424f289a9a95ec8029a0a9bee2cdc7f025f09cb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/demos/demo_video_recording.py
@@ -0,0 +1,69 @@
+"""
+Record video of agent episodes with the imageio library.
+This script uses offscreen rendering.
+
+Example:
+    $ python demo_video_recording.py --environment Lift --robots Panda
+"""
+
+import argparse
+
+import imageio
+import numpy as np
+
+import robosuite.macros as macros
+from robosuite import make
+
+# Set the image convention to opencv so that the images are automatically rendered "right side up" when using imageio
+# (which uses opencv convention)
+macros.IMAGE_CONVENTION = "opencv"
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--environment", type=str, default="Stack")
+    parser.add_argument("--robots", nargs="+", type=str, default="Panda", help="Which robot(s) to use in the env")
+    parser.add_argument("--camera", type=str, default="agentview", help="Name of camera to render")
+    parser.add_argument("--video_path", type=str, default="video.mp4")
+    parser.add_argument("--timesteps", type=int, default=500)
+    parser.add_argument("--height", type=int, default=512)
+    parser.add_argument("--width", type=int, default=512)
+    parser.add_argument("--skip_frame", type=int, default=1)
+    args = parser.parse_args()
+
+    # initialize an environment with offscreen renderer
+    env = make(
+        args.environment,
+        args.robots,
+        has_renderer=False,
+        ignore_done=True,
+        use_camera_obs=True,
+        use_object_obs=False,
+        camera_names=args.camera,
+        camera_heights=args.height,
+        camera_widths=args.width,
+    )
+
+    obs = env.reset()
+    ndim = env.action_dim
+
+    # create a video writer with imageio
+    writer = imageio.get_writer(args.video_path, fps=20)
+
+    frames = []
+    for i in range(args.timesteps):
+
+        # run a uniformly random agent
+        action = 0.5 * np.random.randn(ndim)
+        obs, reward, done, info = env.step(action)
+
+        # dump a frame from every K frames
+        if i % args.skip_frame == 0:
+            frame = obs[args.camera + "_image"]
+            writer.append_data(frame)
+            print("Saving frame #{}".format(i))
+
+        if done:
+            break
+
+    writer.close()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/devices/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/devices/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddabf0335463d61f23889c261ec8b5f11c707c24
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/devices/__init__.py
@@ -0,0 +1,11 @@
+from .device import Device
+from .keyboard import Keyboard
+
+try:
+    from .spacemouse import SpaceMouse
+except ImportError:
+    print(
+        """Unable to load module hid, required to interface with SpaceMouse.\n
+           Only macOS is officially supported. Install the additional\n
+           requirements with `pip install -r requirements-extra.txt`"""
+    )
diff --git a/phantom/submodules/phantom-robosuite/robosuite/devices/device.py b/phantom/submodules/phantom-robosuite/robosuite/devices/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..918523b751d5a1aad0e0310b3a475837d20c018f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/devices/device.py
@@ -0,0 +1,21 @@
+import abc  # for abstract base class definitions
+
+
+class Device(metaclass=abc.ABCMeta):
+    """
+    Base class for all robot controllers.
+    Defines basic interface for all controllers to adhere to.
+    """
+
+    @abc.abstractmethod
+    def start_control(self):
+        """
+        Method that should be called externally before controller can
+        start receiving commands.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_controller_state(self):
+        """Returns the current state of the device, a dictionary of pos, orn, grasp, and reset."""
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/devices/keyboard.py b/phantom/submodules/phantom-robosuite/robosuite/devices/keyboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb37648d61f6efe5d9d1946cc155effccad65d35
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/devices/keyboard.py
@@ -0,0 +1,170 @@
+"""
+Driver class for Keyboard controller.
+"""
+
+import numpy as np
+from pynput.keyboard import Controller, Key, Listener
+
+from robosuite.devices import Device
+from robosuite.utils.transform_utils import rotation_matrix
+
+
+class Keyboard(Device):
+    """
+    A minimalistic driver class for a Keyboard.
+    Args:
+        pos_sensitivity (float): Magnitude of input position command scaling
+        rot_sensitivity (float): Magnitude of scale input rotation commands scaling
+    """
+
+    def __init__(self, pos_sensitivity=1.0, rot_sensitivity=1.0):
+
+        self._display_controls()
+        self._reset_internal_state()
+
+        self._reset_state = 0
+        self._enabled = False
+        self._pos_step = 0.05
+
+        self.pos_sensitivity = pos_sensitivity
+        self.rot_sensitivity = rot_sensitivity
+
+        # make a thread to listen to keyboard and register our callback functions
+        self.listener = Listener(on_press=self.on_press, on_release=self.on_release)
+
+        # start listening
+        self.listener.start()
+
+    @staticmethod
+    def _display_controls():
+        """
+        Method to pretty print controls.
+        """
+
+        def print_command(char, info):
+            char += " " * (10 - len(char))
+            print("{}\t{}".format(char, info))
+
+        print("")
+        print_command("Keys", "Command")
+        print_command("q", "reset simulation")
+        print_command("spacebar", "toggle gripper (open/close)")
+        print_command("w-a-s-d", "move arm horizontally in x-y plane")
+        print_command("r-f", "move arm vertically")
+        print_command("z-x", "rotate arm about x-axis")
+        print_command("t-g", "rotate arm about y-axis")
+        print_command("c-v", "rotate arm about z-axis")
+        print("")
+
+    def _reset_internal_state(self):
+        """
+        Resets internal state of controller, except for the reset signal.
+        """
+        self.rotation = np.array([[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, -1.0]])
+        self.raw_drotation = np.zeros(3)  # immediate roll, pitch, yaw delta values from keyboard hits
+        self.last_drotation = np.zeros(3)
+        self.pos = np.zeros(3)  # (x, y, z)
+        self.last_pos = np.zeros(3)
+        self.grasp = False
+
+    def start_control(self):
+        """
+        Method that should be called externally before controller can
+        start receiving commands.
+        """
+        self._reset_internal_state()
+        self._reset_state = 0
+        self._enabled = True
+
+    def get_controller_state(self):
+        """
+        Grabs the current state of the keyboard.
+        Returns:
+            dict: A dictionary containing dpos, orn, unmodified orn, grasp, and reset
+        """
+
+        dpos = self.pos - self.last_pos
+        self.last_pos = np.array(self.pos)
+        raw_drotation = (
+            self.raw_drotation - self.last_drotation
+        )  # create local variable to return, then reset internal drotation
+        self.last_drotation = np.array(self.raw_drotation)
+        return dict(
+            dpos=dpos,
+            rotation=self.rotation,
+            raw_drotation=raw_drotation,
+            grasp=int(self.grasp),
+            reset=self._reset_state,
+        )
+
+    def on_press(self, key):
+        """
+        Key handler for key presses.
+        Args:
+            key (str): key that was pressed
+        """
+
+        try:
+            # controls for moving position
+            if key.char == "w":
+                self.pos[0] -= self._pos_step * self.pos_sensitivity  # dec x
+            elif key.char == "s":
+                self.pos[0] += self._pos_step * self.pos_sensitivity  # inc x
+            elif key.char == "a":
+                self.pos[1] -= self._pos_step * self.pos_sensitivity  # dec y
+            elif key.char == "d":
+                self.pos[1] += self._pos_step * self.pos_sensitivity  # inc y
+            elif key.char == "f":
+                self.pos[2] -= self._pos_step * self.pos_sensitivity  # dec z
+            elif key.char == "r":
+                self.pos[2] += self._pos_step * self.pos_sensitivity  # inc z
+
+            # controls for moving orientation
+            elif key.char == "z":
+                drot = rotation_matrix(angle=0.1 * self.rot_sensitivity, direction=[1.0, 0.0, 0.0])[:3, :3]
+                self.rotation = self.rotation.dot(drot)  # rotates x
+                self.raw_drotation[1] -= 0.1 * self.rot_sensitivity
+            elif key.char == "x":
+                drot = rotation_matrix(angle=-0.1 * self.rot_sensitivity, direction=[1.0, 0.0, 0.0])[:3, :3]
+                self.rotation = self.rotation.dot(drot)  # rotates x
+                self.raw_drotation[1] += 0.1 * self.rot_sensitivity
+            elif key.char == "t":
+                drot = rotation_matrix(angle=0.1 * self.rot_sensitivity, direction=[0.0, 1.0, 0.0])[:3, :3]
+                self.rotation = self.rotation.dot(drot)  # rotates y
+                self.raw_drotation[0] += 0.1 * self.rot_sensitivity
+            elif key.char == "g":
+                drot = rotation_matrix(angle=-0.1 * self.rot_sensitivity, direction=[0.0, 1.0, 0.0])[:3, :3]
+                self.rotation = self.rotation.dot(drot)  # rotates y
+                self.raw_drotation[0] -= 0.1 * self.rot_sensitivity
+            elif key.char == "c":
+                drot = rotation_matrix(angle=0.1 * self.rot_sensitivity, direction=[0.0, 0.0, 1.0])[:3, :3]
+                self.rotation = self.rotation.dot(drot)  # rotates z
+                self.raw_drotation[2] += 0.1 * self.rot_sensitivity
+            elif key.char == "v":
+                drot = rotation_matrix(angle=-0.1 * self.rot_sensitivity, direction=[0.0, 0.0, 1.0])[:3, :3]
+                self.rotation = self.rotation.dot(drot)  # rotates z
+                self.raw_drotation[2] -= 0.1 * self.rot_sensitivity
+
+        except AttributeError as e:
+            pass
+
+    def on_release(self, key):
+        """
+        Key handler for key releases.
+        Args:
+            key (str): key that was pressed
+        """
+
+        try:
+            # controls for grasping
+            if key == Key.space:
+                self.grasp = not self.grasp  # toggle gripper
+
+            # user-commanded reset
+            elif key.char == "q":
+                self._reset_state = 1
+                self._enabled = False
+                self._reset_internal_state()
+
+        except AttributeError as e:
+            pass
diff --git a/phantom/submodules/phantom-robosuite/robosuite/devices/spacemouse.py b/phantom/submodules/phantom-robosuite/robosuite/devices/spacemouse.py
new file mode 100644
index 0000000000000000000000000000000000000000..604989ff28f8dbc9662097b9f8f6e173b1c6c85d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/devices/spacemouse.py
@@ -0,0 +1,317 @@
+"""Driver class for SpaceMouse controller.
+
+This class provides a driver support to SpaceMouse on macOS.
+In particular, we assume you are using a SpaceMouse Wireless by default.
+
+To set up a new SpaceMouse controller:
+    1. Download and install driver from https://www.3dconnexion.com/service/drivers.html
+    2. Install hidapi library through pip
+       (make sure you run uninstall hid first if it is installed).
+    3. Make sure SpaceMouse is connected before running the script
+    4. (Optional) Based on the model of SpaceMouse, you might need to change the
+       vendor id and product id that correspond to the device.
+
+For Linux support, you can find open-source Linux drivers and SDKs online.
+    See http://spacenav.sourceforge.net/
+
+"""
+
+import threading
+import time
+from collections import namedtuple
+
+import numpy as np
+
+try:
+    import hid
+except ModuleNotFoundError as exc:
+    raise ImportError(
+        "Unable to load module hid, required to interface with SpaceMouse. "
+        "Only macOS is officially supported. Install the additional "
+        "requirements with `pip install -r requirements-extra.txt`"
+    ) from exc
+
+import robosuite.macros as macros
+from robosuite.devices import Device
+from robosuite.utils.transform_utils import rotation_matrix
+
+AxisSpec = namedtuple("AxisSpec", ["channel", "byte1", "byte2", "scale"])
+
+SPACE_MOUSE_SPEC = {
+    "x": AxisSpec(channel=1, byte1=1, byte2=2, scale=1),
+    "y": AxisSpec(channel=1, byte1=3, byte2=4, scale=-1),
+    "z": AxisSpec(channel=1, byte1=5, byte2=6, scale=-1),
+    "roll": AxisSpec(channel=1, byte1=7, byte2=8, scale=-1),
+    "pitch": AxisSpec(channel=1, byte1=9, byte2=10, scale=-1),
+    "yaw": AxisSpec(channel=1, byte1=11, byte2=12, scale=1),
+}
+
+
+def to_int16(y1, y2):
+    """
+    Convert two 8 bit bytes to a signed 16 bit integer.
+
+    Args:
+        y1 (int): 8-bit byte
+        y2 (int): 8-bit byte
+
+    Returns:
+        int: 16-bit integer
+    """
+    x = (y1) | (y2 << 8)
+    if x >= 32768:
+        x = -(65536 - x)
+    return x
+
+
+def scale_to_control(x, axis_scale=350.0, min_v=-1.0, max_v=1.0):
+    """
+    Normalize raw HID readings to target range.
+
+    Args:
+        x (int): Raw reading from HID
+        axis_scale (float): (Inverted) scaling factor for mapping raw input value
+        min_v (float): Minimum limit after scaling
+        max_v (float): Maximum limit after scaling
+
+    Returns:
+        float: Clipped, scaled input from HID
+    """
+    x = x / axis_scale
+    x = min(max(x, min_v), max_v)
+    return x
+
+
+def convert(b1, b2):
+    """
+    Converts SpaceMouse message to commands.
+
+    Args:
+        b1 (int): 8-bit byte
+        b2 (int): 8-bit byte
+
+    Returns:
+        float: Scaled value from Spacemouse message
+    """
+    return scale_to_control(to_int16(b1, b2))
+
+
+class SpaceMouse(Device):
+    """
+    A minimalistic driver class for SpaceMouse with HID library.
+
+    Note: Use hid.enumerate() to view all USB human interface devices (HID).
+    Make sure SpaceMouse is detected before running the script.
+    You can look up its vendor/product id from this method.
+
+    Args:
+        vendor_id (int): HID device vendor id
+        product_id (int): HID device product id
+        pos_sensitivity (float): Magnitude of input position command scaling
+        rot_sensitivity (float): Magnitude of scale input rotation commands scaling
+    """
+
+    def __init__(
+        self,
+        vendor_id=macros.SPACEMOUSE_VENDOR_ID,
+        product_id=macros.SPACEMOUSE_PRODUCT_ID,
+        pos_sensitivity=1.0,
+        rot_sensitivity=1.0,
+    ):
+
+        print("Opening SpaceMouse device")
+        self.vendor_id = vendor_id
+        self.product_id = product_id
+        self.device = hid.device()
+        self.device.open(self.vendor_id, self.product_id)  # SpaceMouse
+
+        self.pos_sensitivity = pos_sensitivity
+        self.rot_sensitivity = rot_sensitivity
+
+        print("Manufacturer: %s" % self.device.get_manufacturer_string())
+        print("Product: %s" % self.device.get_product_string())
+
+        # 6-DOF variables
+        self.x, self.y, self.z = 0, 0, 0
+        self.roll, self.pitch, self.yaw = 0, 0, 0
+
+        self._display_controls()
+
+        self.single_click_and_hold = False
+
+        self._control = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        self._reset_state = 0
+        self.rotation = np.array([[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, -1.0]])
+        self._enabled = False
+
+        # launch a new listener thread to listen to SpaceMouse
+        self.thread = threading.Thread(target=self.run)
+        self.thread.daemon = True
+        self.thread.start()
+
+    @staticmethod
+    def _display_controls():
+        """
+        Method to pretty print controls.
+        """
+
+        def print_command(char, info):
+            char += " " * (30 - len(char))
+            print("{}\t{}".format(char, info))
+
+        print("")
+        print_command("Control", "Command")
+        print_command("Right button", "reset simulation")
+        print_command("Left button (hold)", "close gripper")
+        print_command("Move mouse laterally", "move arm horizontally in x-y plane")
+        print_command("Move mouse vertically", "move arm vertically")
+        print_command("Twist mouse about an axis", "rotate arm about a corresponding axis")
+        print("")
+
+    def _reset_internal_state(self):
+        """
+        Resets internal state of controller, except for the reset signal.
+        """
+        self.rotation = np.array([[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, -1.0]])
+        # Reset 6-DOF variables
+        self.x, self.y, self.z = 0, 0, 0
+        self.roll, self.pitch, self.yaw = 0, 0, 0
+        # Reset control
+        self._control = np.zeros(6)
+        # Reset grasp
+        self.single_click_and_hold = False
+
+    def start_control(self):
+        """
+        Method that should be called externally before controller can
+        start receiving commands.
+        """
+        self._reset_internal_state()
+        self._reset_state = 0
+        self._enabled = True
+
+    def get_controller_state(self):
+        """
+        Grabs the current state of the 3D mouse.
+
+        Returns:
+            dict: A dictionary containing dpos, orn, unmodified orn, grasp, and reset
+        """
+        dpos = self.control[:3] * 0.005 * self.pos_sensitivity
+        roll, pitch, yaw = self.control[3:] * 0.005 * self.rot_sensitivity
+
+        # convert RPY to an absolute orientation
+        drot1 = rotation_matrix(angle=-pitch, direction=[1.0, 0, 0], point=None)[:3, :3]
+        drot2 = rotation_matrix(angle=roll, direction=[0, 1.0, 0], point=None)[:3, :3]
+        drot3 = rotation_matrix(angle=yaw, direction=[0, 0, 1.0], point=None)[:3, :3]
+
+        self.rotation = self.rotation.dot(drot1.dot(drot2.dot(drot3)))
+
+        return dict(
+            dpos=dpos,
+            rotation=self.rotation,
+            raw_drotation=np.array([roll, pitch, yaw]),
+            grasp=self.control_gripper,
+            reset=self._reset_state,
+        )
+
+    def run(self):
+        """Listener method that keeps pulling new messages."""
+
+        t_last_click = -1
+
+        while True:
+            d = self.device.read(13)
+            if d is not None and self._enabled:
+
+                if self.product_id == 50741:
+                    ## logic for older spacemouse model
+
+                    if d[0] == 1:  ## readings from 6-DoF sensor
+                        self.y = convert(d[1], d[2])
+                        self.x = convert(d[3], d[4])
+                        self.z = convert(d[5], d[6]) * -1.0
+
+                    elif d[0] == 2:
+
+                        self.roll = convert(d[1], d[2])
+                        self.pitch = convert(d[3], d[4])
+                        self.yaw = convert(d[5], d[6])
+
+                        self._control = [
+                            self.x,
+                            self.y,
+                            self.z,
+                            self.roll,
+                            self.pitch,
+                            self.yaw,
+                        ]
+                else:
+                    ## default logic for all other spacemouse models
+
+                    if d[0] == 1:  ## readings from 6-DoF sensor
+                        self.y = convert(d[1], d[2])
+                        self.x = convert(d[3], d[4])
+                        self.z = convert(d[5], d[6]) * -1.0
+
+                        self.roll = convert(d[7], d[8])
+                        self.pitch = convert(d[9], d[10])
+                        self.yaw = convert(d[11], d[12])
+
+                        self._control = [
+                            self.x,
+                            self.y,
+                            self.z,
+                            self.roll,
+                            self.pitch,
+                            self.yaw,
+                        ]
+
+                if d[0] == 3:  ## readings from the side buttons
+
+                    # press left button
+                    if d[1] == 1:
+                        t_click = time.time()
+                        elapsed_time = t_click - t_last_click
+                        t_last_click = t_click
+                        self.single_click_and_hold = True
+
+                    # release left button
+                    if d[1] == 0:
+                        self.single_click_and_hold = False
+
+                    # right button is for reset
+                    if d[1] == 2:
+                        self._reset_state = 1
+                        self._enabled = False
+                        self._reset_internal_state()
+
+    @property
+    def control(self):
+        """
+        Grabs current pose of Spacemouse
+
+        Returns:
+            np.array: 6-DoF control value
+        """
+        return np.array(self._control)
+
+    @property
+    def control_gripper(self):
+        """
+        Maps internal states into gripper commands.
+
+        Returns:
+            float: Whether we're using single click and hold or not
+        """
+        if self.single_click_and_hold:
+            return 1.0
+        return 0
+
+
+if __name__ == "__main__":
+
+    space_mouse = SpaceMouse()
+    for i in range(100):
+        print(space_mouse.control, space_mouse.control_gripper)
+        time.sleep(0.02)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/environments/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff6081f09064994187ac1154f62c57066bd455c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/__init__.py
@@ -0,0 +1,3 @@
+from .base import REGISTERED_ENVS, MujocoEnv
+
+ALL_ENVIRONMENTS = REGISTERED_ENVS.keys()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/base.py b/phantom/submodules/phantom-robosuite/robosuite/environments/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..62752b2fa9a13b8d3c9ab74e54b9aa2ee5a550a5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/base.py
@@ -0,0 +1,737 @@
+import os
+import xml.etree.ElementTree as ET
+from collections import OrderedDict
+
+import numpy as np
+
+import robosuite
+import robosuite.macros as macros
+import robosuite.utils.sim_utils as SU
+from robosuite.renderers.base import load_renderer_config
+from robosuite.utils import OpenCVRenderer, SimulationError, XMLError
+from robosuite.utils.binding_utils import MjRenderContextOffscreen, MjSim
+
+REGISTERED_ENVS = {}
+
+
+def register_env(target_class):
+    REGISTERED_ENVS[target_class.__name__] = target_class
+
+
+def make(env_name, *args, **kwargs):
+    """
+    Instantiates a robosuite environment.
+    This method attempts to mirror the equivalent functionality of gym.make in a somewhat sloppy way.
+    Args:
+        env_name (str): Name of the robosuite environment to initialize
+        *args: Additional arguments to pass to the specific environment class initializer
+        **kwargs: Additional arguments to pass to the specific environment class initializer
+    Returns:
+        MujocoEnv: Desired robosuite environment
+    Raises:
+        Exception: [Invalid environment name]
+    """
+    if env_name not in REGISTERED_ENVS:
+        raise Exception(
+            "Environment {} not found. Make sure it is a registered environment among: {}".format(
+                env_name, ", ".join(REGISTERED_ENVS)
+            )
+        )
+    return REGISTERED_ENVS[env_name](*args, **kwargs)
+
+
+class EnvMeta(type):
+    """Metaclass for registering environments"""
+
+    def __new__(meta, name, bases, class_dict):
+        cls = super().__new__(meta, name, bases, class_dict)
+
+        # List all environments that should not be registered here.
+        _unregistered_envs = ["MujocoEnv", "RobotEnv", "ManipulationEnv", "SingleArmEnv", "TwoArmEnv"]
+
+        if cls.__name__ not in _unregistered_envs:
+            register_env(cls)
+        return cls
+
+
+class MujocoEnv(metaclass=EnvMeta):
+    """
+    Initializes a Mujoco Environment.
+    Args:
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+        has_offscreen_renderer (bool): True if using off-screen rendering.
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+        render_collision_mesh (bool): True if rendering collision meshes
+            in camera. False otherwise.
+        render_visual_mesh (bool): True if rendering visual meshes
+            in camera. False otherwise.
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+        control_freq (float): how many control signals to receive
+            in every simulated second. This sets the amount of simulation time
+            that passes between every action input.
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+        renderer (str): string for the renderer to use
+        renderer_config (dict): dictionary for the renderer configurations
+    Raises:
+        ValueError: [Invalid renderer selection]
+    """
+
+    def __init__(
+        self,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # If you're using an onscreen renderer, you must be also using an offscreen renderer!
+        if has_renderer and not has_offscreen_renderer:
+            has_offscreen_renderer = True
+
+        # Rendering-specific attributes
+        self.has_renderer = has_renderer
+        # offscreen renderer needed for on-screen rendering
+        self.has_offscreen_renderer = has_renderer or has_offscreen_renderer
+        self.render_camera = render_camera
+        self.render_collision_mesh = render_collision_mesh
+        self.render_visual_mesh = render_visual_mesh
+        self.render_gpu_device_id = render_gpu_device_id
+        self.viewer = None
+
+        # Simulation-specific attributes
+        self._observables = {}  # Maps observable names to observable objects
+        self._obs_cache = {}  # Maps observable names to pre-/partially-computed observable values
+        self.control_freq = control_freq
+        self.horizon = horizon
+        self.ignore_done = ignore_done
+        self.hard_reset = hard_reset
+        self._xml_processor = None  # Function to process model xml in _initialize_sim() call
+        self.model = None
+        self.cur_time = None
+        self.model_timestep = None
+        self.control_timestep = None
+        self.deterministic_reset = False  # Whether to add randomized resetting of objects / robot joints
+
+        self.renderer = renderer
+        self.renderer_config = renderer_config
+
+        # Load the model
+        self._load_model()
+
+        # Initialize the simulation
+        self._initialize_sim()
+
+        # initializes the rendering
+        self.initialize_renderer()
+
+        # Run all further internal (re-)initialization required
+        self._reset_internal()
+
+        # Load observables
+        if hasattr(self.viewer, "_setup_observables"):
+            self._observables = self.viewer._setup_observables()
+        else:
+            self._observables = self._setup_observables()
+
+        # check if viewer has get observations method and set a flag for future use.
+        self.viewer_get_obs = hasattr(self.viewer, "_get_observations")
+
+    def initialize_renderer(self):
+        self.renderer = self.renderer.lower()
+
+        if self.renderer_config is None and self.renderer != "mujoco":
+            self.renderer_config = load_renderer_config(self.renderer)
+
+        if self.renderer == "mujoco" or self.renderer == "default":
+            pass
+        elif self.renderer == "nvisii":
+            from robosuite.renderers.nvisii.nvisii_renderer import NVISIIRenderer
+
+            self.viewer = NVISIIRenderer(env=self, **self.renderer_config)
+        else:
+            raise ValueError(
+                f"{self.renderer} is not a valid renderer name. Valid options include default (native mujoco renderer), and nvisii"
+            )
+
+    def initialize_time(self, control_freq):
+        """
+        Initializes the time constants used for simulation.
+        Args:
+            control_freq (float): Hz rate to run control loop at within the simulation
+        """
+        self.cur_time = 0
+        self.model_timestep = macros.SIMULATION_TIMESTEP
+        if self.model_timestep <= 0:
+            raise ValueError("Invalid simulation timestep defined!")
+        self.control_freq = control_freq
+        if control_freq <= 0:
+            raise SimulationError("Control frequency {} is invalid".format(control_freq))
+        self.control_timestep = 1.0 / control_freq
+
+    def set_xml_processor(self, processor):
+        """
+        Sets the processor function that xml string will be passed to inside _initialize_sim() calls.
+        Args:
+            processor (None or function): If set, processing method should take in a xml string and
+                return no arguments.
+        """
+        self._xml_processor = processor
+
+    def _load_model(self):
+        """Loads an xml model, puts it in self.model"""
+        pass
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        # Setup mappings from model to IDs
+        self.model.generate_id_mappings(sim=self.sim)
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment.
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        return OrderedDict()
+
+    def _initialize_sim(self, xml_string=None):
+        """
+        Creates a MjSim object and stores it in self.sim. If @xml_string is specified, the MjSim object will be created
+        from the specified xml_string. Else, it will pull from self.model to instantiate the simulation
+        Args:
+            xml_string (str): If specified, creates MjSim object from this filepath
+        """
+        xml = xml_string if xml_string else self.model.get_xml()
+
+        # process the xml before initializing sim
+        if self._xml_processor is not None:
+            xml = self._xml_processor(xml)
+
+        # Create the simulation instance
+        self.sim = MjSim.from_xml_string(xml)
+
+        # run a single step to make sure changes have propagated through sim state
+        self.sim.forward()
+
+        # Setup sim time based on control frequency
+        self.initialize_time(self.control_freq)
+
+    def reset(self):
+        """
+        Resets simulation.
+        Returns:
+            OrderedDict: Environment observation space after reset occurs
+        """
+        # TODO(yukez): investigate black screen of death
+        # Use hard reset if requested
+
+        if self.hard_reset and not self.deterministic_reset:
+            if self.renderer == "mujoco" or self.renderer == "default":
+                self._destroy_viewer()
+                self._destroy_sim()
+            self._load_model()
+            self._initialize_sim()
+        # Else, we only reset the sim internally
+        else:
+            self.sim.reset()
+
+        # Reset necessary robosuite-centric variables
+        self._reset_internal()
+        self.sim.forward()
+        # Setup observables, reloading if
+        self._obs_cache = {}
+        if self.hard_reset:
+            # If we're using hard reset, must re-update sensor object references
+            if hasattr(self.viewer, "_setup_observables"):
+                _observables = self.viewer._setup_observables()
+            else:
+                _observables = self._setup_observables()
+            for obs_name, obs in _observables.items():
+                self.modify_observable(observable_name=obs_name, attribute="sensor", modifier=obs._sensor)
+        # Make sure that all sites are toggled OFF by default
+        self.visualize(vis_settings={vis: False for vis in self._visualizations})
+
+        if self.viewer is not None and self.renderer != "mujoco":
+            self.viewer.reset()
+
+        observations = (
+            self.viewer._get_observations(force_update=True)
+            if self.viewer_get_obs
+            else self._get_observations(force_update=True)
+        )
+
+        # Return new observations
+        return observations
+
+    def _reset_internal(self):
+        """Resets simulation internal configurations."""
+
+        # create visualization screen or renderer
+        if self.has_renderer and self.viewer is None:
+            self.viewer = OpenCVRenderer(self.sim)
+
+            # Set the camera angle for viewing
+            if self.render_camera is not None:
+                camera_id = self.sim.model.camera_name2id(self.render_camera)
+                self.viewer.set_camera(camera_id)
+
+        if self.has_offscreen_renderer:
+            if self.sim._render_context_offscreen is None:
+                render_context = MjRenderContextOffscreen(self.sim, device_id=self.render_gpu_device_id)
+            self.sim._render_context_offscreen.vopt.geomgroup[0] = 1 if self.render_collision_mesh else 0
+            self.sim._render_context_offscreen.vopt.geomgroup[1] = 1 if self.render_visual_mesh else 0
+
+        # additional housekeeping
+        self.sim_state_initial = self.sim.get_state()
+        self._setup_references()
+        self.cur_time = 0
+        self.timestep = 0
+        self.done = False
+
+        # Empty observation cache and reset all observables
+        self._obs_cache = {}
+        for observable in self._observables.values():
+            observable.reset()
+
+    def _update_observables(self, force=False):
+        """
+        Updates all observables in this environment
+        Args:
+            force (bool): If True, will force all the observables to update their internal values to the newest
+                value. This is useful if, e.g., you want to grab observations when directly setting simulation states
+                without actually stepping the simulation.
+        """
+        for observable in self._observables.values():
+            observable.update(timestep=self.model_timestep, obs_cache=self._obs_cache, force=force)
+
+    def _get_observations(self, force_update=False):
+        """
+        Grabs observations from the environment.
+        Args:
+            force_update (bool): If True, will force all the observables to update their internal values to the newest
+                value. This is useful if, e.g., you want to grab observations when directly setting simulation states
+                without actually stepping the simulation.
+        Returns:
+            OrderedDict: OrderedDict containing observations [(name_string, np.array), ...]
+        """
+        observations = OrderedDict()
+        obs_by_modality = OrderedDict()
+
+        # Force an update if requested
+        if force_update:
+            self._update_observables(force=True)
+
+        # Loop through all observables and grab their current observation
+        for obs_name, observable in self._observables.items():
+            if observable.is_enabled() and observable.is_active():
+                obs = observable.obs
+                observations[obs_name] = obs
+                modality = observable.modality + "-state"
+                if modality not in obs_by_modality:
+                    obs_by_modality[modality] = []
+                # Make sure all observations are numpy arrays so we can concatenate them
+                array_obs = [obs] if type(obs) in {int, float} or not obs.shape else obs
+                obs_by_modality[modality].append(np.array(array_obs))
+
+        # Add in modality observations
+        for modality, obs in obs_by_modality.items():
+            # To save memory, we only concatenate the image observations if explicitly requested
+            if modality == "image-state" and not macros.CONCATENATE_IMAGES:
+                continue
+            observations[modality] = np.concatenate(obs, axis=-1)
+
+        return observations
+
+    def step(self, action):
+        """
+        Takes a step in simulation with control command @action.
+        Args:
+            action (np.array): Action to execute within the environment
+        Returns:
+            4-tuple:
+                - (OrderedDict) observations from the environment
+                - (float) reward from the environment
+                - (bool) whether the current episode is completed or not
+                - (dict) misc information
+        Raises:
+            ValueError: [Steps past episode termination]
+        """
+        if self.done:
+            raise ValueError("executing action in terminated episode")
+
+        self.timestep += 1
+
+        # Since the env.step frequency is slower than the mjsim timestep frequency, the internal controller will output
+        # multiple torque commands in between new high level action commands. Therefore, we need to denote via
+        # 'policy_step' whether the current step we're taking is simply an internal update of the controller,
+        # or an actual policy update
+        policy_step = True
+
+        # Loop through the simulation at the model timestep rate until we're ready to take the next policy step
+        # (as defined by the control frequency specified at the environment level)
+        for i in range(int(self.control_timestep / self.model_timestep)):
+            self.sim.forward()
+            self._pre_action(action, policy_step)
+            self.sim.step()
+            self._update_observables()
+            policy_step = False
+
+        # Note: this is done all at once to avoid floating point inaccuracies
+        self.cur_time += self.control_timestep
+
+        reward, done, info = self._post_action(action)
+
+        if self.viewer is not None and self.renderer != "mujoco":
+            self.viewer.update()
+
+        observations = self.viewer._get_observations() if self.viewer_get_obs else self._get_observations()
+        return observations, reward, done, info
+
+    def _pre_action(self, action, policy_step=False):
+        """
+        Do any preprocessing before taking an action.
+        Args:
+            action (np.array): Action to execute within the environment
+            policy_step (bool): Whether this current loop is an actual policy step or internal sim update step
+        """
+        self.sim.data.ctrl[:] = action
+
+    def _post_action(self, action):
+        """
+        Do any housekeeping after taking an action.
+        Args:
+            action (np.array): Action to execute within the environment
+        Returns:
+            3-tuple:
+                - (float) reward from the environment
+                - (bool) whether the current episode is completed or not
+                - (dict) empty dict to be filled with information by subclassed method
+        """
+        reward = self.reward(action)
+
+        # done if number of elapsed timesteps is greater than horizon
+        self.done = (self.timestep >= self.horizon) and not self.ignore_done
+
+        return reward, self.done, {}
+
+    def reward(self, action):
+        """
+        Reward should be a function of state and action
+        Args:
+            action (np.array): Action to execute within the environment
+        Returns:
+            float: Reward from environment
+        """
+        raise NotImplementedError
+
+    def render(self):
+        """
+        Renders to an on-screen window.
+        """
+        self.viewer.render()
+
+    def get_pixel_obs(self):
+        """
+        Gets the pixel observations for the environment from the specified renderer
+        """
+        self.viewer.get_pixel_obs()
+
+    def close_renderer(self):
+        """
+        Closes the renderer
+        """
+        self.viewer.close()
+
+    def observation_spec(self):
+        """
+        Returns an observation as observation specification.
+        An alternative design is to return an OrderedDict where the keys
+        are the observation names and the values are the shapes of observations.
+        We leave this alternative implementation commented out, as we find the
+        current design is easier to use in practice.
+        Returns:
+            OrderedDict: Observations from the environment
+        """
+        observation = self.viewer._get_observations() if self.viewer_get_obs else self._get_observations()
+        return observation
+
+    def clear_objects(self, object_names):
+        """
+        Clears objects with the name @object_names out of the task space. This is useful
+        for supporting task modes with single types of objects, as in
+        @self.single_object_mode without changing the model definition.
+        Args:
+            object_names (str or list of str): Name of object(s) to remove from the task workspace
+        """
+        object_names = {object_names} if type(object_names) is str else set(object_names)
+        for obj in self.model.mujoco_objects:
+            if obj.name in object_names:
+                self.sim.data.set_joint_qpos(obj.joints[0], np.array((10, 10, 10, 1, 0, 0, 0)))
+
+    def visualize(self, vis_settings):
+        """
+        Do any needed visualization here
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "env" keyword as well as any other relevant
+                options specified.
+        """
+        # Set visuals for environment objects
+        for obj in self.model.mujoco_objects:
+            obj.set_sites_visibility(sim=self.sim, visible=vis_settings["env"])
+
+    def set_camera_pos_quat(self, camera_pos, camera_quat):
+        if self.renderer in ["nvisii"]:
+            self.viewer.set_camera_pos_quat(camera_pos, camera_quat)
+        else:
+            raise AttributeError("setting camera position and quat requires renderer to be NVISII.")
+
+    def edit_model_xml(self, xml_str):
+        """
+        This function edits the model xml with custom changes, including resolving relative paths,
+        applying changes retroactively to existing demonstration files, and other custom scripts.
+        Environment subclasses should modify this function to add environment-specific xml editing features.
+        Args:
+            xml_str (str): Mujoco sim demonstration XML file as string
+        Returns:
+            str: Edited xml file as string
+        """
+
+        path = os.path.split(robosuite.__file__)[0]
+        path_split = path.split("/")
+
+        # replace mesh and texture file paths
+        tree = ET.fromstring(xml_str)
+        root = tree
+        asset = root.find("asset")
+        meshes = asset.findall("mesh")
+        textures = asset.findall("texture")
+        all_elements = meshes + textures
+
+        for elem in all_elements:
+            old_path = elem.get("file")
+            if old_path is None:
+                continue
+            old_path_split = old_path.split("/")
+            ind = max(loc for loc, val in enumerate(old_path_split) if val == "robosuite")  # last occurrence index
+            new_path_split = path_split + old_path_split[ind + 1 :]
+            new_path = "/".join(new_path_split)
+            elem.set("file", new_path)
+
+        return ET.tostring(root, encoding="utf8").decode("utf8")
+
+    def reset_from_xml_string(self, xml_string):
+        """
+        Reloads the environment from an XML description of the environment.
+        Args:
+            xml_string (str): Filepath to the xml file that will be loaded directly into the sim
+        """
+
+        # if there is an active viewer window, destroy it
+        if self.renderer != "nvisii":
+            self.close()
+
+        # Since we are reloading from an xml_string, we are deterministically resetting
+        self.deterministic_reset = True
+
+        # initialize sim from xml
+        self._initialize_sim(xml_string=xml_string)
+
+        # Now reset as normal
+        self.reset()
+
+        # Turn off deterministic reset
+        self.deterministic_reset = False
+
+    def check_contact(self, geoms_1, geoms_2=None):
+        """
+        Finds contact between two geom groups.
+        Args:
+            geoms_1 (str or list of str or MujocoModel): an individual geom name or list of geom names or a model. If
+                a MujocoModel is specified, the geoms checked will be its contact_geoms
+            geoms_2 (str or list of str or MujocoModel or None): another individual geom name or list of geom names.
+                If a MujocoModel is specified, the geoms checked will be its contact_geoms. If None, will check
+                any collision with @geoms_1 to any other geom in the environment
+        Returns:
+            bool: True if any geom in @geoms_1 is in contact with any geom in @geoms_2.
+        """
+        return SU.check_contact(sim=self.sim, geoms_1=geoms_1, geoms_2=geoms_2)
+
+    def get_contacts(self, model):
+        """
+        Checks for any contacts with @model (as defined by @model's contact_geoms) and returns the set of
+        geom names currently in contact with that model (excluding the geoms that are part of the model itself).
+        Args:
+            model (MujocoModel): Model to check contacts for.
+        Returns:
+            set: Unique geoms that are actively in contact with this model.
+        Raises:
+            AssertionError: [Invalid input type]
+        """
+        return SU.get_contacts(sim=self.sim, model=model)
+
+    def add_observable(self, observable):
+        """
+        Adds an observable to this environment.
+        Args:
+            observable (Observable): Observable instance.
+        """
+        assert observable.name not in self._observables, (
+            "Observable name {} is already associated with an existing observable! Use modify_observable(...) "
+            "to modify a pre-existing observable.".format(observable.name)
+        )
+        self._observables[observable.name] = observable
+
+    def modify_observable(self, observable_name, attribute, modifier):
+        """
+        Modifies observable with associated name @observable_name, replacing the given @attribute with @modifier.
+        Args:
+             observable_name (str): Observable to modify
+             attribute (str): Observable attribute to modify.
+                Options are {`'sensor'`, `'corrupter'`,`'filter'`,  `'delayer'`, `'sampling_rate'`,
+                `'enabled'`, `'active'`}
+             modifier (any): New function / value to replace with for observable. If a function, new signature should
+                match the function being replaced.
+        """
+        # Find the observable
+        assert observable_name in self._observables, "No valid observable with name {} found. Options are: {}".format(
+            observable_name, self.observation_names
+        )
+        obs = self._observables[observable_name]
+        # replace attribute accordingly
+        if attribute == "sensor":
+            obs.set_sensor(modifier)
+        elif attribute == "corrupter":
+            obs.set_corrupter(modifier)
+        elif attribute == "filter":
+            obs.set_filter(modifier)
+        elif attribute == "delayer":
+            obs.set_delayer(modifier)
+        elif attribute == "sampling_rate":
+            obs.set_sampling_rate(modifier)
+        elif attribute == "enabled":
+            obs.set_enabled(modifier)
+        elif attribute == "active":
+            obs.set_active(modifier)
+        else:
+            # Invalid attribute specified
+            raise ValueError(
+                "Invalid observable attribute specified. Requested: {}, valid options are {}".format(
+                    attribute, {"sensor", "corrupter", "filter", "delayer", "sampling_rate", "enabled", "active"}
+                )
+            )
+
+    def _check_success(self):
+        """
+        Checks if the task has been completed. Should be implemented by subclasses
+        Returns:
+            bool: True if the task has been completed
+        """
+        raise NotImplementedError
+
+    def _destroy_viewer(self):
+        """
+        Destroys the current mujoco renderer instance if it exists
+        """
+        # if there is an active viewer window, destroy it
+        if self.viewer is not None:
+            self.viewer.close()  # change this to viewer.finish()?
+            self.viewer = None
+
+    def _destroy_sim(self):
+        """
+        Destroys the current MjSim instance if it exists
+        """
+        if self.sim is not None:
+            self.sim.free()
+            self.sim = None
+
+    def close(self):
+        """Do any cleanup necessary here."""
+        self._destroy_viewer()
+        self._destroy_sim()
+
+    @property
+    def observation_modalities(self):
+        """
+        Modalities for this environment's observations
+        Returns:
+            set: All observation modalities
+        """
+        return set([observable.modality for observable in self._observables.values()])
+
+    @property
+    def observation_names(self):
+        """
+        Grabs all names for this environment's observables
+        Returns:
+            set: All observation names
+        """
+        return set(self._observables.keys())
+
+    @property
+    def enabled_observables(self):
+        """
+        Grabs all names of enabled observables for this environment. An observable is considered enabled if its values
+        are being continually computed / updated at each simulation timestep.
+        Returns:
+            set: All enabled observation names
+        """
+        return set([name for name, observable in self._observables.items() if observable.is_enabled()])
+
+    @property
+    def active_observables(self):
+        """
+        Grabs all names of active observables for this environment. An observable is considered active if its value is
+        being returned in the observation dict from _get_observations() call or from the step() call (assuming this
+        observable is enabled).
+        Returns:
+            set: All active observation names
+        """
+        return set([name for name, observable in self._observables.items() if observable.is_active()])
+
+    @property
+    def _visualizations(self):
+        """
+        Visualization keywords for this environment
+        Returns:
+            set: All components that can be individually visualized for this environment
+        """
+        return {"env"}
+
+    @property
+    def action_spec(self):
+        """
+        Action specification should be implemented in subclasses.
+        Action space is represented by a tuple of (low, high), which are two numpy
+        vectors that specify the min/max action limits per dimension.
+        """
+        raise NotImplementedError
+
+    @property
+    def action_dim(self):
+        """
+        Size of the action space
+        Returns:
+            int: Action space dimension
+        """
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/__init__.py
@@ -0,0 +1 @@
+
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/door.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/door.py
new file mode 100644
index 0000000000000000000000000000000000000000..8953cf7ee69626285c7b241ca05dcb0b5e8c4ea5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/door.py
@@ -0,0 +1,461 @@
+from collections import OrderedDict
+
+import numpy as np
+
+from robosuite.environments.manipulation.single_arm_env import SingleArmEnv
+from robosuite.models.arenas import TableArena
+from robosuite.models.objects import DoorObject
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.observables import Observable, sensor
+from robosuite.utils.placement_samplers import UniformRandomSampler
+
+
+class Door(SingleArmEnv):
+    """
+    This class corresponds to the door opening task for a single robot arm.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be a single single-arm robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment (default is "default").
+            For most single arm environments, this argument has no impact on the robot setup.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        use_latch (bool): if True, uses a spring-loaded handle and latch to "lock" the door closed initially
+            Otherwise, door is instantiated with a fixed handle
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        placement_initializer (ObjectPositionSampler): if provided, will
+            be used to place objects on every reset, else a UniformRandomSampler
+            is used by default.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        AssertionError: [Invalid number of robots specified]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="default",
+        initialization_noise="default",
+        use_latch=True,
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        placement_initializer=None,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # settings for table top (hardcoded since it's not an essential part of the environment)
+        self.table_full_size = (0.8, 0.3, 0.05)
+        self.table_offset = (-0.2, -0.35, 0.8)
+
+        # reward configuration
+        self.use_latch = use_latch
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        # object placement initializer
+        self.placement_initializer = placement_initializer
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action=None):
+        """
+        Reward function for the task.
+
+        Sparse un-normalized reward:
+
+            - a discrete reward of 1.0 is provided if the door is opened
+
+        Un-normalized summed components if using reward shaping:
+
+            - Reaching: in [0, 0.25], proportional to the distance between door handle and robot arm
+            - Rotating: in [0, 0.25], proportional to angle rotated by door handled
+              - Note that this component is only relevant if the environment is using the locked door version
+
+        Note that a successfully completed task (door opened) will return 1.0 irregardless of whether the environment
+        is using sparse or shaped rewards
+
+        Note that the final reward is normalized and scaled by reward_scale / 1.0 as
+        well so that the max score is equal to reward_scale
+
+        Args:
+            action (np.array): [NOT USED]
+
+        Returns:
+            float: reward value
+        """
+        reward = 0.0
+
+        # sparse completion reward
+        if self._check_success():
+            reward = 1.0
+
+        # else, we consider only the case if we're using shaped rewards
+        elif self.reward_shaping:
+            # Add reaching component
+            dist = np.linalg.norm(self._gripper_to_handle)
+            reaching_reward = 0.25 * (1 - np.tanh(10.0 * dist))
+            reward += reaching_reward
+            # Add rotating component if we're using a locked door
+            if self.use_latch:
+                handle_qpos = self.sim.data.qpos[self.handle_qpos_addr]
+                reward += np.clip(0.25 * np.abs(handle_qpos / (0.5 * np.pi)), -0.25, 0.25)
+
+        # Scale reward if requested
+        if self.reward_scale is not None:
+            reward *= self.reward_scale / 1.0
+
+        return reward
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose accordingly
+        xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
+        self.robots[0].robot_model.set_base_xpos(xpos)
+
+        # load model for table top workspace
+        mujoco_arena = TableArena(
+            table_full_size=self.table_full_size,
+            table_offset=self.table_offset,
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # Modify default agentview camera
+        mujoco_arena.set_camera(
+            camera_name="agentview",
+            pos=[0.5986131746834771, -4.392035683362857e-09, 1.5903500240372423],
+            quat=[0.6380177736282349, 0.3048497438430786, 0.30484986305236816, 0.6380177736282349],
+        )
+
+        # initialize objects of interest
+        self.door = DoorObject(
+            name="Door",
+            friction=0.0,
+            damping=0.1,
+            lock=self.use_latch,
+        )
+
+        # Create placement initializer
+        if self.placement_initializer is not None:
+            self.placement_initializer.reset()
+            self.placement_initializer.add_objects(self.door)
+        else:
+            self.placement_initializer = UniformRandomSampler(
+                name="ObjectSampler",
+                mujoco_objects=self.door,
+                x_range=[0.07, 0.09],
+                y_range=[-0.01, 0.01],
+                rotation=(-np.pi / 2.0 - 0.25, -np.pi / 2.0),
+                rotation_axis="z",
+                ensure_object_boundary_in_range=False,
+                ensure_valid_placement=True,
+                reference_pos=self.table_offset,
+            )
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+            mujoco_objects=self.door,
+        )
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+        # Additional object references from this env
+        self.object_body_ids = dict()
+        self.object_body_ids["door"] = self.sim.model.body_name2id(self.door.door_body)
+        self.object_body_ids["frame"] = self.sim.model.body_name2id(self.door.frame_body)
+        self.object_body_ids["latch"] = self.sim.model.body_name2id(self.door.latch_body)
+        self.door_handle_site_id = self.sim.model.site_name2id(self.door.important_sites["handle"])
+        self.hinge_qpos_addr = self.sim.model.get_joint_qpos_addr(self.door.joints[0])
+        if self.use_latch:
+            self.handle_qpos_addr = self.sim.model.get_joint_qpos_addr(self.door.joints[1])
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # low-level object information
+        if self.use_object_obs:
+            # Get robot prefix and define observables modality
+            pf = self.robots[0].robot_model.naming_prefix
+            modality = "object"
+
+            # Define sensor callbacks
+            @sensor(modality=modality)
+            def door_pos(obs_cache):
+                return np.array(self.sim.data.body_xpos[self.object_body_ids["door"]])
+
+            @sensor(modality=modality)
+            def handle_pos(obs_cache):
+                return self._handle_xpos
+
+            @sensor(modality=modality)
+            def door_to_eef_pos(obs_cache):
+                return (
+                    obs_cache["door_pos"] - obs_cache[f"{pf}eef_pos"]
+                    if "door_pos" in obs_cache and f"{pf}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            @sensor(modality=modality)
+            def handle_to_eef_pos(obs_cache):
+                return (
+                    obs_cache["handle_pos"] - obs_cache[f"{pf}eef_pos"]
+                    if "handle_pos" in obs_cache and f"{pf}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            @sensor(modality=modality)
+            def hinge_qpos(obs_cache):
+                return np.array([self.sim.data.qpos[self.hinge_qpos_addr]])
+
+            sensors = [door_pos, handle_pos, door_to_eef_pos, handle_to_eef_pos, hinge_qpos]
+            names = [s.__name__ for s in sensors]
+
+            # Also append handle qpos if we're using a locked door version with rotatable handle
+            if self.use_latch:
+
+                @sensor(modality=modality)
+                def handle_qpos(obs_cache):
+                    return np.array([self.sim.data.qpos[self.handle_qpos_addr]])
+
+                sensors.append(handle_qpos)
+                names.append("handle_qpos")
+
+            # Create observables
+            for name, s in zip(names, sensors):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                )
+
+        return observables
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+        # Reset all object positions using initializer sampler if we're not directly loading from an xml
+        if not self.deterministic_reset:
+
+            # Sample from the placement initializer for all objects
+            object_placements = self.placement_initializer.sample()
+
+            # We know we're only setting a single object (the door), so specifically set its pose
+            door_pos, door_quat, _ = object_placements[self.door.name]
+            door_body_id = self.sim.model.body_name2id(self.door.root_body)
+            self.sim.model.body_pos[door_body_id] = door_pos
+            self.sim.model.body_quat[door_body_id] = door_quat
+
+    def _check_success(self):
+        """
+        Check if door has been opened.
+
+        Returns:
+            bool: True if door has been opened
+        """
+        hinge_qpos = self.sim.data.qpos[self.hinge_qpos_addr]
+        return hinge_qpos > 0.3
+
+    def visualize(self, vis_settings):
+        """
+        In addition to super call, visualize gripper site proportional to the distance to the door handle.
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "grippers" keyword as well as any other relevant
+                options specified.
+        """
+        # Run superclass method first
+        super().visualize(vis_settings=vis_settings)
+
+        # Color the gripper visualization site according to its distance to the door handle
+        if vis_settings["grippers"]:
+            self._visualize_gripper_to_target(
+                gripper=self.robots[0].gripper, target=self.door.important_sites["handle"], target_type="site"
+            )
+
+    @property
+    def _handle_xpos(self):
+        """
+        Grabs the position of the door handle handle.
+
+        Returns:
+            np.array: Door handle (x,y,z)
+        """
+        return self.sim.data.site_xpos[self.door_handle_site_id]
+
+    @property
+    def _gripper_to_handle(self):
+        """
+        Calculates distance from the gripper to the door handle.
+
+        Returns:
+            np.array: (x,y,z) distance between handle and eef
+        """
+        return self._handle_xpos - self._eef_xpos
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/lift.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/lift.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d27b300f2e0f181b6ceb3bdb3ad7d76998909e0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/lift.py
@@ -0,0 +1,428 @@
+from collections import OrderedDict
+
+import numpy as np
+
+from robosuite.environments.manipulation.single_arm_env import SingleArmEnv
+from robosuite.models.arenas import TableArena
+from robosuite.models.objects import BoxObject
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.mjcf_utils import CustomMaterial
+from robosuite.utils.observables import Observable, sensor
+from robosuite.utils.placement_samplers import UniformRandomSampler
+from robosuite.utils.transform_utils import convert_quat
+
+
+class Lift(SingleArmEnv):
+    """
+    This class corresponds to the lifting task for a single robot arm.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be a single single-arm robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment (default is "default").
+            For most single arm environments, this argument has no impact on the robot setup.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        table_full_size (3-tuple): x, y, and z dimensions of the table.
+
+        table_friction (3-tuple): the three mujoco friction parameters for
+            the table.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        placement_initializer (ObjectPositionSampler): if provided, will
+            be used to place objects on every reset, else a UniformRandomSampler
+            is used by default.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        AssertionError: [Invalid number of robots specified]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="default",
+        initialization_noise="default",
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(1.0, 5e-3, 1e-4),
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        placement_initializer=None,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # settings for table top
+        self.table_full_size = table_full_size
+        self.table_friction = table_friction
+        self.table_offset = np.array((0, 0, 0.8))
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        # object placement initializer
+        self.placement_initializer = placement_initializer
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action=None):
+        """
+        Reward function for the task.
+
+        Sparse un-normalized reward:
+
+            - a discrete reward of 2.25 is provided if the cube is lifted
+
+        Un-normalized summed components if using reward shaping:
+
+            - Reaching: in [0, 1], to encourage the arm to reach the cube
+            - Grasping: in {0, 0.25}, non-zero if arm is grasping the cube
+            - Lifting: in {0, 1}, non-zero if arm has lifted the cube
+
+        The sparse reward only consists of the lifting component.
+
+        Note that the final reward is normalized and scaled by
+        reward_scale / 2.25 as well so that the max score is equal to reward_scale
+
+        Args:
+            action (np array): [NOT USED]
+
+        Returns:
+            float: reward value
+        """
+        reward = 0.0
+
+        # sparse completion reward
+        if self._check_success():
+            reward = 2.25
+
+        # use a shaping reward
+        elif self.reward_shaping:
+
+            # reaching reward
+            cube_pos = self.sim.data.body_xpos[self.cube_body_id]
+            gripper_site_pos = self.sim.data.site_xpos[self.robots[0].eef_site_id]
+            dist = np.linalg.norm(gripper_site_pos - cube_pos)
+            reaching_reward = 1 - np.tanh(10.0 * dist)
+            reward += reaching_reward
+
+            # grasping reward
+            if self._check_grasp(gripper=self.robots[0].gripper, object_geoms=self.cube):
+                reward += 0.25
+
+        # Scale reward if requested
+        if self.reward_scale is not None:
+            reward *= self.reward_scale / 2.25
+
+        return reward
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose accordingly
+        xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
+        self.robots[0].robot_model.set_base_xpos(xpos)
+
+        # load model for table top workspace
+        mujoco_arena = TableArena(
+            table_full_size=self.table_full_size,
+            table_friction=self.table_friction,
+            table_offset=self.table_offset,
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # initialize objects of interest
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "1 1",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        redwood = CustomMaterial(
+            texture="WoodRed",
+            tex_name="redwood",
+            mat_name="redwood_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        self.cube = BoxObject(
+            name="cube",
+            size_min=[0.020, 0.020, 0.020],  # [0.015, 0.015, 0.015],
+            size_max=[0.022, 0.022, 0.022],  # [0.018, 0.018, 0.018])
+            rgba=[1, 0, 0, 1],
+            material=redwood,
+        )
+
+        # Create placement initializer
+        if self.placement_initializer is not None:
+            self.placement_initializer.reset()
+            self.placement_initializer.add_objects(self.cube)
+        else:
+            self.placement_initializer = UniformRandomSampler(
+                name="ObjectSampler",
+                mujoco_objects=self.cube,
+                x_range=[-0.03, 0.03],
+                y_range=[-0.03, 0.03],
+                rotation=None,
+                ensure_object_boundary_in_range=False,
+                ensure_valid_placement=True,
+                reference_pos=self.table_offset,
+                z_offset=0.01,
+            )
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+            mujoco_objects=self.cube,
+        )
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+        # Additional object references from this env
+        self.cube_body_id = self.sim.model.body_name2id(self.cube.root_body)
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # low-level object information
+        if self.use_object_obs:
+            # Get robot prefix and define observables modality
+            pf = self.robots[0].robot_model.naming_prefix
+            modality = "object"
+
+            # cube-related observables
+            @sensor(modality=modality)
+            def cube_pos(obs_cache):
+                return np.array(self.sim.data.body_xpos[self.cube_body_id])
+
+            @sensor(modality=modality)
+            def cube_quat(obs_cache):
+                return convert_quat(np.array(self.sim.data.body_xquat[self.cube_body_id]), to="xyzw")
+
+            @sensor(modality=modality)
+            def gripper_to_cube_pos(obs_cache):
+                return (
+                    obs_cache[f"{pf}eef_pos"] - obs_cache["cube_pos"]
+                    if f"{pf}eef_pos" in obs_cache and "cube_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            sensors = [cube_pos, cube_quat, gripper_to_cube_pos]
+            names = [s.__name__ for s in sensors]
+
+            # Create observables
+            for name, s in zip(names, sensors):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                )
+
+        return observables
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+        # Reset all object positions using initializer sampler if we're not directly loading from an xml
+        if not self.deterministic_reset:
+
+            # Sample from the placement initializer for all objects
+            object_placements = self.placement_initializer.sample()
+
+            # Loop through all objects and reset their positions
+            for obj_pos, obj_quat, obj in object_placements.values():
+                self.sim.data.set_joint_qpos(obj.joints[0], np.concatenate([np.array(obj_pos), np.array(obj_quat)]))
+
+    def visualize(self, vis_settings):
+        """
+        In addition to super call, visualize gripper site proportional to the distance to the cube.
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "grippers" keyword as well as any other relevant
+                options specified.
+        """
+        # Run superclass method first
+        super().visualize(vis_settings=vis_settings)
+
+        # Color the gripper visualization site according to its distance to the cube
+        if vis_settings["grippers"]:
+            self._visualize_gripper_to_target(gripper=self.robots[0].gripper, target=self.cube)
+
+    def _check_success(self):
+        """
+        Check if cube has been lifted.
+
+        Returns:
+            bool: True if cube has been lifted
+        """
+        cube_height = self.sim.data.body_xpos[self.cube_body_id][2]
+        table_height = self.model.mujoco_arena.table_offset[2]
+
+        # cube is higher than the table top above a margin
+        return cube_height > table_height + 0.04
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/manipulation_env.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/manipulation_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c37a03c0e097a441f31b9a141266f7aa601cbd6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/manipulation_env.py
@@ -0,0 +1,322 @@
+import numpy as np
+
+from robosuite.environments.robot_env import RobotEnv
+from robosuite.models.base import MujocoModel
+from robosuite.models.grippers import GripperModel
+from robosuite.robots import ROBOT_CLASS_MAPPING, Manipulator
+
+
+class ManipulationEnv(RobotEnv):
+    """
+    Initializes a manipulation-specific robot environment in Mujoco.
+
+    Args:
+        robots: Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+
+        env_configuration (str): Specifies how to position the robot(s) within the environment. Default is "default",
+            which should be interpreted accordingly by any subclasses.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        mount_types (None or str or list of str): type of mount, used to instantiate mount models from mount factory.
+            Default is "default", which is the default mount associated with the robot(s) the 'robots' specification.
+            None results in no mount, and any other (valid) model overrides the default mount. Should either be
+            single str if same mount type is to be used for all robots or else it should be a list of the same
+            length as "robots" param
+
+        gripper_types (None or str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        ValueError: [Camera obs require offscreen renderer]
+        ValueError: [Camera name must be specified to use camera obs]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        mount_types="default",
+        gripper_types="default",
+        initialization_noise=None,
+        use_camera_obs=True,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,
+        renderer="mujoco",
+        renderer_config=None,
+        direct_gripper_control=False,
+    ):
+        # Robot info
+        robots = list(robots) if type(robots) is list or type(robots) is tuple else [robots]
+        num_robots = len(robots)
+
+        # Gripper
+        gripper_types = self._input2list(gripper_types, num_robots)
+
+        # Robot configurations to pass to super call
+        robot_configs = [
+            {
+                "gripper_type": gripper_types[idx],
+            }
+            for idx in range(num_robots)
+        ]
+
+        # Run superclass init
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types=mount_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            robot_configs=robot_configs,
+            renderer=renderer,
+            renderer_config=renderer_config,
+            direct_gripper_control=direct_gripper_control,
+        )
+
+    @property
+    def _visualizations(self):
+        """
+        Visualization keywords for this environment
+
+        Returns:
+            set: All components that can be individually visualized for this environment
+        """
+        vis_set = super()._visualizations
+        vis_set.add("grippers")
+        return vis_set
+
+    def _check_grasp(self, gripper, object_geoms):
+        """
+        Checks whether the specified gripper as defined by @gripper is grasping the specified object in the environment.
+
+        By default, this will return True if at least one geom in both the "left_fingerpad" and "right_fingerpad" geom
+        groups are in contact with any geom specified by @object_geoms. Custom gripper geom groups can be
+        specified with @gripper as well.
+
+        Args:
+            gripper (GripperModel or str or list of str or list of list of str): If a MujocoModel, this is specific
+            gripper to check for grasping (as defined by "left_fingerpad" and "right_fingerpad" geom groups). Otherwise,
+                this sets custom gripper geom groups which together define a grasp. This can be a string
+                (one group of single gripper geom), a list of string (multiple groups of single gripper geoms) or a
+                list of list of string (multiple groups of multiple gripper geoms). At least one geom from each group
+                must be in contact with any geom in @object_geoms for this method to return True.
+            object_geoms (str or list of str or MujocoModel): If a MujocoModel is inputted, will check for any
+                collisions with the model's contact_geoms. Otherwise, this should be specific geom name(s) composing
+                the object to check for contact.
+
+        Returns:
+            bool: True if the gripper is grasping the given object
+        """
+        # Convert object, gripper geoms into standardized form
+        if isinstance(object_geoms, MujocoModel):
+            o_geoms = object_geoms.contact_geoms
+        else:
+            o_geoms = [object_geoms] if type(object_geoms) is str else object_geoms
+        if isinstance(gripper, GripperModel):
+            g_geoms = [gripper.important_geoms["left_fingerpad"], gripper.important_geoms["right_fingerpad"]]
+        elif type(gripper) is str:
+            g_geoms = [[gripper]]
+        else:
+            # Parse each element in the gripper_geoms list accordingly
+            g_geoms = [[g_group] if type(g_group) is str else g_group for g_group in gripper]
+
+        # Search for collisions between each gripper geom group and the object geoms group
+        for g_group in g_geoms:
+            if not self.check_contact(g_group, o_geoms):
+                return False
+        return True
+
+    def _gripper_to_target(self, gripper, target, target_type="body", return_distance=False):
+        """
+        Calculates the (x,y,z) Cartesian distance (target_pos - gripper_pos) from the specified @gripper to the
+        specified @target. If @return_distance is set, will return the Euclidean (scalar) distance instead.
+
+        Args:
+            gripper (MujocoModel): Gripper model to update grip site rgb
+            target (MujocoModel or str): Either a site / geom / body name, or a model that serves as the target.
+                If a model is given, then the root body will be used as the target.
+            target_type (str): One of {"body", "geom", or "site"}, corresponding to the type of element @target
+                refers to.
+            return_distance (bool): If set, will return Euclidean distance instead of Cartesian distance
+
+        Returns:
+            np.array or float: (Cartesian or Euclidean) distance from gripper to target
+        """
+        # Get gripper and target positions
+        gripper_pos = self.sim.data.get_site_xpos(gripper.important_sites["grip_site"])
+        # If target is MujocoModel, grab the correct body as the target and find the target position
+        if isinstance(target, MujocoModel):
+            target_pos = self.sim.data.get_body_xpos(target.root_body)
+        elif target_type == "body":
+            target_pos = self.sim.data.get_body_xpos(target)
+        elif target_type == "site":
+            target_pos = self.sim.data.get_site_xpos(target)
+        else:
+            target_pos = self.sim.data.get_geom_xpos(target)
+        # Calculate distance
+        diff = target_pos - gripper_pos
+        # Return appropriate value
+        return np.linalg.norm(diff) if return_distance else diff
+
+    def _visualize_gripper_to_target(self, gripper, target, target_type="body"):
+        """
+        Colors the grip visualization site proportional to the Euclidean distance to the specified @target.
+        Colors go from red --> green as the gripper gets closer.
+
+        Args:
+            gripper (MujocoModel): Gripper model to update grip site rgb
+            target (MujocoModel or str): Either a site / geom / body name, or a model that serves as the target.
+                If a model is given, then the root body will be used as the target.
+            target_type (str): One of {"body", "geom", or "site"}, corresponding to the type of element @target
+                refers to.
+        """
+        # Get gripper and target positions
+        gripper_pos = self.sim.data.get_site_xpos(gripper.important_sites["grip_site"])
+        # If target is MujocoModel, grab the correct body as the target and find the target position
+        if isinstance(target, MujocoModel):
+            target_pos = self.sim.data.get_body_xpos(target.root_body)
+        elif target_type == "body":
+            target_pos = self.sim.data.get_body_xpos(target)
+        elif target_type == "site":
+            target_pos = self.sim.data.get_site_xpos(target)
+        else:
+            target_pos = self.sim.data.get_geom_xpos(target)
+        # color the gripper site appropriately based on (squared) distance to target
+        dist = np.sum(np.square((target_pos - gripper_pos)))
+        max_dist = 0.1
+        scaled = (1.0 - min(dist / max_dist, 1.0)) ** 15
+        rgba = np.zeros(3)
+        rgba[0] = 1 - scaled
+        rgba[1] = scaled
+        self.sim.model.site_rgba[self.sim.model.site_name2id(gripper.important_sites["grip_site"])][:3] = rgba
+
+    def _check_robot_configuration(self, robots):
+        """
+        Sanity check to make sure inputted robots and the corresponding requested task/configuration combo is legal.
+        Should be implemented in every specific task module
+
+        Args:
+            robots (str or list of str): Inputted requested robots at the task-level environment
+        """
+        # Make sure all inputted robots are a manipulation robot
+        if type(robots) is str:
+            robots = [robots]
+        for robot in robots:
+            assert issubclass(
+                ROBOT_CLASS_MAPPING[robot], Manipulator
+            ), "Only manipulator robots supported for manipulation environment!"
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/nut_assembly.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/nut_assembly.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c0583c8ae2854f328191f78e4d16130521c8f94
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/nut_assembly.py
@@ -0,0 +1,708 @@
+import random
+from collections import OrderedDict
+
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.environments.manipulation.single_arm_env import SingleArmEnv
+from robosuite.models.arenas import PegsArena
+from robosuite.models.objects import RoundNutObject, SquareNutObject
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.observables import Observable, sensor
+from robosuite.utils.placement_samplers import SequentialCompositeSampler, UniformRandomSampler
+
+
+class NutAssembly(SingleArmEnv):
+    """
+    This class corresponds to the nut assembly task for a single robot arm.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be a single single-arm robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment (default is "default").
+            For most single arm environments, this argument has no impact on the robot setup.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        table_full_size (3-tuple): x, y, and z dimensions of the table.
+
+        table_friction (3-tuple): the three mujoco friction parameters for
+            the table.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        placement_initializer (ObjectPositionSampler): if provided, will
+            be used to place objects on every reset, else a UniformRandomSampler
+            is used by default.
+
+        single_object_mode (int): specifies which version of the task to do. Note that
+            the observations change accordingly.
+
+            :`0`: corresponds to the full task with both types of nuts.
+
+            :`1`: corresponds to an easier task with only one type of nut initialized
+               on the table with every reset. The type is randomized on every reset.
+
+            :`2`: corresponds to an easier task with only one type of nut initialized
+               on the table with every reset. The type is kept constant and will not
+               change between resets.
+
+        nut_type (string): if provided, should be either "round" or "square". Determines
+            which type of nut (round or square) will be spawned on every environment
+            reset. Only used if @single_object_mode is 2.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        AssertionError: [Invalid nut type specified]
+        AssertionError: [Invalid number of robots specified]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="default",
+        initialization_noise="default",
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(1, 0.005, 0.0001),
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        placement_initializer=None,
+        single_object_mode=0,
+        nut_type=None,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # task settings
+        self.single_object_mode = single_object_mode
+        self.nut_to_id = {"square": 0, "round": 1}
+        self.nut_id_to_sensors = {}  # Maps nut id to sensor names for that nut
+        if nut_type is not None:
+            assert nut_type in self.nut_to_id.keys(), "invalid @nut_type argument - choose one of {}".format(
+                list(self.nut_to_id.keys())
+            )
+            self.nut_id = self.nut_to_id[nut_type]  # use for convenient indexing
+        self.obj_to_use = None
+
+        # settings for table top
+        self.table_full_size = table_full_size
+        self.table_friction = table_friction
+        self.table_offset = np.array((0, 0, 0.82))
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        # object placement initializer
+        self.placement_initializer = placement_initializer
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action=None):
+        """
+        Reward function for the task.
+
+        Sparse un-normalized reward:
+
+          - a discrete reward of 1.0 per nut if it is placed around its correct peg
+
+        Un-normalized components if using reward shaping, where the maximum is returned if not solved:
+
+          - Reaching: in [0, 0.1], proportional to the distance between the gripper and the closest nut
+          - Grasping: in {0, 0.35}, nonzero if the gripper is grasping a nut
+          - Lifting: in {0, [0.35, 0.5]}, nonzero only if nut is grasped; proportional to lifting height
+          - Hovering: in {0, [0.5, 0.7]}, nonzero only if nut is lifted; proportional to distance from nut to peg
+
+        Note that a successfully completed task (nut around peg) will return 1.0 per nut irregardless of whether the
+        environment is using sparse or shaped rewards
+
+        Note that the final reward is normalized and scaled by reward_scale / 2.0 (or 1.0 if only a single nut is
+        being used) as well so that the max score is equal to reward_scale
+
+        Args:
+            action (np.array): [NOT USED]
+
+        Returns:
+            float: reward value
+        """
+        # compute sparse rewards
+        self._check_success()
+        reward = np.sum(self.objects_on_pegs)
+
+        # add in shaped rewards
+        if self.reward_shaping:
+            staged_rewards = self.staged_rewards()
+            reward += max(staged_rewards)
+        if self.reward_scale is not None:
+            reward *= self.reward_scale
+            if self.single_object_mode == 0:
+                reward /= 2.0
+        return reward
+
+    def staged_rewards(self):
+        """
+        Calculates staged rewards based on current physical states.
+        Stages consist of reaching, grasping, lifting, and hovering.
+
+        Returns:
+            4-tuple:
+
+                - (float) reaching reward
+                - (float) grasping reward
+                - (float) lifting reward
+                - (float) hovering reward
+        """
+
+        reach_mult = 0.1
+        grasp_mult = 0.35
+        lift_mult = 0.5
+        hover_mult = 0.7
+
+        # filter out objects that are already on the correct pegs
+        active_nuts = []
+        for i, nut in enumerate(self.nuts):
+            if self.objects_on_pegs[i]:
+                continue
+            active_nuts.append(nut)
+
+        # reaching reward governed by distance to closest object
+        r_reach = 0.0
+        if active_nuts:
+            # reaching reward via minimum distance to the handles of the objects
+            dists = [
+                self._gripper_to_target(
+                    gripper=self.robots[0].gripper,
+                    target=active_nut.important_sites["handle"],
+                    target_type="site",
+                    return_distance=True,
+                )
+                for active_nut in active_nuts
+            ]
+            r_reach = (1 - np.tanh(10.0 * min(dists))) * reach_mult
+
+        # grasping reward for touching any objects of interest
+        r_grasp = (
+            int(
+                self._check_grasp(
+                    gripper=self.robots[0].gripper,
+                    object_geoms=[g for active_nut in active_nuts for g in active_nut.contact_geoms],
+                )
+            )
+            * grasp_mult
+        )
+
+        # lifting reward for picking up an object
+        r_lift = 0.0
+        table_pos = np.array(self.sim.data.body_xpos[self.table_body_id])
+        if active_nuts and r_grasp > 0.0:
+            z_target = table_pos[2] + 0.2
+            object_z_locs = self.sim.data.body_xpos[[self.obj_body_id[active_nut.name] for active_nut in active_nuts]][
+                :, 2
+            ]
+            z_dists = np.maximum(z_target - object_z_locs, 0.0)
+            r_lift = grasp_mult + (1 - np.tanh(15.0 * min(z_dists))) * (lift_mult - grasp_mult)
+
+        # hover reward for getting object above peg
+        r_hover = 0.0
+        if active_nuts:
+            r_hovers = np.zeros(len(active_nuts))
+            peg_body_ids = [self.peg1_body_id, self.peg2_body_id]
+            for i, nut in enumerate(active_nuts):
+                valid_obj = False
+                peg_pos = None
+                for nut_name, idn in self.nut_to_id.items():
+                    if nut_name in nut.name.lower():
+                        peg_pos = np.array(self.sim.data.body_xpos[peg_body_ids[idn]])[:2]
+                        valid_obj = True
+                        break
+                if not valid_obj:
+                    raise Exception("Got invalid object to reach: {}".format(nut.name))
+                ob_xy = self.sim.data.body_xpos[self.obj_body_id[nut.name]][:2]
+                dist = np.linalg.norm(peg_pos - ob_xy)
+                r_hovers[i] = r_lift + (1 - np.tanh(10.0 * dist)) * (hover_mult - lift_mult)
+            r_hover = np.max(r_hovers)
+
+        return r_reach, r_grasp, r_lift, r_hover
+
+    def on_peg(self, obj_pos, peg_id):
+
+        if peg_id == 0:
+            peg_pos = np.array(self.sim.data.body_xpos[self.peg1_body_id])
+        else:
+            peg_pos = np.array(self.sim.data.body_xpos[self.peg2_body_id])
+        res = False
+        if (
+            abs(obj_pos[0] - peg_pos[0]) < 0.03
+            and abs(obj_pos[1] - peg_pos[1]) < 0.03
+            and obj_pos[2] < self.table_offset[2] + 0.05
+        ):
+            res = True
+        return res
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose accordingly
+        xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
+        self.robots[0].robot_model.set_base_xpos(xpos)
+
+        # load model for table top workspace
+        mujoco_arena = PegsArena(
+            table_full_size=self.table_full_size,
+            table_friction=self.table_friction,
+            table_offset=self.table_offset,
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # define nuts
+        self.nuts = []
+        nut_names = ("SquareNut", "RoundNut")
+
+        # Create default (SequentialCompositeSampler) sampler if it has not already been specified
+        if self.placement_initializer is None:
+            self.placement_initializer = SequentialCompositeSampler(name="ObjectSampler")
+            for nut_name, default_y_range in zip(nut_names, ([0.11, 0.225], [-0.225, -0.11])):
+                self.placement_initializer.append_sampler(
+                    sampler=UniformRandomSampler(
+                        name=f"{nut_name}Sampler",
+                        x_range=[-0.115, -0.11],
+                        y_range=default_y_range,
+                        rotation=None,
+                        rotation_axis="z",
+                        ensure_object_boundary_in_range=False,
+                        ensure_valid_placement=True,
+                        reference_pos=self.table_offset,
+                        z_offset=0.02,
+                    )
+                )
+        # Reset sampler before adding any new samplers / objects
+        self.placement_initializer.reset()
+
+        for i, (nut_cls, nut_name) in enumerate(
+            zip(
+                (SquareNutObject, RoundNutObject),
+                nut_names,
+            )
+        ):
+            nut = nut_cls(name=nut_name)
+            self.nuts.append(nut)
+            # Add this nut to the placement initializer
+            if isinstance(self.placement_initializer, SequentialCompositeSampler):
+                # assumes we have two samplers so we add nuts to them
+                self.placement_initializer.add_objects_to_sampler(sampler_name=f"{nut_name}Sampler", mujoco_objects=nut)
+            else:
+                # This is assumed to be a flat sampler, so we just add all nuts to this sampler
+                self.placement_initializer.add_objects(nut)
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+            mujoco_objects=self.nuts,
+        )
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+        # Additional object references from this env
+        self.obj_body_id = {}
+        self.obj_geom_id = {}
+
+        self.table_body_id = self.sim.model.body_name2id("table")
+        self.peg1_body_id = self.sim.model.body_name2id("peg1")
+        self.peg2_body_id = self.sim.model.body_name2id("peg2")
+
+        for nut in self.nuts:
+            self.obj_body_id[nut.name] = self.sim.model.body_name2id(nut.root_body)
+            self.obj_geom_id[nut.name] = [self.sim.model.geom_name2id(g) for g in nut.contact_geoms]
+
+        # information of objects
+        self.object_site_ids = [self.sim.model.site_name2id(nut.important_sites["handle"]) for nut in self.nuts]
+
+        # keep track of which objects are on their corresponding pegs
+        self.objects_on_pegs = np.zeros(len(self.nuts))
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # low-level object information
+        if self.use_object_obs:
+            # Get robot prefix and define observables modality
+            pf = self.robots[0].robot_model.naming_prefix
+            modality = "object"
+
+            # Reset nut sensor mappings
+            self.nut_id_to_sensors = {}
+
+            # for conversion to relative gripper frame
+            @sensor(modality=modality)
+            def world_pose_in_gripper(obs_cache):
+                return (
+                    T.pose_inv(T.pose2mat((obs_cache[f"{pf}eef_pos"], obs_cache[f"{pf}eef_quat"])))
+                    if f"{pf}eef_pos" in obs_cache and f"{pf}eef_quat" in obs_cache
+                    else np.eye(4)
+                )
+
+            sensors = [world_pose_in_gripper]
+            names = ["world_pose_in_gripper"]
+            enableds = [True]
+            actives = [False]
+
+            # Define nut related sensors
+            for i, nut in enumerate(self.nuts):
+                # Create sensors for this nut
+                using_nut = self.single_object_mode == 0 or self.nut_id == i
+                nut_sensors, nut_sensor_names = self._create_nut_sensors(nut_name=nut.name, modality=modality)
+                sensors += nut_sensors
+                names += nut_sensor_names
+                enableds += [using_nut] * 4
+                actives += [using_nut] * 4
+                self.nut_id_to_sensors[i] = nut_sensor_names
+
+            if self.single_object_mode == 1:
+                # This is randomly sampled object, so we need to include object id as observation
+                @sensor(modality=modality)
+                def nut_id(obs_cache):
+                    return self.nut_id
+
+                sensors.append(nut_id)
+                names.append("nut_id")
+                enableds.append(True)
+                actives.append(True)
+
+            # Create observables
+            for name, s, enabled, active in zip(names, sensors, enableds, actives):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                    enabled=enabled,
+                    active=active,
+                )
+
+        return observables
+
+    def _create_nut_sensors(self, nut_name, modality="object"):
+        """
+        Helper function to create sensors for a given nut. This is abstracted in a separate function call so that we
+        don't have local function naming collisions during the _setup_observables() call.
+
+        Args:
+            nut_name (str): Name of nut to create sensors for
+            modality (str): Modality to assign to all sensors
+
+        Returns:
+            2-tuple:
+                sensors (list): Array of sensors for the given nut
+                names (list): array of corresponding observable names
+        """
+        pf = self.robots[0].robot_model.naming_prefix
+
+        @sensor(modality=modality)
+        def nut_pos(obs_cache):
+            return np.array(self.sim.data.body_xpos[self.obj_body_id[nut_name]])
+
+        @sensor(modality=modality)
+        def nut_quat(obs_cache):
+            return T.convert_quat(self.sim.data.body_xquat[self.obj_body_id[nut_name]], to="xyzw")
+
+        @sensor(modality=modality)
+        def nut_to_eef_pos(obs_cache):
+            # Immediately return default value if cache is empty
+            if any(
+                [name not in obs_cache for name in [f"{nut_name}_pos", f"{nut_name}_quat", "world_pose_in_gripper"]]
+            ):
+                return np.zeros(3)
+            obj_pose = T.pose2mat((obs_cache[f"{nut_name}_pos"], obs_cache[f"{nut_name}_quat"]))
+            rel_pose = T.pose_in_A_to_pose_in_B(obj_pose, obs_cache["world_pose_in_gripper"])
+            rel_pos, rel_quat = T.mat2pose(rel_pose)
+            obs_cache[f"{nut_name}_to_{pf}eef_quat"] = rel_quat
+            return rel_pos
+
+        @sensor(modality=modality)
+        def nut_to_eef_quat(obs_cache):
+            return (
+                obs_cache[f"{nut_name}_to_{pf}eef_quat"] if f"{nut_name}_to_{pf}eef_quat" in obs_cache else np.zeros(4)
+            )
+
+        sensors = [nut_pos, nut_quat, nut_to_eef_pos, nut_to_eef_quat]
+        names = [f"{nut_name}_pos", f"{nut_name}_quat", f"{nut_name}_to_{pf}eef_pos", f"{nut_name}_to_{pf}eef_quat"]
+
+        return sensors, names
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+        # Reset all object positions using initializer sampler if we're not directly loading from an xml
+        if not self.deterministic_reset:
+
+            # Sample from the placement initializer for all objects
+            object_placements = self.placement_initializer.sample()
+
+            # Loop through all objects and reset their positions
+            for obj_pos, obj_quat, obj in object_placements.values():
+                self.sim.data.set_joint_qpos(obj.joints[0], np.concatenate([np.array(obj_pos), np.array(obj_quat)]))
+
+        # Move objects out of the scene depending on the mode
+        nut_names = {nut.name for nut in self.nuts}
+        if self.single_object_mode == 1:
+            self.obj_to_use = random.choice(list(nut_names))
+            for nut_type, i in self.nut_to_id.items():
+                if nut_type.lower() in self.obj_to_use.lower():
+                    self.nut_id = i
+                    break
+        elif self.single_object_mode == 2:
+            self.obj_to_use = self.nuts[self.nut_id].name
+        if self.single_object_mode in {1, 2}:
+            nut_names.remove(self.obj_to_use)
+            self.clear_objects(list(nut_names))
+
+        # Make sure to update sensors' active and enabled states
+        if self.single_object_mode != 0:
+            for i, sensor_names in self.nut_id_to_sensors.items():
+                for name in sensor_names:
+                    # Set all of these sensors to be enabled and active if this is the active nut, else False
+                    self._observables[name].set_enabled(i == self.nut_id)
+                    self._observables[name].set_active(i == self.nut_id)
+
+    def _check_success(self):
+        """
+        Check if all nuts have been successfully placed around their corresponding pegs.
+
+        Returns:
+            bool: True if all nuts are placed correctly
+        """
+        # remember objects that are on the correct pegs
+        gripper_site_pos = self.sim.data.site_xpos[self.robots[0].eef_site_id]
+        for i, nut in enumerate(self.nuts):
+            obj_str = nut.name
+            obj_pos = self.sim.data.body_xpos[self.obj_body_id[obj_str]]
+            dist = np.linalg.norm(gripper_site_pos - obj_pos)
+            r_reach = 1 - np.tanh(10.0 * dist)
+            self.objects_on_pegs[i] = int(self.on_peg(obj_pos, i) and r_reach < 0.6)
+
+        if self.single_object_mode > 0:
+            return np.sum(self.objects_on_pegs) > 0  # need one object on peg
+
+        # returns True if all objects are on correct pegs
+        return np.sum(self.objects_on_pegs) == len(self.nuts)
+
+    def visualize(self, vis_settings):
+        """
+        In addition to super call, visualize gripper site proportional to the distance to the closest nut.
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "grippers" keyword as well as any other relevant
+                options specified.
+        """
+        # Run superclass method first
+        super().visualize(vis_settings=vis_settings)
+
+        # Color the gripper visualization site according to its distance to the closest nut
+        if vis_settings["grippers"]:
+            # find closest object
+            dists = [
+                self._gripper_to_target(
+                    gripper=self.robots[0].gripper,
+                    target=nut.important_sites["handle"],
+                    target_type="site",
+                    return_distance=True,
+                )
+                for nut in self.nuts
+            ]
+            closest_nut_id = np.argmin(dists)
+            # Visualize the distance to this target
+            self._visualize_gripper_to_target(
+                gripper=self.robots[0].gripper,
+                target=self.nuts[closest_nut_id].important_sites["handle"],
+                target_type="site",
+            )
+
+
+class NutAssemblySingle(NutAssembly):
+    """
+    Easier version of task - place either one round nut or one square nut into its peg.
+    """
+
+    def __init__(self, **kwargs):
+        assert "single_object_mode" not in kwargs, "invalid set of arguments"
+        super().__init__(single_object_mode=1, **kwargs)
+
+
+class NutAssemblySquare(NutAssembly):
+    """
+    Easier version of task - place one square nut into its peg.
+    """
+
+    def __init__(self, **kwargs):
+        assert "single_object_mode" not in kwargs and "nut_type" not in kwargs, "invalid set of arguments"
+        super().__init__(single_object_mode=2, nut_type="square", **kwargs)
+
+
+class NutAssemblyRound(NutAssembly):
+    """
+    Easier version of task - place one round nut into its peg.
+    """
+
+    def __init__(self, **kwargs):
+        assert "single_object_mode" not in kwargs and "nut_type" not in kwargs, "invalid set of arguments"
+        super().__init__(single_object_mode=2, nut_type="round", **kwargs)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/phantom.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/phantom.py
new file mode 100644
index 0000000000000000000000000000000000000000..de1b86ddea9ecf665d432e5f0e7adb45c1140ee0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/phantom.py
@@ -0,0 +1,299 @@
+import numpy as np
+
+from robosuite.environments.manipulation.single_arm_env import SingleArmEnv
+from robosuite.models.arenas import PhantomTableArena
+from robosuite.models.tasks import ManipulationTask
+
+
+class Phantom(SingleArmEnv):
+    """
+    This class corresponds to the stacking task for a single robot arm.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be a single single-arm robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment (default is "default").
+            For most single arm environments, this argument has no impact on the robot setup.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        table_full_size (3-tuple): x, y, and z dimensions of the table.
+
+        table_friction (3-tuple): the three mujoco friction parameters for
+            the table.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        placement_initializer (ObjectPositionSampler): if provided, will
+            be used to place objects on every reset, else a UniformRandomSampler
+            is used by default.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        AssertionError: [Invalid number of robots specified]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="default",
+        initialization_noise="default",
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(1.0, 5e-3, 1e-4),
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        placement_initializer=None,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="frontview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+        object_placements=None,
+        direct_gripper_control=False,
+        camera_pos=None,
+        camera_quat_wxyz=None,
+        camera_fov=None,
+        camera_sensorsize=None,
+        camera_principalpixel=None,
+        camera_focalpixel=None,
+    ):
+
+        # settings for table top
+        self.table_full_size = table_full_size
+        self.table_friction = table_friction
+        self.table_offset = np.array((0, 0, 0.8))
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        # object placement initializer
+        self.placement_initializer = placement_initializer
+
+        self.object_placements = object_placements
+        self.camera_pos = camera_pos
+        self.camera_quat_wxyz = camera_quat_wxyz
+        self.camera_fov = camera_fov
+        self.camera_sensorsize = camera_sensorsize
+        self.camera_principalpixel = camera_principalpixel
+        self.camera_focalpixel = camera_focalpixel
+
+        # pdb.set_trace()
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+            direct_gripper_control=direct_gripper_control,
+        )
+
+    def reset(self, object_placements=None):
+        self.object_placements = object_placements
+        return super().reset()
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose accordingly
+        xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
+        self.robots[0].robot_model.set_base_xpos(xpos)
+
+        # load model for table top workspace
+        mujoco_arena = PhantomTableArena(
+            table_full_size=self.table_full_size,
+            table_friction=self.table_friction,
+            table_offset=self.table_offset,
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+        )
+
+        # Modify default frontview camera
+        if self.camera_pos is not None:
+            robot_base_pos = np.array([-0.56, 0, 0.912])
+            mujoco_arena.set_camera(
+                camera_name="frontview",
+                pos=self.camera_pos + robot_base_pos,
+                quat=self.camera_quat_wxyz,
+                camera_attribs={"sensorsize": np.array2string(self.camera_sensorsize)[1:-1], 
+                                "resolution": f"{self.camera_widths[0]} {self.camera_heights[0]}",
+                                "principalpixel": np.array2string(self.camera_principalpixel)[1:-1],
+                                "focalpixel": np.array2string(self.camera_focalpixel)[1:-1],}
+            )
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+   
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+
+    def visualize(self, vis_settings):
+        """
+        In addition to super call, visualize gripper site proportional to the distance to the cube.
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "grippers" keyword as well as any other relevant
+                options specified.
+        """
+        # Run superclass method first
+        super().visualize(vis_settings=vis_settings)
+
+        # # Color the gripper visualization site according to its distance to the cube
+        # if vis_settings["grippers"]:
+        #     self._visualize_gripper_to_target(gripper=self.robots[0].gripper, target=self.cubeA)
+
+    def reward(self, action):
+        return 0.0
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/phantom_bimanual.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/phantom_bimanual.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf095f15977ef1ba6b68bc759150f928596a57e9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/phantom_bimanual.py
@@ -0,0 +1,341 @@
+
+from collections import OrderedDict
+
+import numpy as np
+import pdb 
+from scipy.spatial.transform import Rotation
+
+from robosuite.environments.manipulation.single_arm_env import SingleArmEnv
+from robosuite.environments.manipulation.two_arm_env import TwoArmEnv
+# from robosuite.models.arenas import TableArena
+from robosuite.models.arenas import TableArena2, EmptyArena
+from robosuite.models.objects import BoxObject
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.mjcf_utils import CustomMaterial
+from robosuite.utils.observables import Observable, sensor
+from robosuite.utils.placement_samplers import UniformRandomSampler
+from robosuite.utils.transform_utils import convert_quat
+from robosuite.models.objects import BoxObject, CylinderObject
+
+class PhantomBimanual(TwoArmEnv):
+    """
+    This class corresponds to the stacking task for a single robot arm.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be a single single-arm robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment (default is "default").
+            For most single arm environments, this argument has no impact on the robot setup.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        table_full_size (3-tuple): x, y, and z dimensions of the table.
+
+        table_friction (3-tuple): the three mujoco friction parameters for
+            the table.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        placement_initializer (ObjectPositionSampler): if provided, will
+            be used to place objects on every reset, else a UniformRandomSampler
+            is used by default.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        AssertionError: [Invalid number of robots specified]
+    """
+
+    def __init__(
+        self,
+        robots,
+        bimanual_setup,
+        env_configuration="default",
+        controller_configs=None,
+        mount_types="default",
+        gripper_types="default",
+        initialization_noise="default",
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(1.0, 5e-3, 1e-4),
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        placement_initializer=None,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="zed",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+        object_placements=None,
+        direct_gripper_control=False,
+        camera_pos=None,
+        camera_quat_wxyz=None,
+        camera_fov=None,
+        camera_sensorsize=None,
+        camera_principalpixel=None,
+        camera_focalpixel=None,
+    ):
+
+        self.bimanual_setup = bimanual_setup
+        # settings for table top
+        self.table_full_size = table_full_size
+        self.table_friction = table_friction
+        self.table_offset = np.array((0, 0, 0.4))
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        # object placement initializer
+        self.placement_initializer = placement_initializer
+
+        self.object_placements = object_placements
+        self.camera_pos = camera_pos
+        self.camera_quat_wxyz = camera_quat_wxyz
+        self.camera_fov = camera_fov
+        self.camera_sensorsize = camera_sensorsize
+        self.camera_principalpixel = camera_principalpixel
+        self.camera_focalpixel = camera_focalpixel
+
+        self.robot_base_height = 2.0
+        self.robot_base_offset = -0.5
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types=mount_types,
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+            direct_gripper_control=direct_gripper_control,
+        )
+
+    def reset(self, object_placements=None):
+        self.object_placements = object_placements
+        return super().reset()
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        if self.bimanual_setup == "tabletop":
+            count = 0
+            for robot, offset, rotation in zip(self.robots, (-0.2, 0.2), (0, 0)):
+                xpos = np.array((0, offset, self.robot_base_height))
+                robot.robot_model.set_base_xpos(xpos)
+                rot = np.array((rotation, 0, np.pi)) if count == 1 else np.array((rotation, 0, 0))
+                robot.robot_model.set_base_ori(rot)
+                count += 1
+        elif self.bimanual_setup == "shoulders1":
+            count = 0
+            for robot, offset, rotation in zip(self.robots, (-0.2, 0.2), (np.pi*2/3, -np.pi*2/3)):
+                xpos = np.array((0, offset, self.robot_base_height))
+                robot.robot_model.set_base_xpos(xpos)
+                rot = np.array((rotation, 0, np.pi)) if count == 1 else np.array((rotation, 0, 0))
+                robot.robot_model.set_base_ori(rot)
+                count += 1
+        elif self.bimanual_setup == "shoulders2":
+            count = 0
+            for robot, offset, rotation in zip(self.robots, (-0.2, 0.2), (np.pi/3, -np.pi/3)):
+                xpos = np.array((0, offset, self.robot_base_height))
+                robot.robot_model.set_base_xpos(xpos)
+                rot = np.array((rotation, 0, np.pi)) if count == 1 else np.array((rotation, 0, 0))
+                robot.robot_model.set_base_ori(rot)
+                count += 1
+        elif self.bimanual_setup == "shoulders":
+            count = 0
+            for robot, offset, rotation in zip(self.robots, (-0.2, 0.2), (np.pi/3, -np.pi/3)):
+                if count == 1:
+                    xpos = np.array((0, 0.2, self.robot_base_height+self.robot_base_offset+robot.robot_model.bottom_offset[2]))
+                else:
+                    xpos = np.array((-0.00656507, -0.14111039, 1.58980033+robot.robot_model.bottom_offset[2]))
+                robot.robot_model.set_base_xpos(xpos)
+                if count == 1:
+                    rot = np.array((rotation, 0, np.pi/2)) 
+                else: 
+                    rot = np.array((0.50415113, -0.05164374, -1.57347674))
+                robot.robot_model.set_base_ori(rot)
+                count += 1
+
+        mujoco_arena = EmptyArena()
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+        )
+
+        # Modify zed camera
+        if self.camera_pos is not None:
+
+            mujoco_arena.set_camera(
+                camera_name="zed",
+                pos=self.camera_pos,
+                quat=self.camera_quat_wxyz,
+                camera_attribs={"sensorsize": np.array2string(self.camera_sensorsize)[1:-1], 
+                                "resolution": f"{self.camera_widths[0]} {self.camera_heights[0]}",
+                                "principalpixel": np.array2string(self.camera_principalpixel)[1:-1],
+                                "focalpixel": np.array2string(self.camera_focalpixel)[1:-1],}
+            )
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+   
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+
+    def visualize(self, vis_settings):
+        """
+        In addition to super call, visualize gripper site proportional to the distance to the cube.
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "grippers" keyword as well as any other relevant
+                options specified.
+        """
+        # Run superclass method first
+        super().visualize(vis_settings=vis_settings)
+
+    def reward(self, action):
+        return 0.0
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/pick_place.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/pick_place.py
new file mode 100644
index 0000000000000000000000000000000000000000..d69a718d83830a3d0d5db618779e49dae3c8d717
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/pick_place.py
@@ -0,0 +1,838 @@
+import random
+from collections import OrderedDict
+
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.environments.manipulation.single_arm_env import SingleArmEnv
+from robosuite.models.arenas import BinsArena
+from robosuite.models.objects import (
+    BreadObject,
+    BreadVisualObject,
+    CanObject,
+    CanVisualObject,
+    CerealObject,
+    CerealVisualObject,
+    MilkObject,
+    MilkVisualObject,
+)
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.observables import Observable, sensor
+from robosuite.utils.placement_samplers import SequentialCompositeSampler, UniformRandomSampler
+
+
+class PickPlace(SingleArmEnv):
+    """
+    This class corresponds to the pick place task for a single robot arm.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be a single single-arm robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment (default is "default").
+            For most single arm environments, this argument has no impact on the robot setup.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        table_full_size (3-tuple): x, y, and z dimensions of the table.
+
+        table_friction (3-tuple): the three mujoco friction parameters for
+            the table.
+
+        bin1_pos (3-tuple): Absolute cartesian coordinates of the bin initially holding the objects
+
+        bin2_pos (3-tuple): Absolute cartesian coordinates of the goal bin
+
+        z_offset (float): amount of z offset for initializing objects in bin
+
+        z_rotation (float, tuple, or None): if provided, controls the range of z-rotation initialization
+            for the objects
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        single_object_mode (int): specifies which version of the task to do. Note that
+            the observations change accordingly.
+
+            :`0`: corresponds to the full task with all types of objects.
+
+            :`1`: corresponds to an easier task with only one type of object initialized
+               on the table with every reset. The type is randomized on every reset.
+
+            :`2`: corresponds to an easier task with only one type of object initialized
+               on the table with every reset. The type is kept constant and will not
+               change between resets.
+
+        object_type (string): if provided, should be one of "milk", "bread", "cereal",
+            or "can". Determines which type of object will be spawned on every
+            environment reset. Only used if @single_object_mode is 2.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        AssertionError: [Invalid object type specified]
+        AssertionError: [Invalid number of robots specified]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="default",
+        initialization_noise="default",
+        table_full_size=(0.39, 0.49, 0.82),
+        table_friction=(1, 0.005, 0.0001),
+        bin1_pos=(0.1, -0.25, 0.8),
+        bin2_pos=(0.1, 0.28, 0.8),
+        z_offset=0.,
+        z_rotation=None,
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        single_object_mode=0,
+        object_type=None,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # task settings
+        self.single_object_mode = single_object_mode
+        self.object_to_id = {"milk": 0, "bread": 1, "cereal": 2, "can": 3}
+        self.object_id_to_sensors = {}  # Maps object id to sensor names for that object
+        self.obj_names = ["Milk", "Bread", "Cereal", "Can"]
+        if object_type is not None:
+            assert object_type in self.object_to_id.keys(), "invalid @object_type argument - choose one of {}".format(
+                list(self.object_to_id.keys())
+            )
+            self.object_id = self.object_to_id[object_type]  # use for convenient indexing
+        self.obj_to_use = None
+
+        # settings for table top
+        self.table_full_size = table_full_size
+        self.table_friction = table_friction
+
+        # settings for bin position
+        self.bin1_pos = np.array(bin1_pos)
+        self.bin2_pos = np.array(bin2_pos)
+        self.z_offset = z_offset # z offset for initializing items in bin
+        self.z_rotation = z_rotation # z rotation for initializing items in bin
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action=None):
+        """
+        Reward function for the task.
+
+        Sparse un-normalized reward:
+
+          - a discrete reward of 1.0 per object if it is placed in its correct bin
+
+        Un-normalized components if using reward shaping, where the maximum is returned if not solved:
+
+          - Reaching: in [0, 0.1], proportional to the distance between the gripper and the closest object
+          - Grasping: in {0, 0.35}, nonzero if the gripper is grasping an object
+          - Lifting: in {0, [0.35, 0.5]}, nonzero only if object is grasped; proportional to lifting height
+          - Hovering: in {0, [0.5, 0.7]}, nonzero only if object is lifted; proportional to distance from object to bin
+
+        Note that a successfully completed task (object in bin) will return 1.0 per object irregardless of whether the
+        environment is using sparse or shaped rewards
+
+        Note that the final reward is normalized and scaled by reward_scale / 4.0 (or 1.0 if only a single object is
+        being used) as well so that the max score is equal to reward_scale
+
+        Args:
+            action (np.array): [NOT USED]
+
+        Returns:
+            float: reward value
+        """
+        # compute sparse rewards
+        self._check_success()
+        reward = np.sum(self.objects_in_bins)
+
+        # add in shaped rewards
+        if self.reward_shaping:
+            staged_rewards = self.staged_rewards()
+            reward += max(staged_rewards)
+        if self.reward_scale is not None:
+            reward *= self.reward_scale
+            if self.single_object_mode == 0:
+                reward /= 4.0
+        return reward
+
+    def staged_rewards(self):
+        """
+        Returns staged rewards based on current physical states.
+        Stages consist of reaching, grasping, lifting, and hovering.
+
+        Returns:
+            4-tuple:
+
+                - (float) reaching reward
+                - (float) grasping reward
+                - (float) lifting reward
+                - (float) hovering reward
+        """
+
+        reach_mult = 0.1
+        grasp_mult = 0.35
+        lift_mult = 0.5
+        hover_mult = 0.7
+
+        # filter out objects that are already in the correct bins
+        active_objs = []
+        for i, obj in enumerate(self.objects):
+            if self.objects_in_bins[i]:
+                continue
+            active_objs.append(obj)
+
+        # reaching reward governed by distance to closest object
+        r_reach = 0.0
+        if active_objs:
+            # get reaching reward via minimum distance to a target object
+            dists = [
+                self._gripper_to_target(
+                    gripper=self.robots[0].gripper,
+                    target=active_obj.root_body,
+                    target_type="body",
+                    return_distance=True,
+                )
+                for active_obj in active_objs
+            ]
+            r_reach = (1 - np.tanh(10.0 * min(dists))) * reach_mult
+
+        # grasping reward for touching any objects of interest
+        r_grasp = (
+            int(
+                self._check_grasp(
+                    gripper=self.robots[0].gripper,
+                    object_geoms=[g for active_obj in active_objs for g in active_obj.contact_geoms],
+                )
+            )
+            * grasp_mult
+        )
+
+        # lifting reward for picking up an object
+        r_lift = 0.0
+        if active_objs and r_grasp > 0.0:
+            z_target = self.bin2_pos[2] + 0.25
+            object_z_locs = self.sim.data.body_xpos[[self.obj_body_id[active_obj.name] for active_obj in active_objs]][
+                :, 2
+            ]
+            z_dists = np.maximum(z_target - object_z_locs, 0.0)
+            r_lift = grasp_mult + (1 - np.tanh(15.0 * min(z_dists))) * (lift_mult - grasp_mult)
+
+        # hover reward for getting object above bin
+        r_hover = 0.0
+        if active_objs:
+            target_bin_ids = [self.object_to_id[active_obj.name.lower()] for active_obj in active_objs]
+            # segment objects into left of the bins and above the bins
+            object_xy_locs = self.sim.data.body_xpos[[self.obj_body_id[active_obj.name] for active_obj in active_objs]][
+                :, :2
+            ]
+            y_check = (
+                np.abs(object_xy_locs[:, 1] - self.target_bin_placements[target_bin_ids, 1]) < self.bin_size[1] / 4.0
+            )
+            x_check = (
+                np.abs(object_xy_locs[:, 0] - self.target_bin_placements[target_bin_ids, 0]) < self.bin_size[0] / 4.0
+            )
+            objects_above_bins = np.logical_and(x_check, y_check)
+            objects_not_above_bins = np.logical_not(objects_above_bins)
+            dists = np.linalg.norm(self.target_bin_placements[target_bin_ids, :2] - object_xy_locs, axis=1)
+            # objects to the left get r_lift added to hover reward,
+            # those on the right get max(r_lift) added (to encourage dropping)
+            r_hover_all = np.zeros(len(active_objs))
+            r_hover_all[objects_above_bins] = lift_mult + (1 - np.tanh(10.0 * dists[objects_above_bins])) * (
+                hover_mult - lift_mult
+            )
+            r_hover_all[objects_not_above_bins] = r_lift + (1 - np.tanh(10.0 * dists[objects_not_above_bins])) * (
+                hover_mult - lift_mult
+            )
+            r_hover = np.max(r_hover_all)
+
+        return r_reach, r_grasp, r_lift, r_hover
+
+    def not_in_bin(self, obj_pos, bin_id):
+
+        bin_x_low = self.bin2_pos[0]
+        bin_y_low = self.bin2_pos[1]
+        if bin_id == 0 or bin_id == 2:
+            bin_x_low -= self.bin_size[0] / 2
+        if bin_id < 2:
+            bin_y_low -= self.bin_size[1] / 2
+
+        bin_x_high = bin_x_low + self.bin_size[0] / 2
+        bin_y_high = bin_y_low + self.bin_size[1] / 2
+
+        res = True
+        if (
+            bin_x_low < obj_pos[0] < bin_x_high
+            and bin_y_low < obj_pos[1] < bin_y_high
+            and self.bin2_pos[2] < obj_pos[2] < self.bin2_pos[2] + 0.1
+        ):
+            res = False
+        return res
+
+    def _get_placement_initializer(self):
+        """
+        Helper function for defining placement initializer and object sampling bounds.
+        """
+        self.placement_initializer = SequentialCompositeSampler(name="ObjectSampler")
+
+        # can sample anywhere in bin
+        bin_x_half = self.model.mujoco_arena.table_full_size[0] / 2 - 0.05
+        bin_y_half = self.model.mujoco_arena.table_full_size[1] / 2 - 0.05
+
+        # each object should just be sampled in the bounds of the bin (with some tolerance)
+        self.placement_initializer.append_sampler(
+            sampler=UniformRandomSampler(
+                name="CollisionObjectSampler",
+                mujoco_objects=self.objects,
+                x_range=[-bin_x_half, bin_x_half],
+                y_range=[-bin_y_half, bin_y_half],
+                rotation=self.z_rotation,
+                rotation_axis="z",
+                ensure_object_boundary_in_range=True,
+                ensure_valid_placement=True,
+                reference_pos=self.bin1_pos,
+                z_offset=self.z_offset,
+            )
+        )
+
+        # each visual object should just be at the center of each target bin
+        index = 0
+        for vis_obj in self.visual_objects:
+
+            # get center of target bin
+            bin_x_low = self.bin2_pos[0]
+            bin_y_low = self.bin2_pos[1]
+            if index == 0 or index == 2:
+                bin_x_low -= self.bin_size[0] / 2
+            if index < 2:
+                bin_y_low -= self.bin_size[1] / 2
+            bin_x_high = bin_x_low + self.bin_size[0] / 2
+            bin_y_high = bin_y_low + self.bin_size[1] / 2
+            bin_center = np.array(
+                [
+                    (bin_x_low + bin_x_high) / 2.0,
+                    (bin_y_low + bin_y_high) / 2.0,
+                ]
+            )
+
+            # placement is relative to object bin, so compute difference and send to placement initializer
+            rel_center = bin_center - self.bin1_pos[:2]
+
+            self.placement_initializer.append_sampler(
+                sampler=UniformRandomSampler(
+                    name=f"{vis_obj.name}ObjectSampler",
+                    mujoco_objects=vis_obj,
+                    x_range=[rel_center[0], rel_center[0]],
+                    y_range=[rel_center[1], rel_center[1]],
+                    rotation=0.0,
+                    rotation_axis="z",
+                    ensure_object_boundary_in_range=False,
+                    ensure_valid_placement=False,
+                    reference_pos=self.bin1_pos,
+                    z_offset=self.bin2_pos[2] - self.bin1_pos[2],
+                )
+            )
+            index += 1
+
+    def _construct_visual_objects(self):
+        """
+        Function that can be overriden by subclasses to load different objects.
+        """
+        self.visual_objects = []
+        for vis_obj_cls, obj_name in zip(
+                (MilkVisualObject, BreadVisualObject, CerealVisualObject, CanVisualObject),
+                self.obj_names,
+        ):
+            vis_name = "Visual" + obj_name
+            vis_obj = vis_obj_cls(name=vis_name)
+            self.visual_objects.append(vis_obj)
+
+    def _construct_objects(self):
+        """
+        Function that can be overriden by subclasses to load different objects.
+        """
+        self.objects = []
+        for obj_cls, obj_name in zip(
+                (MilkObject, BreadObject, CerealObject, CanObject),
+                self.obj_names,
+        ):
+            obj = obj_cls(name=obj_name)
+            self.objects.append(obj)
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose accordingly
+        xpos = self.robots[0].robot_model.base_xpos_offset["bins"]
+        self.robots[0].robot_model.set_base_xpos(xpos)
+
+        # load model for table top workspace
+        mujoco_arena = BinsArena(
+            bin1_pos=self.bin1_pos, table_full_size=self.table_full_size, table_friction=self.table_friction
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # store some arena attributes
+        self.bin_size = mujoco_arena.table_full_size
+
+        # make objects
+        self._construct_visual_objects()
+        self._construct_objects()
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+            mujoco_objects=self.visual_objects + self.objects,
+        )
+
+        # Generate placement initializer
+        self._get_placement_initializer()
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+        # Additional object references from this env
+        self.obj_body_id = {}
+        self.obj_geom_id = {}
+
+        # object-specific ids
+        for obj in self.visual_objects + self.objects:
+            self.obj_body_id[obj.name] = self.sim.model.body_name2id(obj.root_body)
+            self.obj_geom_id[obj.name] = [self.sim.model.geom_name2id(g) for g in obj.contact_geoms]
+
+        # keep track of which objects are in their corresponding bins
+        self.objects_in_bins = np.zeros(len(self.objects))
+
+        # target locations in bin for each object type
+        self.target_bin_placements = np.zeros((len(self.objects), 3))
+        for i, obj in enumerate(self.objects):
+            bin_id = i
+            bin_x_low = self.bin2_pos[0]
+            bin_y_low = self.bin2_pos[1]
+            if bin_id == 0 or bin_id == 2:
+                bin_x_low -= self.bin_size[0] / 2.0
+            if bin_id < 2:
+                bin_y_low -= self.bin_size[1] / 2.0
+            bin_x_low += self.bin_size[0] / 4.0
+            bin_y_low += self.bin_size[1] / 4.0
+            self.target_bin_placements[i, :] = [bin_x_low, bin_y_low, self.bin2_pos[2]]
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # low-level object information
+        if self.use_object_obs:
+            # Get robot prefix and define observables modality
+            pf = self.robots[0].robot_model.naming_prefix
+            modality = "object"
+
+            # Reset obj sensor mappings
+            self.object_id_to_sensors = {}
+
+            # for conversion to relative gripper frame
+            @sensor(modality=modality)
+            def world_pose_in_gripper(obs_cache):
+                return (
+                    T.pose_inv(T.pose2mat((obs_cache[f"{pf}eef_pos"], obs_cache[f"{pf}eef_quat"])))
+                    if f"{pf}eef_pos" in obs_cache and f"{pf}eef_quat" in obs_cache
+                    else np.eye(4)
+                )
+
+            sensors = [world_pose_in_gripper]
+            names = ["world_pose_in_gripper"]
+            enableds = [True]
+            actives = [False]
+
+            for i, obj in enumerate(self.objects):
+                # Create object sensors
+                using_obj = self.single_object_mode == 0 or self.object_id == i
+                obj_sensors, obj_sensor_names = self._create_obj_sensors(obj_name=obj.name, modality=modality)
+                sensors += obj_sensors
+                names += obj_sensor_names
+                enableds += [using_obj] * 4
+                actives += [using_obj] * 4
+                self.object_id_to_sensors[i] = obj_sensor_names
+
+            if self.single_object_mode == 1:
+                # This is randomly sampled object, so we need to include object id as observation
+                @sensor(modality=modality)
+                def obj_id(obs_cache):
+                    return self.object_id
+
+                sensors.append(obj_id)
+                names.append("obj_id")
+                enableds.append(True)
+                actives.append(True)
+
+            # Create observables
+            for name, s, enabled, active in zip(names, sensors, enableds, actives):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                    enabled=enabled,
+                    active=active,
+                )
+
+        return observables
+
+    def _create_obj_sensors(self, obj_name, modality="object"):
+        """
+        Helper function to create sensors for a given object. This is abstracted in a separate function call so that we
+        don't have local function naming collisions during the _setup_observables() call.
+
+        Args:
+            obj_name (str): Name of object to create sensors for
+            modality (str): Modality to assign to all sensors
+
+        Returns:
+            2-tuple:
+                sensors (list): Array of sensors for the given obj
+                names (list): array of corresponding observable names
+        """
+        pf = self.robots[0].robot_model.naming_prefix
+
+        @sensor(modality=modality)
+        def obj_pos(obs_cache):
+            return np.array(self.sim.data.body_xpos[self.obj_body_id[obj_name]])
+
+        @sensor(modality=modality)
+        def obj_quat(obs_cache):
+            return T.convert_quat(self.sim.data.body_xquat[self.obj_body_id[obj_name]], to="xyzw")
+
+        @sensor(modality=modality)
+        def obj_to_eef_pos(obs_cache):
+            # Immediately return default value if cache is empty
+            if any(
+                [name not in obs_cache for name in [f"{obj_name}_pos", f"{obj_name}_quat", "world_pose_in_gripper"]]
+            ):
+                return np.zeros(3)
+            obj_pose = T.pose2mat((obs_cache[f"{obj_name}_pos"], obs_cache[f"{obj_name}_quat"]))
+            rel_pose = T.pose_in_A_to_pose_in_B(obj_pose, obs_cache["world_pose_in_gripper"])
+            rel_pos, rel_quat = T.mat2pose(rel_pose)
+            obs_cache[f"{obj_name}_to_{pf}eef_quat"] = rel_quat
+            return rel_pos
+
+        @sensor(modality=modality)
+        def obj_to_eef_quat(obs_cache):
+            return (
+                obs_cache[f"{obj_name}_to_{pf}eef_quat"] if f"{obj_name}_to_{pf}eef_quat" in obs_cache else np.zeros(4)
+            )
+
+        sensors = [obj_pos, obj_quat, obj_to_eef_pos, obj_to_eef_quat]
+        names = [f"{obj_name}_pos", f"{obj_name}_quat", f"{obj_name}_to_{pf}eef_pos", f"{obj_name}_to_{pf}eef_quat"]
+
+        return sensors, names
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+        # Reset all object positions using initializer sampler if we're not directly loading from an xml
+        if not self.deterministic_reset:
+
+            # Sample from the placement initializer for all objects
+            object_placements = self.placement_initializer.sample()
+
+            # Loop through all objects and reset their positions
+            for obj_pos, obj_quat, obj in object_placements.values():
+                # Set the visual object body locations
+                if "visual" in obj.name.lower():
+                    self.sim.model.body_pos[self.obj_body_id[obj.name]] = obj_pos
+                    self.sim.model.body_quat[self.obj_body_id[obj.name]] = obj_quat
+                else:
+                    # Set the collision object joints
+                    self.sim.data.set_joint_qpos(obj.joints[0], np.concatenate([np.array(obj_pos), np.array(obj_quat)]))
+
+        # Set the bins to the desired position
+        self.sim.model.body_pos[self.sim.model.body_name2id("bin1")] = self.bin1_pos
+        self.sim.model.body_pos[self.sim.model.body_name2id("bin2")] = self.bin2_pos
+
+        # Move objects out of the scene depending on the mode
+        obj_names = {obj.name for obj in self.objects}
+        if self.single_object_mode == 1:
+            self.obj_to_use = random.choice(list(obj_names))
+            for obj_type, i in self.object_to_id.items():
+                if obj_type.lower() in self.obj_to_use.lower():
+                    self.object_id = i
+                    break
+        elif self.single_object_mode == 2:
+            self.obj_to_use = self.objects[self.object_id].name
+        if self.single_object_mode in {1, 2}:
+            obj_names.remove(self.obj_to_use)
+            self.clear_objects(list(obj_names))
+
+        # Make sure to update sensors' active and enabled states
+        if self.single_object_mode != 0:
+            for i, sensor_names in self.object_id_to_sensors.items():
+                for name in sensor_names:
+                    # Set all of these sensors to be enabled and active if this is the active object, else False
+                    self._observables[name].set_enabled(i == self.object_id)
+                    self._observables[name].set_active(i == self.object_id)
+
+    def _check_success(self):
+        """
+        Check if all objects have been successfully placed in their corresponding bins.
+
+        Returns:
+            bool: True if all objects are placed correctly
+        """
+        # remember objects that are in the correct bins
+        gripper_site_pos = self.sim.data.site_xpos[self.robots[0].eef_site_id]
+        for i, obj in enumerate(self.objects):
+            obj_str = obj.name
+            obj_pos = self.sim.data.body_xpos[self.obj_body_id[obj_str]]
+            dist = np.linalg.norm(gripper_site_pos - obj_pos)
+            r_reach = 1 - np.tanh(10.0 * dist)
+            self.objects_in_bins[i] = int((not self.not_in_bin(obj_pos, i)) and r_reach < 0.6)
+
+        # returns True if a single object is in the correct bin
+        if self.single_object_mode in {1, 2}:
+            return np.sum(self.objects_in_bins) > 0
+
+        # returns True if all objects are in correct bins
+        return np.sum(self.objects_in_bins) == len(self.objects)
+
+    def visualize(self, vis_settings):
+        """
+        In addition to super call, visualize gripper site proportional to the distance to the closest object.
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "grippers" keyword as well as any other relevant
+                options specified.
+        """
+        # Run superclass method first
+        super().visualize(vis_settings=vis_settings)
+
+        # Color the gripper visualization site according to its distance to the closest object
+        if vis_settings["grippers"]:
+            # find closest object
+            dists = [
+                self._gripper_to_target(
+                    gripper=self.robots[0].gripper,
+                    target=obj.root_body,
+                    target_type="body",
+                    return_distance=True,
+                )
+                for obj in self.objects
+            ]
+            closest_obj_id = np.argmin(dists)
+            # Visualize the distance to this target
+            self._visualize_gripper_to_target(
+                gripper=self.robots[0].gripper,
+                target=self.objects[closest_obj_id].root_body,
+                target_type="body",
+            )
+
+
+class PickPlaceSingle(PickPlace):
+    """
+    Easier version of task - place one object into its bin.
+    A new object is sampled on every reset.
+    """
+
+    def __init__(self, **kwargs):
+        assert "single_object_mode" not in kwargs, "invalid set of arguments"
+        super().__init__(single_object_mode=1, **kwargs)
+
+
+class PickPlaceMilk(PickPlace):
+    """
+    Easier version of task - place one milk into its bin.
+    """
+
+    def __init__(self, **kwargs):
+        assert "single_object_mode" not in kwargs and "object_type" not in kwargs, "invalid set of arguments"
+        super().__init__(single_object_mode=2, object_type="milk", **kwargs)
+
+
+class PickPlaceBread(PickPlace):
+    """
+    Easier version of task - place one bread into its bin.
+    """
+
+    def __init__(self, **kwargs):
+        assert "single_object_mode" not in kwargs and "object_type" not in kwargs, "invalid set of arguments"
+        super().__init__(single_object_mode=2, object_type="bread", **kwargs)
+
+
+class PickPlaceCereal(PickPlace):
+    """
+    Easier version of task - place one cereal into its bin.
+    """
+
+    def __init__(self, **kwargs):
+        assert "single_object_mode" not in kwargs and "object_type" not in kwargs, "invalid set of arguments"
+        super().__init__(single_object_mode=2, object_type="cereal", **kwargs)
+
+
+class PickPlaceCan(PickPlace):
+    """
+    Easier version of task - place one can into its bin.
+    """
+
+    def __init__(self, **kwargs):
+        assert "single_object_mode" not in kwargs and "object_type" not in kwargs, "invalid set of arguments"
+        super().__init__(single_object_mode=2, object_type="can", **kwargs)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/single_arm_env.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/single_arm_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc3c9ac41557d644fdc7476bb1b1c488fd43b9e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/single_arm_env.py
@@ -0,0 +1,72 @@
+import numpy as np
+
+from robosuite.environments.manipulation.manipulation_env import ManipulationEnv
+from robosuite.robots import SingleArm
+from robosuite.utils.transform_utils import mat2quat
+
+
+class SingleArmEnv(ManipulationEnv):
+    """
+    A manipulation environment intended for a single robot arm.
+    """
+
+    def _load_model(self):
+        """
+        Verifies correct robot model is loaded
+        """
+        super()._load_model()
+
+        # Verify the correct robot has been loaded
+        assert isinstance(
+            self.robots[0], SingleArm
+        ), "Error: Expected one single-armed robot! Got {} type instead.".format(type(self.robots[0]))
+
+    def _check_robot_configuration(self, robots):
+        """
+        Sanity check to make sure the inputted robots and configuration is acceptable
+
+        Args:
+            robots (str or list of str): Robots to instantiate within this env
+        """
+        super()._check_robot_configuration(robots)
+        if type(robots) is list:
+            assert len(robots) == 1, "Error: Only one robot should be inputted for this task!"
+
+    @property
+    def _eef_xpos(self):
+        """
+        Grabs End Effector position
+
+        Returns:
+            np.array: End effector(x,y,z)
+        """
+        return np.array(self.sim.data.site_xpos[self.robots[0].eef_site_id])
+
+    @property
+    def _eef_xmat(self):
+        """
+        End Effector orientation as a rotation matrix
+        Note that this draws the orientation from the "ee" site, NOT the gripper site, since the gripper
+        orientations are inconsistent!
+
+        Returns:
+            np.array: (3,3) End Effector orientation matrix
+        """
+        pf = self.robots[0].gripper.naming_prefix
+
+        if self.env_configuration == "bimanual":
+            return np.array(self.sim.data.site_xmat[self.sim.model.site_name2id(pf + "right_grip_site")]).reshape(3, 3)
+        else:
+            return np.array(self.sim.data.site_xmat[self.sim.model.site_name2id(pf + "grip_site")]).reshape(3, 3)
+
+    @property
+    def _eef_xquat(self):
+        """
+        End Effector orientation as a (x,y,z,w) quaternion
+        Note that this draws the orientation from the "ee" site, NOT the gripper site, since the gripper
+        orientations are inconsistent!
+
+        Returns:
+            np.array: (x,y,z,w) End Effector quaternion
+        """
+        return mat2quat(self._eef_xmat)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/stack.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..992ab7e2111a11d1efa836c7c06003249bd7ae2c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/stack.py
@@ -0,0 +1,499 @@
+from collections import OrderedDict
+
+import numpy as np
+
+from robosuite.environments.manipulation.single_arm_env import SingleArmEnv
+from robosuite.models.arenas import TableArena
+from robosuite.models.objects import BoxObject
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.mjcf_utils import CustomMaterial
+from robosuite.utils.observables import Observable, sensor
+from robosuite.utils.placement_samplers import UniformRandomSampler
+from robosuite.utils.transform_utils import convert_quat
+
+
+class Stack(SingleArmEnv):
+    """
+    This class corresponds to the stacking task for a single robot arm.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be a single single-arm robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment (default is "default").
+            For most single arm environments, this argument has no impact on the robot setup.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        table_full_size (3-tuple): x, y, and z dimensions of the table.
+
+        table_friction (3-tuple): the three mujoco friction parameters for
+            the table.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        placement_initializer (ObjectPositionSampler): if provided, will
+            be used to place objects on every reset, else a UniformRandomSampler
+            is used by default.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        AssertionError: [Invalid number of robots specified]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="default",
+        initialization_noise="default",
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(1.0, 5e-3, 1e-4),
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        placement_initializer=None,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # settings for table top
+        self.table_full_size = table_full_size
+        self.table_friction = table_friction
+        self.table_offset = np.array((0, 0, 0.8))
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        # object placement initializer
+        self.placement_initializer = placement_initializer
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action):
+        """
+        Reward function for the task.
+
+        Sparse un-normalized reward:
+
+            - a discrete reward of 2.0 is provided if the red block is stacked on the green block
+
+        Un-normalized components if using reward shaping:
+
+            - Reaching: in [0, 0.25], to encourage the arm to reach the cube
+            - Grasping: in {0, 0.25}, non-zero if arm is grasping the cube
+            - Lifting: in {0, 1}, non-zero if arm has lifted the cube
+            - Aligning: in [0, 0.5], encourages aligning one cube over the other
+            - Stacking: in {0, 2}, non-zero if cube is stacked on other cube
+
+        The reward is max over the following:
+
+            - Reaching + Grasping
+            - Lifting + Aligning
+            - Stacking
+
+        The sparse reward only consists of the stacking component.
+
+        Note that the final reward is normalized and scaled by
+        reward_scale / 2.0 as well so that the max score is equal to reward_scale
+
+        Args:
+            action (np array): [NOT USED]
+
+        Returns:
+            float: reward value
+        """
+        r_reach, r_lift, r_stack = self.staged_rewards()
+        if self.reward_shaping:
+            reward = max(r_reach, r_lift, r_stack)
+        else:
+            reward = 2.0 if r_stack > 0 else 0.0
+
+        if self.reward_scale is not None:
+            reward *= self.reward_scale / 2.0
+
+        return reward
+
+    def staged_rewards(self):
+        """
+        Helper function to calculate staged rewards based on current physical states.
+
+        Returns:
+            3-tuple:
+
+                - (float): reward for reaching and grasping
+                - (float): reward for lifting and aligning
+                - (float): reward for stacking
+        """
+        # reaching is successful when the gripper site is close to the center of the cube
+        cubeA_pos = self.sim.data.body_xpos[self.cubeA_body_id]
+        cubeB_pos = self.sim.data.body_xpos[self.cubeB_body_id]
+        gripper_site_pos = self.sim.data.site_xpos[self.robots[0].eef_site_id]
+        dist = np.linalg.norm(gripper_site_pos - cubeA_pos)
+        r_reach = (1 - np.tanh(10.0 * dist)) * 0.25
+
+        # grasping reward
+        grasping_cubeA = self._check_grasp(gripper=self.robots[0].gripper, object_geoms=self.cubeA)
+        if grasping_cubeA:
+            r_reach += 0.25
+
+        # lifting is successful when the cube is above the table top by a margin
+        cubeA_height = cubeA_pos[2]
+        table_height = self.table_offset[2]
+        cubeA_lifted = cubeA_height > table_height + 0.04
+        r_lift = 1.0 if cubeA_lifted else 0.0
+
+        # Aligning is successful when cubeA is right above cubeB
+        if cubeA_lifted:
+            horiz_dist = np.linalg.norm(np.array(cubeA_pos[:2]) - np.array(cubeB_pos[:2]))
+            r_lift += 0.5 * (1 - np.tanh(horiz_dist))
+
+        # stacking is successful when the block is lifted and the gripper is not holding the object
+        r_stack = 0
+        cubeA_touching_cubeB = self.check_contact(self.cubeA, self.cubeB)
+        if not grasping_cubeA and r_lift > 0 and cubeA_touching_cubeB:
+            r_stack = 2.0
+
+        return r_reach, r_lift, r_stack
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose accordingly
+        xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
+        self.robots[0].robot_model.set_base_xpos(xpos)
+
+        # load model for table top workspace
+        mujoco_arena = TableArena(
+            table_full_size=self.table_full_size,
+            table_friction=self.table_friction,
+            table_offset=self.table_offset,
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # initialize objects of interest
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "1 1",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        redwood = CustomMaterial(
+            texture="WoodRed",
+            tex_name="redwood",
+            mat_name="redwood_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        greenwood = CustomMaterial(
+            texture="WoodGreen",
+            tex_name="greenwood",
+            mat_name="greenwood_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        self.cubeA = BoxObject(
+            name="cubeA",
+            size_min=[0.02, 0.02, 0.02],
+            size_max=[0.02, 0.02, 0.02],
+            rgba=[1, 0, 0, 1],
+            material=redwood,
+        )
+        self.cubeB = BoxObject(
+            name="cubeB",
+            size_min=[0.025, 0.025, 0.025],
+            size_max=[0.025, 0.025, 0.025],
+            rgba=[0, 1, 0, 1],
+            material=greenwood,
+        )
+        cubes = [self.cubeA, self.cubeB]
+        # Create placement initializer
+        if self.placement_initializer is not None:
+            self.placement_initializer.reset()
+            self.placement_initializer.add_objects(cubes)
+        else:
+            self.placement_initializer = UniformRandomSampler(
+                name="ObjectSampler",
+                mujoco_objects=cubes,
+                x_range=[-0.08, 0.08],
+                y_range=[-0.08, 0.08],
+                rotation=None,
+                ensure_object_boundary_in_range=False,
+                ensure_valid_placement=True,
+                reference_pos=self.table_offset,
+                z_offset=0.01,
+            )
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+            mujoco_objects=cubes,
+        )
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+        # Additional object references from this env
+        self.cubeA_body_id = self.sim.model.body_name2id(self.cubeA.root_body)
+        self.cubeB_body_id = self.sim.model.body_name2id(self.cubeB.root_body)
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+        # Reset all object positions using initializer sampler if we're not directly loading from an xml
+        if not self.deterministic_reset:
+
+            # Sample from the placement initializer for all objects
+            object_placements = self.placement_initializer.sample()
+
+            # Loop through all objects and reset their positions
+            for obj_pos, obj_quat, obj in object_placements.values():
+                self.sim.data.set_joint_qpos(obj.joints[0], np.concatenate([np.array(obj_pos), np.array(obj_quat)]))
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # low-level object information
+        if self.use_object_obs:
+            # Get robot prefix and define observables modality
+            pf = self.robots[0].robot_model.naming_prefix
+            modality = "object"
+
+            # position and rotation of the first cube
+            @sensor(modality=modality)
+            def cubeA_pos(obs_cache):
+                return np.array(self.sim.data.body_xpos[self.cubeA_body_id])
+
+            @sensor(modality=modality)
+            def cubeA_quat(obs_cache):
+                return convert_quat(np.array(self.sim.data.body_xquat[self.cubeA_body_id]), to="xyzw")
+
+            @sensor(modality=modality)
+            def cubeB_pos(obs_cache):
+                return np.array(self.sim.data.body_xpos[self.cubeB_body_id])
+
+            @sensor(modality=modality)
+            def cubeB_quat(obs_cache):
+                return convert_quat(np.array(self.sim.data.body_xquat[self.cubeB_body_id]), to="xyzw")
+
+            @sensor(modality=modality)
+            def gripper_to_cubeA(obs_cache):
+                return (
+                    obs_cache["cubeA_pos"] - obs_cache[f"{pf}eef_pos"]
+                    if "cubeA_pos" in obs_cache and f"{pf}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            @sensor(modality=modality)
+            def gripper_to_cubeB(obs_cache):
+                return (
+                    obs_cache["cubeB_pos"] - obs_cache[f"{pf}eef_pos"]
+                    if "cubeB_pos" in obs_cache and f"{pf}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            @sensor(modality=modality)
+            def cubeA_to_cubeB(obs_cache):
+                return (
+                    obs_cache["cubeB_pos"] - obs_cache["cubeA_pos"]
+                    if "cubeA_pos" in obs_cache and "cubeB_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            sensors = [cubeA_pos, cubeA_quat, cubeB_pos, cubeB_quat, gripper_to_cubeA, gripper_to_cubeB, cubeA_to_cubeB]
+            names = [s.__name__ for s in sensors]
+
+            # Create observables
+            for name, s in zip(names, sensors):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                )
+
+        return observables
+
+    def _check_success(self):
+        """
+        Check if blocks are stacked correctly.
+
+        Returns:
+            bool: True if blocks are correctly stacked
+        """
+        _, _, r_stack = self.staged_rewards()
+        return r_stack > 0
+
+    def visualize(self, vis_settings):
+        """
+        In addition to super call, visualize gripper site proportional to the distance to the cube.
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "grippers" keyword as well as any other relevant
+                options specified.
+        """
+        # Run superclass method first
+        super().visualize(vis_settings=vis_settings)
+
+        # Color the gripper visualization site according to its distance to the cube
+        if vis_settings["grippers"]:
+            self._visualize_gripper_to_target(gripper=self.robots[0].gripper, target=self.cubeA)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/tool_hang.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/tool_hang.py
new file mode 100644
index 0000000000000000000000000000000000000000..df5d63806a1e612a0e359b3466b5483f39706854
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/tool_hang.py
@@ -0,0 +1,736 @@
+from collections import OrderedDict
+
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.environments.manipulation.single_arm_env import SingleArmEnv
+from robosuite.models.arenas import TableArena
+from robosuite.models.objects import HookFrame, RatchetingWrenchObject, StandWithMount
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.mjcf_utils import CustomMaterial
+from robosuite.utils.observables import Observable, sensor
+from robosuite.utils.placement_samplers import SequentialCompositeSampler, UniformRandomSampler
+from robosuite.utils.sim_utils import check_contact
+
+
+class ToolHang(SingleArmEnv):
+    """
+    This class corresponds to the tool hang task for a single robot arm.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be a single single-arm robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment (default is "default").
+            For most single arm environments, this argument has no impact on the robot setup.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        table_full_size (3-tuple): x, y, and z dimensions of the table.
+
+        table_friction (3-tuple): the three mujoco friction parameters for
+            the table.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        AssertionError: [Invalid number of robots specified]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="default",
+        initialization_noise="default",
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(1.0, 5e-3, 1e-4),
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # settings for table top
+        self.table_full_size = table_full_size
+        self.table_friction = table_friction
+        self.table_offset = np.array((0, 0, 0.8))
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action=None):
+        """
+        Reward function for the task.
+
+        Args:
+            action (np array): [NOT USED]
+
+        Returns:
+            float: reward value
+        """
+        reward = 0.0
+
+        # sparse completion reward
+        if self._check_success():
+            reward = 1.0
+
+        # Scale reward if requested
+        if self.reward_scale is not None:
+            reward *= self.reward_scale
+
+        return reward
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+
+        Made some aspects easier than the real world task:
+            - increase base thickness for stand
+            - increase mount width to 1.2 cm
+            - add hole visualization
+            - reduce hook height on stand a little
+            - reduce tool ends height a little
+        """
+        super()._load_model()
+
+        # Adjust base pose accordingly
+        xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
+        self.robots[0].robot_model.set_base_xpos(xpos)
+
+        # load model for table top workspace
+        mujoco_arena = TableArena(
+            table_full_size=self.table_full_size,
+            table_friction=self.table_friction,
+            table_offset=self.table_offset,
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # Modify default agentview camera
+        mujoco_arena.set_camera(
+            camera_name="agentview",
+            pos=[0.4837275266036987, 0.2505579098815722, 1.2639379055124524],
+            quat=[0.39713290333747864, 0.27807527780532837, 0.5016612410545349, 0.7164464592933655],
+        )
+
+        # Add sideview
+        mujoco_arena.set_camera(
+            camera_name="sideview",
+            pos=[0.4837275266036987, 0.2505579098815722, 1.2139379055124524],
+            quat=[0.39713290333747864, 0.27807527780532837, 0.5016612410545349, 0.7164464592933655],
+        )
+
+        # Create stand, frame, and tool
+        self.stand_args = dict(
+            name="stand",
+            size=(
+                (12.0 / 100.0),
+                (14.0 / 100.0),
+                (16.0 / 100.0),
+            ),  # 14 cm x 12 cm base, with 16 cm height (in real world we cut the 32 cm height stand in half as well)
+            mount_location=(0.0, (4.5 / 100.0)),  # 2.5 cm from right edge, so 4.5 cm to the right
+            mount_width=(1.2 / 100.0),  # 1.2 cm thickness for rod cavity
+            wall_thickness=(0.1 / 100.0),  # about 0.1-0.2 cm thickness for walls
+            base_thickness=(1 / 100.0),  # increased thickness to 1 cm (different from real)
+            initialize_on_side=False,
+            add_hole_vis=True,
+            density=50000.0,
+            solref=(0.02, 1.0),
+            solimp=(0.998, 0.998, 0.001),
+        )
+        self.stand = StandWithMount(**self.stand_args)
+
+        self.frame_args = dict(
+            name="frame",
+            frame_length=(9.5 / 100.0),  # 9.5 cm wide
+            frame_height=(18.0 / 100.0),  # 18 cm tall (in real world we cut the physical 36 cm rod in half as well)
+            frame_thickness=(0.75 / 100.0),  # 0.75 cm thick
+            hook_height=(1.2 / 100.0),  # lowered to 1.2 cm tall (instead of 1.7 cm in real world)
+            grip_location=((9.0 - 3.0) / 100.0)
+            - (0.75 / 200.0),  # move up by half height of frame minus half height of grip minus half thickness
+            grip_size=((2.54 / 200.0), (6.35 / 200.0)),  # 6.35 cm length, 2.54 cm thick
+            tip_size=(
+                (2.54 / 200.0),
+                (0.2 / 200.0),
+                (0.65 / 200.0),
+                (1.905 / 100.0),
+            ),  # 1-inch cylinder, 0.65 inch solder tip
+            density=500.0,
+            solref=(0.02, 1.0),
+            solimp=(0.998, 0.998, 0.001),
+        )
+        self.frame = HookFrame(**self.frame_args)
+
+        self.real_tool_args = dict(
+            name="tool",
+            handle_size=(
+                (16.5 / 200.0),
+                (1.75 / 200.0),
+                (0.32 / 200.0),
+            ),  # 16.5 cm length, 1.75 cm width, 0.32 cm thick (1.5 cm with foam)
+            outer_radius_1=(3.5 / 200.0),  # larger hole 3.5 cm outer diameter
+            inner_radius_1=(2.1 / 200.0),  # reduced larger hole 2.1 cm inner diameter (from real world 2.3 cm)
+            height_1=(0.7 / 200.0),  # 0.7 cm height
+            outer_radius_2=(3.0 / 200.0),  # smaller hole 3 cm outer diameter
+            inner_radius_2=(2.0 / 200.0),  # smaller hole 2 cm outer diameter
+            height_2=(0.7 / 200.0),  # 0.7 cm height
+            ngeoms=8,
+            grip_size=((3 / 200.0), (8.0 / 200.0)),  # 8 cm length, 3 cm thick
+            density=2000.0,
+            solref=(0.02, 1.0),
+            solimp=(0.998, 0.998, 0.001),
+            friction=(0.95, 0.3, 0.1),
+        )
+
+        self.tool_args = self.real_tool_args
+        self.tool = RatchetingWrenchObject(**self.tool_args)
+
+        # Create placement initializer
+        self._get_placement_initializer()
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+            mujoco_objects=[self.stand, self.frame, self.tool],
+        )
+
+    def _get_placement_initializer(self):
+        """
+        Helper function for defining placement initializer and object sampling bounds
+        """
+        # Create placement initializer
+        self.placement_initializer = SequentialCompositeSampler(name="ObjectSampler")
+
+        # Pre-define settings for each object's placement
+        objects = [self.stand, self.frame, self.tool]
+        x_centers = [-self.table_full_size[0] * 0.1, -self.table_full_size[0] * 0.05, self.table_full_size[0] * 0.05]
+        y_centers = [0.0, -self.table_full_size[1] * 0.3, -self.table_full_size[1] * 0.25]
+        x_tols = [0.0, 0.02, 0.02]
+        y_tols = [0.0, 0.02, 0.02]
+        rot_centers = [0, (-np.pi / 2) + (np.pi / 6), (-np.pi / 2) - (np.pi / 9.0)]
+        rot_tols = [0.0, np.pi / 18, np.pi / 18.0]
+        rot_axes = ["z", "y", "z"]
+        z_offsets = [
+            0.001,
+            (self.frame_args["frame_thickness"] - self.frame_args["frame_height"]) / 2.0
+            + 0.001
+            + (self.stand_args["base_thickness"] / 2.0)
+            + (self.frame_args["grip_size"][1]),
+            0.001,
+        ]
+        if ("tip_size" in self.frame_args) and (self.frame_args["tip_size"] is not None):
+            z_offsets[1] -= self.frame_args["tip_size"][0] + 2.0 * self.frame_args["tip_size"][3]
+        for obj, x, y, x_tol, y_tol, r, r_tol, r_axis, z_offset in zip(
+            objects, x_centers, y_centers, x_tols, y_tols, rot_centers, rot_tols, rot_axes, z_offsets
+        ):
+            # Create sampler for this object and add it to the sequential sampler
+            self.placement_initializer.append_sampler(
+                sampler=UniformRandomSampler(
+                    name=f"{obj.name}ObjectSampler",
+                    mujoco_objects=obj,
+                    x_range=[x - x_tol, x + x_tol],
+                    y_range=[y - y_tol, y + y_tol],
+                    rotation=[r - r_tol, r + r_tol],
+                    rotation_axis=r_axis,
+                    ensure_object_boundary_in_range=False,
+                    ensure_valid_placement=False,
+                    reference_pos=self.table_offset,
+                    z_offset=z_offset,
+                )
+            )
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+        # Additional object references from this env
+        self.obj_body_id = dict(
+            stand=self.sim.model.body_name2id(self.stand.root_body),
+            frame=self.sim.model.body_name2id(self.frame.root_body),
+            tool=self.sim.model.body_name2id(self.tool.root_body),
+        )
+
+        # Important sites:
+        #   tool_hole1_center - for checking hanging
+        #   frame_hang_site, frame_mount_site, frame_intersection_site - for orienting the hook, and checking hanging
+        #   stand_mount_site - for checking that stand base is upright
+        self.obj_site_id = dict(
+            tool_hole1_center=self.sim.model.site_name2id("tool_hole1_center"),  # center of one end of wrench
+            # tool_hole2_center=self.sim.model.site_name2id("tool_hole2_center"), # center of other end of wrench
+            frame_hang_site=self.sim.model.site_name2id("frame_hang_site"),  # end of frame where hanging takes place
+            frame_mount_site=self.sim.model.site_name2id(
+                "frame_mount_site"
+            ),  # bottom of frame that needs to be inserted into base
+            frame_intersection_site=self.sim.model.site_name2id("frame_intersection_site"),  # corner of frame
+            stand_mount_site=self.sim.model.site_name2id(
+                "stand_mount_site"
+            ),  # where frame needs to be inserted into stand
+        )
+        if ("tip_size" in self.frame_args) and (self.frame_args["tip_size"] is not None):
+            self.obj_site_id["frame_tip_site"] = self.sim.model.site_name2id("frame_tip_site")  # tip site for insertion
+
+        # Important geoms:
+        #   stand_base - for checking that stand base is upright
+        #   stand wall geoms - for checking rod insertion into stand
+        #   tool hole geoms - for checking insertion
+        self.obj_geom_id = dict(
+            stand_base=self.sim.model.geom_name2id("stand_base"),  # bottom of stand
+        )
+        for i in range(4):
+            self.obj_geom_id["stand_wall_{}".format(i)] = self.sim.model.geom_name2id("stand_wall{}".format(i))
+        for i in range(self.tool_args["ngeoms"]):
+            self.obj_geom_id["tool_hole1_hc_{}".format(i)] = self.sim.model.geom_name2id("tool_hole1_hc_{}".format(i))
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # low-level object information
+        if self.use_object_obs:
+            # Get robot prefix and define observables modality
+            pf = self.robots[0].robot_model.naming_prefix
+            modality = "object"
+
+            # for conversion to relative gripper frame
+            @sensor(modality=modality)
+            def world_pose_in_gripper(obs_cache):
+                return (
+                    T.pose_inv(T.pose2mat((obs_cache[f"{pf}eef_pos"], obs_cache[f"{pf}eef_quat"])))
+                    if f"{pf}eef_pos" in obs_cache and f"{pf}eef_quat" in obs_cache
+                    else np.eye(4)
+                )
+
+            sensors = [world_pose_in_gripper]
+            names = ["world_pose_in_gripper"]
+            actives = [False]
+
+            # Add absolute and relative pose for each object
+            obj_names = ["base", "frame", "tool"]
+            query_names = ["stand_base", "frame_intersection_site", "tool"]
+            query_types = ["geom", "site", "body"]
+            for i in range(len(obj_names)):
+                obj_sensors, obj_sensor_names = self._create_obj_sensors(
+                    obj_name=obj_names[i], modality=modality, query_name=query_names[i], query_type=query_types[i]
+                )
+                sensors += obj_sensors
+                names += obj_sensor_names
+                actives += [True] * len(obj_sensors)
+
+            # Key boolean checks
+            @sensor(modality=modality)
+            def frame_is_assembled(obs_cache):
+                return [float(self._check_frame_assembled())]
+
+            @sensor(modality=modality)
+            def tool_on_frame(obs_cache):
+                return [float(self._check_tool_on_frame())]
+
+            sensors += [frame_is_assembled, tool_on_frame]
+            names += [frame_is_assembled.__name__, tool_on_frame.__name__]
+            actives += [True, True]
+
+            # Create observables
+            for name, s, active in zip(names, sensors, actives):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                    active=active,
+                )
+
+        return observables
+
+    def _create_obj_sensors(self, obj_name, modality="object", query_name=None, query_type="body"):
+        """
+        Helper function to create sensors for a given object. This is abstracted in a separate function call so that we
+        don't have local function naming collisions during the _setup_observables() call.
+
+        Args:
+            obj_name (str): Name of object to create sensors for (used for naming observations)
+            modality (str): Modality to assign to all sensors
+            query_name (str): Name to query mujoco for the pose attributes of this object - if None, use @obj_name
+            query_type (str): Either "body", "geom", or "site" - type of mujoco sensor that will be queried for pose
+
+        Returns:
+            2-tuple:
+                sensors (list): Array of sensors for the given obj
+                names (list): array of corresponding observable names
+        """
+        if query_name is None:
+            query_name = obj_name
+
+        assert query_type in ["body", "geom", "site"]
+        if query_type == "body":
+            id_lookup = self.obj_body_id
+            pos_lookup = self.sim.data.body_xpos
+            mat_lookup = self.sim.data.body_xmat
+        elif query_type == "geom":
+            id_lookup = self.obj_geom_id
+            pos_lookup = self.sim.data.geom_xpos
+            mat_lookup = self.sim.data.geom_xmat
+        else:
+            id_lookup = self.obj_site_id
+            pos_lookup = self.sim.data.site_xpos
+            mat_lookup = self.sim.data.site_xmat
+
+        ### TODO: this was slightly modified from pick-place - do we want to move this into utils to share it? ###
+        pf = self.robots[0].robot_model.naming_prefix
+
+        @sensor(modality=modality)
+        def obj_pos(obs_cache):
+            return np.array(pos_lookup[id_lookup[query_name]])
+
+        @sensor(modality=modality)
+        def obj_quat(obs_cache):
+            return T.mat2quat(np.array(mat_lookup[id_lookup[query_name]]).reshape(3, 3))
+
+        @sensor(modality=modality)
+        def obj_to_eef_pos(obs_cache):
+            # Immediately return default value if cache is empty
+            if any(
+                [name not in obs_cache for name in [f"{obj_name}_pos", f"{obj_name}_quat", "world_pose_in_gripper"]]
+            ):
+                return np.zeros(3)
+            obj_pose = T.pose2mat((obs_cache[f"{obj_name}_pos"], obs_cache[f"{obj_name}_quat"]))
+            rel_pose = T.pose_in_A_to_pose_in_B(obj_pose, obs_cache["world_pose_in_gripper"])
+            rel_pos, rel_quat = T.mat2pose(rel_pose)
+            obs_cache[f"{obj_name}_to_{pf}eef_quat"] = rel_quat
+            return rel_pos
+
+        @sensor(modality=modality)
+        def obj_to_eef_quat(obs_cache):
+            return (
+                obs_cache[f"{obj_name}_to_{pf}eef_quat"] if f"{obj_name}_to_{pf}eef_quat" in obs_cache else np.zeros(4)
+            )
+
+        sensors = [obj_pos, obj_quat, obj_to_eef_pos, obj_to_eef_quat]
+        names = [f"{obj_name}_pos", f"{obj_name}_quat", f"{obj_name}_to_{pf}eef_pos", f"{obj_name}_to_{pf}eef_quat"]
+
+        return sensors, names
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+        # Reset all object positions using initializer sampler if we're not directly loading from an xml
+        if not self.deterministic_reset:
+
+            # Sample from the placement initializer for all objects
+            object_placements = self.placement_initializer.sample()
+
+            # Loop through all objects and reset their positions
+            for obj_pos, obj_quat, obj in object_placements.values():
+                self.sim.data.set_joint_qpos(obj.joints[0], np.concatenate([np.array(obj_pos), np.array(obj_quat)]))
+
+    def visualize(self, vis_settings):
+        """
+        In addition to super call, visualize gripper site proportional to the distance to the cube.
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "grippers" keyword as well as any other relevant
+                options specified.
+        """
+        # Run superclass method first
+        super().visualize(vis_settings=vis_settings)
+
+        # Color the gripper visualization site according to its distance to the cube
+        if vis_settings["grippers"]:
+            self._visualize_gripper_to_target(gripper=self.robots[0].gripper, target=self.tool)
+
+    def _check_success(self):
+        """
+        Check if tool is hung on frame correctly and frame is assembled coorectly as well.
+
+        Returns:
+            bool: True if tool is hung on frame correctly
+        """
+        return self._check_frame_assembled() and self._check_tool_on_frame()
+
+    def _check_frame_assembled(self):
+        """
+        Check if the frame has been assembled correctly. This checks the following things:
+            (1) the base is upright
+            (2) the end of the hook frame is close enough to the base
+            (3) the hook frame is between the walls of the base
+        """
+
+        # position of base
+        base_pos = self.sim.data.geom_xpos[self.obj_geom_id["stand_base"]]
+
+        # check (1): the base is upright. Just take the vector between two locations on the base shaft, and check
+        #            that the angle to the z-axis is small, by computing the angle between that unit vector and
+        #            the z-axis. Recall that for two unit vectors, the arccosine of the dot product gives the angle.
+        vec_along_base_shaft = self.sim.data.site_xpos[self.obj_site_id["stand_mount_site"]] - base_pos
+        vec_along_base_shaft = vec_along_base_shaft / np.linalg.norm(vec_along_base_shaft)
+        angle_to_z_axis = np.abs(np.arccos(vec_along_base_shaft[2]))
+        base_shaft_is_vertical = angle_to_z_axis < np.pi / 18.0  # less than 10 degrees
+
+        # check (2): the end of the hook frame is close enough to the base. Just check the distance
+        if "frame_tip_site" in self.obj_site_id:
+            bottom_hook_pos = self.sim.data.site_xpos[self.obj_site_id["frame_tip_site"]]
+        else:
+            bottom_hook_pos = self.sim.data.site_xpos[self.obj_site_id["frame_mount_site"]]
+        insertion_dist = np.linalg.norm(bottom_hook_pos - base_pos)
+        # insertion_tolerance = (self.frame_args["frame_thickness"] / 2.)
+        insertion_tolerance = 0.05  # NOTE: this was manually tuned
+        bottom_is_close_enough = insertion_dist < insertion_tolerance
+
+        # check (3): the hook frame is in between the walls of the base. Take the geom positions of opposing base walls
+        #            and check that they are on opposite sides of the line defined by the hook frame.
+
+        # normalized vector that points along the frame hook
+        hook_endpoint = self.sim.data.site_xpos[self.obj_site_id["frame_mount_site"]]
+        frame_hook_vec = self.sim.data.site_xpos[self.obj_site_id["frame_intersection_site"]] - hook_endpoint
+        frame_hook_length = np.linalg.norm(frame_hook_vec)
+        frame_hook_vec = frame_hook_vec / frame_hook_length
+
+        # geom wall position vectors relative to base position
+        geom_positions = [
+            self.sim.data.geom_xpos[self.obj_geom_id["stand_wall_{}".format(i)]] - hook_endpoint for i in range(4)
+        ]
+
+        # take cross product of each point against the line, and then dot the result to see if
+        # the sign is positive or negative. If it is positive, then they are on the same side
+        # (visualize with right-hand-rule to see this)
+        rod_is_between_stand_walls = all(
+            [
+                np.dot(np.cross(geom_positions[0], frame_hook_vec), np.cross(geom_positions[2], frame_hook_vec)) < 0,
+                np.dot(np.cross(geom_positions[1], frame_hook_vec), np.cross(geom_positions[3], frame_hook_vec)) < 0,
+            ]
+        )
+
+        return base_shaft_is_vertical and (bottom_is_close_enough and rod_is_between_stand_walls)
+
+    def _check_tool_on_frame(self):
+        """
+        Check if the tool has been hung on the frame correctly. This checks the following things:
+            (1) the robot is not touching the tool (it is hanging on its own)
+            (2) the tool hole is making contact with the frame hook
+            (3) the tool hole is close to the line defined by the frame hook
+            (4) either end of the tool hole are on opposite sides of the frame hook
+            (5) the tool hole is inserted far enough into the frame hook
+        """
+
+        # check (1): robot is not touching the tool
+        robot_grasp_geoms = [
+            self.robots[0].gripper.important_geoms["left_fingerpad"],
+            self.robots[0].gripper.important_geoms["right_fingerpad"],
+        ]
+        robot_and_tool_contact = False
+        for g_group in robot_grasp_geoms:
+            if check_contact(self.sim, g_group, self.tool.contact_geoms):
+                robot_and_tool_contact = True
+                break
+
+        # check (2): the tool hole is making contact with the frame hook
+        all_tool_hole_geoms = ["tool_hole1_hc_{}".format(i) for i in range(self.tool_args["ngeoms"])]
+        frame_hook_geom = "frame_horizontal_frame"
+        frame_and_tool_hole_contact = check_contact(self.sim, all_tool_hole_geoms, frame_hook_geom)
+
+        # check (3): compute distance from tool hole center to the line defined by the frame hook
+
+        # normalized vector that points along the frame hook
+        hook_endpoint = self.sim.data.site_xpos[self.obj_site_id["frame_hang_site"]]
+        frame_hook_vec = self.sim.data.site_xpos[self.obj_site_id["frame_intersection_site"]] - hook_endpoint
+        frame_hook_length = np.linalg.norm(frame_hook_vec)
+        frame_hook_vec = frame_hook_vec / frame_hook_length
+
+        # compute orthogonal projection of tool hole point to get distance to frame hook line
+        # (see https://en.wikipedia.org/wiki/Distance_from_a_point_to_a_line#Vector_formulation)
+        tool_hole_center = self.sim.data.site_xpos[self.obj_site_id["tool_hole1_center"]]
+        tool_hole_vec = tool_hole_center - hook_endpoint
+        tool_hole_dot = np.dot(tool_hole_vec, frame_hook_vec)
+        tool_hole_proj = tool_hole_dot * frame_hook_vec
+        tool_hole_ortho_proj = tool_hole_vec - tool_hole_proj
+        dist_to_frame_hook_line = np.linalg.norm(tool_hole_ortho_proj)
+
+        # distance needs to be less than the difference between the inner tool hole radius and the half-length of the frame hook box geom
+        tool_hole_is_close_enough = dist_to_frame_hook_line < (
+            self.tool_args["inner_radius_1"] - (self.frame_args["frame_thickness"] / 2.0)
+        )
+
+        # check (4): take two opposite geoms around the tool hole, and check that they are on opposite sides of the frame hook line
+        #            to guarantee that insertion has taken place
+        g2_id = self.tool_args["ngeoms"] // 2  # get geom opposite geom 0
+        g1_pos = self.sim.data.geom_xpos[self.obj_geom_id["tool_hole1_hc_0"]]
+        g2_pos = self.sim.data.geom_xpos[self.obj_geom_id["tool_hole1_hc_{}".format(g2_id)]]
+
+        # take cross product of each point against the line, and then dot the result to see if
+        # the sign is positive or negative. If it is positive, then they are on the same side
+        # (visualize with right-hand-rule to see this)
+        g1_vec = g1_pos - hook_endpoint
+        g2_vec = g2_pos - hook_endpoint
+        tool_is_between_hook = np.dot(np.cross(g1_vec, frame_hook_vec), np.cross(g2_vec, frame_hook_vec)) < 0
+
+        # check (5): check if tool insertion is far enough - check this by computing normalized distance of projection along frame hook line.
+        #            We ensure that it's at least 5% inserted along the length of the frame hook.
+        normalized_dist_along_frame_hook_line = tool_hole_dot / frame_hook_length
+        tool_is_inserted_far_enough = (normalized_dist_along_frame_hook_line > 0.05) and (
+            normalized_dist_along_frame_hook_line < 1.0
+        )
+
+        return all(
+            [
+                (not robot_and_tool_contact),
+                frame_and_tool_hole_contact,
+                tool_hole_is_close_enough,
+                tool_is_between_hook,
+                tool_is_inserted_far_enough,
+            ]
+        )
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_env.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5c99f8d68deaf5c83eaed5fe464a11a56a3caac
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_env.py
@@ -0,0 +1,136 @@
+import numpy as np
+
+from robosuite.environments.manipulation.manipulation_env import ManipulationEnv
+from robosuite.utils.robot_utils import check_bimanual
+from robosuite.utils.transform_utils import mat2quat
+
+
+class TwoArmEnv(ManipulationEnv):
+    """
+    A manipulation environment intended for two robot arms.
+    """
+
+    def _check_robot_configuration(self, robots):
+        """
+        Sanity check to make sure the inputted robots and configuration is acceptable
+
+        Args:
+            robots (str or list of str): Robots to instantiate within this env
+        """
+        super()._check_robot_configuration(robots)
+        robots = robots if type(robots) == list or type(robots) == tuple else [robots]
+        # If default config is used, set env_configuration accordingly
+        if self.env_configuration == "default":
+            self.env_configuration = "bimanual" if check_bimanual(robots[0]) else "single-arm-opposed"
+
+        if self.env_configuration == "single-arm-opposed" or self.env_configuration == "single-arm-parallel":
+            # Specifically two robots should be inputted!
+            is_bimanual = False
+            if type(robots) is not list or len(robots) != 2:
+                raise ValueError(
+                    "Error: Exactly two single-armed robots should be inputted " "for this task configuration!"
+                )
+        elif self.env_configuration == "bimanual":
+            is_bimanual = True
+            # Specifically one robot should be inputted!
+            if type(robots) is list and len(robots) != 1:
+                raise ValueError("Error: Exactly one bimanual robot should be inputted " "for this task configuration!")
+        else:
+            # This is an unknown env configuration, print error
+            raise ValueError(
+                "Error: Unknown environment configuration received. Only 'bimanual',"
+                "'single-arm-parallel', and 'single-arm-opposed' are supported. Got: {}".format(self.env_configuration)
+            )
+
+        # Lastly, check to make sure all inputted robot names are of their correct type (bimanual / not bimanual)
+        for robot in robots:
+            if check_bimanual(robot) != is_bimanual:
+                raise ValueError(
+                    "Error: For {} configuration, expected bimanual check to return {}; "
+                    "instead, got {}.".format(self.env_configuration, is_bimanual, check_bimanual(robot))
+                )
+
+    @property
+    def _eef0_xpos(self):
+        """
+        Grab the position of Robot 0's end effector.
+
+        Returns:
+            np.array: (x,y,z) position of EEF0
+        """
+        if self.env_configuration == "bimanual":
+            return np.array(self.sim.data.site_xpos[self.robots[0].eef_site_id["right"]])
+        else:
+            return np.array(self.sim.data.site_xpos[self.robots[0].eef_site_id])
+
+    @property
+    def _eef1_xpos(self):
+        """
+        Grab the position of Robot 1's end effector.
+
+        Returns:
+            np.array: (x,y,z) position of EEF1
+        """
+        if self.env_configuration == "bimanual":
+            return np.array(self.sim.data.site_xpos[self.robots[0].eef_site_id["left"]])
+        else:
+            return np.array(self.sim.data.site_xpos[self.robots[1].eef_site_id])
+
+    @property
+    def _eef0_xmat(self):
+        """
+        End Effector 0 orientation as a rotation matrix
+        Note that this draws the orientation from the "ee" site, NOT the gripper site, since the gripper
+        orientations are inconsistent!
+
+        Returns:
+            np.array: (3,3) orientation matrix for EEF0
+        """
+        pf = self.robots[0].gripper.naming_prefix
+
+        if self.env_configuration == "bimanual":
+            return np.array(self.sim.data.site_xmat[self.sim.model.site_name2id(pf + "right_grip_site")]).reshape(3, 3)
+
+        else:
+            return np.array(self.sim.data.site_xmat[self.sim.model.site_name2id(pf + "grip_site")]).reshape(3, 3)
+
+    @property
+    def _eef1_xmat(self):
+        """
+        End Effector 1 orientation as a rotation matrix
+        Note that this draws the orientation from the "ee" site, NOT the gripper site, since the gripper
+        orientations are inconsistent!
+
+        Returns:
+            np.array: (3,3) orientation matrix for EEF1
+        """
+        if self.env_configuration == "bimanual":
+            pf = self.robots[0].gripper.naming_prefix
+            return np.array(self.sim.data.site_xmat[self.sim.model.site_name2id(pf + "left_grip_site")]).reshape(3, 3)
+        else:
+            pf = self.robots[1].gripper.naming_prefix
+            return np.array(self.sim.data.site_xmat[self.sim.model.site_name2id(pf + "grip_site")]).reshape(3, 3)
+
+    @property
+    def _eef0_xquat(self):
+        """
+        End Effector 0 orientation as a (x,y,z,w) quaternion
+        Note that this draws the orientation from the "ee" site, NOT the gripper site, since the gripper
+        orientations are inconsistent!
+
+        Returns:
+            np.array: (x,y,z,w) quaternion for EEF0
+        """
+        return mat2quat(self._eef0_xmat)
+
+    @property
+    def _eef1_xquat(self):
+        """
+        End Effector 1 orientation as a (x,y,z,w) quaternion
+        Note that this draws the orientation from the "ee" site, NOT the gripper site, since the gripper
+        orientations are inconsistent!
+
+        Returns:
+            np.array: (x,y,z,w) quaternion for EEF1
+        """
+        return mat2quat(self._eef1_xmat)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_handover.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_handover.py
new file mode 100644
index 0000000000000000000000000000000000000000..db0d5f94d0b75e59c85128bdcc4d5209d5c84547
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_handover.py
@@ -0,0 +1,617 @@
+from collections import OrderedDict
+
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.environments.manipulation.two_arm_env import TwoArmEnv
+from robosuite.models.arenas import TableArena
+from robosuite.models.objects import HammerObject
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.observables import Observable, sensor
+from robosuite.utils.placement_samplers import UniformRandomSampler
+
+
+class TwoArmHandover(TwoArmEnv):
+    """
+    This class corresponds to the handover task for two robot arms.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be either 2 single single-arm robots or 1 bimanual robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment. Can be either:
+
+            :`'bimanual'`: Only applicable for bimanual robot setups. Sets up the (single) bimanual robot on the -x
+                side of the table
+            :`'single-arm-parallel'`: Only applicable for multi single arm setups. Sets up the (two) single armed
+                robots next to each other on the -x side of the table
+            :`'single-arm-opposed'`: Only applicable for multi single arm setups. Sets up the (two) single armed
+                robots opposed from each others on the opposite +/-y sides of the table.
+
+        Note that "default" corresponds to either "bimanual" if a bimanual robot is used or "single-arm-opposed" if two
+        single-arm robots are used.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        prehensile (bool): If true, handover object starts on the table. Else, the object starts in Arm0's gripper
+
+        table_full_size (3-tuple): x, y, and z dimensions of the table.
+
+        table_friction (3-tuple): the three mujoco friction parameters for
+            the table.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        placement_initializer (ObjectPositionSampler): if provided, will
+            be used to place objects on every reset, else a UniformRandomSampler
+            is used by default.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        ValueError: [Invalid number of robots specified]
+        ValueError: [Invalid env configuration]
+        ValueError: [Invalid robots for specified env configuration]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="default",
+        initialization_noise="default",
+        prehensile=True,
+        table_full_size=(0.8, 1.2, 0.05),
+        table_friction=(1.0, 5e-3, 1e-4),
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        placement_initializer=None,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # Task settings
+        self.prehensile = prehensile
+
+        # settings for table top
+        self.table_full_size = table_full_size
+        self.table_true_size = list(table_full_size)
+        self.table_true_size[1] *= 0.25  # true size will only be partially wide
+        self.table_friction = table_friction
+        self.table_offset = [0, self.table_full_size[1] * (-3 / 8), 0.8]
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+        self.height_threshold = 0.1  # threshold above the table surface which the hammer is considered lifted
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        # object placement initializer
+        self.placement_initializer = placement_initializer
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action=None):
+        """
+        Reward function for the task.
+
+        Sparse un-normalized reward:
+
+            - a discrete reward of 2.0 is provided when only Arm 1 is gripping the handle and has the handle
+              lifted above a certain threshold
+
+        Un-normalized max-wise components if using reward shaping:
+
+            - Arm0 Reaching: (1) in [0, 0.25] proportional to the distance between Arm 0 and the handle
+            - Arm0 Grasping: (2) in {0, 0.5}, nonzero if Arm 0 is gripping the hammer (any part).
+            - Arm0 Lifting: (3) in {0, 1.0}, nonzero if Arm 0 lifts the handle from the table past a certain threshold
+            - Arm0 Hovering: (4) in {0, [1.0, 1.25]}, nonzero only if Arm0 is actively lifting the hammer, and is
+              proportional to the distance between the handle and Arm 1
+              conditioned on the handle being lifted from the table and being grasped by Arm 0
+            - Mutual Grasping: (5) in {0, 1.5}, nonzero if both Arm 0 and Arm 1 are gripping the hammer (Arm 1 must be
+              gripping the handle) while lifted above the table
+            - Handover: (6) in {0, 2.0}, nonzero when only Arm 1 is gripping the handle and has the handle
+              lifted above the table
+
+        Note that the final reward is normalized and scaled by reward_scale / 2.0 as
+        well so that the max score is equal to reward_scale
+
+        Args:
+            action (np array): [NOT USED]
+
+        Returns:
+            float: reward value
+        """
+        # Initialize reward
+        reward = 0
+
+        # use a shaping reward if specified
+        if self.reward_shaping:
+            # Grab relevant parameters
+            arm0_grasp_any, arm1_grasp_handle, hammer_height, table_height = self._get_task_info()
+            # First, we'll consider the cases if the hammer is lifted above the threshold (step 3 - 6)
+            if hammer_height - table_height > self.height_threshold:
+                # Split cases depending on whether arm1 is currently grasping the handle or not
+                if arm1_grasp_handle:
+                    # Check if arm0 is grasping
+                    if arm0_grasp_any:
+                        # This is step 5
+                        reward = 1.5
+                    else:
+                        # This is step 6 (completed task!)
+                        reward = 2.0
+                # This is the case where only arm0 is grasping (step 2-3)
+                else:
+                    reward = 1.0
+                    # Add in up to 0.25 based on distance between handle and arm1
+                    dist = np.linalg.norm(self._gripper_1_to_handle)
+                    reaching_reward = 0.25 * (1 - np.tanh(1.0 * dist))
+                    reward += reaching_reward
+            # Else, the hammer is still on the ground ):
+            else:
+                # Split cases depending on whether arm0 is currently grasping the handle or not
+                if arm0_grasp_any:
+                    # This is step 2
+                    reward = 0.5
+                else:
+                    # This is step 1, we want to encourage arm0 to reach for the handle
+                    dist = np.linalg.norm(self._gripper_0_to_handle)
+                    reaching_reward = 0.25 * (1 - np.tanh(1.0 * dist))
+                    reward = reaching_reward
+
+        # Else this is the sparse reward setting
+        else:
+            # Provide reward if only Arm 1 is grasping the hammer and the handle lifted above the pre-defined threshold
+            if self._check_success():
+                reward = 2.0
+
+        if self.reward_scale is not None:
+            reward *= self.reward_scale / 2.0
+
+        return reward
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose(s) accordingly
+        if self.env_configuration == "bimanual":
+            xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
+            self.robots[0].robot_model.set_base_xpos(xpos)
+        else:
+            if self.env_configuration == "single-arm-opposed":
+                # Set up robots facing towards each other by rotating them from their default position
+                for robot, rotation, offset in zip(self.robots, (np.pi / 2, -np.pi / 2), (-0.25, 0.25)):
+                    xpos = robot.robot_model.base_xpos_offset["table"](self.table_full_size[0])
+                    rot = np.array((0, 0, rotation))
+                    xpos = T.euler2mat(rot) @ np.array(xpos)
+                    xpos += np.array((0, offset, 0))
+                    robot.robot_model.set_base_xpos(xpos)
+                    robot.robot_model.set_base_ori(rot)
+            else:  # "single-arm-parallel" configuration setting
+                # Set up robots parallel to each other but offset from the center
+                for robot, offset in zip(self.robots, (-0.6, 0.6)):
+                    xpos = robot.robot_model.base_xpos_offset["table"](self.table_full_size[0])
+                    xpos = np.array(xpos) + np.array((0, offset, 0))
+                    robot.robot_model.set_base_xpos(xpos)
+
+        # load model for table top workspace
+        mujoco_arena = TableArena(
+            table_full_size=self.table_true_size, table_friction=self.table_friction, table_offset=self.table_offset
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # Modify default agentview camera
+        mujoco_arena.set_camera(
+            camera_name="agentview",
+            pos=[0.8894354364730311, -3.481824231498976e-08, 1.7383813133506494],
+            quat=[0.6530981063842773, 0.2710406184196472, 0.27104079723358154, 0.6530979871749878],
+        )
+
+        # initialize objects of interest
+        self.hammer = HammerObject(name="hammer")
+
+        # Create placement initializer
+        if self.placement_initializer is not None:
+            self.placement_initializer.reset()
+            self.placement_initializer.add_objects(self.hammer)
+        else:
+            # Set rotation about y-axis if hammer starts on table else rotate about z if it starts in gripper
+            rotation_axis = "y" if self.prehensile else "z"
+            self.placement_initializer = UniformRandomSampler(
+                name="ObjectSampler",
+                mujoco_objects=self.hammer,
+                x_range=[-0.1, 0.1],
+                y_range=[-0.05, 0.05],
+                rotation=None,
+                rotation_axis=rotation_axis,
+                ensure_object_boundary_in_range=False,
+                ensure_valid_placement=True,
+                reference_pos=self.table_offset,
+            )
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+            mujoco_objects=self.hammer,
+        )
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+        # Hammer object references from this env
+        self.hammer_body_id = self.sim.model.body_name2id(self.hammer.root_body)
+        self.hammer_handle_geom_id = self.sim.model.geom_name2id(self.hammer.handle_geoms[0])
+
+        # General env references
+        self.table_top_id = self.sim.model.site_name2id("table_top")
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # low-level object information
+        if self.use_object_obs:
+            # Get robot prefix and define observables modality
+            if self.env_configuration == "bimanual":
+                pf0 = self.robots[0].robot_model.naming_prefix + "right_"
+                pf1 = self.robots[0].robot_model.naming_prefix + "left_"
+            else:
+                pf0 = self.robots[0].robot_model.naming_prefix
+                pf1 = self.robots[1].robot_model.naming_prefix
+            modality = "object"
+
+            # position and rotation of hammer
+            @sensor(modality=modality)
+            def hammer_pos(obs_cache):
+                return np.array(self._hammer_pos)
+
+            @sensor(modality=modality)
+            def hammer_quat(obs_cache):
+                return np.array(self._hammer_quat)
+
+            @sensor(modality=modality)
+            def handle_xpos(obs_cache):
+                return np.array(self._handle_xpos)
+
+            @sensor(modality=modality)
+            def gripper0_to_handle(obs_cache):
+                return (
+                    obs_cache["handle_xpos"] - obs_cache[f"{pf0}eef_pos"]
+                    if "handle_xpos" in obs_cache and f"{pf0}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            @sensor(modality=modality)
+            def gripper1_to_handle(obs_cache):
+                return (
+                    obs_cache["handle_xpos"] - obs_cache[f"{pf1}eef_pos"]
+                    if "handle_xpos" in obs_cache and f"{pf1}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            sensors = [hammer_pos, hammer_quat, handle_xpos, gripper0_to_handle, gripper1_to_handle]
+            names = [s.__name__ for s in sensors]
+
+            # Create observables
+            for name, s in zip(names, sensors):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                )
+
+        return observables
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+        # Reset all object positions using initializer sampler if we're not directly loading from an xml
+        if not self.deterministic_reset:
+
+            # Sample from the placement initializer for all objects
+            object_placements = self.placement_initializer.sample()
+
+            # Loop through all objects and reset their positions
+            for obj_pos, obj_quat, obj in object_placements.values():
+                # If prehensile, set the object normally
+                if self.prehensile:
+                    self.sim.data.set_joint_qpos(obj.joints[0], np.concatenate([np.array(obj_pos), np.array(obj_quat)]))
+                # Else, set the object in the hand of the robot and loop a few steps to guarantee the robot is grasping
+                #   the object initially
+                else:
+                    eef_rot_quat = T.mat2quat(T.euler2mat([np.pi - T.mat2euler(self._eef0_xmat)[2], 0, 0]))
+                    obj_quat = T.quat_multiply(obj_quat, eef_rot_quat)
+                    for j in range(100):
+                        # Set object in hand
+                        self.sim.data.set_joint_qpos(
+                            obj.joints[0], np.concatenate([self._eef0_xpos, np.array(obj_quat)])
+                        )
+                        # Close gripper (action = 1) and prevent arm from moving
+                        if self.env_configuration == "bimanual":
+                            # Execute no-op action with gravity compensation
+                            torques = np.concatenate(
+                                [
+                                    self.robots[0].controller["right"].torque_compensation,
+                                    self.robots[0].controller["left"].torque_compensation,
+                                ]
+                            )
+                            self.sim.data.ctrl[self.robots[0]._ref_joint_actuator_indexes] = torques
+                            # Execute gripper action
+                            self.robots[0].grip_action(gripper=self.robots[0].gripper["right"], gripper_action=[1])
+                        else:
+                            # Execute no-op action with gravity compensation
+                            self.sim.data.ctrl[self.robots[0]._ref_joint_actuator_indexes] = self.robots[
+                                0
+                            ].controller.torque_compensation
+                            self.sim.data.ctrl[self.robots[1]._ref_joint_actuator_indexes] = self.robots[
+                                1
+                            ].controller.torque_compensation
+                            # Execute gripper action
+                            self.robots[0].grip_action(gripper=self.robots[0].gripper, gripper_action=[1])
+                        # Take forward step
+                        self.sim.step()
+
+    def _get_task_info(self):
+        """
+        Helper function that grabs the current relevant locations of objects of interest within the environment
+
+        Returns:
+            4-tuple:
+
+                - (bool) True if Arm0 is grasping any part of the hammer
+                - (bool) True if Arm1 is grasping the hammer handle
+                - (float) Height of the hammer body
+                - (float) Height of the table surface
+        """
+        # Get height of hammer and table and define height threshold
+        hammer_angle_offset = (self.hammer.handle_length / 2 + 2 * self.hammer.head_halfsize) * np.sin(
+            self._hammer_angle
+        )
+        hammer_height = (
+            self.sim.data.geom_xpos[self.hammer_handle_geom_id][2] - self.hammer.top_offset[2] - hammer_angle_offset
+        )
+        table_height = self.sim.data.site_xpos[self.table_top_id][2]
+
+        # Check if any Arm's gripper is grasping the hammer handle
+        (g0, g1) = (
+            (self.robots[0].gripper["right"], self.robots[0].gripper["left"])
+            if self.env_configuration == "bimanual"
+            else (self.robots[0].gripper, self.robots[1].gripper)
+        )
+        arm0_grasp_any = self._check_grasp(gripper=g0, object_geoms=self.hammer)
+        arm1_grasp_handle = self._check_grasp(gripper=g1, object_geoms=self.hammer.handle_geoms)
+
+        # Return all relevant values
+        return arm0_grasp_any, arm1_grasp_handle, hammer_height, table_height
+
+    def _check_success(self):
+        """
+        Check if hammer is successfully handed off
+
+        Returns:
+            bool: True if handover has been completed
+        """
+        # Grab relevant params
+        arm0_grasp_any, arm1_grasp_handle, hammer_height, table_height = self._get_task_info()
+        return (
+            True
+            if arm1_grasp_handle and not arm0_grasp_any and hammer_height - table_height > self.height_threshold
+            else False
+        )
+
+    @property
+    def _handle_xpos(self):
+        """
+        Grab the position of the hammer handle.
+
+        Returns:
+            np.array: (x,y,z) position of handle
+        """
+        return self.sim.data.geom_xpos[self.hammer_handle_geom_id]
+
+    @property
+    def _hammer_pos(self):
+        """
+        Grab the position of the hammer body.
+
+        Returns:
+            np.array: (x,y,z) position of body
+        """
+        return np.array(self.sim.data.body_xpos[self.hammer_body_id])
+
+    @property
+    def _hammer_quat(self):
+        """
+        Grab the orientation of the hammer body.
+
+        Returns:
+            np.array: (x,y,z,w) quaternion of the hammer body
+        """
+        return T.convert_quat(self.sim.data.body_xquat[self.hammer_body_id], to="xyzw")
+
+    @property
+    def _hammer_angle(self):
+        """
+        Calculate the angle of hammer with the ground, relative to it resting horizontally
+
+        Returns:
+            float: angle in radians
+        """
+        mat = T.quat2mat(self._hammer_quat)
+        z_unit = [0, 0, 1]
+        z_rotated = np.matmul(mat, z_unit)
+        return np.pi / 2 - np.arccos(np.dot(z_unit, z_rotated))
+
+    @property
+    def _gripper_0_to_handle(self):
+        """
+        Calculate vector from the left gripper to the hammer handle.
+
+        Returns:
+            np.array: (dx,dy,dz) distance vector between handle and EEF0
+        """
+        return self._handle_xpos - self._eef0_xpos
+
+    @property
+    def _gripper_1_to_handle(self):
+        """
+        Calculate vector from the right gripper to the hammer handle.
+
+        Returns:
+            np.array: (dx,dy,dz) distance vector between handle and EEF1
+        """
+        return self._handle_xpos - self._eef1_xpos
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_lift.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_lift.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba3acf1599aaab948e7e10a201d2a24e6082189d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_lift.py
@@ -0,0 +1,545 @@
+from collections import OrderedDict
+
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.environments.manipulation.two_arm_env import TwoArmEnv
+from robosuite.models.arenas import TableArena
+from robosuite.models.objects import PotWithHandlesObject
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.observables import Observable, sensor
+from robosuite.utils.placement_samplers import UniformRandomSampler
+
+
+class TwoArmLift(TwoArmEnv):
+    """
+    This class corresponds to the lifting task for two robot arms.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be either 2 single single-arm robots or 1 bimanual robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment. Can be either:
+
+            :`'bimanual'`: Only applicable for bimanual robot setups. Sets up the (single) bimanual robot on the -x
+                side of the table
+            :`'single-arm-parallel'`: Only applicable for multi single arm setups. Sets up the (two) single armed
+                robots next to each other on the -x side of the table
+            :`'single-arm-opposed'`: Only applicable for multi single arm setups. Sets up the (two) single armed
+                robots opposed from each others on the opposite +/-y sides of the table.
+
+        Note that "default" corresponds to either "bimanual" if a bimanual robot is used or "single-arm-opposed" if two
+        single-arm robots are used.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        table_full_size (3-tuple): x, y, and z dimensions of the table.
+
+        table_friction (3-tuple): the three mujoco friction parameters for
+            the table.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        placement_initializer (ObjectPositionSampler): if provided, will
+            be used to place objects on every reset, else a UniformRandomSampler
+            is used by default.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        ValueError: [Invalid number of robots specified]
+        ValueError: [Invalid env configuration]
+        ValueError: [Invalid robots for specified env configuration]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="default",
+        initialization_noise="default",
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(1.0, 5e-3, 1e-4),
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        placement_initializer=None,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # settings for table top
+        self.table_full_size = table_full_size
+        self.table_friction = table_friction
+        self.table_offset = np.array((0, 0, 0.8))
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        # object placement initializer
+        self.placement_initializer = placement_initializer
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action=None):
+        """
+        Reward function for the task.
+
+        Sparse un-normalized reward:
+
+            - a discrete reward of 3.0 is provided if the pot is lifted and is parallel within 30 deg to the table
+
+        Un-normalized summed components if using reward shaping:
+
+            - Reaching: in [0, 0.5], per-arm component that is proportional to the distance between each arm and its
+              respective pot handle, and exactly 0.5 when grasping the handle
+              - Note that the agent only gets the lifting reward when flipping no more than 30 degrees.
+            - Grasping: in {0, 0.25}, binary per-arm component awarded if the gripper is grasping its correct handle
+            - Lifting: in [0, 1.5], proportional to the pot's height above the table, and capped at a certain threshold
+
+        Note that the final reward is normalized and scaled by reward_scale / 3.0 as
+        well so that the max score is equal to reward_scale
+
+        Args:
+            action (np array): [NOT USED]
+
+        Returns:
+            float: reward value
+        """
+        reward = 0
+
+        # check if the pot is tilted more than 30 degrees
+        mat = T.quat2mat(self._pot_quat)
+        z_unit = [0, 0, 1]
+        z_rotated = np.matmul(mat, z_unit)
+        cos_z = np.dot(z_unit, z_rotated)
+        cos_30 = np.cos(np.pi / 6)
+        direction_coef = 1 if cos_z >= cos_30 else 0
+
+        # check for goal completion: cube is higher than the table top above a margin
+        if self._check_success():
+            reward = 3.0 * direction_coef
+
+        # use a shaping reward
+        elif self.reward_shaping:
+            # lifting reward
+            pot_bottom_height = self.sim.data.site_xpos[self.pot_center_id][2] - self.pot.top_offset[2]
+            table_height = self.sim.data.site_xpos[self.table_top_id][2]
+            elevation = pot_bottom_height - table_height
+            r_lift = min(max(elevation - 0.05, 0), 0.15)
+            reward += 10.0 * direction_coef * r_lift
+
+            _gripper0_to_handle0 = self._gripper0_to_handle0
+            _gripper1_to_handle1 = self._gripper1_to_handle1
+
+            # gh stands for gripper-handle
+            # When grippers are far away, tell them to be closer
+
+            # Get contacts
+            (g0, g1) = (
+                (self.robots[0].gripper["right"], self.robots[0].gripper["left"])
+                if self.env_configuration == "bimanual"
+                else (self.robots[0].gripper, self.robots[1].gripper)
+            )
+
+            _g0h_dist = np.linalg.norm(_gripper0_to_handle0)
+            _g1h_dist = np.linalg.norm(_gripper1_to_handle1)
+
+            # Grasping reward
+            if self._check_grasp(gripper=g0, object_geoms=self.pot.handle0_geoms):
+                reward += 0.25
+            # Reaching reward
+            reward += 0.5 * (1 - np.tanh(10.0 * _g0h_dist))
+
+            # Grasping reward
+            if self._check_grasp(gripper=g1, object_geoms=self.pot.handle1_geoms):
+                reward += 0.25
+            # Reaching reward
+            reward += 0.5 * (1 - np.tanh(10.0 * _g1h_dist))
+
+        if self.reward_scale is not None:
+            reward *= self.reward_scale / 3.0
+
+        return reward
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose(s) accordingly
+        if self.env_configuration == "bimanual":
+            xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
+            self.robots[0].robot_model.set_base_xpos(xpos)
+        else:
+            if self.env_configuration == "single-arm-opposed":
+                # Set up robots facing towards each other by rotating them from their default position
+                for robot, rotation in zip(self.robots, (np.pi / 2, -np.pi / 2)):
+                    xpos = robot.robot_model.base_xpos_offset["table"](self.table_full_size[0])
+                    rot = np.array((0, 0, rotation))
+                    xpos = T.euler2mat(rot) @ np.array(xpos)
+                    robot.robot_model.set_base_xpos(xpos)
+                    robot.robot_model.set_base_ori(rot)
+            else:  # "single-arm-parallel" configuration setting
+                # Set up robots parallel to each other but offset from the center
+                for robot, offset in zip(self.robots, (-0.25, 0.25)):
+                    xpos = robot.robot_model.base_xpos_offset["table"](self.table_full_size[0])
+                    xpos = np.array(xpos) + np.array((0, offset, 0))
+                    robot.robot_model.set_base_xpos(xpos)
+
+        # load model for table top workspace
+        mujoco_arena = TableArena(
+            table_full_size=self.table_full_size,
+            table_friction=self.table_friction,
+            table_offset=self.table_offset,
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # initialize objects of interest
+        self.pot = PotWithHandlesObject(name="pot")
+
+        # Create placement initializer
+        if self.placement_initializer is not None:
+            self.placement_initializer.reset()
+            self.placement_initializer.add_objects(self.pot)
+        else:
+            self.placement_initializer = UniformRandomSampler(
+                name="ObjectSampler",
+                mujoco_objects=self.pot,
+                x_range=[-0.03, 0.03],
+                y_range=[-0.03, 0.03],
+                ensure_object_boundary_in_range=False,
+                ensure_valid_placement=True,
+                reference_pos=self.table_offset,
+                rotation=(np.pi + -np.pi / 3, np.pi + np.pi / 3),
+            )
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+            mujoco_objects=self.pot,
+        )
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+        # Additional object references from this env
+        self.pot_body_id = self.sim.model.body_name2id(self.pot.root_body)
+        self.handle0_site_id = self.sim.model.site_name2id(self.pot.important_sites["handle0"])
+        self.handle1_site_id = self.sim.model.site_name2id(self.pot.important_sites["handle1"])
+        self.table_top_id = self.sim.model.site_name2id("table_top")
+        self.pot_center_id = self.sim.model.site_name2id(self.pot.important_sites["center"])
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # low-level object information
+        if self.use_object_obs:
+            # Get robot prefix and define observables modality
+            if self.env_configuration == "bimanual":
+                pf0 = self.robots[0].robot_model.naming_prefix + "right_"
+                pf1 = self.robots[0].robot_model.naming_prefix + "left_"
+            else:
+                pf0 = self.robots[0].robot_model.naming_prefix
+                pf1 = self.robots[1].robot_model.naming_prefix
+            modality = "object"
+
+            # position and rotation of object
+
+            @sensor(modality=modality)
+            def pot_pos(obs_cache):
+                return np.array(self.sim.data.body_xpos[self.pot_body_id])
+
+            @sensor(modality=modality)
+            def pot_quat(obs_cache):
+                return T.convert_quat(self.sim.data.body_xquat[self.pot_body_id], to="xyzw")
+
+            @sensor(modality=modality)
+            def handle0_xpos(obs_cache):
+                return np.array(self._handle0_xpos)
+
+            @sensor(modality=modality)
+            def handle1_xpos(obs_cache):
+                return np.array(self._handle1_xpos)
+
+            @sensor(modality=modality)
+            def gripper0_to_handle0(obs_cache):
+                return (
+                    obs_cache["handle0_xpos"] - obs_cache[f"{pf0}eef_pos"]
+                    if "handle0_xpos" in obs_cache and f"{pf0}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            @sensor(modality=modality)
+            def gripper1_to_handle1(obs_cache):
+                return (
+                    obs_cache["handle1_xpos"] - obs_cache[f"{pf1}eef_pos"]
+                    if "handle1_xpos" in obs_cache and f"{pf1}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            sensors = [pot_pos, pot_quat, handle0_xpos, handle1_xpos, gripper0_to_handle0, gripper1_to_handle1]
+            names = [s.__name__ for s in sensors]
+
+            # Create observables
+            for name, s in zip(names, sensors):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                )
+
+        return observables
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+        # Reset all object positions using initializer sampler if we're not directly loading from an xml
+        if not self.deterministic_reset:
+
+            # Sample from the placement initializer for all objects
+            object_placements = self.placement_initializer.sample()
+
+            # Loop through all objects and reset their positions
+            for obj_pos, obj_quat, obj in object_placements.values():
+                self.sim.data.set_joint_qpos(obj.joints[0], np.concatenate([np.array(obj_pos), np.array(obj_quat)]))
+
+    def visualize(self, vis_settings):
+        """
+        In addition to super call, visualize gripper site proportional to the distance to each handle.
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "grippers" keyword as well as any other relevant
+                options specified.
+        """
+        # Run superclass method first
+        super().visualize(vis_settings=vis_settings)
+
+        # Color the gripper visualization site according to its distance to each handle
+        if vis_settings["grippers"]:
+            handles = [self.pot.important_sites[f"handle{i}"] for i in range(2)]
+            grippers = (
+                [self.robots[0].gripper[arm] for arm in self.robots[0].arms]
+                if self.env_configuration == "bimanual"
+                else [robot.gripper for robot in self.robots]
+            )
+            for gripper, handle in zip(grippers, handles):
+                self._visualize_gripper_to_target(gripper=gripper, target=handle, target_type="site")
+
+    def _check_success(self):
+        """
+        Check if pot is successfully lifted
+
+        Returns:
+            bool: True if pot is lifted
+        """
+        pot_bottom_height = self.sim.data.site_xpos[self.pot_center_id][2] - self.pot.top_offset[2]
+        table_height = self.sim.data.site_xpos[self.table_top_id][2]
+
+        # cube is higher than the table top above a margin
+        return pot_bottom_height > table_height + 0.10
+
+    @property
+    def _handle0_xpos(self):
+        """
+        Grab the position of the left (blue) hammer handle.
+
+        Returns:
+            np.array: (x,y,z) position of handle
+        """
+        return self.sim.data.site_xpos[self.handle0_site_id]
+
+    @property
+    def _handle1_xpos(self):
+        """
+        Grab the position of the right (green) hammer handle.
+
+        Returns:
+            np.array: (x,y,z) position of handle
+        """
+        return self.sim.data.site_xpos[self.handle1_site_id]
+
+    @property
+    def _pot_quat(self):
+        """
+        Grab the orientation of the pot body.
+
+        Returns:
+            np.array: (x,y,z,w) quaternion of the pot body
+        """
+        return T.convert_quat(self.sim.data.body_xquat[self.pot_body_id], to="xyzw")
+
+    @property
+    def _gripper0_to_handle0(self):
+        """
+        Calculate vector from the left gripper to the left pot handle.
+
+        Returns:
+            np.array: (dx,dy,dz) distance vector between handle and EEF0
+        """
+        return self._handle0_xpos - self._eef0_xpos
+
+    @property
+    def _gripper1_to_handle1(self):
+        """
+        Calculate vector from the right gripper to the right pot handle.
+
+        Returns:
+            np.array: (dx,dy,dz) distance vector between handle and EEF0
+        """
+        return self._handle1_xpos - self._eef1_xpos
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_peg_in_hole.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_peg_in_hole.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f46b2fc814b538f5627087508a6656de8144ce
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_peg_in_hole.py
@@ -0,0 +1,518 @@
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.environments.manipulation.two_arm_env import TwoArmEnv
+from robosuite.models.arenas import EmptyArena
+from robosuite.models.objects import CylinderObject, PlateWithHoleObject
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.mjcf_utils import CustomMaterial, array_to_string, find_elements
+from robosuite.utils.observables import Observable, sensor
+
+
+class TwoArmPegInHole(TwoArmEnv):
+    """
+    This class corresponds to the peg-in-hole task for two robot arms.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be either 2 single single-arm robots or 1 bimanual robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment. Can be either:
+
+            :`'bimanual'`: Only applicable for bimanual robot setups. Sets up the (single) bimanual robot on the -x
+                side of the table
+            :`'single-arm-parallel'`: Only applicable for multi single arm setups. Sets up the (two) single armed
+                robots next to each other on the -x side of the table
+            :`'single-arm-opposed'`: Only applicable for multi single arm setups. Sets up the (two) single armed
+                robots opposed from each others on the opposite +/-y sides of the table.
+
+        Note that "default" corresponds to either "bimanual" if a bimanual robot is used or "single-arm-opposed" if two
+        single-arm robots are used.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate gripper models from gripper factory.
+            For this environment, setting a value other than the default (None) will raise an AssertionError, as
+            this environment is not meant to be used with any gripper at all.
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        use_camera_obs (bool or list of bool): if True, every observation for a specific robot includes a rendered
+        image. Should either be single bool if camera obs value is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        peg_radius (2-tuple): low and high limits of the (uniformly sampled)
+            radius of the peg
+
+        peg_length (float): length of the peg
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        AssertionError: [Gripper specified]
+        ValueError: [Invalid number of robots specified]
+        ValueError: [Invalid env configuration]
+        ValueError: [Invalid robots for specified env configuration]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types=None,
+        initialization_noise="default",
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        peg_radius=(0.015, 0.03),
+        peg_length=0.13,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # Assert that the gripper type is None
+        assert gripper_types is None, "Tried to specify gripper other than None in TwoArmPegInHole environment!"
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        # Save peg specs
+        self.peg_radius = peg_radius
+        self.peg_length = peg_length
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action=None):
+        """
+        Reward function for the task.
+
+        Sparse un-normalized reward:
+
+            - a discrete reward of 5.0 is provided if the peg is inside the plate's hole
+              - Note that we enforce that it's inside at an appropriate angle (cos(theta) > 0.95).
+
+        Un-normalized summed components if using reward shaping:
+
+            - Reaching: in [0, 1], to encourage the arms to approach each other
+            - Perpendicular Distance: in [0,1], to encourage the arms to approach each other
+            - Parallel Distance: in [0,1], to encourage the arms to approach each other
+            - Alignment: in [0, 1], to encourage having the right orientation between the peg and hole.
+            - Placement: in {0, 1}, nonzero if the peg is in the hole with a relatively correct alignment
+
+        Note that the final reward is normalized and scaled by reward_scale / 5.0 as
+        well so that the max score is equal to reward_scale
+
+        """
+        reward = 0
+
+        # Right location and angle
+        if self._check_success():
+            reward = 1.0
+
+        # use a shaping reward
+        if self.reward_shaping:
+            # Grab relevant values
+            t, d, cos = self._compute_orientation()
+            # reaching reward
+            hole_pos = self.sim.data.body_xpos[self.hole_body_id]
+            gripper_site_pos = self.sim.data.body_xpos[self.peg_body_id]
+            dist = np.linalg.norm(gripper_site_pos - hole_pos)
+            reaching_reward = 1 - np.tanh(1.0 * dist)
+            reward += reaching_reward
+
+            # Orientation reward
+            reward += 1 - np.tanh(d)
+            reward += 1 - np.tanh(np.abs(t))
+            reward += cos
+
+        # if we're not reward shaping, scale sparse reward so that the max reward is identical to its dense version
+        else:
+            reward *= 5.0
+
+        if self.reward_scale is not None:
+            reward *= self.reward_scale / 5.0
+
+        return reward
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose(s) accordingly
+        if self.env_configuration == "bimanual":
+            xpos = self.robots[0].robot_model.base_xpos_offset["empty"]
+            self.robots[0].robot_model.set_base_xpos(xpos)
+        else:
+            if self.env_configuration == "single-arm-opposed":
+                # Set up robots facing towards each other by rotating them from their default position
+                for robot, rotation in zip(self.robots, (np.pi / 2, -np.pi / 2)):
+                    xpos = robot.robot_model.base_xpos_offset["empty"]
+                    rot = np.array((0, 0, rotation))
+                    xpos = T.euler2mat(rot) @ np.array(xpos)
+                    robot.robot_model.set_base_xpos(xpos)
+                    robot.robot_model.set_base_ori(rot)
+            else:  # "single-arm-parallel" configuration setting
+                # Set up robots parallel to each other but offset from the center
+                for robot, offset in zip(self.robots, (-0.25, 0.25)):
+                    xpos = robot.robot_model.base_xpos_offset["empty"]
+                    xpos = np.array(xpos) + np.array((0, offset, 0))
+                    robot.robot_model.set_base_xpos(xpos)
+
+        # Add arena and robot
+        mujoco_arena = EmptyArena()
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # Modify default agentview camera
+        mujoco_arena.set_camera(
+            camera_name="agentview",
+            pos=[1.0666432116509934, 1.4903257668114777e-08, 2.0563394967349096],
+            quat=[0.6530979871749878, 0.27104058861732483, 0.27104055881500244, 0.6530978679656982],
+        )
+
+        # initialize objects of interest
+        self.hole = PlateWithHoleObject(name="hole")
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "1 1",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        greenwood = CustomMaterial(
+            texture="WoodGreen",
+            tex_name="greenwood",
+            mat_name="greenwood_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        self.peg = CylinderObject(
+            name="peg",
+            size_min=(self.peg_radius[0], self.peg_length),
+            size_max=(self.peg_radius[1], self.peg_length),
+            material=greenwood,
+            rgba=[0, 1, 0, 1],
+            joints=None,
+        )
+
+        # Load hole object
+        hole_obj = self.hole.get_obj()
+        hole_obj.set("quat", "0 0 0.707 0.707")
+        hole_obj.set("pos", "0.11 0 0.17")
+
+        # Load peg object
+        peg_obj = self.peg.get_obj()
+        peg_obj.set("pos", array_to_string((0, 0, self.peg_length)))
+
+        # Append appropriate objects to arms
+        if self.env_configuration == "bimanual":
+            r_eef, l_eef = [self.robots[0].robot_model.eef_name[arm] for arm in self.robots[0].arms]
+            r_model, l_model = [self.robots[0].robot_model, self.robots[0].robot_model]
+        else:
+            r_eef, l_eef = [robot.robot_model.eef_name for robot in self.robots]
+            r_model, l_model = [self.robots[0].robot_model, self.robots[1].robot_model]
+        r_body = find_elements(root=r_model.worldbody, tags="body", attribs={"name": r_eef}, return_first=True)
+        l_body = find_elements(root=l_model.worldbody, tags="body", attribs={"name": l_eef}, return_first=True)
+        r_body.append(peg_obj)
+        l_body.append(hole_obj)
+
+        # task includes arena, robot, and objects of interest
+        # We don't add peg and hole directly since they were already appended to the robots
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+        )
+
+        # Make sure to add relevant assets from peg and hole objects
+        self.model.merge_assets(self.hole)
+        self.model.merge_assets(self.peg)
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+        # Additional object references from this env
+        self.hole_body_id = self.sim.model.body_name2id(self.hole.root_body)
+        self.peg_body_id = self.sim.model.body_name2id(self.peg.root_body)
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # low-level object information
+        if self.use_object_obs:
+            # Get robot prefix and define observables modality
+            if self.env_configuration == "bimanual":
+                pf0 = self.robots[0].robot_model.naming_prefix + "right_"
+                pf1 = self.robots[0].robot_model.naming_prefix + "left_"
+            else:
+                pf0 = self.robots[0].robot_model.naming_prefix
+                pf1 = self.robots[1].robot_model.naming_prefix
+            modality = "object"
+
+            # position and rotation of peg and hole
+            @sensor(modality=modality)
+            def hole_pos(obs_cache):
+                return np.array(self.sim.data.body_xpos[self.hole_body_id])
+
+            @sensor(modality=modality)
+            def hole_quat(obs_cache):
+                return T.convert_quat(self.sim.data.body_xquat[self.hole_body_id], to="xyzw")
+
+            @sensor(modality=modality)
+            def peg_to_hole(obs_cache):
+                return (
+                    obs_cache["hole_pos"] - np.array(self.sim.data.body_xpos[self.peg_body_id])
+                    if "hole_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            @sensor(modality=modality)
+            def peg_quat(obs_cache):
+                return T.convert_quat(self.sim.data.body_xquat[self.peg_body_id], to="xyzw")
+
+            # Relative orientation parameters
+            @sensor(modality=modality)
+            def angle(obs_cache):
+                t, d, cos = self._compute_orientation()
+                obs_cache["t"] = t
+                obs_cache["d"] = d
+                return cos
+
+            @sensor(modality=modality)
+            def t(obs_cache):
+                return obs_cache["t"] if "t" in obs_cache else 0.0
+
+            @sensor(modality=modality)
+            def d(obs_cache):
+                return obs_cache["d"] if "d" in obs_cache else 0.0
+
+            sensors = [hole_pos, hole_quat, peg_to_hole, peg_quat, angle, t, d]
+            names = [s.__name__ for s in sensors]
+
+            # Create observables
+            for name, s in zip(names, sensors):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                )
+
+        return observables
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+    def _check_success(self):
+        """
+        Check if peg is successfully aligned and placed within the hole
+
+        Returns:
+            bool: True if peg is placed in hole correctly
+        """
+        t, d, cos = self._compute_orientation()
+
+        return d < 0.06 and -0.12 <= t <= 0.14 and cos > 0.95
+
+    def _compute_orientation(self):
+        """
+        Helper function to return the relative positions between the hole and the peg.
+        In particular, the intersection of the line defined by the peg and the plane
+        defined by the hole is computed; the parallel distance, perpendicular distance,
+        and angle are returned.
+
+        Returns:
+            3-tuple:
+
+                - (float): parallel distance
+                - (float): perpendicular distance
+                - (float): angle
+        """
+        peg_mat = self.sim.data.body_xmat[self.peg_body_id]
+        peg_mat.shape = (3, 3)
+        peg_pos = self.sim.data.body_xpos[self.peg_body_id]
+
+        hole_pos = self.sim.data.body_xpos[self.hole_body_id]
+        hole_mat = self.sim.data.body_xmat[self.hole_body_id]
+        hole_mat.shape = (3, 3)
+
+        v = peg_mat @ np.array([0, 0, 1])
+        v = v / np.linalg.norm(v)
+        center = hole_pos + hole_mat @ np.array([0.1, 0, 0])
+
+        t = (center - peg_pos) @ v / (np.linalg.norm(v) ** 2)
+        d = np.linalg.norm(np.cross(v, peg_pos - center)) / np.linalg.norm(v)
+
+        hole_normal = hole_mat @ np.array([0, 0, 1])
+        return (
+            t,
+            d,
+            abs(np.dot(hole_normal, v) / np.linalg.norm(hole_normal) / np.linalg.norm(v)),
+        )
+
+    def _peg_pose_in_hole_frame(self):
+        """
+        A helper function that takes in a named data field and returns the pose of that
+        object in the base frame.
+
+        Returns:
+            np.array: (4,4) matrix corresponding to the pose of the peg in the hole frame
+        """
+        # World frame
+        peg_pos_in_world = self.sim.data.get_body_xpos(self.peg.root_body)
+        peg_rot_in_world = self.sim.data.get_body_xmat(self.peg.root_body).reshape((3, 3))
+        peg_pose_in_world = T.make_pose(peg_pos_in_world, peg_rot_in_world)
+
+        # World frame
+        hole_pos_in_world = self.sim.data.get_body_xpos(self.hole.root_body)
+        hole_rot_in_world = self.sim.data.get_body_xmat(self.hole.root_body).reshape((3, 3))
+        hole_pose_in_world = T.make_pose(hole_pos_in_world, hole_rot_in_world)
+
+        world_pose_in_hole = T.pose_inv(hole_pose_in_world)
+
+        peg_pose_in_hole = T.pose_in_A_to_pose_in_B(peg_pose_in_world, world_pose_in_hole)
+        return peg_pose_in_hole
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_transport.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_transport.py
new file mode 100644
index 0000000000000000000000000000000000000000..d989a30075944c4c815aca72450443a08dead386
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/two_arm_transport.py
@@ -0,0 +1,602 @@
+from collections import OrderedDict
+
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.environments.manipulation.two_arm_env import TwoArmEnv
+from robosuite.models.arenas import MultiTableArena
+from robosuite.models.objects import BoxObject, HammerObject, TransportGroup
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.mjcf_utils import CustomMaterial
+from robosuite.utils.observables import Observable, sensor
+from robosuite.utils.placement_samplers import SequentialCompositeSampler, UniformRandomSampler
+
+
+class TwoArmTransport(TwoArmEnv):
+    """
+    This class corresponds to the transport task for two robot arms, requiring a payload to be transported from an
+    initial bin into a target bin, while removing trash from the target bin to a trash bin.
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be either 2 single single-arm robots or 1 bimanual robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment. Can be either:
+
+            :`'bimanual'`: Only applicable for bimanual robot setups. Sets up the (single) bimanual robot on the -x
+                side of the table
+            :`'single-arm-parallel'`: Only applicable for multi single arm setups. Sets up the (two) single armed
+                robots next to each other on the -x side of the table
+            :`'single-arm-opposed'`: Only applicable for multi single arm setups. Sets up the (two) single armed
+                robots opposed from each others on the opposite +/-y sides of the table.
+
+        Note that "default" corresponds to either "bimanual" if a bimanual robot is used or "single-arm-opposed" if two
+        single-arm robots are used.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
+            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
+            overrides the default gripper. Should either be single str if same gripper type is to be used for all
+            robots or else it should be a list of the same length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        tables_boundary (3-tuple): x, y, and z dimensions of the table bounds. Two tables will be created at the edges of
+            this boundary
+
+        table_friction (3-tuple): the three mujoco friction parameters for
+            each table.
+
+        bin_size (3-tuple): (x,y,z) dimensions of bins to use
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+    Raises:
+        ValueError: [Invalid number of robots specified]
+        ValueError: [Invalid env configuration]
+        ValueError: [Invalid robots for specified env configuration]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="default",
+        initialization_noise="default",
+        tables_boundary=(0.8, 1.2, 0.05),
+        table_friction=(1.0, 5e-3, 1e-4),
+        bin_size=(0.3, 0.3, 0.15),
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=False,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # settings for table top
+        self.tables_boundary = tables_boundary
+        self.table_full_size = np.array(tables_boundary)
+        self.table_full_size[1] *= 0.25  # each table size will only be a fraction of the full boundary
+        self.table_friction = table_friction
+        self.table_offsets = np.zeros((2, 3))
+        self.table_offsets[0, 1] = self.tables_boundary[1] * -3 / 8  # scale y offset
+        self.table_offsets[1, 1] = self.tables_boundary[1] * 3 / 8  # scale y offset
+        self.table_offsets[:, 2] = 0.8  # scale z offset
+        self.bin_size = np.array(bin_size)
+
+        # reward configuration
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+        self.height_threshold = 0.1  # threshold above the table surface which the payload is considered lifted
+
+        # whether to use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action=None):
+        """
+        Reward function for the task.
+
+        Sparse un-normalized reward:
+
+            - a discrete reward of 1.0 is provided when the payload is in the target bin and the trash is in the trash
+                bin
+
+        Un-normalized max-wise components if using reward shaping:
+
+            # TODO!
+
+        Note that the final reward is normalized and scaled by reward_scale / 1.0 as
+        well so that the max score is equal to reward_scale
+
+        Args:
+            action (np array): [NOT USED]
+
+        Returns:
+            float: reward value
+        """
+        # Initialize reward
+        reward = 0
+
+        # use a shaping reward if specified
+        if self.reward_shaping:
+            # TODO! So we print a warning and force sparse rewards
+            print(f"\n\nWarning! No dense reward current implemented for this task. Forcing sparse rewards\n\n")
+            self.reward_shaping = False
+
+        # Else this is the sparse reward setting
+        else:
+            # Provide reward if payload is in target bin and trash is in trash bin
+            if self._check_success():
+                reward = 1.0
+
+        if self.reward_scale is not None:
+            reward *= self.reward_scale / 1.0
+
+        return reward
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose(s) accordingly
+        if self.env_configuration == "bimanual":
+            xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
+            self.robots[0].robot_model.set_base_xpos(xpos)
+        else:
+            if self.env_configuration == "single-arm-opposed":
+                # Set up robots facing towards each other by rotating them from their default position
+                for robot, rotation, offset in zip(self.robots, (np.pi / 2, -np.pi / 2), (-0.25, 0.25)):
+                    xpos = robot.robot_model.base_xpos_offset["table"](self.table_full_size[0])
+                    rot = np.array((0, 0, rotation))
+                    xpos = T.euler2mat(rot) @ np.array(xpos)
+                    xpos += np.array((0, offset, 0))
+                    robot.robot_model.set_base_xpos(xpos)
+                    robot.robot_model.set_base_ori(rot)
+            else:  # "single-arm-parallel" configuration setting
+                # Set up robots parallel to each other but offset from the center
+                for robot, offset in zip(self.robots, (-0.6, 0.6)):
+                    xpos = robot.robot_model.base_xpos_offset["table"](self.table_full_size[0])
+                    xpos = np.array(xpos) + np.array((0, offset, 0))
+                    robot.robot_model.set_base_xpos(xpos)
+
+        # load model for table top workspace
+        mujoco_arena = MultiTableArena(
+            table_offsets=self.table_offsets,
+            table_rots=0,
+            table_full_sizes=self.table_full_size,
+            table_frictions=self.table_friction,
+            has_legs=True,
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # Modify default agentview camera
+        mujoco_arena.set_camera(
+            camera_name="agentview",
+            pos=[0.8894354364730311, -3.481824231498976e-08, 1.7383813133506494],
+            quat=[0.6530981063842773, 0.2710406184196472, 0.27104079723358154, 0.6530979871749878],
+        )
+
+        # TODO: Add built-in method into TwoArmEnv so we have an elegant way of automatically adding extra cameras to all these envs
+        # Add shoulder cameras
+        mujoco_arena.set_camera(
+            camera_name="shouldercamera0",
+            pos=[0.4430096057365183, -1.0697399743660143, 1.3639950119362048],
+            quat=[0.804057240486145, 0.5531665086746216, 0.11286306381225586, 0.18644218146800995],
+        )
+        mujoco_arena.set_camera(
+            camera_name="shouldercamera1",
+            pos=[-0.40900713993039983, 0.9613722572245062, 1.3084072951772754],
+            quat=[0.15484197437763214, 0.12077208608388901, -0.5476858019828796, -0.8133130073547363],
+        )
+
+        # Add relevant materials
+        # Textures to use
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "3 3",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        redwood = CustomMaterial(
+            texture="WoodRed",
+            tex_name="redwood",
+            mat_name="redwood_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+
+        # initialize objects of interest
+        payload = HammerObject(
+            name="payload",
+            handle_radius=0.015,
+            handle_length=0.20,
+            handle_density=150.0,
+            handle_friction=4.0,
+            head_density_ratio=1.5,
+        )
+        trash = BoxObject(name="trash", size=[0.02, 0.02, 0.02], material=redwood)
+        self.transport = TransportGroup(
+            name="transport",
+            payload=payload,
+            trash=trash,
+            bin_size=self.bin_size,
+        )
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+            mujoco_objects=list(self.transport.objects.values()),
+        )
+
+        # Create placement initializer
+        self._get_placement_initializer()
+
+    def _get_placement_initializer(self):
+        """
+        Helper function for defining placement initializer and object sampling bounds
+        """
+        # Create placement initializer
+        self.placement_initializer = SequentialCompositeSampler(name="ObjectSampler")
+
+        # Pre-define settings for each object's placement
+        object_names = ["start_bin", "lid", "payload", "target_bin", "trash", "trash_bin"]
+        table_nums = [0, 0, 0, 1, 1, 1]
+        x_centers = [
+            self.table_full_size[0] * 0.25,
+            0,  # gets overridden anyways
+            0,  # gets overridden anyways
+            -self.table_full_size[0] * 0.25,
+            0,  # gets overridden anyways
+            self.table_full_size[0] * 0.25,
+        ]
+        pos_tol = 0.005
+        rot_centers = [0, 0, np.pi / 2, 0, 0, 0]
+        rot_tols = [0, 0, np.pi / 6, 0, 0.3 * np.pi, 0]
+        rot_axes = ["z", "z", "y", "z", "z", "z"]
+        for obj_name, x, r, r_tol, r_axis, table_num in zip(
+            object_names, x_centers, rot_centers, rot_tols, rot_axes, table_nums
+        ):
+            # Get name and table
+            obj = self.transport.objects[obj_name]
+            table_pos = self.table_offsets[table_num]
+            # Create sampler for this object and add it to the sequential sampler
+            self.placement_initializer.append_sampler(
+                sampler=UniformRandomSampler(
+                    name=f"{obj_name}ObjectSampler",
+                    mujoco_objects=obj,
+                    x_range=[x - pos_tol, x + pos_tol],
+                    y_range=[-pos_tol, pos_tol],
+                    rotation=[r - r_tol, r + r_tol],
+                    rotation_axis=r_axis,
+                    ensure_object_boundary_in_range=False,
+                    ensure_valid_placement=False,
+                    reference_pos=table_pos,
+                    z_offset=0.001,
+                )
+            )
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # low-level object information
+        if self.use_object_obs:
+            # Get robot prefix and define observables modality
+            if self.env_configuration == "bimanual":
+                pf0 = self.robots[0].robot_model.naming_prefix + "right_"
+                pf1 = self.robots[0].robot_model.naming_prefix + "left_"
+            else:
+                pf0 = self.robots[0].robot_model.naming_prefix
+                pf1 = self.robots[1].robot_model.naming_prefix
+            modality = "object"
+
+            # position and rotation of payload
+            @sensor(modality=modality)
+            def payload_pos(obs_cache):
+                return np.array(self.transport.payload_pos)
+
+            @sensor(modality=modality)
+            def payload_quat(obs_cache):
+                return np.array(self.transport.payload_quat)
+
+            # position and rotation of trash
+            @sensor(modality=modality)
+            def trash_pos(obs_cache):
+                return np.array(self.transport.trash_pos)
+
+            @sensor(modality=modality)
+            def trash_quat(obs_cache):
+                return np.array(self.transport.trash_quat)
+
+            # position and rotation of lid handle
+            @sensor(modality=modality)
+            def lid_handle_pos(obs_cache):
+                return np.array(self.transport.lid_handle_pos)
+
+            @sensor(modality=modality)
+            def lid_handle_quat(obs_cache):
+                return np.array(self.transport.lid_handle_quat)
+
+            # bin positions
+            @sensor(modality=modality)
+            def target_bin_pos(obs_cache):
+                return np.array(self.transport.target_bin_pos)
+
+            @sensor(modality=modality)
+            def trash_bin_pos(obs_cache):
+                return np.array(self.transport.trash_bin_pos)
+
+            # Relevant egocentric positions for arm0
+            @sensor(modality=modality)
+            def gripper0_to_payload(obs_cache):
+                return (
+                    obs_cache["payload_pos"] - obs_cache[f"{pf0}eef_pos"]
+                    if "payload_pos" in obs_cache and f"{pf0}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            @sensor(modality=modality)
+            def gripper0_to_lid_handle(obs_cache):
+                return (
+                    obs_cache["lid_handle_pos"] - obs_cache[f"{pf0}eef_pos"]
+                    if "lid_handle_pos" in obs_cache and f"{pf0}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            # Relevant egocentric positions for arm1
+            @sensor(modality=modality)
+            def gripper1_to_payload(obs_cache):
+                return (
+                    obs_cache["payload_pos"] - obs_cache[f"{pf1}eef_pos"]
+                    if "payload_pos" in obs_cache and f"{pf1}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            @sensor(modality=modality)
+            def gripper1_to_trash(obs_cache):
+                return (
+                    obs_cache["trash_pos"] - obs_cache[f"{pf1}eef_pos"]
+                    if "trash_pos" in obs_cache and f"{pf1}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            # Key boolean checks
+            @sensor(modality=modality)
+            def payload_in_target_bin(obs_cache):
+                return self.transport.payload_in_target_bin
+
+            @sensor(modality=modality)
+            def trash_in_trash_bin(obs_cache):
+                return self.transport.trash_in_trash_bin
+
+            sensors = [
+                payload_pos,
+                payload_quat,
+                trash_pos,
+                trash_quat,
+                lid_handle_pos,
+                lid_handle_quat,
+                target_bin_pos,
+                trash_bin_pos,
+                gripper0_to_payload,
+                gripper0_to_lid_handle,
+                gripper1_to_payload,
+                gripper1_to_trash,
+                payload_in_target_bin,
+                trash_in_trash_bin,
+            ]
+            names = [s.__name__ for s in sensors]
+
+            # Create observables
+            for name, s in zip(names, sensors):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                )
+
+        return observables
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        super()._reset_internal()
+
+        # Update sim
+        self.transport.update_sim(sim=self.sim)
+
+        # Reset all object positions using initializer sampler if we're not directly loading from an xml
+        if not self.deterministic_reset:
+
+            # Sample from the placement initializer for all objects
+            object_placements = self.placement_initializer.sample()
+
+            # Initialize placeholders that we'll need to override the payload, lid, and trash object locations
+            start_bin_pos = None
+            target_bin_pos = None
+
+            # Loop through all objects and reset their positions
+            for obj_pos, obj_quat, obj in object_placements.values():
+                # If this is toolbox or good bin, store their sampled positions
+                if "start_bin" in obj.name and "lid" not in obj.name:
+                    start_bin_pos = obj_pos
+                elif "target_bin" in obj.name:
+                    target_bin_pos = obj_pos
+                # Else if this is either the lid, payload, or trash object,
+                # we override their positions to match their respective containers' positions
+                elif "lid" in obj.name:
+                    obj_pos = (start_bin_pos[0], start_bin_pos[1], obj_pos[2] + self.transport.bin_size[2])
+                elif "payload" in obj.name:
+                    obj_pos = (
+                        start_bin_pos[0],
+                        start_bin_pos[1],
+                        obj_pos[2] + self.transport.objects["start_bin"].wall_thickness,
+                    )
+                elif "trash" in obj.name and "bin" not in obj.name:
+                    obj_pos = (
+                        target_bin_pos[0],
+                        target_bin_pos[1],
+                        obj_pos[2] + self.transport.objects["target_bin"].wall_thickness,
+                    )
+                # Set the collision object joints
+                self.sim.data.set_joint_qpos(obj.joints[0], np.concatenate([np.array(obj_pos), np.array(obj_quat)]))
+
+    def _check_success(self):
+        """
+        Check if payload is in target in and trash is in trash bin
+
+        Returns:
+            bool: True if transport has been completed
+        """
+        return True if self.transport.payload_in_target_bin and self.transport.trash_in_trash_bin else False
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/wipe.py b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/wipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6af44132dce0d9b36f2159e7ed12683c54ddb77
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/manipulation/wipe.py
@@ -0,0 +1,768 @@
+import multiprocessing
+from collections import OrderedDict
+
+import numpy as np
+
+from robosuite.environments.manipulation.single_arm_env import SingleArmEnv
+from robosuite.models.arenas import WipeArena
+from robosuite.models.tasks import ManipulationTask
+from robosuite.utils.observables import Observable, sensor
+
+# Default Wipe environment configuration
+DEFAULT_WIPE_CONFIG = {
+    # settings for reward
+    "arm_limit_collision_penalty": -10.0,  # penalty for reaching joint limit or arm collision (except the wiping tool) with the table
+    "wipe_contact_reward": 0.01,  # reward for contacting something with the wiping tool
+    "unit_wiped_reward": 50.0,  # reward per peg wiped
+    "ee_accel_penalty": 0,  # penalty for large end-effector accelerations
+    "excess_force_penalty_mul": 0.05,  # penalty for each step that the force is over the safety threshold
+    "distance_multiplier": 5.0,  # multiplier for the dense reward inversely proportional to the mean location of the pegs to wipe
+    "distance_th_multiplier": 5.0,  # multiplier in the tanh function for the aforementioned reward
+    # settings for table top
+    "table_full_size": [0.5, 0.8, 0.05],  # Size of tabletop
+    "table_offset": [0.15, 0, 0.9],  # Offset of table (z dimension defines max height of table)
+    "table_friction": [0.03, 0.005, 0.0001],  # Friction parameters for the table
+    "table_friction_std": 0,  # Standard deviation to sample different friction parameters for the table each episode
+    "table_height": 0.0,  # Additional height of the table over the default location
+    "table_height_std": 0.0,  # Standard deviation to sample different heigths of the table each episode
+    "line_width": 0.04,  # Width of the line to wipe (diameter of the pegs)
+    "two_clusters": False,  # if the dirt to wipe is one continuous line or two
+    "coverage_factor": 0.6,  # how much of the table surface we cover
+    "num_markers": 100,  # How many particles of dirt to generate in the environment
+    # settings for thresholds
+    "contact_threshold": 1.0,  # Minimum eef force to qualify as contact [N]
+    "pressure_threshold": 0.5,  # force threshold (N) to overcome to get increased contact wiping reward
+    "pressure_threshold_max": 60.0,  # maximum force allowed (N)
+    # misc settings
+    "print_results": False,  # Whether to print results or not
+    "get_info": False,  # Whether to grab info after each env step if not
+    "use_robot_obs": True,  # if we use robot observations (proprioception) as input to the policy
+    "use_contact_obs": True,  # if we use a binary observation for whether robot is in contact or not
+    "early_terminations": True,  # Whether we allow for early terminations or not
+    "use_condensed_obj_obs": True,  # Whether to use condensed object observation representation (only applicable if obj obs is active)
+}
+
+
+class Wipe(SingleArmEnv):
+    """
+    This class corresponds to the Wiping task for a single robot arm
+
+    Args:
+        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
+            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
+            Note: Must be a single single-arm robot!
+
+        env_configuration (str): Specifies how to position the robots within the environment (default is "default").
+            For most single arm environments, this argument has no impact on the robot setup.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        gripper_types (str or list of str): type of gripper, used to instantiate
+            gripper models from gripper factory.
+            For this environment, setting a value other than the default ("WipingGripper") will raise an
+            AssertionError, as this environment is not meant to be used with any other alternative gripper.
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        use_object_obs (bool): if True, include object (cube) information in
+            the observation.
+
+        reward_scale (None or float): Scales the normalized reward function by the amount specified.
+            If None, environment reward remains unnormalized
+
+        reward_shaping (bool): if True, use dense rewards.
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+        task_config (None or dict): Specifies the parameters relevant to this task. For a full list of expected
+            parameters, see the default configuration dict at the top of this file.
+            If None is specified, the default configuration will be used.
+
+        Raises:
+            AssertionError: [Gripper specified]
+            AssertionError: [Bad reward specification]
+            AssertionError: [Invalid number of robots specified]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        controller_configs=None,
+        gripper_types="WipingGripper",
+        initialization_noise="default",
+        use_camera_obs=True,
+        use_object_obs=True,
+        reward_scale=1.0,
+        reward_shaping=True,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,  # {None, instance, class, element}
+        task_config=None,
+        renderer="mujoco",
+        renderer_config=None,
+    ):
+        # Assert that the gripper type is None
+        assert (
+            gripper_types == "WipingGripper"
+        ), "Tried to specify gripper other than WipingGripper in Wipe environment!"
+
+        # Get config
+        self.task_config = task_config if task_config is not None else DEFAULT_WIPE_CONFIG
+
+        # Set task-specific parameters
+
+        # settings for the reward
+        self.reward_scale = reward_scale
+        self.reward_shaping = reward_shaping
+        self.arm_limit_collision_penalty = self.task_config["arm_limit_collision_penalty"]
+        self.wipe_contact_reward = self.task_config["wipe_contact_reward"]
+        self.unit_wiped_reward = self.task_config["unit_wiped_reward"]
+        self.ee_accel_penalty = self.task_config["ee_accel_penalty"]
+        self.excess_force_penalty_mul = self.task_config["excess_force_penalty_mul"]
+        self.distance_multiplier = self.task_config["distance_multiplier"]
+        self.distance_th_multiplier = self.task_config["distance_th_multiplier"]
+        # Final reward computation
+        # So that is better to finish that to stay touching the table for 100 steps
+        # The 0.5 comes from continuous_distance_reward at 0. If something changes, this may change as well
+        self.task_complete_reward = self.unit_wiped_reward * (self.wipe_contact_reward + 0.5)
+        # Verify that the distance multiplier is not greater than the task complete reward
+        assert (
+            self.task_complete_reward > self.distance_multiplier
+        ), "Distance multiplier cannot be greater than task complete reward!"
+
+        # settings for table top
+        self.table_full_size = self.task_config["table_full_size"]
+        self.table_height = self.task_config["table_height"]
+        self.table_height_std = self.task_config["table_height_std"]
+        delta_height = min(0, np.random.normal(self.table_height, self.table_height_std))  # sample variation in height
+        self.table_offset = np.array(self.task_config["table_offset"]) + np.array((0, 0, delta_height))
+        self.table_friction = self.task_config["table_friction"]
+        self.table_friction_std = self.task_config["table_friction_std"]
+        self.line_width = self.task_config["line_width"]
+        self.two_clusters = self.task_config["two_clusters"]
+        self.coverage_factor = self.task_config["coverage_factor"]
+        self.num_markers = self.task_config["num_markers"]
+
+        # settings for thresholds
+        self.contact_threshold = self.task_config["contact_threshold"]
+        self.pressure_threshold = self.task_config["pressure_threshold"]
+        self.pressure_threshold_max = self.task_config["pressure_threshold_max"]
+
+        # misc settings
+        self.print_results = self.task_config["print_results"]
+        self.get_info = self.task_config["get_info"]
+        self.use_robot_obs = self.task_config["use_robot_obs"]
+        self.use_contact_obs = self.task_config["use_contact_obs"]
+        self.early_terminations = self.task_config["early_terminations"]
+        self.use_condensed_obj_obs = self.task_config["use_condensed_obj_obs"]
+
+        # Scale reward if desired (see reward method for details)
+        self.reward_normalization_factor = horizon / (
+            self.num_markers * self.unit_wiped_reward + horizon * (self.wipe_contact_reward + self.task_complete_reward)
+        )
+
+        # ee resets
+        self.ee_force_bias = np.zeros(3)
+        self.ee_torque_bias = np.zeros(3)
+
+        # set other wipe-specific attributes
+        self.wiped_markers = []
+        self.collisions = 0
+        self.f_excess = 0
+        self.metadata = []
+        self.spec = "spec"
+
+        # whether to include and use ground-truth object states
+        self.use_object_obs = use_object_obs
+
+        super().__init__(
+            robots=robots,
+            env_configuration=env_configuration,
+            controller_configs=controller_configs,
+            mount_types="default",
+            gripper_types=gripper_types,
+            initialization_noise=initialization_noise,
+            use_camera_obs=use_camera_obs,
+            has_renderer=has_renderer,
+            has_offscreen_renderer=has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            camera_names=camera_names,
+            camera_heights=camera_heights,
+            camera_widths=camera_widths,
+            camera_depths=camera_depths,
+            camera_segmentations=camera_segmentations,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def reward(self, action=None):
+        """
+        Reward function for the task.
+
+        Sparse un-normalized reward:
+
+            - a discrete reward of self.unit_wiped_reward is provided per single dirt (peg) wiped during this step
+            - a discrete reward of self.task_complete_reward is provided if all dirt is wiped
+
+        Note that if the arm is either colliding or near its joint limit, a reward of 0 will be automatically given
+
+        Un-normalized summed components if using reward shaping (individual components can be set to 0:
+
+            - Reaching: in [0, self.distance_multiplier], proportional to distance between wiper and centroid of dirt
+              and zero if the table has been fully wiped clean of all the dirt
+            - Table Contact: in {0, self.wipe_contact_reward}, non-zero if wiper is in contact with table
+            - Wiping: in {0, self.unit_wiped_reward}, non-zero for each dirt (peg) wiped during this step
+            - Cleaned: in {0, self.task_complete_reward}, non-zero if no dirt remains on the table
+            - Collision / Joint Limit Penalty: in {self.arm_limit_collision_penalty, 0}, nonzero if robot arm
+              is colliding with an object
+              - Note that if this value is nonzero, no other reward components can be added
+            - Large Force Penalty: in [-inf, 0], scaled by wiper force and directly proportional to
+              self.excess_force_penalty_mul if the current force exceeds self.pressure_threshold_max
+            - Large Acceleration Penalty: in [-inf, 0], scaled by estimated wiper acceleration and directly
+              proportional to self.ee_accel_penalty
+
+        Note that the final per-step reward is normalized given the theoretical best episode return and then scaled:
+        reward_scale * (horizon /
+        (num_markers * unit_wiped_reward + horizon * (wipe_contact_reward + task_complete_reward)))
+
+        Args:
+            action (np array): [NOT USED]
+
+        Returns:
+            float: reward value
+        """
+        reward = 0
+
+        total_force_ee = np.linalg.norm(np.array(self.robots[0].recent_ee_forcetorques.current[:3]))
+
+        # Neg Reward from collisions of the arm with the table
+        if self.check_contact(self.robots[0].robot_model):
+            if self.reward_shaping:
+                reward = self.arm_limit_collision_penalty
+            self.collisions += 1
+        elif self.robots[0].check_q_limits():
+            if self.reward_shaping:
+                reward = self.arm_limit_collision_penalty
+            self.collisions += 1
+        else:
+            # If the arm is not colliding or in joint limits, we check if we are wiping
+            # (we don't want to reward wiping if there are unsafe situations)
+            active_markers = []
+
+            # Current 3D location of the corners of the wiping tool in world frame
+            c_geoms = self.robots[0].gripper.important_geoms["corners"]
+            corner1_id = self.sim.model.geom_name2id(c_geoms[0])
+            corner1_pos = np.array(self.sim.data.geom_xpos[corner1_id])
+            corner2_id = self.sim.model.geom_name2id(c_geoms[1])
+            corner2_pos = np.array(self.sim.data.geom_xpos[corner2_id])
+            corner3_id = self.sim.model.geom_name2id(c_geoms[2])
+            corner3_pos = np.array(self.sim.data.geom_xpos[corner3_id])
+            corner4_id = self.sim.model.geom_name2id(c_geoms[3])
+            corner4_pos = np.array(self.sim.data.geom_xpos[corner4_id])
+
+            # Unit vectors on my plane
+            v1 = corner1_pos - corner2_pos
+            v1 /= np.linalg.norm(v1)
+            v2 = corner4_pos - corner2_pos
+            v2 /= np.linalg.norm(v2)
+
+            # Corners of the tool in the coordinate frame of the plane
+            t1 = np.array([np.dot(corner1_pos - corner2_pos, v1), np.dot(corner1_pos - corner2_pos, v2)])
+            t2 = np.array([np.dot(corner2_pos - corner2_pos, v1), np.dot(corner2_pos - corner2_pos, v2)])
+            t3 = np.array([np.dot(corner3_pos - corner2_pos, v1), np.dot(corner3_pos - corner2_pos, v2)])
+            t4 = np.array([np.dot(corner4_pos - corner2_pos, v1), np.dot(corner4_pos - corner2_pos, v2)])
+
+            pp = [t1, t2, t4, t3]
+
+            # Normal of the plane defined by v1 and v2
+            n = np.cross(v1, v2)
+            n /= np.linalg.norm(n)
+
+            def isLeft(P0, P1, P2):
+                return (P1[0] - P0[0]) * (P2[1] - P0[1]) - (P2[0] - P0[0]) * (P1[1] - P0[1])
+
+            def PointInRectangle(X, Y, Z, W, P):
+                return isLeft(X, Y, P) < 0 and isLeft(Y, Z, P) < 0 and isLeft(Z, W, P) < 0 and isLeft(W, X, P) < 0
+
+            # Only go into this computation if there are contact points
+            if self.sim.data.ncon != 0:
+
+                # Check each marker that is still active
+                for marker in self.model.mujoco_arena.markers:
+
+                    # Current marker 3D location in world frame
+                    marker_pos = np.array(self.sim.data.body_xpos[self.sim.model.body_name2id(marker.root_body)])
+
+                    # We use the second tool corner as point on the plane and define the vector connecting
+                    # the marker position to that point
+                    v = marker_pos - corner2_pos
+
+                    # Shortest distance between the center of the marker and the plane
+                    dist = np.dot(v, n)
+
+                    # Projection of the center of the marker onto the plane
+                    projected_point = np.array(marker_pos) - dist * n
+
+                    # Positive distances means the center of the marker is over the plane
+                    # The plane is aligned with the bottom of the wiper and pointing up, so the marker would be over it
+                    if dist > 0.0:
+                        # Distance smaller than this threshold means we are close to the plane on the upper part
+                        if dist < 0.02:
+                            # Write touching points and projected point in coordinates of the plane
+                            pp_2 = np.array(
+                                [np.dot(projected_point - corner2_pos, v1), np.dot(projected_point - corner2_pos, v2)]
+                            )
+                            # Check if marker is within the tool center:
+                            if PointInRectangle(pp[0], pp[1], pp[2], pp[3], pp_2):
+                                active_markers.append(marker)
+
+            # Obtain the list of currently active (wiped) markers that where not wiped before
+            # These are the markers we are wiping at this step
+            lall = np.where(np.isin(active_markers, self.wiped_markers, invert=True))
+            new_active_markers = np.array(active_markers)[lall]
+
+            # Loop through all new markers we are wiping at this step
+            for new_active_marker in new_active_markers:
+                # Grab relevant marker id info
+                new_active_marker_geom_id = self.sim.model.geom_name2id(new_active_marker.visual_geoms[0])
+                # Make this marker transparent since we wiped it (alpha = 0)
+                self.sim.model.geom_rgba[new_active_marker_geom_id][3] = 0
+                # Add this marker the wiped list
+                self.wiped_markers.append(new_active_marker)
+                # Add reward if we're using the dense reward
+                if self.reward_shaping:
+                    reward += self.unit_wiped_reward
+
+            # Additional reward components if using dense rewards
+            if self.reward_shaping:
+                # If we haven't wiped all the markers yet, add a smooth reward for getting closer
+                # to the centroid of the dirt to wipe
+                if len(self.wiped_markers) < self.num_markers:
+                    _, _, mean_pos_to_things_to_wipe = self._get_wipe_information()
+                    mean_distance_to_things_to_wipe = np.linalg.norm(mean_pos_to_things_to_wipe)
+                    reward += self.distance_multiplier * (
+                        1 - np.tanh(self.distance_th_multiplier * mean_distance_to_things_to_wipe)
+                    )
+
+                # Reward for keeping contact
+                if self.sim.data.ncon != 0 and self._has_gripper_contact:
+                    reward += self.wipe_contact_reward
+
+                # Penalty for excessive force with the end-effector
+                if total_force_ee > self.pressure_threshold_max:
+                    reward -= self.excess_force_penalty_mul * total_force_ee
+                    self.f_excess += 1
+
+                # Reward for pressing into table
+                # TODO: Need to include this computation somehow in the scaled reward computation
+                elif total_force_ee > self.pressure_threshold and self.sim.data.ncon > 1:
+                    reward += self.wipe_contact_reward + 0.01 * total_force_ee
+                    if self.sim.data.ncon > 50:
+                        reward += 10.0 * self.wipe_contact_reward
+
+                # Penalize large accelerations
+                reward -= self.ee_accel_penalty * np.mean(abs(self.robots[0].recent_ee_acc.current))
+
+            # Final reward if all wiped
+            if len(self.wiped_markers) == self.num_markers:
+                reward += self.task_complete_reward
+
+        # Printing results
+        if self.print_results:
+            string_to_print = (
+                "Process {pid}, timestep {ts:>4}: reward: {rw:8.4f}"
+                "wiped markers: {ws:>3} collisions: {sc:>3} f-excess: {fe:>3}".format(
+                    pid=id(multiprocessing.current_process()),
+                    ts=self.timestep,
+                    rw=reward,
+                    ws=len(self.wiped_markers),
+                    sc=self.collisions,
+                    fe=self.f_excess,
+                )
+            )
+            print(string_to_print)
+
+        # If we're scaling our reward, we normalize the per-step rewards given the theoretical best episode return
+        # This is equivalent to scaling the reward by:
+        #   reward_scale * (horizon /
+        #       (num_markers * unit_wiped_reward + horizon * (wipe_contact_reward + task_complete_reward)))
+        if self.reward_scale:
+            reward *= self.reward_scale * self.reward_normalization_factor
+        return reward
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Adjust base pose accordingly
+        xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
+        self.robots[0].robot_model.set_base_xpos(xpos)
+
+        # Get robot's contact geoms
+        self.robot_contact_geoms = self.robots[0].robot_model.contact_geoms
+
+        mujoco_arena = WipeArena(
+            table_full_size=self.table_full_size,
+            table_friction=self.table_friction,
+            table_offset=self.table_offset,
+            table_friction_std=self.table_friction_std,
+            coverage_factor=self.coverage_factor,
+            num_markers=self.num_markers,
+            line_width=self.line_width,
+            two_clusters=self.two_clusters,
+        )
+
+        # Arena always gets set to zero origin
+        mujoco_arena.set_origin([0, 0, 0])
+
+        # task includes arena, robot, and objects of interest
+        self.model = ManipulationTask(
+            mujoco_arena=mujoco_arena,
+            mujoco_robots=[robot.robot_model for robot in self.robots],
+        )
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Creates object-based observables if enabled
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+
+        # Get prefix from robot model to avoid naming clashes for multiple robots
+        pf = self.robots[0].robot_model.naming_prefix
+        modality = "object"
+
+        sensors = []
+        names = []
+
+        # Add binary contact observation
+        if self.use_contact_obs:
+
+            @sensor(modality=f"{pf}proprio")
+            def gripper_contact(obs_cache):
+                return self._has_gripper_contact
+
+            sensors.append(gripper_contact)
+            names.append(f"{pf}contact")
+
+        # object information in the observation
+        if self.use_object_obs:
+
+            if self.use_condensed_obj_obs:
+                # use implicit representation of wiping objects
+                @sensor(modality=modality)
+                def wipe_radius(obs_cache):
+                    wipe_rad, wipe_cent, _ = self._get_wipe_information()
+                    obs_cache["wipe_centroid"] = wipe_cent
+                    return wipe_rad
+
+                @sensor(modality=modality)
+                def wipe_centroid(obs_cache):
+                    return obs_cache["wipe_centroid"] if "wipe_centroid" in obs_cache else np.zeros(3)
+
+                @sensor(modality=modality)
+                def proportion_wiped(obs_cache):
+                    return len(self.wiped_markers) / self.num_markers
+
+                sensors += [proportion_wiped, wipe_radius, wipe_centroid]
+                names += ["proportion_wiped", "wipe_radius", "wipe_centroid"]
+
+                if self.use_robot_obs:
+                    # also use ego-centric obs
+                    @sensor(modality=modality)
+                    def gripper_to_wipe_centroid(obs_cache):
+                        return (
+                            obs_cache["wipe_centroid"] - obs_cache[f"{pf}eef_pos"]
+                            if "wipe_centroid" in obs_cache and f"{pf}eef_pos" in obs_cache
+                            else np.zeros(3)
+                        )
+
+                    sensors.append(gripper_to_wipe_centroid)
+                    names.append("gripper_to_wipe_centroid")
+
+            else:
+                # use explicit representation of wiping objects
+                for i, marker in enumerate(self.model.mujoco_arena.markers):
+                    marker_sensors, marker_sensor_names = self._create_marker_sensors(i, marker, modality)
+                    sensors += marker_sensors
+                    names += marker_sensor_names
+
+            # Create observables
+            for name, s in zip(names, sensors):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                )
+
+        return observables
+
+    def _create_marker_sensors(self, i, marker, modality="object"):
+        """
+        Helper function to create sensors for a given marker. This is abstracted in a separate function call so that we
+        don't have local function naming collisions during the _setup_observables() call.
+
+        Args:
+            i (int): ID number corresponding to the marker
+            marker (MujocoObject): Marker to create sensors for
+            modality (str): Modality to assign to all sensors
+
+        Returns:
+            2-tuple:
+                sensors (list): Array of sensors for the given marker
+                names (list): array of corresponding observable names
+        """
+        pf = self.robots[0].robot_model.naming_prefix
+
+        @sensor(modality=modality)
+        def marker_pos(obs_cache):
+            return np.array(self.sim.data.body_xpos[self.sim.model.body_name2id(marker.root_body)])
+
+        @sensor(modality=modality)
+        def marker_wiped(obs_cache):
+            return [0, 1][marker in self.wiped_markers]
+
+        sensors = [marker_pos, marker_wiped]
+        names = [f"marker{i}_pos", f"marker{i}_wiped"]
+
+        if self.use_robot_obs:
+            # also use ego-centric obs
+            @sensor(modality=modality)
+            def gripper_to_marker(obs_cache):
+                return (
+                    obs_cache[f"marker{i}_pos"] - obs_cache[f"{pf}eef_pos"]
+                    if f"marker{i}_pos" in obs_cache and f"{pf}eef_pos" in obs_cache
+                    else np.zeros(3)
+                )
+
+            sensors.append(gripper_to_marker)
+            names.append(f"gripper_to_marker{i}")
+
+        return sensors, names
+
+    def _reset_internal(self):
+        super()._reset_internal()
+
+        # inherited class should reset positions of objects (only if we're not using a deterministic reset)
+        if not self.deterministic_reset:
+            self.model.mujoco_arena.reset_arena(self.sim)
+
+        # Reset all internal vars for this wipe task
+        self.timestep = 0
+        self.wiped_markers = []
+        self.collisions = 0
+        self.f_excess = 0
+
+        # ee resets - bias at initial state
+        self.ee_force_bias = np.zeros(3)
+        self.ee_torque_bias = np.zeros(3)
+
+    def _check_success(self):
+        """
+        Checks if Task succeeds (all dirt wiped).
+
+        Returns:
+            bool: True if completed task
+        """
+        return True if len(self.wiped_markers) == self.num_markers else False
+
+    def _check_terminated(self):
+        """
+        Check if the task has completed one way or another. The following conditions lead to termination:
+
+            - Collision
+            - Task completion (wiping succeeded)
+            - Joint Limit reached
+
+        Returns:
+            bool: True if episode is terminated
+        """
+
+        terminated = False
+
+        # Prematurely terminate if contacting the table with the arm
+        if self.check_contact(self.robots[0].robot_model):
+            if self.print_results:
+                print(40 * "-" + " COLLIDED " + 40 * "-")
+            terminated = True
+
+        # Prematurely terminate if task is success
+        if self._check_success():
+            if self.print_results:
+                print(40 * "+" + " FINISHED WIPING " + 40 * "+")
+            terminated = True
+
+        # Prematurely terminate if contacting the table with the arm
+        if self.robots[0].check_q_limits():
+            if self.print_results:
+                print(40 * "-" + " JOINT LIMIT " + 40 * "-")
+            terminated = True
+
+        return terminated
+
+    def _post_action(self, action):
+        """
+        In addition to super method, add additional info if requested
+
+        Args:
+            action (np.array): Action to execute within the environment
+
+        Returns:
+            3-tuple:
+
+                - (float) reward from the environment
+                - (bool) whether the current episode is completed or not
+                - (dict) info about current env step
+        """
+        reward, done, info = super()._post_action(action)
+
+        # Update force bias
+        if np.linalg.norm(self.ee_force_bias) == 0:
+            self.ee_force_bias = self.robots[0].ee_force
+            self.ee_torque_bias = self.robots[0].ee_torque
+
+        if self.get_info:
+            info["add_vals"] = ["nwipedmarkers", "colls", "percent_viapoints_", "f_excess"]
+            info["nwipedmarkers"] = len(self.wiped_markers)
+            info["colls"] = self.collisions
+            info["percent_viapoints_"] = len(self.wiped_markers) / self.num_markers
+            info["f_excess"] = self.f_excess
+
+        # allow episode to finish early if allowed
+        if self.early_terminations:
+            done = done or self._check_terminated()
+
+        return reward, done, info
+
+    def _get_wipe_information(self):
+        """Returns set of wiping information"""
+        mean_pos_to_things_to_wipe = np.zeros(3)
+        wipe_centroid = np.zeros(3)
+        marker_positions = []
+        num_non_wiped_markers = 0
+        if len(self.wiped_markers) < self.num_markers:
+            for marker in self.model.mujoco_arena.markers:
+                if marker not in self.wiped_markers:
+                    marker_pos = np.array(self.sim.data.body_xpos[self.sim.model.body_name2id(marker.root_body)])
+                    wipe_centroid += marker_pos
+                    marker_positions.append(marker_pos)
+                    num_non_wiped_markers += 1
+            wipe_centroid /= max(1, num_non_wiped_markers)
+            mean_pos_to_things_to_wipe = wipe_centroid - self._eef_xpos
+        # Radius of circle from centroid capturing all remaining wiping markers
+        max_radius = 0
+        if num_non_wiped_markers > 0:
+            max_radius = np.max(np.linalg.norm(np.array(marker_positions) - wipe_centroid, axis=1))
+        # Return all values
+        return max_radius, wipe_centroid, mean_pos_to_things_to_wipe
+
+    @property
+    def _has_gripper_contact(self):
+        """
+        Determines whether the gripper is making contact with an object, as defined by the eef force surprassing
+        a certain threshold defined by self.contact_threshold
+
+        Returns:
+            bool: True if contact is surpasses given threshold magnitude
+        """
+        return np.linalg.norm(self.robots[0].ee_force - self.ee_force_bias) > self.contact_threshold
diff --git a/phantom/submodules/phantom-robosuite/robosuite/environments/robot_env.py b/phantom/submodules/phantom-robosuite/robosuite/environments/robot_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c836655a0c3f69ebfa402acdb496b475ac1e573
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/environments/robot_env.py
@@ -0,0 +1,619 @@
+from collections import OrderedDict
+from copy import deepcopy
+
+import numpy as np
+
+import robosuite.macros as macros
+from robosuite.controllers import reset_controllers
+from robosuite.environments.base import MujocoEnv
+from robosuite.robots import ROBOT_CLASS_MAPPING
+from robosuite.utils.mjcf_utils import IMAGE_CONVENTION_MAPPING
+from robosuite.utils.observables import Observable, sensor
+
+
+class RobotEnv(MujocoEnv):
+    """
+    Initializes a robot environment in Mujoco.
+
+    Args:
+        robots: Specification for specific robot(s) to be instantiated within this env
+
+        env_configuration (str): Specifies how to position the robot(s) within the environment. Default is "default",
+            which should be interpreted accordingly by any subclasses.
+
+        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
+            custom controller. Else, uses the default controller for this specific task. Should either be single
+            dict if same controller is to be used for all robots or else it should be a list of the same length as
+            "robots" param
+
+        mount_types (None or str or list of str): type of mount, used to instantiate mount models from mount factory.
+            Default is "default", which is the default mount associated with the robot(s) the 'robots' specification.
+            None results in no mount, and any other (valid) model overrides the default mount. Should either be
+            single str if same mount type is to be used for all robots or else it should be a list of the same
+            length as "robots" param
+
+        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
+            The expected keys and corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            Should either be single dict if same noise value is to be used for all robots or else it should be a
+            list of the same length as "robots" param
+
+            :Note: Specifying "default" will automatically use the default noise settings.
+                Specifying None will automatically create the required dict with "magnitude" set to 0.0.
+
+        use_camera_obs (bool): if True, every observation includes rendered image(s)
+
+        has_renderer (bool): If true, render the simulation state in
+            a viewer instead of headless mode.
+
+        has_offscreen_renderer (bool): True if using off-screen rendering
+
+        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
+            will result in the default angle being applied, which is useful as it can be dragged / panned by
+            the user using the mouse
+
+        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.
+
+        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.
+
+        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
+            Defaults to -1, in which case the device will be inferred from environment variables
+            (GPUS or CUDA_VISIBLE_DEVICES).
+
+        control_freq (float): how many control signals to receive in every second. This sets the amount of
+            simulation time that passes between every action input.
+
+        horizon (int): Every episode lasts for exactly @horizon timesteps.
+
+        ignore_done (bool): True if never terminating the environment (ignore @horizon).
+
+        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
+            only calls sim.reset and resets all robosuite-internal variables
+
+        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
+            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.
+
+            :Note: At least one camera must be specified if @use_camera_obs is True.
+
+            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
+                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
+                robot's camera list).
+
+        camera_heights (int or list of int): height of camera frame. Should either be single int if
+            same height is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_widths (int or list of int): width of camera frame. Should either be single int if
+            same width is to be used for all cameras' frames or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
+            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
+            "camera names" param.
+
+        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
+            for each camera. Valid options are:
+
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
+            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
+            segmentation setting(s) to use.
+
+        robot_configs (list of dict): Per-robot configurations set from any subclass initializers.
+
+    Raises:
+        ValueError: [Camera obs require offscreen renderer]
+        ValueError: [Camera name must be specified to use camera obs]
+    """
+
+    def __init__(
+        self,
+        robots,
+        env_configuration="default",
+        mount_types="default",
+        controller_configs=None,
+        initialization_noise=None,
+        use_camera_obs=True,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        render_camera="frontview",
+        render_collision_mesh=False,
+        render_visual_mesh=True,
+        render_gpu_device_id=-1,
+        control_freq=20,
+        horizon=1000,
+        ignore_done=False,
+        hard_reset=True,
+        camera_names="agentview",
+        camera_heights=256,
+        camera_widths=256,
+        camera_depths=False,
+        camera_segmentations=None,
+        robot_configs=None,
+        renderer="mujoco",
+        renderer_config=None,
+        direct_gripper_control=False,
+    ):
+        # First, verify that correct number of robots are being inputted
+        self.env_configuration = env_configuration
+        self._check_robot_configuration(robots)
+
+        # Robot
+        robots = list(robots) if type(robots) is list or type(robots) is tuple else [robots]
+        self.num_robots = len(robots)
+        self.robot_names = robots
+        self.robots = self._input2list(None, self.num_robots)
+        self._action_dim = None
+
+        # Mount
+        mount_types = self._input2list(mount_types, self.num_robots)
+
+        # Controller
+        controller_configs = self._input2list(controller_configs, self.num_robots)
+
+        # Initialization Noise
+        initialization_noise = self._input2list(initialization_noise, self.num_robots)
+
+        # Observations -- Ground truth = object_obs, Image data = camera_obs
+        self.use_camera_obs = use_camera_obs
+
+        # Camera / Rendering Settings
+        self.has_offscreen_renderer = has_offscreen_renderer
+        self.camera_names = (
+            list(camera_names) if type(camera_names) is list or type(camera_names) is tuple else [camera_names]
+        )
+        self.num_cameras = len(self.camera_names)
+
+        self.camera_heights = self._input2list(camera_heights, self.num_cameras)
+        self.camera_widths = self._input2list(camera_widths, self.num_cameras)
+        self.camera_depths = self._input2list(camera_depths, self.num_cameras)
+        self.camera_segmentations = self._input2list(camera_segmentations, self.num_cameras)
+        # We need to parse camera segmentations more carefully since it may be a nested list
+        seg_is_nested = False
+        for i, camera_s in enumerate(self.camera_segmentations):
+            if isinstance(camera_s, list) or isinstance(camera_s, tuple):
+                seg_is_nested = True
+                break
+        camera_segs = deepcopy(self.camera_segmentations)
+        for i, camera_s in enumerate(self.camera_segmentations):
+            if camera_s is not None:
+                self.camera_segmentations[i] = self._input2list(camera_s, 1) if seg_is_nested else deepcopy(camera_segs)
+
+        # sanity checks for camera rendering
+        if self.use_camera_obs and not self.has_offscreen_renderer:
+            raise ValueError("Error: Camera observations require an offscreen renderer!")
+        if self.use_camera_obs and self.camera_names is None:
+            raise ValueError("Must specify at least one camera name when using camera obs")
+
+        # Robot configurations -- update from subclass configs
+        if robot_configs is None:
+            robot_configs = [{} for _ in range(self.num_robots)]
+        self.robot_configs = [
+            dict(
+                **{
+                    "controller_config": controller_configs[idx],
+                    "mount_type": mount_types[idx],
+                    "initialization_noise": initialization_noise[idx],
+                    "control_freq": control_freq,
+                    "direct_gripper_control": direct_gripper_control,
+                },
+                **robot_config,
+            )
+            for idx, robot_config in enumerate(robot_configs)
+        ]
+
+        # Run superclass init
+        super().__init__(
+            has_renderer=has_renderer,
+            has_offscreen_renderer=self.has_offscreen_renderer,
+            render_camera=render_camera,
+            render_collision_mesh=render_collision_mesh,
+            render_visual_mesh=render_visual_mesh,
+            render_gpu_device_id=render_gpu_device_id,
+            control_freq=control_freq,
+            horizon=horizon,
+            ignore_done=ignore_done,
+            hard_reset=hard_reset,
+            renderer=renderer,
+            renderer_config=renderer_config,
+        )
+
+    def visualize(self, vis_settings):
+        """
+        In addition to super call, visualizes robots.
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "robots" keyword as well as any other relevant
+                options specified.
+        """
+        # Run superclass method first
+        super().visualize(vis_settings=vis_settings)
+        # Loop over robots to visualize them independently
+        for robot in self.robots:
+            robot.visualize(vis_settings=vis_settings)
+
+    @property
+    def _visualizations(self):
+        """
+        Visualization keywords for this environment
+
+        Returns:
+            set: All components that can be individually visualized for this environment
+        """
+        vis_set = super()._visualizations
+        vis_set.add("robots")
+        return vis_set
+
+    @property
+    def action_spec(self):
+        """
+        Action space (low, high) for this environment
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum (low) action values
+                - (np.array) maximum (high) action values
+        """
+        low, high = [], []
+        for robot in self.robots:
+            lo, hi = robot.action_limits
+            low, high = np.concatenate([low, lo]), np.concatenate([high, hi])
+        return low, high
+
+    @property
+    def action_dim(self):
+        """
+        Size of the action space
+
+        Returns:
+            int: Action space dimension
+        """
+        return self._action_dim
+
+    @staticmethod
+    def _input2list(inp, length):
+        """
+        Helper function that converts an input that is either a single value or a list into a list
+
+        Args:
+            inp (None or str or list): Input value to be converted to list
+            length (int): Length of list to broadcast input to
+
+        Returns:
+            list: input @inp converted into a list of length @length
+        """
+        # convert to list if necessary
+        return list(inp) if type(inp) is list or type(inp) is tuple else [inp for _ in range(length)]
+
+    def _load_model(self):
+        """
+        Loads an xml model, puts it in self.model
+        """
+        super()._load_model()
+
+        # Load robots
+        self._load_robots()
+
+    def _setup_references(self):
+        """
+        Sets up references to important components. A reference is typically an
+        index or a list of indices that point to the corresponding elements
+        in a flatten array, which is how MuJoCo stores physical simulation data.
+        """
+        super()._setup_references()
+
+        # Setup robot-specific references as well (note: requires resetting of sim for robot first)
+        for robot in self.robots:
+            robot.reset_sim(self.sim)
+            robot.setup_references()
+
+    def _setup_observables(self):
+        """
+        Sets up observables to be used for this environment. Loops through all robots and grabs their corresponding
+        observables to add to the procedurally generated dict of observables
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        observables = super()._setup_observables()
+        # Loop through all robots and grab their observables, adding it to the proprioception modality
+        for robot in self.robots:
+            robot_obs = robot.setup_observables()
+            observables.update(robot_obs)
+
+        # Loop through cameras and update the observations if using camera obs
+        if self.use_camera_obs:
+            # Create sensor information
+            sensors = []
+            names = []
+            for (cam_name, cam_w, cam_h, cam_d, cam_segs) in zip(
+                self.camera_names,
+                self.camera_widths,
+                self.camera_heights,
+                self.camera_depths,
+                self.camera_segmentations,
+            ):
+
+                # Add cameras associated to our arrays
+                cam_sensors, cam_sensor_names = self._create_camera_sensors(
+                    cam_name, cam_w=cam_w, cam_h=cam_h, cam_d=cam_d, cam_segs=cam_segs, modality="image"
+                )
+                sensors += cam_sensors
+                names += cam_sensor_names
+
+            # If any the camera segmentations are not None, then we shrink all the sites as a hacky way to
+            # prevent them from being rendered in the segmentation mask
+            if not all(seg is None for seg in self.camera_segmentations):
+                self.sim.model.site_size[:, :] = 1.0e-8
+
+            # Create observables for these cameras
+            for name, s in zip(names, sensors):
+                observables[name] = Observable(
+                    name=name,
+                    sensor=s,
+                    sampling_rate=self.control_freq,
+                )
+
+        return observables
+
+    def _create_camera_sensors(self, cam_name, cam_w, cam_h, cam_d, cam_segs, modality="image"):
+        """
+        Helper function to create sensors for a given camera. This is abstracted in a separate function call so that we
+        don't have local function naming collisions during the _setup_observables() call.
+        Args:
+            cam_name (str): Name of camera to create sensors for
+            cam_w (int): Width of camera
+            cam_h (int): Height of camera
+            cam_d (bool): Whether to create a depth sensor as well
+            cam_segs (None or list): Type of segmentation(s) to use, where each entry can be the following:
+                `None`: no segmentation sensor used
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+
+            modality (str): Modality to assign to all sensors
+        Returns:
+            2-tuple:
+                sensors (list): Array of sensors for the given camera
+                names (list): array of corresponding observable names
+        """
+        # Make sure we get correct convention
+        convention = IMAGE_CONVENTION_MAPPING[macros.IMAGE_CONVENTION]
+
+        # Create sensor information
+        sensors = []
+        names = []
+
+        # Add camera observables to the dict
+        rgb_sensor_name = f"{cam_name}_image"
+        depth_sensor_name = f"{cam_name}_depth"
+        segmentation_sensor_name = f"{cam_name}_segmentation"
+
+        @sensor(modality=modality)
+        def camera_rgb(obs_cache):
+            img = self.sim.render(
+                camera_name=cam_name,
+                width=cam_w,
+                height=cam_h,
+                depth=cam_d,
+            )
+            if cam_d:
+                rgb, depth = img
+                obs_cache[depth_sensor_name] = np.expand_dims(depth[::convention], axis=-1)
+                return rgb[::convention]
+            else:
+                return img[::convention]
+
+        sensors.append(camera_rgb)
+        names.append(rgb_sensor_name)
+
+        if cam_d:
+
+            @sensor(modality=modality)
+            def camera_depth(obs_cache):
+                return obs_cache[depth_sensor_name] if depth_sensor_name in obs_cache else np.zeros((cam_h, cam_w, 1))
+
+            sensors.append(camera_depth)
+            names.append(depth_sensor_name)
+
+        if cam_segs is not None:
+            # Define mapping we'll use for segmentation
+            for cam_s in cam_segs:
+                seg_sensor, seg_sensor_name = self._create_segementation_sensor(
+                    cam_name=cam_name,
+                    cam_w=cam_w,
+                    cam_h=cam_h,
+                    cam_s=cam_s,
+                    seg_name_root=segmentation_sensor_name,
+                    modality=modality,
+                )
+
+                sensors.append(seg_sensor)
+                names.append(seg_sensor_name)
+
+        return sensors, names
+
+    def _create_segementation_sensor(self, cam_name, cam_w, cam_h, cam_s, seg_name_root, modality="image"):
+        """
+        Helper function to create sensors for a given camera. This is abstracted in a separate function call so that we
+        don't have local function naming collisions during the _setup_observables() call.
+
+        Args:
+            cam_name (str): Name of camera to create sensors for
+            cam_w (int): Width of camera
+            cam_h (int): Height of camera
+            cam_s (None or list): Type of segmentation to use, should be the following:
+                `'instance'`: segmentation at the class-instance level
+                `'class'`: segmentation at the class level
+                `'element'`: segmentation at the per-geom level
+            seg_name_root (str): Sensor name root to assign to this sensor
+
+            modality (str): Modality to assign to all sensors
+
+        Returns:
+            2-tuple:
+                camera_segmentation (function): Generated sensor function for this segmentation sensor
+                name (str): Corresponding sensor name
+        """
+        # Make sure we get correct convention
+        convention = IMAGE_CONVENTION_MAPPING[macros.IMAGE_CONVENTION]
+
+        if cam_s == "instance":
+            name2id = {inst: i for i, inst in enumerate(list(self.model.instances_to_ids.keys()))}
+            mapping = {idn: name2id[inst] for idn, inst in self.model.geom_ids_to_instances.items()}
+        elif cam_s == "class":
+            name2id = {cls: i for i, cls in enumerate(list(self.model.classes_to_ids.keys()))}
+            mapping = {idn: name2id[cls] for idn, cls in self.model.geom_ids_to_classes.items()}
+        else:  # element
+            # No additional mapping needed
+            mapping = None
+
+        @sensor(modality=modality)
+        def camera_segmentation(obs_cache):
+            seg = self.sim.render(
+                camera_name=cam_name,
+                width=cam_w,
+                height=cam_h,
+                depth=False,
+                segmentation=True,
+            )
+            seg = np.expand_dims(seg[::convention, :, 1], axis=-1)
+            # Map raw IDs to grouped IDs if we're using instance or class-level segmentation
+            if mapping is not None:
+                seg = (
+                    np.fromiter(map(lambda x: mapping.get(x, -1), seg.flatten()), dtype=np.int32).reshape(
+                        cam_h, cam_w, 1
+                    )
+                    + 1
+                )
+            return seg
+
+        name = f"{seg_name_root}_{cam_s}"
+
+        return camera_segmentation, name
+
+    def _reset_internal(self):
+        """
+        Resets simulation internal configurations.
+        """
+        # Run superclass reset functionality
+        super()._reset_internal()
+
+        # Reset controllers
+        reset_controllers()
+
+        # Reset action dim
+        self._action_dim = 0
+
+        # Reset robot and update action space dimension along the way
+        for robot in self.robots:
+            robot.reset(deterministic=self.deterministic_reset)
+            self._action_dim += robot.action_dim
+
+        # Update cameras if appropriate
+        if self.use_camera_obs:
+            temp_names = []
+            for cam_name in self.camera_names:
+                if "all-" in cam_name:
+                    # We need to add all robot-specific camera names that include the key after the tag "all-"
+                    start_idx = len(temp_names) - 1
+                    key = cam_name.replace("all-", "")
+                    for robot in self.robots:
+                        for robot_cam_name in robot.robot_model.cameras:
+                            if key in robot_cam_name:
+                                temp_names.append(robot_cam_name)
+                    # We also need to broadcast the corresponding values from each camera dimensions as well
+                    end_idx = len(temp_names) - 1
+                    self.camera_widths = (
+                        self.camera_widths[:start_idx]
+                        + [self.camera_widths[start_idx]] * (end_idx - start_idx)
+                        + self.camera_widths[(start_idx + 1) :]
+                    )
+                    self.camera_heights = (
+                        self.camera_heights[:start_idx]
+                        + [self.camera_heights[start_idx]] * (end_idx - start_idx)
+                        + self.camera_heights[(start_idx + 1) :]
+                    )
+                    self.camera_depths = (
+                        self.camera_depths[:start_idx]
+                        + [self.camera_depths[start_idx]] * (end_idx - start_idx)
+                        + self.camera_depths[(start_idx + 1) :]
+                    )
+                else:
+                    # We simply add this camera to the temp_names
+                    temp_names.append(cam_name)
+            # Lastly, replace camera names with the updated ones
+            self.camera_names = temp_names
+
+    def _pre_action(self, action, policy_step=False):
+        """
+        Overrides the superclass method to control the robot(s) within this enviornment using their respective
+        controllers using the passed actions and gripper control.
+
+        Args:
+            action (np.array): The control to apply to the robot(s). Note that this should be a flat 1D array that
+                encompasses all actions to be distributed to each robot if there are multiple. For each section of the
+                action space assigned to a single robot, the first @self.robots[i].controller.control_dim dimensions
+                should be the desired controller actions and if the robot has a gripper, the next
+                @self.robots[i].gripper.dof dimensions should be actuation controls for the gripper.
+            policy_step (bool): Whether a new policy step (action) is being taken
+
+        Raises:
+            AssertionError: [Invalid action dimension]
+        """
+        # Verify that the action is the correct dimension
+        assert len(action) == self.action_dim, "environment got invalid action dimension -- expected {}, got {}".format(
+            self.action_dim, len(action)
+        )
+
+        # Update robot joints based on controller actions
+        cutoff = 0
+        for idx, robot in enumerate(self.robots):
+            robot_action = action[cutoff : cutoff + robot.action_dim]
+            robot.control(robot_action, policy_step=policy_step)
+            cutoff += robot.action_dim
+
+    def _load_robots(self):
+        """
+        Instantiates robots and stores them within the self.robots attribute
+        """
+        # Loop through robots and instantiate Robot object for each
+        for idx, (name, config) in enumerate(zip(self.robot_names, self.robot_configs)):
+            # Create the robot instance
+            self.robots[idx] = ROBOT_CLASS_MAPPING[name](robot_type=name, idn=idx, **config)
+            # Now, load the robot models
+            self.robots[idx].load_model()
+
+    def reward(self, action):
+        """
+        Runs superclass method by default
+        """
+        return super().reward(action)
+
+    def _check_success(self):
+        """
+        Runs superclass method by default
+        """
+        return super()._check_success()
+
+    def _check_robot_configuration(self, robots):
+        """
+        Sanity check to make sure inputted robots and the corresponding requested task/configuration combo is legal.
+        Should be implemented in every specific task module
+
+        Args:
+            robots (str or list of str): Inputted requested robots at the task-level environment
+        """
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/macros.py b/phantom/submodules/phantom-robosuite/robosuite/macros.py
new file mode 100644
index 0000000000000000000000000000000000000000..918c51590448a05557479f3552834f3079ff35c1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/macros.py
@@ -0,0 +1,55 @@
+"""
+Macro settings that can be imported and toggled. Internally, specific parts of the codebase rely on these settings
+for determining core functionality.
+
+To make sure global reference is maintained, should import these settings as:
+
+`import robosuite.macros as macros`
+"""
+
+# Global Mujoco Simulation Parameters
+SIMULATION_TIMESTEP = 0.002  # Internal simulation timestep (in seconds)
+
+# Instance Randomization
+# Used if we want to randomize geom groups uniformly per instance -- e.g.: entire robot arm, vs. per-joint geom
+# This should get set to True in your script BEFORE an environment is created or the DR wrapper is used
+USING_INSTANCE_RANDOMIZATION = False
+
+# Numba settings
+# TODO: Numba causes BSOD for NutAssembly task when rendering offscreen (deterministically!)
+ENABLE_NUMBA = True
+CACHE_NUMBA = True
+
+# Image Convention
+# Robosuite (Mujoco)-rendered images are based on the OpenGL coordinate frame convention, whereas many downstream
+# applications assume an OpenCV coordinate frame convention. For consistency, you can set the image convention
+# here; this will assure that any rendered frames will match the associated convention.
+# See the figure at the bottom of https://amytabb.com/ts/2019_06_28/ for an informative overview.
+IMAGE_CONVENTION = "opencv"  # Options are {"opengl", "opencv"}
+
+# Image concatenation
+# In general, observations are concatenated together by modality. However, image observations are expensive memory-wise,
+# so we skip concatenating all images together by default, unless this flag is set to True
+CONCATENATE_IMAGES = False
+
+MUJOCO_GPU_RENDERING = True
+
+# Spacemouse settings. Used by SpaceMouse class in robosuite/devices/spacemouse.py
+SPACEMOUSE_VENDOR_ID = 9583
+SPACEMOUSE_PRODUCT_ID = 50734
+
+# If LOGGING LEVEL is set to None, the logger will be turned off
+CONSOLE_LOGGING_LEVEL = "WARN"
+# File logging is written to /tmp/robosuite_{time}_{pid}.log by default
+FILE_LOGGING_LEVEL = None
+
+# Override with macros from macros_private.py file, if it exists
+try:
+    from robosuite.macros_private import *
+except ImportError:
+    import robosuite
+    from robosuite.utils.log_utils import ROBOSUITE_DEFAULT_LOGGER
+
+    ROBOSUITE_DEFAULT_LOGGER.warn("No private macro file found!")
+    ROBOSUITE_DEFAULT_LOGGER.warn("It is recommended to use a private macro file")
+    ROBOSUITE_DEFAULT_LOGGER.warn("To setup, run: python {}/scripts/setup_macros.py".format(robosuite.__path__[0]))
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73dc77f94297a4ff36e4db5e3e360f691189f6f8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/__init__.py
@@ -0,0 +1,4 @@
+import os
+from .world import MujocoWorldBase
+
+assets_root = os.path.join(os.path.dirname(__file__), "assets")
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/arenas/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab448ace61fb1cb6d2947202d009c7a5601e4096
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/__init__.py
@@ -0,0 +1,9 @@
+from .arena import Arena
+from .table_arena import TableArena
+from .table_arena2 import TableArena2
+from .phantom_table_arena import PhantomTableArena
+from .multi_table_arena import MultiTableArena
+from .pegs_arena import PegsArena
+from .bins_arena import BinsArena
+from .empty_arena import EmptyArena
+from .wipe_arena import WipeArena
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/arenas/arena.py b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/arena.py
new file mode 100644
index 0000000000000000000000000000000000000000..d274f9f5e56e7b5f2fe5ef6f9b6bab4312530e1a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/arena.py
@@ -0,0 +1,81 @@
+import numpy as np
+
+from robosuite.models.base import MujocoXML
+from robosuite.utils.mjcf_utils import (
+    ENVIRONMENT_COLLISION_COLOR,
+    array_to_string,
+    find_elements,
+    new_body,
+    new_element,
+    new_geom,
+    new_joint,
+    recolor_collision_geoms,
+    string_to_array,
+)
+
+
+class Arena(MujocoXML):
+    """Base arena class."""
+
+    def __init__(self, fname):
+        super().__init__(fname)
+        # Get references to floor and bottom
+        self.bottom_pos = np.zeros(3)
+        self.floor = self.worldbody.find("./geom[@name='floor']")
+
+        # Run any necessary post-processing on the model
+        self._postprocess_arena()
+
+        # Recolor all geoms
+        recolor_collision_geoms(
+            root=self.worldbody,
+            rgba=ENVIRONMENT_COLLISION_COLOR,
+            exclude=lambda e: True if e.get("name", None) == "floor" else False,
+        )
+
+    def set_origin(self, offset):
+        """
+        Applies a constant offset to all objects.
+
+        Args:
+            offset (3-tuple): (x,y,z) offset to apply to all nodes in this XML
+        """
+        offset = np.array(offset)
+        for node in self.worldbody.findall("./*[@pos]"):
+            cur_pos = string_to_array(node.get("pos"))
+            new_pos = cur_pos + offset
+            node.set("pos", array_to_string(new_pos))
+
+    def set_camera(self, camera_name, pos, quat, camera_attribs=None):
+        """
+        Sets a camera with @camera_name. If the camera already exists, then this overwrites its pos and quat values.
+
+        Args:
+            camera_name (str): Camera name to search for / create
+            pos (3-array): (x,y,z) coordinates of camera in world frame
+            quat (4-array): (w,x,y,z) quaternion of camera in world frame
+            camera_attribs (dict): If specified, should be additional keyword-mapped attributes for this camera.
+                See http://www.mujoco.org/book/XMLreference.html#camera for exact attribute specifications.
+        """
+        # Determine if camera already exists
+        camera = find_elements(root=self.worldbody, tags="camera", attribs={"name": camera_name}, return_first=True)
+
+        # Compose attributes
+        if camera_attribs is None:
+            camera_attribs = {}
+        camera_attribs["pos"] = array_to_string(pos)
+        camera_attribs["quat"] = array_to_string(quat)
+
+        if camera is None:
+            # If camera doesn't exist, then add a new camera with the specified attributes
+            self.worldbody.append(new_element(tag="camera", name=camera_name, **camera_attribs))
+        else:
+            # Otherwise, we edit all specified attributes in that camera
+            for attrib, value in camera_attribs.items():
+                camera.set(attrib, value)
+
+    def _postprocess_arena(self):
+        """
+        Runs any necessary post-processing on the imported Arena model
+        """
+        pass
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/arenas/bins_arena.py b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/bins_arena.py
new file mode 100644
index 0000000000000000000000000000000000000000..b50b40bcdd298a3398607fb284bf0a48962343f7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/bins_arena.py
@@ -0,0 +1,34 @@
+import numpy as np
+
+from robosuite.models.arenas import Arena
+from robosuite.utils.mjcf_utils import array_to_string, xml_path_completion
+
+
+class BinsArena(Arena):
+    """
+    Workspace that contains two bins placed side by side.
+
+    Args:
+        bin1_pos (3-tuple): (x,y,z) position to place bin1
+        table_full_size (3-tuple): (L,W,H) full dimensions of the table
+        table_friction (3-tuple): (sliding, torsional, rolling) friction parameters of the table
+    """
+
+    def __init__(
+        self, bin1_pos=(0.1, -0.5, 0.8), table_full_size=(0.39, 0.49, 0.82), table_friction=(1, 0.005, 0.0001)
+    ):
+        super().__init__(xml_path_completion("arenas/bins_arena.xml"))
+
+        self.table_full_size = np.array(table_full_size)
+        self.table_half_size = self.table_full_size / 2
+        self.table_friction = table_friction
+
+        self.bin1_body = self.worldbody.find("./body[@name='bin1']")
+        self.bin2_body = self.worldbody.find("./body[@name='bin2']")
+        self.table_top_abs = np.array(bin1_pos)
+
+        self.configure_location()
+
+    def configure_location(self):
+        """Configures correct locations for this arena"""
+        self.floor.set("pos", array_to_string(self.bottom_pos))
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/arenas/empty_arena.py b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/empty_arena.py
new file mode 100644
index 0000000000000000000000000000000000000000..e10da831b24b0c1870ffb640327cdf543efd02ab
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/empty_arena.py
@@ -0,0 +1,9 @@
+from robosuite.models.arenas import Arena
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class EmptyArena(Arena):
+    """Empty workspace."""
+
+    def __init__(self):
+        super().__init__(xml_path_completion("arenas/empty_arena.xml"))
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/arenas/multi_table_arena.py b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/multi_table_arena.py
new file mode 100644
index 0000000000000000000000000000000000000000..f62ef357e2c45a483ac2643cdea084a7b9380722
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/multi_table_arena.py
@@ -0,0 +1,149 @@
+from collections.abc import Iterable
+
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.models.arenas import Arena
+from robosuite.utils.mjcf_utils import (
+    array_to_string,
+    new_body,
+    new_geom,
+    new_site,
+    string_to_array,
+    xml_path_completion,
+)
+
+
+class MultiTableArena(Arena):
+    """
+    Workspace that contains multiple tables.
+    Args:
+        table_offsets (list of 3-array): (x,y,z) offset from center of arena when placing each table.
+            Note that the number of tables is inferred from the length of this list
+            Note that the z value sets the upper limit of the table
+        table_rots (float or list of float): z-rotation to apply to each table. If only a
+            single value is given, it will be broadcasted according to the total number of tables
+        table_full_sizes (3-array or list of 3-array): (L,W,H) full dimensions of each table. If only a
+            single value is given, it will be broadcasted according to the total number of tables
+        table_frictions (3-array or list of 3-array): (sliding, torsional, rolling) friction parameters of each table.
+        has_legs (bool or list of bool): whether each table has legs or not. If only a
+            single value is given, it will be broadcasted according to the total number of tables
+        xml (str): xml file to load arena
+    """
+
+    def __init__(
+        self,
+        table_offsets,
+        table_rots=0,
+        table_full_sizes=(0.8, 0.8, 0.05),
+        table_frictions=(1, 0.005, 0.0001),
+        has_legs=True,
+        xml="arenas/multi_table_arena.xml",
+    ):
+        # Set internal vars
+        self.table_offsets = np.array(table_offsets)
+        self.n_tables = self.table_offsets.shape[0]
+        self.table_rots = (
+            np.array(table_rots) if isinstance(table_rots, Iterable) else np.ones(self.n_tables) * table_rots
+        )
+        self.table_full_sizes = np.array(table_full_sizes)
+        if len(self.table_full_sizes.shape) == 1:
+            self.table_full_sizes = np.stack([self.table_full_sizes] * self.n_tables, axis=0)
+        self.table_half_sizes = self.table_full_sizes / 2
+        self.table_frictions = np.array(table_frictions)
+        if len(self.table_frictions.shape) == 1:
+            self.table_frictions = np.stack([self.table_frictions] * self.n_tables, axis=0)
+        self.center_pos = np.array(self.table_offsets)
+        self.center_pos[:, 2] -= self.table_half_sizes[:, 2]
+        self.has_legs = has_legs if isinstance(has_legs, Iterable) else [has_legs] * self.n_tables
+
+        # Run super init
+        super().__init__(xml_path_completion(xml))
+
+        # Configure any relevant locations
+        self.configure_location()
+
+    def _add_table(self, name, offset, rot, half_size, friction, has_legs):
+        """
+        Procedurally generates a table and adds it to the XML
+        """
+        # Create body for this table, and add it to worldbody
+        table_body = new_body(name=name, pos=offset - np.array([0, 0, half_size[2]]))
+        self.worldbody.append(table_body)
+
+        # Create core attributes for table geoms
+        table_attribs = {
+            "pos": (0, 0, 0),
+            "quat": T.convert_quat(T.axisangle2quat([0, 0, rot]), to="wxyz"),
+            "size": half_size,
+            "type": "box",
+        }
+
+        # Create collision and visual bodies, and add them to the table body
+        col_geom = new_geom(name=f"{name}_collision", group=0, friction=friction, **table_attribs)
+        vis_geom = new_geom(
+            name=f"{name}_visual", group=1, conaffinity=0, contype=0, material="table_ceramic", **table_attribs
+        )
+        table_body.append(col_geom)
+        table_body.append(vis_geom)
+
+        # Add tabletop site to table
+        top_site = new_site(name=f"{name}_top", pos=(0, 0, half_size[2]), size=(0.001, 0.001, 0.001), rgba=(0, 0, 0, 0))
+        table_body.append(top_site)
+
+        # Add legs if requested
+        if has_legs:
+            delta_x = [0.1, -0.1, -0.1, 0.1]
+            delta_y = [0.1, 0.1, -0.1, -0.1]
+            for i, (dx, dy) in enumerate(zip(delta_x, delta_y)):
+                # If x-length of table is less than a certain length, place leg in the middle between ends
+                # Otherwise we place it near the edge
+                x = 0
+                if half_size[0] > abs(dx * 2.0):
+                    x += np.sign(dx) * half_size[0] - dx
+                # Repeat the same process for y
+                y = 0
+                if half_size[1] > abs(dy * 2.0):
+                    y += np.sign(dy) * half_size[1] - dy
+                # Rotate x and y values according to requested rotation
+                c, s = np.cos(rot), np.sin(rot)
+                rot_xy = np.array([[c, -s], [s, c]]) @ np.array([x, y])
+                # Add in offsets
+                x = rot_xy[0]
+                y = rot_xy[1]
+                # Get z value
+                z = (offset[2] - half_size[2]) / 2.0
+                # Create visual geom and add it to table body
+                leg_geom = new_geom(
+                    name=f"{name}_leg{i}_visual",
+                    pos=(x, y, -z),
+                    type="cylinder",
+                    size=(0.025, z),
+                    group=1,
+                    conaffinity=0,
+                    contype=0,
+                    material="table_legs_metal",
+                )
+                table_body.append(leg_geom)
+
+    def configure_location(self):
+        """Configures correct locations for this arena"""
+        # Set floor correctly
+        self.floor.set("pos", array_to_string(self.bottom_pos))
+
+    def _postprocess_arena(self):
+        """
+        Runs any necessary post-processing on the imported Arena model
+        """
+        # Create tables
+        for i, (offset, rot, half_size, friction, legs) in enumerate(
+            zip(self.table_offsets, self.table_rots, self.table_half_sizes, self.table_frictions, self.has_legs)
+        ):
+            self._add_table(
+                name=f"table{i}",
+                offset=offset,
+                rot=rot,
+                half_size=half_size,
+                friction=friction,
+                has_legs=legs,
+            )
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/arenas/pegs_arena.py b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/pegs_arena.py
new file mode 100644
index 0000000000000000000000000000000000000000..492b99d331038c2a791d111e43d797030ec1cac9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/pegs_arena.py
@@ -0,0 +1,30 @@
+from robosuite.models.arenas import TableArena
+
+
+class PegsArena(TableArena):
+    """
+    Workspace that contains a tabletop with two fixed pegs.
+
+    Args:
+        table_full_size (3-tuple): (L,W,H) full dimensions of the table
+        table_friction (3-tuple): (sliding, torsional, rolling) friction parameters of the table
+        table_offset (3-tuple): (x,y,z) offset from center of arena when placing table.
+            Note that the z value sets the upper limit of the table
+    """
+
+    def __init__(
+        self,
+        table_full_size=(0.45, 0.69, 0.05),
+        table_friction=(1, 0.005, 0.0001),
+        table_offset=(0, 0, 0),
+    ):
+        super().__init__(
+            table_full_size=table_full_size,
+            table_friction=table_friction,
+            table_offset=table_offset,
+            xml="arenas/pegs_arena.xml",
+        )
+
+        # Get references to peg bodies
+        self.peg1_body = self.worldbody.find("./body[@name='peg1']")
+        self.peg2_body = self.worldbody.find("./body[@name='peg2']")
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/arenas/phantom_table_arena.py b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/phantom_table_arena.py
new file mode 100644
index 0000000000000000000000000000000000000000..abca7d38409eb8511489beb1e59e8da857e28be2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/phantom_table_arena.py
@@ -0,0 +1,97 @@
+import numpy as np
+
+from robosuite.models.arenas import Arena
+from robosuite.utils.mjcf_utils import array_to_string, string_to_array, xml_path_completion
+
+
+class PhantomTableArena(Arena):
+    """
+    Workspace that contains an empty table.
+
+
+    Args:
+        table_full_size (3-tuple): (L,W,H) full dimensions of the table
+        table_friction (3-tuple): (sliding, torsional, rolling) friction parameters of the table
+        table_offset (3-tuple): (x,y,z) offset from center of arena when placing table.
+            Note that the z value sets the upper limit of the table
+        has_legs (bool): whether the table has legs or not
+        xml (str): xml file to load arena
+    """
+
+    def __init__(
+        self,
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(1, 0.005, 0.0001),
+        table_offset=(0, 0, 0.8),
+        has_legs=True,
+        xml="arenas/phantom_table_arena.xml",
+    ):
+        super().__init__(xml_path_completion(xml))
+
+        self.table_full_size = np.array(table_full_size)
+        self.table_half_size = self.table_full_size / 2
+        self.table_friction = table_friction
+        self.table_offset = table_offset
+        self.center_pos = self.bottom_pos + np.array([0, 0, -self.table_half_size[2]]) + self.table_offset
+
+        self.table_body = self.worldbody.find("./body[@name='table']")
+        self.table_collision = self.table_body.find("./geom[@name='table_collision']")
+        self.table_visual = self.table_body.find("./geom[@name='table_visual']")
+        self.table_top = self.table_body.find("./site[@name='table_top']")
+
+        self.has_legs = has_legs
+        self.table_legs_visual = [
+            self.table_body.find("./geom[@name='table_leg1_visual']"),
+            self.table_body.find("./geom[@name='table_leg2_visual']"),
+            self.table_body.find("./geom[@name='table_leg3_visual']"),
+            self.table_body.find("./geom[@name='table_leg4_visual']"),
+        ]
+
+        self.configure_location()
+
+    def configure_location(self):
+        """Configures correct locations for this arena"""
+        self.floor.set("pos", array_to_string(self.bottom_pos))
+
+        self.table_body.set("pos", array_to_string(self.center_pos))
+        self.table_collision.set("size", array_to_string(self.table_half_size))
+        self.table_collision.set("friction", array_to_string(self.table_friction))
+        self.table_visual.set("size", array_to_string(self.table_half_size))
+
+        self.table_top.set("pos", array_to_string(np.array([0, 0, self.table_half_size[2]])))
+
+        # If we're not using legs, set their size to 0
+        if not self.has_legs:
+            for leg in self.table_legs_visual:
+                leg.set("rgba", array_to_string([1, 0, 0, 0]))
+                leg.set("size", array_to_string([0.0001, 0.0001]))
+        else:
+            # Otherwise, set leg locations appropriately
+            delta_x = [0.1, -0.1, -0.1, 0.1]
+            delta_y = [0.1, 0.1, -0.1, -0.1]
+            for leg, dx, dy in zip(self.table_legs_visual, delta_x, delta_y):
+                # If x-length of table is less than a certain length, place leg in the middle between ends
+                # Otherwise we place it near the edge
+                x = 0
+                if self.table_half_size[0] > abs(dx * 2.0):
+                    x += np.sign(dx) * self.table_half_size[0] - dx
+                # Repeat the same process for y
+                y = 0
+                if self.table_half_size[1] > abs(dy * 2.0):
+                    y += np.sign(dy) * self.table_half_size[1] - dy
+                # Get z value
+                z = (self.table_offset[2] - self.table_half_size[2]) / 2.0
+                # Set leg position
+                leg.set("pos", array_to_string([x, y, -z]))
+                # Set leg size
+                leg.set("size", array_to_string([0.025, z]))
+
+    @property
+    def table_top_abs(self):
+        """
+        Grabs the absolute position of table top
+
+        Returns:
+            np.array: (x,y,z) table position
+        """
+        return string_to_array(self.floor.get("pos")) + self.table_offset
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/arenas/table_arena.py b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/table_arena.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a14c11c95d612ec6fd34eea62623d28a2460220
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/table_arena.py
@@ -0,0 +1,97 @@
+import numpy as np
+
+from robosuite.models.arenas import Arena
+from robosuite.utils.mjcf_utils import array_to_string, string_to_array, xml_path_completion
+
+
+class TableArena(Arena):
+    """
+    Workspace that contains an empty table.
+
+
+    Args:
+        table_full_size (3-tuple): (L,W,H) full dimensions of the table
+        table_friction (3-tuple): (sliding, torsional, rolling) friction parameters of the table
+        table_offset (3-tuple): (x,y,z) offset from center of arena when placing table.
+            Note that the z value sets the upper limit of the table
+        has_legs (bool): whether the table has legs or not
+        xml (str): xml file to load arena
+    """
+
+    def __init__(
+        self,
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(1, 0.005, 0.0001),
+        table_offset=(0, 0, 0.8),
+        has_legs=True,
+        xml="arenas/table_arena.xml",
+    ):
+        super().__init__(xml_path_completion(xml))
+
+        self.table_full_size = np.array(table_full_size)
+        self.table_half_size = self.table_full_size / 2
+        self.table_friction = table_friction
+        self.table_offset = table_offset
+        self.center_pos = self.bottom_pos + np.array([0, 0, -self.table_half_size[2]]) + self.table_offset
+
+        self.table_body = self.worldbody.find("./body[@name='table']")
+        self.table_collision = self.table_body.find("./geom[@name='table_collision']")
+        self.table_visual = self.table_body.find("./geom[@name='table_visual']")
+        self.table_top = self.table_body.find("./site[@name='table_top']")
+
+        self.has_legs = has_legs
+        self.table_legs_visual = [
+            self.table_body.find("./geom[@name='table_leg1_visual']"),
+            self.table_body.find("./geom[@name='table_leg2_visual']"),
+            self.table_body.find("./geom[@name='table_leg3_visual']"),
+            self.table_body.find("./geom[@name='table_leg4_visual']"),
+        ]
+
+        self.configure_location()
+
+    def configure_location(self):
+        """Configures correct locations for this arena"""
+        self.floor.set("pos", array_to_string(self.bottom_pos))
+
+        self.table_body.set("pos", array_to_string(self.center_pos))
+        self.table_collision.set("size", array_to_string(self.table_half_size))
+        self.table_collision.set("friction", array_to_string(self.table_friction))
+        self.table_visual.set("size", array_to_string(self.table_half_size))
+
+        self.table_top.set("pos", array_to_string(np.array([0, 0, self.table_half_size[2]])))
+
+        # If we're not using legs, set their size to 0
+        if not self.has_legs:
+            for leg in self.table_legs_visual:
+                leg.set("rgba", array_to_string([1, 0, 0, 0]))
+                leg.set("size", array_to_string([0.0001, 0.0001]))
+        else:
+            # Otherwise, set leg locations appropriately
+            delta_x = [0.1, -0.1, -0.1, 0.1]
+            delta_y = [0.1, 0.1, -0.1, -0.1]
+            for leg, dx, dy in zip(self.table_legs_visual, delta_x, delta_y):
+                # If x-length of table is less than a certain length, place leg in the middle between ends
+                # Otherwise we place it near the edge
+                x = 0
+                if self.table_half_size[0] > abs(dx * 2.0):
+                    x += np.sign(dx) * self.table_half_size[0] - dx
+                # Repeat the same process for y
+                y = 0
+                if self.table_half_size[1] > abs(dy * 2.0):
+                    y += np.sign(dy) * self.table_half_size[1] - dy
+                # Get z value
+                z = (self.table_offset[2] - self.table_half_size[2]) / 2.0
+                # Set leg position
+                leg.set("pos", array_to_string([x, y, -z]))
+                # Set leg size
+                leg.set("size", array_to_string([0.025, z]))
+
+    @property
+    def table_top_abs(self):
+        """
+        Grabs the absolute position of table top
+
+        Returns:
+            np.array: (x,y,z) table position
+        """
+        return string_to_array(self.floor.get("pos")) + self.table_offset
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/arenas/table_arena2.py b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/table_arena2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5813888d0157926b0ee58cb94a8186bce174eb3d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/table_arena2.py
@@ -0,0 +1,98 @@
+
+import numpy as np
+
+from robosuite.models.arenas import Arena
+from robosuite.utils.mjcf_utils import array_to_string, string_to_array, xml_path_completion
+
+
+class TableArena2(Arena):
+    """
+    Workspace that contains an empty table.
+
+
+    Args:
+        table_full_size (3-tuple): (L,W,H) full dimensions of the table
+        table_friction (3-tuple): (sliding, torsional, rolling) friction parameters of the table
+        table_offset (3-tuple): (x,y,z) offset from center of arena when placing table.
+            Note that the z value sets the upper limit of the table
+        has_legs (bool): whether the table has legs or not
+        xml (str): xml file to load arena
+    """
+
+    def __init__(
+        self,
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(1, 0.005, 0.0001),
+        table_offset=(0, 0, 0.8),
+        has_legs=True,
+        xml="arenas/table_arena2.xml",
+    ):
+        super().__init__(xml_path_completion(xml))
+
+        self.table_full_size = np.array(table_full_size)
+        self.table_half_size = self.table_full_size / 2
+        self.table_friction = table_friction
+        self.table_offset = table_offset
+        self.center_pos = self.bottom_pos + np.array([0, 0, -self.table_half_size[2]]) + self.table_offset
+
+        self.table_body = self.worldbody.find("./body[@name='table']")
+        self.table_collision = self.table_body.find("./geom[@name='table_collision']")
+        self.table_visual = self.table_body.find("./geom[@name='table_visual']")
+        self.table_top = self.table_body.find("./site[@name='table_top']")
+
+        self.has_legs = has_legs
+        self.table_legs_visual = [
+            self.table_body.find("./geom[@name='table_leg1_visual']"),
+            self.table_body.find("./geom[@name='table_leg2_visual']"),
+            self.table_body.find("./geom[@name='table_leg3_visual']"),
+            self.table_body.find("./geom[@name='table_leg4_visual']"),
+        ]
+
+        self.configure_location()
+
+    def configure_location(self):
+        """Configures correct locations for this arena"""
+        self.floor.set("pos", array_to_string(self.bottom_pos))
+
+        self.table_body.set("pos", array_to_string(self.center_pos))
+        self.table_collision.set("size", array_to_string(self.table_half_size))
+        self.table_collision.set("friction", array_to_string(self.table_friction))
+        self.table_visual.set("size", array_to_string(self.table_half_size))
+
+        self.table_top.set("pos", array_to_string(np.array([0, 0, self.table_half_size[2]])))
+
+        # If we're not using legs, set their size to 0
+        if not self.has_legs:
+            for leg in self.table_legs_visual:
+                leg.set("rgba", array_to_string([1, 0, 0, 0]))
+                leg.set("size", array_to_string([0.0001, 0.0001]))
+        else:
+            # Otherwise, set leg locations appropriately
+            delta_x = [0.1, -0.1, -0.1, 0.1]
+            delta_y = [0.1, 0.1, -0.1, -0.1]
+            for leg, dx, dy in zip(self.table_legs_visual, delta_x, delta_y):
+                # If x-length of table is less than a certain length, place leg in the middle between ends
+                # Otherwise we place it near the edge
+                x = 0
+                if self.table_half_size[0] > abs(dx * 2.0):
+                    x += np.sign(dx) * self.table_half_size[0] - dx
+                # Repeat the same process for y
+                y = 0
+                if self.table_half_size[1] > abs(dy * 2.0):
+                    y += np.sign(dy) * self.table_half_size[1] - dy
+                # Get z value
+                z = (self.table_offset[2] - self.table_half_size[2]) / 2.0
+                # Set leg position
+                leg.set("pos", array_to_string([x, y, -z]))
+                # Set leg size
+                leg.set("size", array_to_string([0.025, z]))
+
+    @property
+    def table_top_abs(self):
+        """
+        Grabs the absolute position of table top
+
+        Returns:
+            np.array: (x,y,z) table position
+        """
+        return string_to_array(self.floor.get("pos")) + self.table_offset
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/arenas/wipe_arena.py b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/wipe_arena.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1db1cad4eb0501dda1db869eb86871dc74981a3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/arenas/wipe_arena.py
@@ -0,0 +1,186 @@
+import numpy as np
+
+from robosuite.models.arenas import TableArena
+from robosuite.models.objects import CylinderObject
+from robosuite.utils.mjcf_utils import CustomMaterial, find_elements
+
+
+class WipeArena(TableArena):
+    """
+    Workspace that contains an empty table with visual markers on its surface.
+
+    Args:
+        table_full_size (3-tuple): (L,W,H) full dimensions of the table
+        table_friction (3-tuple): (sliding, torsional, rolling) friction parameters of the table
+        table_offset (3-tuple): (x,y,z) offset from center of arena when placing table.
+            Note that the z value sets the upper limit of the table
+        coverage_factor (float): Fraction of table that will be sampled for dirt placement
+        num_markers (int): Number of dirt (peg) particles to generate in a path on the table
+        table_friction_std (float): Standard deviation to sample for the peg friction
+        line_width (float): Diameter of dirt path trace
+        two_clusters (bool): If set, will generate two separate dirt paths with half the number of sensors in each
+    """
+
+    def __init__(
+        self,
+        table_full_size=(0.8, 0.8, 0.05),
+        table_friction=(0.01, 0.005, 0.0001),
+        table_offset=(0, 0, 0.8),
+        coverage_factor=0.9,
+        num_markers=10,
+        table_friction_std=0,
+        line_width=0.02,
+        two_clusters=False,
+    ):
+        # Tactile table-specific features
+        self.table_friction_std = table_friction_std
+        self.line_width = line_width
+        self.markers = []
+        self.coverage_factor = coverage_factor
+        self.num_markers = num_markers
+        self.two_clusters = two_clusters
+
+        # Attribute to hold current direction of sampled dirt path
+        self.direction = None
+
+        # run superclass init
+        super().__init__(
+            table_full_size=table_full_size,
+            table_friction=table_friction,
+            table_offset=table_offset,
+        )
+
+    def configure_location(self):
+        """Configures correct locations for this arena"""
+        # Run superclass first
+        super().configure_location()
+
+        # Define start position for drawing the line
+        pos = self.sample_start_pos()
+
+        # Define dirt material for markers
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "1 1",
+            "specular": "0.0",
+            "shininess": "0.0",
+        }
+        dirt = CustomMaterial(
+            texture="Dirt",
+            tex_name="dirt",
+            mat_name="dirt_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+            shared=True,
+        )
+
+        # Define line(s) drawn on table
+        for i in range(self.num_markers):
+            # If we're using two clusters, we resample the starting position and direction at the halfway point
+            if self.two_clusters and i == int(np.floor(self.num_markers / 2)):
+                pos = self.sample_start_pos()
+            marker_name = f"contact{i}"
+            marker = CylinderObject(
+                name=marker_name,
+                size=[self.line_width / 2, 0.001],
+                rgba=[1, 1, 1, 1],
+                material=dirt,
+                obj_type="visual",
+                joints=None,
+            )
+            # Manually add this object to the arena xml
+            self.merge_assets(marker)
+            table = find_elements(root=self.worldbody, tags="body", attribs={"name": "table"}, return_first=True)
+            table.append(marker.get_obj())
+
+            # Add this marker to our saved list of all markers
+            self.markers.append(marker)
+
+            # Add to the current dirt path
+            pos = self.sample_path_pos(pos)
+
+    def reset_arena(self, sim):
+        """
+        Reset the visual marker locations in the environment. Requires @sim (MjSim) reference to be passed in so that
+        the Mujoco sim can be directly modified
+
+        Args:
+            sim (MjSim): Simulation instance containing this arena and visual markers
+        """
+        # Sample new initial position and direction for generated marker paths
+        pos = self.sample_start_pos()
+
+        # Loop through all visual markers
+        for i, marker in enumerate(self.markers):
+            # If we're using two clusters, we resample the starting position and direction at the halfway point
+            if self.two_clusters and i == int(np.floor(self.num_markers / 2)):
+                pos = self.sample_start_pos()
+            # Get IDs to the body, geom, and site of each marker
+            body_id = sim.model.body_name2id(marker.root_body)
+            geom_id = sim.model.geom_name2id(marker.visual_geoms[0])
+            site_id = sim.model.site_name2id(marker.sites[0])
+            # Determine new position for this marker
+            position = np.array([pos[0], pos[1], self.table_half_size[2]])
+            # Set the current marker (body) to this new position
+            sim.model.body_pos[body_id] = position
+            # Reset the marker visualization -- setting geom rgba alpha value to 1
+            sim.model.geom_rgba[geom_id][3] = 1
+            # Hide the default visualization site
+            sim.model.site_rgba[site_id][3] = 0
+            # Sample next values in local marker trajectory
+            pos = self.sample_path_pos(pos)
+
+    def sample_start_pos(self):
+        """
+        Helper function to return sampled start position of a new dirt (peg) location
+
+        Returns:
+            np.array: the (x,y) value of the newly sampled dirt starting location
+        """
+        # First define the random direction that we will start at
+        self.direction = np.random.uniform(-np.pi, np.pi)
+
+        return np.array(
+            (
+                np.random.uniform(
+                    -self.table_half_size[0] * self.coverage_factor + self.line_width / 2,
+                    self.table_half_size[0] * self.coverage_factor - self.line_width / 2,
+                ),
+                np.random.uniform(
+                    -self.table_half_size[1] * self.coverage_factor + self.line_width / 2,
+                    self.table_half_size[1] * self.coverage_factor - self.line_width / 2,
+                ),
+            )
+        )
+
+    def sample_path_pos(self, pos):
+        """
+        Helper function to add a sampled dirt (peg) position to a pre-existing dirt path, whose most
+        recent dirt position is defined by @pos
+
+        Args:
+            pos (np.array): (x,y) value of most recent dirt position
+
+        Returns:
+            np.array: the (x,y) value of the newly sampled dirt position to add to the current dirt path
+        """
+        # Random chance to alter the current dirt direction
+        if np.random.uniform(0, 1) > 0.7:
+            self.direction += np.random.normal(0, 0.5)
+
+        posnew0 = pos[0] + 0.005 * np.sin(self.direction)
+        posnew1 = pos[1] + 0.005 * np.cos(self.direction)
+
+        # We keep resampling until we get a valid new position that's on the table
+        while (
+            abs(posnew0) >= self.table_half_size[0] * self.coverage_factor - self.line_width / 2
+            or abs(posnew1) >= self.table_half_size[1] * self.coverage_factor - self.line_width / 2
+        ):
+            self.direction += np.random.normal(0, 0.5)
+            posnew0 = pos[0] + 0.005 * np.sin(self.direction)
+            posnew1 = pos[1] + 0.005 * np.cos(self.direction)
+
+        # Return this newly sampled position
+        return np.array((posnew0, posnew1))
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/bins_arena.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/bins_arena.xml
new file mode 100644
index 0000000000000000000000000000000000000000..5e93098acbb30bc267c5e8c6d93f201a0414abde
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/bins_arena.xml
@@ -0,0 +1,74 @@
+<mujoco model="bins_arena">
+  <asset>
+    <texture builtin="gradient" height="256" rgb1=".9 .9 1." rgb2=".2 .3 .4" type="skybox" width="256"/>
+    <texture file="../textures/light-gray-floor-tile.png" type="2d" name="texplane"/>
+    <material name="floorplane" reflectance="0.01" shininess="0.0" specular="0.0" texrepeat="2 2" texture="texplane" texuniform="true"/>
+    <!-- steel legs -->
+    <texture file="../textures/steel-brushed.png" type="cube" name="tex-steel-brushed"/>
+    <material name="table_legs_metal" reflectance="0.8" shininess="0.8" texrepeat="1 1" texture="tex-steel-brushed" />
+    <!-- plaster walls -->
+    <texture file="../textures/light-gray-plaster.png" type="2d" name="tex-light-gray-plaster"/>
+    <material name="walls_mat" reflectance="0.0" shininess="0.1" specular="0.1" texrepeat="3 3" texture="tex-light-gray-plaster" texuniform="true" />
+    <!-- bins -->
+    <texture file="../textures/light-wood.png" type="2d" name="tex-light-wood"/>
+    <texture file="../textures/dark-wood.png" type="2d" name="tex-dark-wood"/>
+    <material name="light-wood" reflectance="0" texrepeat="15 15" texture="tex-light-wood" texuniform="true"/>
+    <material name="dark-wood" reflectance="0" texrepeat="5 5" texture="tex-dark-wood" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <geom condim="3" group="1" material="floorplane" name="floor" pos="0 0 0" size="3 3 .125" type="plane"/>
+    <!-- Walls (visual only) -->
+    <geom pos="-1.25 2.25 1.5" quat="0.6532815 0.6532815 0.2705981 0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_leftcorner_visual" material="walls_mat"/>
+    <geom pos="-1.25 -2.25 1.5" quat="0.6532815 0.6532815 -0.2705981 -0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rightcorner_visual" material="walls_mat"/>
+    <geom pos="1.25 3 1.5" quat="0.7071 0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_left_visual" material="walls_mat"/>
+    <geom pos="1.25 -3 1.5" quat="0.7071 -0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_right_visual" material="walls_mat"/>
+    <geom pos="-2 0 1.5" quat="0.5 0.5 0.5 0.5" size="1.5 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rear_visual" material="walls_mat"/>
+    <geom pos="3 0 1.5" quat="0.5 0.5 -0.5 -0.5" size="3 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_front_visual" material="walls_mat"/>
+    <!-- bins -->
+    <body name="bin1" pos="0.1 -0.25 0.8">
+      <geom pos="0 0 0" size="0.2 0.25 0.02" type="box" group="0" friction="1 0.005 0.0001"/>
+      <geom pos="0 0 0" material="light-wood" size="0.2 0.25 0.02" type="box" conaffinity="0" contype="0" group="1"/>
+        <geom pos="0 0.25 0.05" size="0.21 0.01 0.05" type="box" group="0" friction="1 0.005 0.0001" rgba="0 0 1 1"/>
+        <geom pos="0 0.25 0.05" size="0.21 0.01 0.05" type="box" conaffinity="0" contype="0" group="1" material="light-wood"/>
+        <geom pos="0 -0.25 0.05" size="0.21 0.01 0.05" type="box" group="0" friction="1 0.005 0.0001" rgba="0 0 1 1"/>
+        <geom pos="0 -0.25 0.05" size="0.21 0.01 0.05" type="box" conaffinity="0" contype="0" group="1" material="light-wood"/>
+        <geom pos="0.2 0 0.05" size="0.01 0.25 0.05" type="box" group="0" friction="1 0.005 0.0001" rgba="0 0 1 1"/>
+        <geom pos="0.2 0 0.05" size="0.01 0.25 0.05" type="box" conaffinity="0" contype="0" group="1" material="light-wood"/>
+        <geom pos="-0.2 0 0.05" size="0.01 0.25 0.05" type="box" group="0" friction="1 0.005 0.0001" rgba="0 0 1 1"/>
+        <geom pos="-0.2 0 0.05" size="0.01 0.25 0.05" type="box" conaffinity="0" contype="0" group="1" material="light-wood"/>
+        <!-- Legs (visual only) -->
+        <geom pos="0.15 0.2 -0.4" size="0.01 0.4" type="cylinder" conaffinity="0" contype="0" group="1" name="bin1_leg1_visual" material="table_legs_metal"/>
+        <geom pos="-0.15 0.2 -0.4" size="0.01 0.4" type="cylinder" conaffinity="0" contype="0" group="1" name="bin1_leg2_visual" material="table_legs_metal"/>
+        <geom pos="-0.15 -0.2 -0.4" size="0.01 0.4" type="cylinder" conaffinity="0" contype="0" group="1" name="bin1_leg3_visual" material="table_legs_metal"/>
+        <geom pos="0.15 -0.2 -0.4" size="0.01 0.4" type="cylinder" conaffinity="0" contype="0" group="1" name="bin1_leg4_visual" material="table_legs_metal"/>
+    </body>
+    <body name="bin2" pos="0.1 0.28 0.8">
+      <geom pos="0 0 0" size="0.2 0.25 0.02" type="box" group="0" friction="1 0.005 0.0001"/>
+      <geom pos="0 0 0" size="0.2 0.25 0.02" type="box" conaffinity="0" contype="0" group="1" material="dark-wood"/>
+        <geom pos="0 0.25 0.05" size="0.21 0.01 0.05" type="box" group="0" friction="1 0.005 0.0001" rgba="1 0 0 1"/>
+        <geom pos="0 0.25 0.05" size="0.21 0.01 0.05" type="box" conaffinity="0" contype="0" group="1" material="dark-wood"/>
+        <geom pos="0 0 0.05" size="0.2 0.01 0.05" type="box" group="0" friction="1 0.005 0.0001" rgba="1 0 0 1"/>
+        <geom pos="0 0 0.05" size="0.2 0.01 0.05" type="box" conaffinity="0" contype="0" group="1" material="dark-wood"/>
+        <geom pos="0 -0.25 0.05" size="0.21 0.01 0.05" type="box" group="0" friction="1 0.005 0.0001" rgba="1 0 0 1"/>
+        <geom pos="0 -0.25 0.05" size="0.21 0.01 0.05" type="box" conaffinity="0" contype="0" group="1" material="dark-wood"/>
+        <geom pos="0.2 0 0.05" size="0.01 0.25 0.05" type="box" group="0" friction="1 0.005 0.0001" rgba="1 0 0 1"/>
+        <geom pos="0.2 0 0.05" size="0.01 0.25 0.05" type="box" conaffinity="0" contype="0" group="1" material="dark-wood"/>
+        <geom pos="0 0 0.05" size="0.01 0.25 0.05" type="box" group="0" friction="1 0.005 0.0001" rgba="1 0 0 1"/>
+        <geom pos="0 0 0.05" size="0.01 0.25 0.05" type="box" conaffinity="0" contype="0" group="1" material="dark-wood"/>
+        <geom pos="-0.2 0 0.05" size="0.01 0.25 0.05" type="box" group="0" friction="1 0.005 0.0001" rgba="1 0 0 1"/>
+        <geom pos="-0.2 0 0.05" size="0.01 0.25 0.05" type="box" conaffinity="0" contype="0" group="1" material="dark-wood"/>
+        <!-- Legs (visual only) -->
+        <geom pos="0.15 0.2 -0.4" size="0.01 0.4" type="cylinder" conaffinity="0" contype="0" group="1" name="bin2_leg1_visual" material="table_legs_metal"/>
+        <geom pos="-0.15 0.2 -0.4" size="0.01 0.4" type="cylinder" conaffinity="0" contype="0" group="1" name="bin2_leg2_visual" material="table_legs_metal"/>
+        <geom pos="-0.15 -0.2 -0.4" size="0.01 0.4" type="cylinder" conaffinity="0" contype="0" group="1" name="bin2_leg3_visual" material="table_legs_metal"/>
+        <geom pos="0.15 -0.2 -0.4" size="0.01 0.4" type="cylinder" conaffinity="0" contype="0" group="1" name="bin2_leg4_visual" material="table_legs_metal"/>
+    </body>
+    <light pos="1.0 1.0 1.5" dir="-0.2 -0.2 -1" specular="0.3 0.3 0.3" directional="true" castshadow="false"/>
+    <!-- front view -->
+    <camera mode="fixed" name="frontview" pos="1.6 0 1.45" quat="0.56 0.43 0.43 0.56"/>
+    <!-- bird view -->
+    <camera mode="fixed" name="birdview" pos="-0.2 0 3.0" quat="0.7071 0 0 0.7071"/>
+    <!-- agent view -->
+    <camera mode="fixed" name="agentview" pos="1.0 0 1.75" quat="0.653 0.271 0.271 0.653"/>
+  </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/empty_arena.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/empty_arena.xml
new file mode 100644
index 0000000000000000000000000000000000000000..cab8bc8198fb4df90cd9e3a3b9868751857ff4c4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/empty_arena.xml
@@ -0,0 +1,35 @@
+
+<mujoco model="empty_arena">
+  <asset>
+    <texture builtin="gradient" height="256" rgb1=".9 .9 1." rgb2=".2 .3 .4" type="skybox" width="256"/>
+    <texture file="../textures/light-gray-floor-tile.png" type="2d" name="texplane"/>
+    <material name="floorplane" reflectance="0.01" shininess="0.0" specular="0.0" texrepeat="2 2" texture="texplane" texuniform="true"/>
+    <!-- plaster walls -->
+    <texture file="../textures/light-gray-plaster.png" type="2d" name="tex-light-gray-plaster"/>
+    <material name="walls_mat" reflectance="0.0" shininess="0.1" specular="0.1" texrepeat="3 3" texture="tex-light-gray-plaster" texuniform="true" />
+
+  </asset>
+  <worldbody>
+    <!-- Floor -->
+    <geom condim="3" group="1" material="floorplane" name="floor" pos="0 0 0" size="3 3 .125" type="plane"/>
+    <!-- Walls (visual only) -->
+    <geom pos="-1.25 2.25 1.5" quat="0.6532815 0.6532815 0.2705981 0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_leftcorner_visual" material="walls_mat"/>
+    <geom pos="-1.25 -2.25 1.5" quat="0.6532815 0.6532815 -0.2705981 -0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rightcorner_visual" material="walls_mat"/>
+    <geom pos="1.25 3 1.5" quat="0.7071 0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_left_visual" material="walls_mat"/>
+    <geom pos="1.25 -3 1.5" quat="0.7071 -0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_right_visual" material="walls_mat"/>
+    <geom pos="-2 0 1.5" quat="0.5 0.5 0.5 0.5" size="1.5 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rear_visual" material="walls_mat"/>
+    <geom pos="3 0 1.5" quat="0.5 0.5 -0.5 -0.5" size="3 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_front_visual" material="walls_mat"/>
+    <light pos="1.0 1.0 1.5" dir="-0.2 -0.2 -1" specular="0.3 0.3 0.3" directional="true" castshadow="false"/>
+    
+    <!-- zed view -->
+    <camera mode="fixed" name="zed" pos="0 0 0" quat="0 0 0 1"/>
+    <!-- front view -->
+    <camera mode="fixed" name="frontview" pos="1.5 0 2.5" quat="0.60282587 0.36959568 0.36959568 0.60282587"/>
+    <!-- bird view -->
+    <camera mode="fixed" name="birdview" pos="0.2 0 3.0" quat="0.7071 0 0 0.7071"/>
+    <!-- agent view -->
+    <camera mode="fixed" name="agentview" pos="0.5 0 1.35" quat="0.653 0.271 0.271 0.653"/>
+    <!-- side view -->
+    <camera mode="fixed" name="sideview" pos="0.2 1.5 1.6" quat="0.009905065491771751 0.006877963156909582 0.5912228352893879 0.806418094001364" />
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/multi_table_arena.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/multi_table_arena.xml
new file mode 100644
index 0000000000000000000000000000000000000000..9c5cd3a54e7d5b6912fd179ae441296db07a1ddf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/multi_table_arena.xml
@@ -0,0 +1,43 @@
+<mujoco model="multi_table_arena">
+  <asset>
+    <texture builtin="gradient" height="256" rgb1=".9 .9 1." rgb2=".2 .3 .4" type="skybox" width="256"/>
+    <texture file="../textures/light-gray-floor-tile.png" type="2d" name="texplane"/>
+    <material name="floorplane" reflectance="0.01" shininess="0.0" specular="0.0" texrepeat="2 2" texture="texplane" texuniform="true"/>
+    <!-- ceramic table texture and material-->
+    <texture file="../textures/ceramic.png" type="cube" name="tex-ceramic"/>
+    <material name="table_ceramic" reflectance="0.0" shininess="0.0" specular="0.2" texrepeat="1 1" texture="tex-ceramic" />
+    <!-- steel legs -->
+    <texture file="../textures/steel-brushed.png" type="cube" name="tex-steel-brushed"/>
+    <material name="table_legs_metal" reflectance="0.8" shininess="0.8" texrepeat="1 1" texture="tex-steel-brushed" />
+    <!-- plaster walls -->
+    <texture file="../textures/light-gray-plaster.png" type="2d" name="tex-light-gray-plaster"/>
+    <material name="walls_mat" reflectance="0.0" shininess="0.1" specular="0.1" texrepeat="3 3" texture="tex-light-gray-plaster" texuniform="true" />
+    <!-- added table texture and material for domain randomization -->
+    <texture  name="textable" builtin="flat" height="512" width="512" rgb1="0.5 0.5 0.5" rgb2="0.5 0.5 0.5"/>
+    <material name="table_mat" texture="textable" />
+
+  </asset>
+  <worldbody>
+    <!-- Floor -->
+    <geom condim="3" group="1" material="floorplane" name="floor" pos="0 0 0" size="3 3 .125" type="plane"/>
+    <!-- Walls (visual only) -->
+    <geom pos="-1.25 2.25 1.5" quat="0.6532815 0.6532815 0.2705981 0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_leftcorner_visual" material="walls_mat"/>
+    <geom pos="-1.25 -2.25 1.5" quat="0.6532815 0.6532815 -0.2705981 -0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rightcorner_visual" material="walls_mat"/>
+    <geom pos="1.25 3 1.5" quat="0.7071 0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_left_visual" material="walls_mat"/>
+    <geom pos="1.25 -3 1.5" quat="0.7071 -0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_right_visual" material="walls_mat"/>
+    <geom pos="-2 0 1.5" quat="0.5 0.5 0.5 0.5" size="1.5 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rear_visual" material="walls_mat"/>
+    <geom pos="3 0 1.5" quat="0.5 0.5 -0.5 -0.5" size="3 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_front_visual" material="walls_mat"/>
+    <!-- Tables will be procedurally generated -->
+
+    <light name="light1" diffuse=".8 .8 .8" dir="0 -.15 -1" directional="false" pos="1 1 4.0" specular="0.3 0.3 0.3" castshadow="false"/>
+    <light name="light2" diffuse=".8 .8 .8" dir="0 -.15 -1" directional="false" pos="-3. -3. 4.0" specular="0.3 0.3 0.3" castshadow="false"/>
+    <!-- front view -->
+    <camera mode="fixed" name="frontview" pos="1.6 0 1.45" quat="0.56 0.43 0.43 0.56"/>
+    <!-- bird view -->
+    <camera mode="fixed" name="birdview" pos="-0.2 0 3.0" quat="0.7071 0 0 0.7071"/>
+    <!-- agent view -->
+    <camera mode="fixed" name="agentview" pos="0.5 0 1.35" quat="0.653 0.271 0.271 0.653"/>
+    <!-- side view -->
+    <camera mode="fixed" name="sideview" pos="-0.05651774593317116 1.2761224129427358 1.4879572214102434" quat="0.009905065491771751 0.006877963156909582 0.5912228352893879 0.806418094001364" />
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/pegs_arena.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/pegs_arena.xml
new file mode 100644
index 0000000000000000000000000000000000000000..a3a3211860cb03550617b68d2a43e2fdfcd469e8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/pegs_arena.xml
@@ -0,0 +1,63 @@
+<mujoco model="pegs_arena">
+  <asset>
+    <texture builtin="gradient" height="256" rgb1=".9 .9 1." rgb2=".2 .3 .4" type="skybox" width="256"/>
+    <texture file="../textures/light-gray-floor-tile.png" type="2d" name="texplane"/>
+    <material name="floorplane" reflectance="0.01" shininess="0.0" specular="0.0" texrepeat="2 2" texture="texplane" texuniform="true"/>
+    <texture file="../textures/steel-scratched.png" type="cube" name="steel-metal"/>
+    <texture file="../textures/brass-ambra.png" type="cube" name="brass-metal"/>
+    <material name="smetal" reflectance="1.0" shininess="1.0" specular="1.0" texrepeat="1 1" texture="steel-metal" texuniform="true"/>
+    <material name="bmetal"  reflectance="1.0" shininess="1.0" specular="1.0" texrepeat="1 1" texture="brass-metal" texuniform="true"/>
+    <!-- ceramic table texture and material-->
+    <texture file="../textures/ceramic.png" type="cube" name="tex-ceramic"/>
+    <material name="table_ceramic" reflectance="0.0" shininess="0.0" specular="0.2" texrepeat="1 1" texture="tex-ceramic" />
+    <!-- steel legs -->
+    <texture file="../textures/steel-brushed.png" type="cube" name="tex-steel-brushed"/>
+    <material name="table_legs_metal" reflectance="0.8" shininess="0.8" texrepeat="1 1" texture="tex-steel-brushed" />
+    <!-- plaster walls -->
+    <texture file="../textures/light-gray-plaster.png" type="2d" name="tex-light-gray-plaster"/>
+    <material name="walls_mat" reflectance="0.0" shininess="0.1" specular="0.1" texrepeat="3 3" texture="tex-light-gray-plaster" texuniform="true" />
+    <!-- added table texture and material for domain randomization -->
+    <texture  name="textable" builtin="flat" height="512" width="512" rgb1="0.5 0.5 0.5" rgb2="0.5 0.5 0.5"/>
+    <material name="table_mat" texture="textable" />
+
+  </asset>
+  <worldbody>
+    <!-- Floor -->
+    <geom condim="3" group="1" material="floorplane" name="floor" pos="0 0 0" size="3 3 .125" type="plane"/>
+    <!-- Walls (visual only) -->
+    <geom pos="-1.25 2.25 1.5" quat="0.6532815 0.6532815 0.2705981 0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_leftcorner_visual" material="walls_mat"/>
+    <geom pos="-1.25 -2.25 1.5" quat="0.6532815 0.6532815 -0.2705981 -0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rightcorner_visual" material="walls_mat"/>
+    <geom pos="1.25 3 1.5" quat="0.7071 0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_left_visual" material="walls_mat"/>
+    <geom pos="1.25 -3 1.5" quat="0.7071 -0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_right_visual" material="walls_mat"/>
+    <geom pos="-2 0 1.5" quat="0.5 0.5 0.5 0.5" size="1.5 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rear_visual" material="walls_mat"/>
+    <geom pos="3 0 1.5" quat="0.5 0.5 -0.5 -0.5" size="3 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_front_visual" material="walls_mat"/>
+    <!-- Table body -->
+    <body name="table" pos="0 0 0.4">
+      <geom pos="0 0 0" size="0.4 0.4 0.4" type="box" group="0" name="table_collision" friction="1 0.005 0.0001"/>
+      <geom pos="0 0 0" size="0.4 0.4 0.4" type="box" conaffinity="0" contype="0" group="1" name="table_visual" material="table_ceramic"/>
+      <site pos="0 0 0.4" name="table_top" size="0.001 0.001 0.001" rgba="0 0 0 0"/>
+      <!-- Legs (visual only) -->
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg1_visual" material="table_legs_metal"/>
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg2_visual" material="table_legs_metal"/>
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg3_visual" material="table_legs_metal"/>
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg4_visual" material="table_legs_metal"/>
+    </body>
+    <body name="peg1" pos="0.23 0.1 0.85">
+      <geom pos="0 0 0" size="0.016 0.016 0.1" type="box" group="0" friction="1 0.005 0.0001"/>
+      <geom pos="0 0 0" size="0.016 0.016 0.1" type="box" conaffinity="0" contype="0" group="1" material="bmetal" />
+    </body>
+    <body name="peg2" pos="0.23 -0.1 0.85">
+      <geom pos="0 0 0" size="0.02 0.1" type="cylinder" group="0" friction="1 0.005 0.0001"/>
+      <geom pos="0 0 0" size="0.02 0.1" type="cylinder" conaffinity="0" contype="0" group="1" material="smetal" />
+    </body>
+    <light pos="1.0 1.0 1.5" dir="-0.2 -0.2 -1" specular="0.3 0.3 0.3" directional="true" castshadow="false"/>
+    <!-- front view -->
+    <camera mode="fixed" name="frontview" pos="1.6 0 1.45" quat="0.56 0.43 0.43 0.56"/>
+    <!-- bird view -->
+    <camera mode="fixed" name="birdview" pos="-0.2 0 3.0" quat="0.7071 0 0 0.7071"/>
+    <!-- agent view -->
+    <camera mode="fixed" name="agentview" pos="0.5 0 1.35" quat="0.653 0.271 0.271 0.653"/>
+    <!-- side view -->
+    <camera mode="fixed" name="sideview" pos="-0.05651774593317116 1.2761224129427358 1.4879572214102434" quat="0.009905065491771751 0.006877963156909582 0.5912228352893879 0.806418094001364" />
+  </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/phantom_table_arena.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/phantom_table_arena.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c99b5829c38732853393f16d40eecf03460e9d35
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/phantom_table_arena.xml
@@ -0,0 +1,60 @@
+<mujoco model="phantom_table_arena">
+  <visual>
+    <headlight diffuse="0.6 0.6 0.6" ambient="0.3 0.3 0.3" specular="0 0 0"/>
+    <rgba haze="0.15 0.25 0.35 1"/>
+    <global azimuth="120" elevation="-20"/>
+  </visual>
+
+  <asset>
+    <texture builtin="gradient" height="256" rgb1=".9 .9 1." rgb2=".2 .3 .4" type="skybox" width="256"/>
+
+    <texture file="../textures/light-gray-floor-tile.png" type="2d" name="texplane"/>
+    <material name="floorplane" reflectance="0.01" shininess="0.0" specular="0.0" texrepeat="2 2" texture="texplane" texuniform="true"/>
+    <!-- ceramic table texture and material-->
+    <texture file="../textures/ceramic.png" type="cube" name="tex-ceramic"/>
+    <material name="table_ceramic" reflectance="0.0" shininess="0.0" specular="0.2" texrepeat="1 1" texture="tex-ceramic" />
+    <!-- steel legs -->
+    <texture file="../textures/steel-brushed.png" type="cube" name="tex-steel-brushed"/>
+    <material name="table_legs_metal" reflectance="0.8" shininess="0.8" texrepeat="1 1" texture="tex-steel-brushed" />
+    <!-- plaster walls -->
+    <texture file="../textures/light-gray-plaster.png" type="2d" name="tex-light-gray-plaster"/>
+    <material name="walls_mat" reflectance="0.0" shininess="0.1" specular="0.1" texrepeat="3 3" texture="tex-light-gray-plaster" texuniform="true" />
+    <!-- added table texture and material for domain randomization -->
+    <texture  name="textable" builtin="flat" height="512" width="512" rgb1="0.5 0.5 0.5" rgb2="0.5 0.5 0.5"/>
+    <material name="table_mat" texture="textable" />
+
+  </asset>
+  <worldbody>
+    <!-- Floor -->
+    <geom condim="3" group="1" material="floorplane" name="floor" pos="0 0 0" size="3 3 .125" type="plane"/>
+    <!-- Walls (visual only) -->
+    <geom pos="-1.25 2.25 1.5" quat="0.6532815 0.6532815 0.2705981 0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_leftcorner_visual" material="walls_mat"/>
+    <geom pos="-1.25 -2.25 1.5" quat="0.6532815 0.6532815 -0.2705981 -0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rightcorner_visual" material="walls_mat"/>
+    <geom pos="1.25 3 1.5" quat="0.7071 0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_left_visual" material="walls_mat"/>
+    <geom pos="1.25 -3 1.5" quat="0.7071 -0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_right_visual" material="walls_mat"/>
+    <geom pos="-2 0 1.5" quat="0.5 0.5 0.5 0.5" size="1.5 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rear_visual" material="walls_mat"/>
+    <geom pos="3 0 1.5" quat="0.5 0.5 -0.5 -0.5" size="3 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_front_visual" material="walls_mat"/>
+    <!-- Table body -->
+    <body name="table" pos="0 0 0.4">
+      <geom pos="0 0 0" size="0.4 0.4 0.4" type="box" group="0" name="table_collision" friction="1 0.005 0.0001"/>
+      <geom pos="0 0 0" size="0.4 0.4 0.4" type="box" conaffinity="0" contype="0" group="1" name="table_visual" material="table_ceramic"/>
+      <site pos="0 0 0.4" name="table_top" size="0.001 0.001 0.001" rgba="0 0 0 0"/>
+      <!-- Legs (visual only) -->
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg1_visual" material="table_legs_metal"/>
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg2_visual" material="table_legs_metal"/>
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg3_visual" material="table_legs_metal"/>
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg4_visual" material="table_legs_metal"/>
+    </body>
+
+    <light pos="0.5 -1.0 1.5" dir="-0.2 0.2 -0.5" specular="0.3 0.3 0.3" directional="true" castshadow="false"/>
+    <!-- front view -->
+    <camera mode="fixed" name="frontview" pos="0.75 0 1.45" quat="0.60282587 0.36959568 0.36959568 0.60282587"/>
+    <!-- bird view -->
+    <camera mode="fixed" name="birdview" pos="-0.2 0 3.0" quat="0.7071 0 0 0.7071"/>
+    <!-- agent view -->
+    <camera mode="fixed" name="agentview" pos="0.5 0 1.35" quat="0.653 0.271 0.271 0.653"/>
+    <!-- side view -->
+    <camera mode="fixed" name="sideview" pos="-0.05651774593317116 1.2761224129427358 1.4879572214102434" quat="0.009905065491771751 0.006877963156909582 0.5912228352893879 0.806418094001364" />
+  </worldbody>
+</mujoco>
+
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/table_arena.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/table_arena.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1c77448f67c23fcac3980a220149ef41efb84c0f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/arenas/table_arena.xml
@@ -0,0 +1,52 @@
+<mujoco model="table_arena">
+  <asset>
+    <texture builtin="gradient" height="256" rgb1=".9 .9 1." rgb2=".2 .3 .4" type="skybox" width="256"/>
+    <texture file="../textures/light-gray-floor-tile.png" type="2d" name="texplane"/>
+    <material name="floorplane" reflectance="0.01" shininess="0.0" specular="0.0" texrepeat="2 2" texture="texplane" texuniform="true"/>
+    <!-- ceramic table texture and material-->
+    <texture file="../textures/ceramic.png" type="cube" name="tex-ceramic"/>
+    <material name="table_ceramic" reflectance="0.0" shininess="0.0" specular="0.2" texrepeat="1 1" texture="tex-ceramic" />
+    <!-- steel legs -->
+    <texture file="../textures/steel-brushed.png" type="cube" name="tex-steel-brushed"/>
+    <material name="table_legs_metal" reflectance="0.8" shininess="0.8" texrepeat="1 1" texture="tex-steel-brushed" />
+    <!-- plaster walls -->
+    <texture file="../textures/light-gray-plaster.png" type="2d" name="tex-light-gray-plaster"/>
+    <material name="walls_mat" reflectance="0.0" shininess="0.1" specular="0.1" texrepeat="3 3" texture="tex-light-gray-plaster" texuniform="true" />
+    <!-- added table texture and material for domain randomization -->
+    <texture  name="textable" builtin="flat" height="512" width="512" rgb1="0.5 0.5 0.5" rgb2="0.5 0.5 0.5"/>
+    <material name="table_mat" texture="textable" />
+
+  </asset>
+  <worldbody>
+    <!-- Floor -->
+    <geom condim="3" group="1" material="floorplane" name="floor" pos="0 0 0" size="3 3 .125" type="plane"/>
+    <!-- Walls (visual only) -->
+    <geom pos="-1.25 2.25 1.5" quat="0.6532815 0.6532815 0.2705981 0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_leftcorner_visual" material="walls_mat"/>
+    <geom pos="-1.25 -2.25 1.5" quat="0.6532815 0.6532815 -0.2705981 -0.2705981" size="1.06 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rightcorner_visual" material="walls_mat"/>
+    <geom pos="1.25 3 1.5" quat="0.7071 0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_left_visual" material="walls_mat"/>
+    <geom pos="1.25 -3 1.5" quat="0.7071 -0.7071 0 0" size="1.75 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_right_visual" material="walls_mat"/>
+    <geom pos="-2 0 1.5" quat="0.5 0.5 0.5 0.5" size="1.5 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_rear_visual" material="walls_mat"/>
+    <geom pos="3 0 1.5" quat="0.5 0.5 -0.5 -0.5" size="3 1.5 0.01" type="box" conaffinity="0" contype="0" group="1" name="wall_front_visual" material="walls_mat"/>
+    <!-- Table body -->
+    <body name="table" pos="0 0 0.4">
+      <geom pos="0 0 0" size="0.4 0.4 0.4" type="box" group="0" name="table_collision" friction="1 0.005 0.0001"/>
+      <geom pos="0 0 0" size="0.4 0.4 0.4" type="box" conaffinity="0" contype="0" group="1" name="table_visual" material="table_ceramic"/>
+      <site pos="0 0 0.4" name="table_top" size="0.001 0.001 0.001" rgba="0 0 0 0"/>
+      <!-- Legs (visual only) -->
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg1_visual" material="table_legs_metal"/>
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg2_visual" material="table_legs_metal"/>
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg3_visual" material="table_legs_metal"/>
+      <geom pos="0 0 0" size="0.05 0.1" type="cylinder" conaffinity="0" contype="0" group="1" name="table_leg4_visual" material="table_legs_metal"/>
+    </body>
+    <light pos="1.0 1.0 1.5" dir="-0.2 -0.2 -1" specular="0.3 0.3 0.3" directional="true" castshadow="false"/>
+    <!-- front view -->
+    <camera mode="fixed" name="frontview" pos="1.6 0 1.45" quat="0.56 0.43 0.43 0.56"/>
+    <!-- bird view -->
+    <camera mode="fixed" name="birdview" pos="-0.2 0 3.0" quat="0.7071 0 0 0.7071"/>
+    <!-- agent view -->
+    <camera mode="fixed" name="agentview" pos="0.5 0 1.35" quat="0.653 0.271 0.271 0.653"/>
+    <!-- side view -->
+    <camera mode="fixed" name="sideview" pos="-0.05651774593317116 1.2761224129427358 1.4879572214102434" quat="0.009905065491771751 0.006877963156909582 0.5912228352893879 0.806418094001364" />
+
+  </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/base.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/base.xml
new file mode 100644
index 0000000000000000000000000000000000000000..b52d438e453ba0a2d808052c8b85f5d0026860bb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/base.xml
@@ -0,0 +1,22 @@
+<!-- This is the base xml for all physics simulations. Set global configs here. -->
+<mujoco model="base">
+  <compiler angle="radian" meshdir="meshes/" inertiagrouprange="0 0" autolimits="true"/>
+  <option impratio="20" cone="elliptic" density="1.2" viscosity="0.00002"/>
+  <size nconmax="5000" njmax="5000"/>
+
+  <asset>
+  </asset>
+
+  <visual>
+    <map znear="0.001" />
+  </visual>
+
+  <actuator>
+  </actuator>
+
+  <worldbody>
+  </worldbody>
+
+  <equality>
+  </equality>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/PEDESTAL.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/PEDESTAL.dae
new file mode 100644
index 0000000000000000000000000000000000000000..1a2f05b303d45623abbcc448f20918d093dd412a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/PEDESTAL.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b0a44b95452421196cccdc1347fbb3f6da6df7a32c5929ba92ae7441b5b1d60
+size 6230755
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/PEDESTAL.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/PEDESTAL.stl
new file mode 100644
index 0000000000000000000000000000000000000000..c40e88cd1aa5e4da35fae1d5225c9eb9750d4dda
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/PEDESTAL.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:405962a9049d58faddfbde642e9ac3fafeead06e1799535eeef4cf01ebd6b25f
+size 3735734
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/pedestal_link_collision.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/pedestal_link_collision.dae
new file mode 100644
index 0000000000000000000000000000000000000000..56bc76595dd29d948d9bbbdfee65d6f2b585921f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/pedestal_link_collision.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b7361cb274df9005294234680b7006aebd4c0067f1515b7a936028cba4c65df
+size 12281
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/pedestal_link_collision.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/pedestal_link_collision.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ec574cfed5b11239176ef327c720f4a8527e6f99
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/base/pedestal_link_collision.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73296b38d0d267f5d2aa0f8626432807e7e7fc3b6aa50263da9d31620d5cba1d
+size 10284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H0.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H0.dae
new file mode 100644
index 0000000000000000000000000000000000000000..6f5b709e908c27cdf3c8765f56413eb6af699807
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H0.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95f8bc9e80eea217afb4e7b271fa71ce52a7054bbe4b20868c1b889e6bc66f65
+size 334255
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..de85f896b6886b97172f1a323887787c4fde93da
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fc0a1a1bee8949c2247ccf837883af5e31516bc54dede2d962243691c0c8c68
+size 260834
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..e52f83ddac62ad4769b1f4a0d905821a8922892d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcb09dfdd019b803a0761c367f8acf6f331a0c4984952cc424ecb5a174087422
+size 233396
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..949a5e84ba7069b847bd31e392bed2e907baf633
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/head/H1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96cbc1f7b8c5a7a89927d0a91cf7d3993df6ce43f31e96c6c7baeae0e34d072a
+size 174384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_elbow/E1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_elbow/E1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..da6d38ed1daa5acecd26711a2d7ce8732542dc82
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_elbow/E1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e1b3868d8c43baf307e1cca7e700c8be75c6cd983dace1d76ee2c3b563243e2
+size 796534
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_elbow/E1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_elbow/E1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..bf0788d634e49a84dd3a96e911ad7f8b6412d0c9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_elbow/E1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d82652d80f7e449212fb1528b960e0f6bcae85c3997d4613a3a0e1cda5673db
+size 600284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_forearm/W1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_forearm/W1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..a9e9f195788862b322f3475f5d2bebe6a4fa755a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_forearm/W1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9df525029abbdab8c7ce566d57b6067bcfb804da5a27a6ce43dc59cd2e63ffd4
+size 910723
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_forearm/W1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_forearm/W1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..a83a1f983cd80851fcdb8de3bf16a93a937925db
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_forearm/W1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebc784f78f7353d7b0287682c1bd24c62bf6d3c87e8ee84ddfb350c45dcc9dc8
+size 687784
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_shoulder/S1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_shoulder/S1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..03291076e90badf09c093d129412049a17092dab
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_shoulder/S1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50231b610a1f01a5f22394942e324ea3e4b62c5762a5201b16bab2a2c0ac5bdb
+size 434567
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_shoulder/S1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_shoulder/S1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..5b96e1bdf50785915965ddeeb8e0224aac1602cd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/lower_shoulder/S1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acc43d8ad81f46d080ea3915e8a45a5ee719e9c94d66622c1a5fb7adc35facc6
+size 328784
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link.dae
new file mode 100644
index 0000000000000000000000000000000000000000..259133a4678951d9a2342ca517b12df60b5139ff
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5067ef42e5575227ff0c2235215c9e1bea804dd62dc87c024a714e03a9d7220e
+size 5037837
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..05cb991be1b7d7f607c0578816f57bed7ba25907
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:553cb6a7e4a445d08b3b24f27ebc727cfc0d7602537c13fa7d27fa226dec790b
+size 3463084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link_collision.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link_collision.dae
new file mode 100644
index 0000000000000000000000000000000000000000..4e181e71e730a4f5dc6c520e6a4b0e0026414986
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link_collision.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0303ca3d47d90cc70969e540a234356a551c5e9d62cbccec35d3bd39c11ad50
+size 703067
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link_collision.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link_collision.stl
new file mode 100644
index 0000000000000000000000000000000000000000..369ffe19ab92bce8d47f31ffaddfff0fffd2d485
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/torso/base_link_collision.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2980b03a0e699f0d85dd4ce3074758ee7067de4b81409b5fa1db4e3078c8b58e
+size 458034
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_elbow/E0.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_elbow/E0.dae
new file mode 100644
index 0000000000000000000000000000000000000000..71505fcbc21080075810f9aadcf1f4c3555d4c38
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_elbow/E0.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f26641362fba070757907374cf59cf3052dc4d9b133090552c2f83b6cf46705
+size 877438
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_elbow/E0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_elbow/E0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..24da66fc3989fd717136338504e79e27df3ad009
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_elbow/E0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6685cc363e0ad06afeb49ea4a462f76ffdf65f39308e2f86ff179855a77d34a4
+size 656284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_forearm/W0.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_forearm/W0.dae
new file mode 100644
index 0000000000000000000000000000000000000000..28228dc7609b2eacdd5aa229e1629966f645e839
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_forearm/W0.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f32b0aa2010f8b021bd649166207eb86250d19cee8c1ea25dbf8cf8ee41eecf7
+size 1814264
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_forearm/W0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_forearm/W0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..1d5f2315eb7d537d2fe2b632e9f2527774ee914c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_forearm/W0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:873f7caca17829a2f11f110cbc04aa051c7b221f5b96b04d2db723e7c8adf9f5
+size 1316034
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_shoulder/S0.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_shoulder/S0.dae
new file mode 100644
index 0000000000000000000000000000000000000000..e21cfb985f05fb7066386e57e012795c4c2cb9d4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_shoulder/S0.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97ce864d405db2de44ddd73689fe8e60eec16cc307daf67ee2d6bb886eb9b161
+size 2640134
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_shoulder/S0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_shoulder/S0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..36489e53cc80378e0cd7f350e6bface6f99f9dfa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/upper_shoulder/S0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b66b83af3303764224e2a8ec1b6257082a572c6eb04ddd4ab4a70dbfea1e0017
+size 1917534
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/wrist/W2.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/wrist/W2.dae
new file mode 100644
index 0000000000000000000000000000000000000000..50a054099069dc3e5766f4155f6ad5ca350947b9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/wrist/W2.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a6cc33e009acc6f3f408008839f5a7e208cfb6caa87e8234425279cc726b109
+size 925929
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/wrist/W2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/wrist/W2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..62ae582181ced7e2289fa084bc16ceffad994b52
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/meshes/wrist/W2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17391c5fbee065d6e39feb2b3f77b9e8a0b12a0cd471c4cd4277844a0492ab6d
+size 566584
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/urdf/baxter_arm.urdf b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/urdf/baxter_arm.urdf
new file mode 100644
index 0000000000000000000000000000000000000000..b0e2aa6977a6122dba14f55a596277c554c914fa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/baxter_description/urdf/baxter_arm.urdf
@@ -0,0 +1,1546 @@
+<?xml version="1.0" ?>
+<!-- ============================================================================================== -->
+<!-- |                                                                                            | -->
+<!-- |    baxter.urdf is DEPRECATED. It is included here for backward compatibility.              | -->
+<!-- |    Please use baxter.urdf.xacro by running:                                                | -->
+<!-- |$ rosrun xacro xacro.py - -inorder `rospack find baxter_description`/urdf/baxter.urdf.xacro | -->
+<!-- |                                                                                            | -->
+<!-- ============================================================================================== -->
+<robot name="baxter">
+  <link name="base">
+  </link>
+  <link name="torso">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/torso/base_link.dae"/>
+      </geometry>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="35.336455"/>
+      <inertia ixx="1.849155" ixy="-0.000354" ixz="-0.154188" iyy="1.662671" iyz="0.003292" izz="0.802239"/>
+    </inertial>
+  </link>
+  <link name="left_torso_itb">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="right_torso_itb">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="pedestal">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/base/PEDESTAL.dae"/>
+      </geometry>
+      <material name="darkgray">
+        <color rgba=".2 .2 .2 1"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="60.86397744"/>
+      <inertia ixx="5.0635929" ixy="0.00103417" ixz="0.80199628" iyy="6.08689388" iyz="0.00105311" izz="4.96191932"/>
+    </inertial>
+  </link>
+  <link name="head">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 .00953"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/head/H0.dae"/>
+      </geometry>
+      <material name="darkgray">
+        <color rgba=".2 .2 .2 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0.0 0 0.0"/>
+      <geometry>
+        <sphere radius="0.001"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.547767"/>
+      <inertia ixx="0.004641" ixy="0.000159" ixz="0.000242" iyy="0.003295" iyz="-0.001324" izz="0.003415"/>
+    </inertial>
+  </link>
+  <link name="sonar_ring">
+    <visual>
+      <origin rpy="0 0 0" xyz="-.0347 0 .00953"/>
+      <geometry>
+        <cylinder length="0.01" radius="0.085"/>
+      </geometry>
+      <material name="darkgray">
+        <color rgba=".2 .2 .2 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0.0 0 0.0"/>
+      <geometry>
+        <sphere radius="0.001"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="screen">
+    <visual>
+      <origin rpy="0 -1.57079632679 0" xyz="0 -.00953 -.0347"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/head/H1.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0.0 0 0.0"/>
+      <geometry>
+        <sphere radius="0.001"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.440171"/>
+      <inertia ixx="0.004006" ixy="0.000230" ixz="0.000002" iyy="0.002800" iyz="0.000029" izz="0.001509"/>
+    </inertial>
+  </link>
+  <link name="display">
+    <visual>
+      <origin rpy="0.2617993877991494 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.218 0.16 0.001"/>
+      </geometry>
+      <material name="black">
+        <color rgba="0 0 0 1"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="head_camera">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="dummyhead1">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="collision_head_link_1">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <sphere radius="0.001"/>
+      </geometry>
+      <material name="red">
+        <color rgba="0.8 0.3 0.3 0.3"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0.0" xyz="-0.07 -0.04 0.0"/>
+      <geometry>
+        <sphere radius="0.22"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="collision_head_link_2">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <sphere radius="0.001"/>
+      </geometry>
+      <material name="red">
+        <color rgba="0.8 0.3 0.3 0.3"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="-0.07 0.04 0.00"/>
+      <geometry>
+        <sphere radius="0.22"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <joint name="collision_head_1" type="fixed">
+    <origin rpy="0 0 0" xyz="0.11 0 0.75"/>
+    <parent link="base"/>
+    <child link="collision_head_link_1"/>
+  </joint>
+  <joint name="collision_head_2" type="fixed">
+    <origin rpy="0 0 0" xyz="0.11 0 0.75"/>
+    <parent link="base"/>
+    <child link="collision_head_link_2"/>
+  </joint>
+  <joint name="dummy" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+    <parent link="head"/>
+    <child link="dummyhead1"/>
+  </joint>
+  <joint name="torso_t0" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+    <parent link="base"/>
+    <child link="torso"/>
+    <limit effort="50000" lower="-3.01" upper="3.01" velocity="10000"/>
+  </joint>
+  <joint name="left_torso_itb_fixed" type="fixed">
+    <origin rpy="-1.57079632679 3.1415 0" xyz="-0.08897 0.15593 0.389125"/>
+    <parent link="torso"/>
+    <child link="left_torso_itb"/>
+  </joint>
+  <joint name="right_torso_itb_fixed" type="fixed">
+    <origin rpy="1.57079632679 0 0" xyz="-0.08897 -0.15593 0.389125"/>
+    <parent link="torso"/>
+    <child link="right_torso_itb"/>
+  </joint>
+  <joint name="pedestal_fixed" type="fixed">
+    <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+    <parent link="torso"/>
+    <child link="pedestal"/>
+  </joint>
+  <joint name="head_pan" type="revolute">
+    <origin rpy="0 0 0" xyz="0.06 0 0.686"/>
+    <axis xyz="0 0 1"/>
+    <parent link="torso"/>
+    <child link="head"/>
+    <limit effort="50000" lower="-1.57079632679" upper="1.57079632679" velocity="10000"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="head_nod" type="fixed">
+    <origin rpy="1.75057 0 1.57079632679" xyz=".1227 0 0"/>
+    <parent link="head"/>
+    <child link="screen"/>
+  </joint>
+  <joint name="head_camera" type="fixed">
+    <origin rpy="1.75057 0 1.57079632679" xyz="0.12839 0 0.06368"/>
+    <parent link="head"/>
+    <child link="head_camera"/>
+  </joint>
+  <joint name="display_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0.0 -0.016 0.0"/>
+    <parent link="screen"/>
+    <child link="display"/>
+  </joint>
+  <joint name="sonar_s0" type="fixed">
+    <origin rpy="0 0 0" xyz="0.0947 0 .817"/>
+    <axis xyz="0 0 1"/>
+    <parent link="torso"/>
+    <child link="sonar_ring"/>
+  </joint>
+  <link name="right_arm_mount">
+    <!-- all defaults -->
+ <gravity>0</gravity>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="right_upper_shoulder">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/upper_shoulder/S0.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.1361"/>
+      <geometry>
+        <cylinder length="0.2722" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.01783 0.00086 0.19127"/>
+      <mass value="5.70044"/>
+      <inertia ixx="0.04709102262" ixy="0.00012787556" ixz="0.00614870039" iyy="0.03766976455" iyz="0.00078086899" izz="0.03595988478"/>
+    </inertial>
+  </link>
+  <link name="right_lower_shoulder">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/lower_shoulder/S1.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <cylinder length="0.12" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.06845 0.00269 -0.00529"/>
+      <mass value="3.22698"/>
+      <inertia ixx="0.01175209419" ixy="-0.00030096398" ixz="0.00207675762" iyy="0.0278859752" iyz="-0.00018821993" izz="0.02078749298"/>
+    </inertial>
+  </link>
+  <link name="right_upper_elbow">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/upper_elbow/E0.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.0535"/>
+      <geometry>
+        <cylinder length="0.107" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.00276 0.00132 0.18086"/>
+      <mass value="4.31272"/>
+      <inertia ixx="0.02661733557" ixy="0.00029270634" ixz="0.00392189887" iyy="0.02844355207" iyz="0.0010838933" izz="0.01248008322"/>
+    </inertial>
+  </link>
+  <link name="right_upper_elbow_visual">
+ <gravity>0</gravity>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.1365"/>
+      <geometry>
+        <cylinder length="0.273" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="right_lower_elbow">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/lower_elbow/E1.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+      <geometry>
+        <cylinder length="0.10" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.02611 0.00159 -0.01117"/>
+      <mass value="2.07206"/>
+      <inertia ixx="0.00711582686" ixy="0.00036036173" ixz="0.0007459496" iyy="0.01318227876" iyz="-0.00019663418" izz="0.00926852064"/>
+    </inertial>
+  </link>
+  <link name="right_upper_forearm">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/upper_forearm/W0.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.044"/>
+      <geometry>
+        <cylinder length="0.088" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.00168 0.0046 0.13952"/>
+      <mass value="2.24665"/>
+      <inertia ixx="0.01667742825" ixy="0.00018403705" ixz="0.00018657629" iyy="0.01675457264" iyz="-0.00064732352" izz="0.0037463115"/>
+    </inertial>
+  </link>
+  <link name="right_upper_forearm_visual">
+ <gravity>0</gravity>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.136"/>
+      <geometry>
+        <cylinder length="0.272" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="right_arm_itb">
+ <gravity>0</gravity>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="right_lower_forearm">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/lower_forearm/W1.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <cylinder length="0.10" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.06041 0.00697 0.006"/>
+      <mass value="1.60979"/>
+      <inertia ixx="0.00387607152" ixy="-0.00044384784" ixz="-0.00021115038" iyy="0.00700537914" iyz="0.00015348067" izz="0.0055275524"/>
+    </inertial>
+  </link>
+  <link name="right_wrist">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/wrist/W2.dae"/>
+      </geometry>
+      <material name="lightgrey">
+        <color rgba=".1 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <cylinder length="0.165" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.00198 0.00125 0.01855"/>
+      <mass value="0.35093"/>
+      <inertia ixx="0.00025289155" ixy="0.00000575311" ixz="-0.00000159345" iyy="0.0002688601" iyz="-0.00000519818" izz="0.0003074118"/>
+    </inertial>
+  </link>
+  <link name="right_hand">
+ <gravity>0</gravity>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.0232"/>
+      <geometry>
+        <cylinder length="0.0464" radius="0.04"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.01093 0.00042 -0.01532"/>
+      <mass value="0.19125"/>
+      <inertia ixx="0.00017588" ixy="0.00000147073" ixz="0.0000243633" iyy="0.00021166377" iyz="0.00000172689" izz="0.00023745397"/>
+    </inertial>
+  </link>
+  <link name="right_hand_camera">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <cylinder length="0.01" radius="0.02"/>
+      </geometry>
+      <material name="blue">
+        <color rgba="0 0 1 0.8"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="right_hand_camera_axis">
+ <gravity>0</gravity>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="right_hand_range">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.005 .02 .005"/>
+      </geometry>
+      <material name="blue">
+        <color rgba="0 0 1 0.8"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="right_hand_accelerometer">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.01 0.01 0.01"/>
+      </geometry>
+      <material name="black">
+        <color rgba="0 0 0 1"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="right_gripper_base">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.02"/>
+      <geometry>
+        <box size="0.025 0.075 0.035"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="right_gripper">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.01 0.01 0.01"/>
+      </geometry>
+      <material name="black">
+        <color rgba="0 0 0 1"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <joint name="right_torso_arm_mount" type="fixed">
+    <origin rpy="0 0 -0.7854" xyz="0.024645 -0.219645 0.118588"/>
+    <parent link="torso"/>
+    <child link="right_arm_mount"/>
+  </joint>
+  <joint name="right_s0" type="revolute">
+    <origin rpy="0 0 0" xyz="0.055695 0 0.011038"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_arm_mount"/>
+    <child link="right_upper_shoulder"/>
+    <limit effort="50.0" lower="-1.70167993878" upper="1.70167993878" velocity="1.5"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="right_s1" type="revolute">
+    <origin rpy="-1.57079632679 0 0" xyz="0.069 0 0.27035"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_upper_shoulder"/>
+    <child link="right_lower_shoulder"/>
+    <limit effort="100.0" lower="-2.147" upper="1.047" velocity="1.5"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="right_e0" type="revolute">
+    <origin rpy="1.57079632679 0 1.57079632679" xyz="0.102 0 0"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_lower_shoulder"/>
+    <child link="right_upper_elbow"/>
+    <limit effort="50.0" lower="-3.05417993878" upper="3.05417993878" velocity="1.5"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="right_e0_fixed" type="fixed">
+    <origin rpy="1.57079632679 0 1.57079632679" xyz="0.107 0 0"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_lower_shoulder"/>
+    <child link="right_upper_elbow_visual"/>
+  </joint>
+  <joint name="right_e1" type="revolute">
+    <origin rpy="-1.57079632679 -1.57079632679 0" xyz="0.069 0 0.26242"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_upper_elbow"/>
+    <child link="right_lower_elbow"/>
+    <limit effort="50.0" lower="-0.05" upper="2.618" velocity="1.5"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="right_w0" type="revolute">
+    <origin rpy="1.57079632679 0 1.57079632679" xyz="0.10359 0 0"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_lower_elbow"/>
+    <child link="right_upper_forearm"/>
+    <limit effort="15.0" lower="-3.059" upper="3.059" velocity="4.0"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="right_w0_fixed" type="fixed">
+    <origin rpy="1.57079632679 0 1.57079632679" xyz="0.088 0 0"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_lower_elbow"/>
+    <child link="right_upper_forearm_visual"/>
+  </joint>
+  <joint name="right_w0_to_itb_fixed" type="fixed">
+    <origin rpy="-1.57079632679 0 1.57079632679" xyz="-0.0565 0 0.12"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_upper_forearm"/>
+    <child link="right_arm_itb"/>
+  </joint>
+  <joint name="right_w1" type="revolute">
+    <origin rpy="-1.57079632679 -1.57079632679 0" xyz="0.01 0 0.2707"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_upper_forearm"/>
+    <child link="right_lower_forearm"/>
+    <limit effort="15.0" lower="-1.57079632679" upper="2.094" velocity="4.0"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="right_w2" type="revolute">
+    <origin rpy="1.57079632679 0 1.57079632679" xyz="0.115975 0 0"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_lower_forearm"/>
+    <child link="right_wrist"/>
+    <limit effort="15.0" lower="-3.059" upper="3.059" velocity="4.0"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="right_hand" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.11355"/>
+    <axis xyz="0 0 1"/>
+    <parent link="right_wrist"/>
+    <child link="right_hand"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="right_hand_camera" type="fixed">
+    <origin rpy="0 0 -1.57079633" xyz="0.03825 0.012 0.015355"/>
+    <parent link="right_hand"/>
+    <child link="right_hand_camera"/>
+  </joint>
+  <joint name="right_hand_camera_axis" type="fixed">
+    <origin rpy="0 0 0" xyz="0.03825 0.012 0.015355"/>
+    <parent link="right_hand"/>
+    <child link="right_hand_camera_axis"/>
+  </joint>
+  <joint name="right_hand_range" type="fixed">
+    <origin rpy="0 -1.57079632679 -1.57079632679" xyz="0.032 -0.020245 0.0288"/>
+    <parent link="right_hand"/>
+    <child link="right_hand_range"/>
+  </joint>
+  <joint name="right_hand_accelerometer" type="fixed">
+    <origin rpy="0 0 0" xyz="0.00198 0.000133 -0.0146"/>
+    <parent link="right_hand"/>
+    <child link="right_hand_accelerometer"/>
+  </joint>
+  <joint name="right_gripper_base" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+    <parent link="right_hand"/>
+    <child link="right_gripper_base"/>
+  </joint>
+  <joint name="right_endpoint" type="fixed">
+    <origin rpy="0 0 0" xyz="0.0 0 0.025"/>
+    <parent link="right_gripper_base"/>
+    <child link="right_gripper"/>
+  </joint>
+  <link name="left_arm_mount">
+    <!-- all defaults -->
+ <gravity>0</gravity>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="left_upper_shoulder">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/upper_shoulder/S0.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.1361"/>
+      <geometry>
+        <cylinder length="0.2722" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.01783 0.00086 0.19127"/>
+      <mass value="5.70044"/>
+      <inertia ixx="0.04709102262" ixy="0.00012787556" ixz="0.00614870039" iyy="0.03766976455" iyz="0.00078086899" izz="0.03595988478"/>
+    </inertial>
+  </link>
+  <link name="left_lower_shoulder">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/lower_shoulder/S1.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <cylinder length="0.12" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.06845 0.00269 -0.00529"/>
+      <mass value="3.22698"/>
+      <inertia ixx="0.01175209419" ixy="-0.00030096398" ixz="0.00207675762" iyy="0.0278859752" iyz="-0.00018821993" izz="0.02078749298"/>
+    </inertial>
+  </link>
+  <link name="left_upper_elbow">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/upper_elbow/E0.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.0535"/>
+      <geometry>
+        <cylinder length="0.107" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.00276 0.00132 0.18086"/>
+      <mass value="4.31272"/>
+      <inertia ixx="0.02661733557" ixy="0.00029270634" ixz="0.00392189887" iyy="0.02844355207" iyz="0.0010838933" izz="0.01248008322"/>
+    </inertial>
+  </link>
+  <link name="left_upper_elbow_visual">
+ <gravity>0</gravity>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.1365"/>
+      <geometry>
+        <cylinder length="0.273" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="left_lower_elbow">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/lower_elbow/E1.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+      <geometry>
+        <cylinder length="0.10" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.02611 0.00159 -0.01117"/>
+      <mass value="2.07206"/>
+      <inertia ixx="0.00711582686" ixy="0.00036036173" ixz="0.0007459496" iyy="0.01318227876" iyz="-0.00019663418" izz="0.00926852064"/>
+    </inertial>
+  </link>
+  <link name="left_upper_forearm">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/upper_forearm/W0.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.044"/>
+      <geometry>
+        <cylinder length="0.088" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.00168 0.0046 0.13952"/>
+      <mass value="2.24665"/>
+      <inertia ixx="0.01667742825" ixy="0.00018403705" ixz="0.00018657629" iyy="0.01675457264" iyz="-0.00064732352" izz="0.0037463115"/>
+    </inertial>
+  </link>
+  <link name="left_upper_forearm_visual">
+ <gravity>0</gravity>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.136"/>
+      <geometry>
+        <cylinder length="0.272" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="left_arm_itb">
+ <gravity>0</gravity>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="left_lower_forearm">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/lower_forearm/W1.dae"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <cylinder length="0.10" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.06041 0.00697 0.006"/>
+      <mass value="1.60979"/>
+      <inertia ixx="0.00387607152" ixy="-0.00044384784" ixz="-0.00021115038" iyy="0.00700537914" iyz="0.00015348067" izz="0.0055275524"/>
+    </inertial>
+  </link>
+  <link name="left_wrist">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://baxter_description/meshes/wrist/W2.dae"/>
+      </geometry>
+      <material name="lightgrey">
+        <color rgba=".1 .1 .1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <cylinder length="0.165" radius="0.06"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.00198 0.00125 0.01855"/>
+      <mass value="0.35093"/>
+      <inertia ixx="0.00025289155" ixy="0.00000575311" ixz="-0.00000159345" iyy="0.0002688601" iyz="-0.00000519818" izz="0.0003074118"/>
+    </inertial>
+  </link>
+  <link name="left_hand">
+ <gravity>0</gravity>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.0232"/>
+      <geometry>
+        <cylinder length="0.0464" radius="0.04"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.01093 0.00042 -0.01532"/>
+      <mass value="0.19125"/>
+      <inertia ixx="0.00017588" ixy="0.00000147073" ixz="0.0000243633" iyy="0.00021166377" iyz="0.00000172689" izz="0.00023745397"/>
+    </inertial>
+  </link>
+  <link name="left_hand_camera">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <cylinder length="0.01" radius="0.02"/>
+      </geometry>
+      <material name="blue">
+        <color rgba="0 0 1 0.8"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="left_hand_camera_axis">
+ <gravity>0</gravity>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="left_hand_range">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.005 .02 .005"/>
+      </geometry>
+      <material name="blue">
+        <color rgba="0 0 1 0.8"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="left_hand_accelerometer">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.01 0.01 0.01"/>
+      </geometry>
+      <material name="black">
+        <color rgba="0 0 0 1"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="left_gripper_base">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.02"/>
+      <geometry>
+        <box size="0.025 0.075 0.035"/>
+      </geometry>
+      <material name="darkred">
+        <color rgba=".5 .1 .1 1"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <link name="left_gripper">
+ <gravity>0</gravity>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.01 0.01 0.01"/>
+      </geometry>
+      <material name="black">
+        <color rgba="0 0 0 1"/>
+      </material>
+    </visual>
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.0001"/>
+      <inertia ixx="1e-08" ixy="0" ixz="0" iyy="1e-08" iyz="0" izz="1e-08"/>
+    </inertial>
+  </link>
+  <joint name="left_torso_arm_mount" type="fixed">
+    <origin rpy="0 0 0.7854" xyz="0.024645 0.219645 0.118588"/>
+    <parent link="torso"/>
+    <child link="left_arm_mount"/>
+  </joint>
+  <joint name="left_s0" type="revolute">
+    <origin rpy="0 0 0" xyz="0.055695 0 0.011038"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_arm_mount"/>
+    <child link="left_upper_shoulder"/>
+    <limit effort="50.0" lower="-1.70167993878" upper="1.70167993878" velocity="1.5"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="left_s1" type="revolute">
+    <origin rpy="-1.57079632679 0 0" xyz="0.069 0 0.27035"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_upper_shoulder"/>
+    <child link="left_lower_shoulder"/>
+    <limit effort="100.0" lower="-2.147" upper="1.047" velocity="1.5"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="left_e0" type="revolute">
+    <origin rpy="1.57079632679 0 1.57079632679" xyz="0.102 0 0"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_lower_shoulder"/>
+    <child link="left_upper_elbow"/>
+    <limit effort="50.0" lower="-3.05417993878" upper="3.05417993878" velocity="1.5"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="left_e0_fixed" type="fixed">
+    <origin rpy="1.57079632679 0 1.57079632679" xyz="0.107 0 0"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_lower_shoulder"/>
+    <child link="left_upper_elbow_visual"/>
+  </joint>
+  <joint name="left_e1" type="revolute">
+    <origin rpy="-1.57079632679 -1.57079632679 0" xyz="0.069 0 0.26242"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_upper_elbow"/>
+    <child link="left_lower_elbow"/>
+    <limit effort="50.0" lower="-0.05" upper="2.618" velocity="1.5"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="left_w0" type="revolute">
+    <origin rpy="1.57079632679 0 1.57079632679" xyz="0.10359 0 0"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_lower_elbow"/>
+    <child link="left_upper_forearm"/>
+    <limit effort="15.0" lower="-3.059" upper="3.059" velocity="4.0"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="left_w0_fixed" type="fixed">
+    <origin rpy="1.57079632679 0 1.57079632679" xyz="0.088 0 0"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_lower_elbow"/>
+    <child link="left_upper_forearm_visual"/>
+  </joint>
+  <joint name="left_w0_to_itb_fixed" type="fixed">
+    <origin rpy="-1.57079632679 0 1.57079632679" xyz="-0.0565 0 0.12"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_upper_forearm"/>
+    <child link="left_arm_itb"/>
+  </joint>
+  <joint name="left_w1" type="revolute">
+    <origin rpy="-1.57079632679 -1.57079632679 0" xyz="0.01 0 0.2707"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_upper_forearm"/>
+    <child link="left_lower_forearm"/>
+    <limit effort="15.0" lower="-1.57079632679" upper="2.094" velocity="4.0"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="left_w2" type="revolute">
+    <origin rpy="1.57079632679 0 1.57079632679" xyz="0.115975 0 0"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_lower_forearm"/>
+    <child link="left_wrist"/>
+    <limit effort="15.0" lower="-3.059" upper="3.059" velocity="4.0"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="left_hand" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.11355"/>
+    <axis xyz="0 0 1"/>
+    <parent link="left_wrist"/>
+    <child link="left_hand"/>
+    <dynamics damping="0.7" friction="0.0"/>
+  </joint>
+  <joint name="left_hand_camera" type="fixed">
+    <origin rpy="0 0 -1.57079633" xyz="0.03825 0.012 0.015355"/>
+    <parent link="left_hand"/>
+    <child link="left_hand_camera"/>
+  </joint>
+  <joint name="left_hand_camera_axis" type="fixed">
+    <origin rpy="0 0 0" xyz="0.03825 0.012 0.015355"/>
+    <parent link="left_hand"/>
+    <child link="left_hand_camera_axis"/>
+  </joint>
+  <joint name="left_hand_range" type="fixed">
+    <origin rpy="0 -1.57079632679 -1.57079632679" xyz="0.032 -0.020245 0.0288"/>
+    <parent link="left_hand"/>
+    <child link="left_hand_range"/>
+  </joint>
+  <joint name="left_hand_accelerometer" type="fixed">
+    <origin rpy="0 0 0" xyz="0.00198 0.000133 -0.0146"/>
+    <parent link="left_hand"/>
+    <child link="left_hand_accelerometer"/>
+  </joint>
+  <joint name="left_gripper_base" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+    <parent link="left_hand"/>
+    <child link="left_gripper_base"/>
+  </joint>
+  <joint name="left_endpoint" type="fixed">
+    <origin rpy="0 0 0" xyz="0.0 0 0.025"/>
+    <parent link="left_gripper_base"/>
+    <child link="left_gripper"/>
+  </joint>
+
+  <!-- Transmissions -->
+
+  <!-- Right Arm -->
+  <transmission name="right_tran1">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="right_s0">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="right_motor1">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="right_tran2">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="right_s1">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="right_motor2">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="right_tran3">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="right_e0">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="right_motor3">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="right_tran4">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="right_e1">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="right_motor4">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="right_tran5">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="right_w0">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="right_motor5">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="right_tran6">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="right_w1">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="right_motor6">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="right_tran7">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="right_w2">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="right_motor7">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+
+  <!-- Left Arm -->
+  <transmission name="left_tran1">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="left_s0">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="left_motor1">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="left_tran2">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="left_s1">      
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="left_motor2">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="left_tran3">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="left_e0">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="left_motor3">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="left_tran4">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="left_e1">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="left_motor4">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="left_tran5">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="left_w0">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="left_motor5">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="left_tran6">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="left_w1">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="left_motor6">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="left_tran7">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="left_w2">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="left_motor7">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+
+  <!-- Head -->
+  <transmission name="head1">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="head_pan">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="head_motor1">
+      <hardwareInterface>EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+
+  <!-- ros_control plugin -->
+  <gazebo>
+    <plugin name="baxter_ros_control" filename="libbaxter_gazebo_ros_control.so">
+      <robotNamespace>/robot</robotNamespace>
+    </plugin>
+  </gazebo>
+
+  <!-- Gazebo-Specific Link Properties -->
+  <gazebo reference="torso">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="head_pan">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="right_s0">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="right_s1">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="right_e0">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="right_e1">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="right_w0">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="right_w1">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="right_w2">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="left_s0">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="left_s1">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="left_e0">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="left_e1">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="left_w0">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="left_w1">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="left_w2">
+    <implicitSpringDamper>1</implicitSpringDamper>
+  </gazebo>
+  <gazebo reference="head_camera">
+    <sensor name="head_camera" type="camera">
+      <update_rate>30.0</update_rate>
+      <camera name="head_camera">
+        <pose>0.0 0.21 0.0 0.0 -0.8 -1.570796327 </pose>
+        <horizontal_fov>1.3962634</horizontal_fov>
+        <image>
+          <width>800</width>
+          <height>800</height>
+          <format>R8G8B8</format>
+        </image>
+        <clip>
+          <near>0.02</near>
+          <far>300</far>
+        </clip>
+        <noise>
+          <type>gaussian</type>
+          <mean>0.0</mean>
+          <stddev>0.007</stddev>
+        </noise>
+      </camera>
+      <plugin filename="libgazebo_ros_camera.so" name="camera_controller">
+        <alwaysOn>true</alwaysOn>
+        <updateRate>0.0</updateRate>
+        <cameraName>head_camera</cameraName>
+        <imageTopicName>image</imageTopicName>
+        <cameraInfoTopicName>camera_info</cameraInfoTopicName>
+        <frameName>head_camera</frameName>
+        <hackBaseline>0.07</hackBaseline>
+        <distortionK1>0.0</distortionK1>
+        <distortionK2>0.0</distortionK2>
+        <distortionK3>0.0</distortionK3>
+        <distortionT1>0.0</distortionT1>
+        <distortionT2>0.0</distortionT2>
+        <robotNamespace>/cameras</robotNamespace>
+      </plugin>
+    </sensor>
+  </gazebo>
+  <gazebo reference="right_hand_camera">
+    <sensor name="right_hand_camera" type="camera">
+      <update_rate>30.0</update_rate>
+      <camera name="right_hand_camera">
+        <pose>0.0 0.0 0.0 0.0 -1.570796327 1.570796327 </pose>
+        <horizontal_fov>1.3962634</horizontal_fov>
+        <image>
+          <width>800</width>
+          <height>800</height>
+          <format>R8G8B8</format>
+        </image>
+        <clip>
+          <near>0.02</near>
+          <far>300</far>
+        </clip>
+        <noise>
+          <type>gaussian</type>
+          <mean>0.0</mean>
+          <stddev>0.007</stddev>
+        </noise>
+      </camera>
+      <plugin filename="libgazebo_ros_camera.so" name="camera_controller">
+        <alwaysOn>true</alwaysOn>
+        <updateRate>0.0</updateRate>
+        <cameraName>right_hand_camera</cameraName>
+        <imageTopicName>image</imageTopicName>
+        <cameraInfoTopicName>camera_info</cameraInfoTopicName>
+        <frameName>right_hand_camera</frameName>
+        <hackBaseline>0.07</hackBaseline>
+        <distortionK1>0.0</distortionK1>
+        <distortionK2>0.0</distortionK2>
+        <distortionK3>0.0</distortionK3>
+        <distortionT1>0.0</distortionT1>
+        <distortionT2>0.0</distortionT2>
+        <robotNamespace>/cameras</robotNamespace>
+      </plugin>
+    </sensor>
+  </gazebo>
+  <gazebo reference="left_hand_camera">
+    <sensor name="left_hand_camera" type="camera">
+      <update_rate>30.0</update_rate>
+      <camera name="left_hand_camera">
+        <pose>0.0 0.0 0.0 0.0 -1.570796327 1.570796327 </pose>
+        <horizontal_fov>1.3962634</horizontal_fov>
+        <image>
+          <width>800</width>
+          <height>800</height>
+          <format>R8G8B8</format>
+        </image>
+        <clip>
+          <near>0.02</near>
+          <far>300</far>
+        </clip>
+        <noise>
+          <type>gaussian</type>
+          <mean>0.0</mean>
+          <stddev>0.007</stddev>
+        </noise>
+      </camera>
+      <plugin filename="libgazebo_ros_camera.so" name="camera_controller">
+        <alwaysOn>true</alwaysOn>
+        <updateRate>0.0</updateRate>
+        <cameraName>left_hand_camera</cameraName>
+        <imageTopicName>image</imageTopicName>
+        <cameraInfoTopicName>camera_info</cameraInfoTopicName>
+        <frameName>left_hand_camera</frameName>
+        <hackBaseline>0.07</hackBaseline>
+        <distortionK1>0.0</distortionK1>
+        <distortionK2>0.0</distortionK2>
+        <distortionK3>0.0</distortionK3>
+        <distortionT1>0.0</distortionT1>
+        <distortionT2>0.0</distortionT2>
+        <robotNamespace>/cameras</robotNamespace>
+      </plugin>
+    </sensor>
+  </gazebo>
+  <gazebo reference="display">
+    <visual>
+      <plugin name="screen_video_controller" filename="libgazebo_ros_video.so">
+        <height>600</height>
+        <width>1024</width>
+        <topicName>/robot/xdisplay</topicName>
+      </plugin>
+    </visual>
+  </gazebo>
+  <gazebo reference="sonar_ring">
+    <sensor name="sonar" type="ray">
+      <pose>0.0 0.0 0.0 0.0 0.0 0.0</pose>
+      <ray>
+        <scan>
+          <horizontal>
+            <samples>12</samples>
+            <resolution>1.0</resolution>
+            <min_angle>-3.14</min_angle>
+            <max_angle>3.14</max_angle>
+          </horizontal>
+          <vertical>
+            <samples>2</samples>
+            <resolution>1.0</resolution>
+            <min_angle>-0.001</min_angle>
+            <max_angle>0</max_angle>
+          </vertical>
+        </scan>
+        <range>
+          <min>0.05</min>
+          <max>50.0</max>
+        </range>
+      </ray>
+      <plugin name="sonar_plugin" filename="libgazebo_ros_block_laser.so">
+        <gaussianNoise>0.00</gaussianNoise>
+        <alwaysOn>true</alwaysOn>
+        <updateRate>100.0</updateRate>
+        <topicName>/robot/sonar/head_sonar/state</topicName>
+        <frameName>sonar_ring</frameName>
+      </plugin>
+      <always_on>true</always_on>
+      <update_rate>100.0</update_rate>
+    </sensor>
+  </gazebo>
+  <gazebo reference="right_hand_range">
+    <sensor name="ir_right" type="ray">
+      <pose>0.0 0.0 0.0 0.0 0.0 0.0</pose>
+      <ray>
+        <scan>
+          <horizontal>
+            <samples>1</samples>
+            <resolution>1.0</resolution>
+            <min_angle>-0.5</min_angle>
+            <max_angle>0.5</max_angle>
+          </horizontal>
+        </scan>
+        <range>
+          <min>0.004</min>
+          <max>0.4</max>
+        </range>
+      </ray>
+      <plugin name="plugin_3" filename="libgazebo_ros_laser.so">
+        <gaussianNoise>0.005</gaussianNoise>
+        <alwaysOn>true</alwaysOn>
+        <updateRate>100</updateRate>
+        <topicName>/sim/laserscan/right_hand_range/state</topicName>
+        <frameName>right_hand_range</frameName>
+      </plugin>
+      <always_on>true</always_on>
+      <update_rate>100.0</update_rate>
+    </sensor>
+  </gazebo>
+  <gazebo reference="left_hand_range">
+    <sensor name="ir_left" type="ray">
+      <pose>0.0 0.0 0.0 0.0 0.0 0.0</pose>
+      <ray>
+        <scan>
+          <horizontal>
+            <samples>1</samples>
+            <resolution>1.0</resolution>
+            <min_angle>-0.5</min_angle>
+            <max_angle>0.5</max_angle>
+          </horizontal>
+        </scan>
+        <range>
+          <min>0.004</min>
+          <max>0.4</max>
+        </range>
+      </ray>
+      <plugin name="plugin_4" filename="libgazebo_ros_laser.so">
+        <gaussianNoise>0.005</gaussianNoise>
+        <alwaysOn>true</alwaysOn>
+        <updateRate>100</updateRate>
+        <topicName>/sim/laserscan/left_hand_range/state</topicName>
+        <frameName>left_hand_range</frameName>
+      </plugin>
+      <always_on>true</always_on>
+      <update_rate>100.0</update_rate>
+    </sensor>
+  </gazebo>
+</robot>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/CMakeLists.txt b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7282c39947f348162715e7e829a4aa5eb0f6e2b9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 2.8.3)
+project(franka_description)
+
+find_package(catkin REQUIRED)
+catkin_package(CATKIN_DEPENDS xacro)
+
+install(DIRECTORY meshes
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+)
+install(DIRECTORY robots
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/mainpage.dox b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/mainpage.dox
new file mode 100644
index 0000000000000000000000000000000000000000..941d0bf97714038b58c39c7dcd816e901e22c023
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/mainpage.dox
@@ -0,0 +1,6 @@
+/**
+ * @mainpage
+ * @htmlinclude "manifest.html"
+ *
+ * Overview page for Franka Emika research robots: https://frankaemika.github.io
+ */
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/finger.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/finger.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ef5e672efbb990561b36fcee2c15b2f61cf42065
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/finger.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d07a740392f3b9b0816f65d64fff9927d3d57c897870fc4b6ff9c56fff3a0c8
+size 1684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/hand.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/hand.stl
new file mode 100644
index 0000000000000000000000000000000000000000..bb315217a60e27343b84a9d4e3a4686762c4fc8d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/hand.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94493e94f30fe940f2c8ca2f155c3bbe67bbff406d3edf5e261670d2f0f6e2ed
+size 10084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..bbe58384ff30b933eb8758429c4f5cbd970c1b50
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc6d94330de8ddb005b311bfdba9f3b8e1aa7c256b71592ee7ff32cb9a9a5aa
+size 10084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..b7e855112619448e2cf63660e07387871de542ac
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e41a39a94108fcf56aacff603fc91ec80541f4c1af17b51a0de5617f5566e6d2
+size 15084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..6ba548f4137d4ba09e7b0d9299fa631b27af1ea1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:370f7605a0fae3529db169ded50f52f171024aa792d4d773bc84197301f6a039
+size 15084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link3.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link3.stl
new file mode 100644
index 0000000000000000000000000000000000000000..7115ba0e92d33fd3a2e6e2087df980ae8b9a6730
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link3.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a8d638b9349c6c0eefc4e888636ac4838c4b27170f18a51699321118af709c1
+size 15084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link4.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link4.stl
new file mode 100644
index 0000000000000000000000000000000000000000..88c6db70bf3c3b68bce08b9bcb5142050b1f9079
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link4.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0180ebb5772ec9840cb049750cffb29a9ddc90311752a16ea34757782ef9e48d
+size 15084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link5.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link5.stl
new file mode 100644
index 0000000000000000000000000000000000000000..5eaf5c8ec2155135ab9297d51e3dae6e5e280675
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link5.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd17e688c7870e722283525879643d53a74c0024d328b0e14b034b54c8b6c31a
+size 15084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link6.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link6.stl
new file mode 100644
index 0000000000000000000000000000000000000000..828ad3bd384b22ef734d8add0e50d6ae449dce9c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link6.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20b768e99a0e0440b5754dcca108016434e57937cc356acd9c352ccd3cb27f77
+size 10084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link7.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link7.stl
new file mode 100644
index 0000000000000000000000000000000000000000..2047756ec662f051af90fe61266998bf16e655fe
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/collision/link7.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92ac6afcf7574c034d3170d8a68e95ac9048ab9d0dd5bbd8311b86e551b9ab1c
+size 10084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/finger.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/finger.dae
new file mode 100644
index 0000000000000000000000000000000000000000..06f2732f3d24135ff2c06b0217fc328925cb40b5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/finger.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9d5471e9cdf76f85493df6fe6c76d29810edd21577c204009160496d5b77fbb
+size 51123
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/hand.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/hand.dae
new file mode 100644
index 0000000000000000000000000000000000000000..073dd433d2574db0060d31b8f03f26a4f5851bee
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/hand.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9602bbad114cfefd4bbb0cd0a57f3021bce9efbaaa1604a5e72bfb76926bb019
+size 548949
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link0.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link0.dae
new file mode 100644
index 0000000000000000000000000000000000000000..2ea2fbee592e033a1cdf431400ad8c2ac4091248
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link0.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9ceaa66bb3a734e3a32f2f737ae57a29e922f4a962ed77b9bb8d8e25cd33159
+size 1590896
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..2030d60bcfb6e3f00c6287fbe8cad91be27118f0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8b7b7c1217d620a811fc0ee52d1d1b0e1470de955e7453872aac3f15cf7c5e
+size 978415
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link2.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link2.dae
new file mode 100644
index 0000000000000000000000000000000000000000..64981bd82b803b79f46005dd56885b4ae01e9d87
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link2.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c44d0364f0030007e427106a4e842d835ca43902716cc46ee4f3342dab189e12
+size 998486
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link3.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link3.dae
new file mode 100644
index 0000000000000000000000000000000000000000..23d6124df5e3d5696241441fbdf7dcfe53dbe150
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link3.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dab39a126153fb82f3650cca6de63a8e978f851aa5020a8e91b3d9d548dbba3d
+size 1099651
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link4.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link4.dae
new file mode 100644
index 0000000000000000000000000000000000000000..0ce1680db10d42992cb781fa23e3b5db43dea3ff
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link4.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e03d680e3a4a4555d673bcb8cb466e479f5cb069a5fc8a0b0f99c089c50fd63
+size 1145491
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link5.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link5.dae
new file mode 100644
index 0000000000000000000000000000000000000000..b6911ff709357cc25b27355fc36873d1d30f9cc1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link5.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0be76681192578a14d6ace89527e4ee418f7395e825e285125e05fd998d24e3e
+size 1438169
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link6.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link6.dae
new file mode 100644
index 0000000000000000000000000000000000000000..adac012b16351aecef432a28bd593edc0872a9ae
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link6.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed9c57432b079d55b9954775f2ddfe34e8b904f683949b8eb6314238f8afa46e
+size 1727767
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link7.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link7.dae
new file mode 100644
index 0000000000000000000000000000000000000000..b6d289bc5b7d51793fd2bf805695356eed8ae3ac
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/meshes/visual/link7.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71be614f734bd27b2d7dec3e8bb022251cbbfce38b0a12dbfc1b88bc0513822a
+size 935952
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/package.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/package.xml
new file mode 100644
index 0000000000000000000000000000000000000000..6db900392f56e7b1529f93dd5304874fbe38a4f9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/package.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0"?>
+<package format="2">
+  <name>franka_description</name>
+  <version>0.7.0</version>
+  <description>franka_description contains URDF files and meshes of Franka Emika robots</description>
+  <maintainer email="support@franka.de">Franka Emika GmbH</maintainer>
+  <license>Apache 2.0</license>
+
+  <url type="website">http://wiki.ros.org/franka_description</url>
+  <url type="repository">https://github.com/frankaemika/franka_ros</url>
+  <url type="bugtracker">https://github.com/frankaemika/franka_ros/issues</url>
+  <author>Franka Emika GmbH</author>
+
+  <buildtool_depend>catkin</buildtool_depend>
+
+  <exec_depend>xacro</exec_depend>
+</package>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/rosdoc.yaml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/rosdoc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..96ee597ef5cacd0f223f676c738f396f0810b78c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/rosdoc.yaml
@@ -0,0 +1,2 @@
+- builder: doxygen
+  javadoc_autobrief: YES
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/hand.urdf b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/hand.urdf
new file mode 100644
index 0000000000000000000000000000000000000000..59b9548ac779bfb0369344c50d2b5e2f1900c1c6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/hand.urdf
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from ./hand.urdf.xacro              | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="hand" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="panda_hand">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+      <mass value="0.67893"/>
+      <inertia ixx="0.01" ixy="0.0" ixz="0.0" iyy="0.01" iyz="0.0" izz="0.01"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/hand.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/hand.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_leftfinger">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+      <mass value="0.01053"/>
+      <inertia ixx="0.001" ixy="0.0" ixz="0.0" iyy="0.001" iyz="0.0" izz="0.001"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/finger.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_rightfinger">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+      <mass value="0.01053"/>
+      <inertia ixx="0.001" ixy="0.0" ixz="0.0" iyy="0.001" iyz="0.0" izz="0.001"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/finger.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_finger_joint1" type="prismatic">
+    <parent link="panda_hand"/>
+    <child link="panda_leftfinger"/>
+    <origin rpy="0 0 0" xyz="0 0 0.0584"/>
+    <axis xyz="0 1 0"/>
+    <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
+  </joint>
+  <joint name="panda_finger_joint2" type="prismatic">
+    <parent link="panda_hand"/>
+    <child link="panda_rightfinger"/>
+    <origin rpy="0 0 0" xyz="0 0 0.0584"/>
+    <axis xyz="0 -1 0"/>
+    <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
+    <mimic joint="panda_finger_joint1"/>
+  </joint>
+</robot>
+
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/hand.urdf.xacro b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/hand.urdf.xacro
new file mode 100644
index 0000000000000000000000000000000000000000..643fc608f05f2bd79f0212e01e1a01086d02bd57
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/hand.urdf.xacro
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<robot xmlns:xacro="http://www.ros.org/wiki/xacro" name="hand">
+  <xacro:include filename="hand.xacro"/>
+  <xacro:hand ns="panda"/>
+</robot>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/hand.xacro b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/hand.xacro
new file mode 100644
index 0000000000000000000000000000000000000000..3f3a209faff8476ee4521a812c5d28314fc8de51
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/hand.xacro
@@ -0,0 +1,80 @@
+<?xml version="1.0" encoding="utf-8"?>
+<robot xmlns:xacro="http://www.ros.org/wiki/xacro" name="hand">
+  <xacro:macro name="hand" params="connected_to:='' ns:='' rpy:='0 0 0' xyz:='0 0 0' ">
+    <xacro:unless value="${connected_to == ''}">
+      <joint name="${ns}_hand_joint" type="fixed">
+        <parent link="${connected_to}"/>
+        <child link="${ns}_hand"/>
+        <origin xyz="${xyz}" rpy="${rpy}"/>
+      </joint>
+    </xacro:unless>
+    <link name="${ns}_hand">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+        <mass value="0.67893"/>
+        <inertia ixx="0.01" ixy="0.0" ixz="0.0" iyy="0.01" iyz="0.0" izz="0.01"/>
+      </inertial>
+      <visual>
+        <geometry>
+          <mesh filename="package://panda_description/meshes/visual/hand.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <geometry>
+          <mesh filename="package://panda_description/meshes/collision/hand.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <link name="${ns}_leftfinger">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+        <mass value="0.01053"/>
+        <inertia ixx="0.001" ixy="0.0" ixz="0.0" iyy="0.001" iyz="0.0" izz="0.001"/>
+      </inertial>
+      <visual>
+        <geometry>
+          <mesh filename="package://panda_description/meshes/visual/finger.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <geometry>
+          <mesh filename="package://panda_description/meshes/collision/finger.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <link name="${ns}_rightfinger">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+        <mass value="0.01053"/>
+        <inertia ixx="0.001" ixy="0.0" ixz="0.0" iyy="0.001" iyz="0.0" izz="0.001"/>
+      </inertial>
+      <visual>
+        <origin xyz="0 0 0" rpy="0 0 ${pi}"/>
+        <geometry>
+          <mesh filename="package://panda_description/meshes/visual/finger.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <origin xyz="0 0 0" rpy="0 0 ${pi}"/>
+        <geometry>
+          <mesh filename="package://panda_description/meshes/collision/finger.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <joint name="${ns}_finger_joint1" type="prismatic">
+      <parent link="${ns}_hand"/>
+      <child link="${ns}_leftfinger"/>
+      <origin xyz="0 0 0.0584" rpy="0 0 0"/>
+      <axis xyz="0 1 0"/>
+      <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
+    </joint>
+    <joint name="${ns}_finger_joint2" type="prismatic">
+      <parent link="${ns}_hand"/>
+      <child link="${ns}_rightfinger"/>
+      <origin xyz="0 0 0.0584" rpy="0 0 0"/>
+      <axis xyz="0 -1 0"/>
+      <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
+      <mimic joint="${ns}_finger_joint1" />
+    </joint>
+  </xacro:macro>
+</robot>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm.urdf b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm.urdf
new file mode 100644
index 0000000000000000000000000000000000000000..2e2b6692b225ba21d0f01af729cd1a0088514b13
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm.urdf
@@ -0,0 +1,213 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from ./panda_arm.urdf.xacro         | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="panda_link0">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 0.0 0.05"/>
+      <mass value="4"/>
+      <inertia ixx="0.4" ixy="0.0" ixz="0.0" iyy="0.4" iyz="0.0" izz="0.4"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link0.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link0.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_link1">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.07"/>
+      <mass value="3"/>
+      <inertia ixx="0.3" ixy="0.0" ixz="0.0" iyy="0.3" iyz="0.0" izz="0.3"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link1.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link1.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint1" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="0 0 0" xyz="0 0 0.333"/>
+    <parent link="panda_link0"/>
+    <child link="panda_link1"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link2">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 -0.1 0.0"/>
+      <mass value="3"/>
+      <inertia ixx="0.3" ixy="0.0" ixz="0.0" iyy="0.3" iyz="0.0" izz="0.3"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link2.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link2.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint2" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+    <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link1"/>
+    <child link="panda_link2"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link3">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.04 0 -0.05"/>
+      <mass value="2"/>
+      <inertia ixx="0.2" ixy="0.0" ixz="0.0" iyy="0.2" iyz="0.0" izz="0.2"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link3.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link3.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint3" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
+    <parent link="panda_link2"/>
+    <child link="panda_link3"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link4">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.04 0.05 0"/>
+      <mass value="2"/>
+      <inertia ixx="0.2" ixy="0.0" ixz="0.0" iyy="0.2" iyz="0.0" izz="0.2"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link4.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link4.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint4" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
+    <parent link="panda_link3"/>
+    <child link="panda_link4"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link5">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 -0.15"/>
+      <mass value="2"/>
+      <inertia ixx="0.2" ixy="0.0" ixz="0.0" iyy="0.2" iyz="0.0" izz="0.2"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link5.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link5.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint5" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
+    <parent link="panda_link4"/>
+    <child link="panda_link5"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link6">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.06 0 0"/>
+      <mass value="1.5"/>
+      <inertia ixx="0.1" ixy="0.0" ixz="0.0" iyy="0.1" iyz="0.0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link6.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link6.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint6" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link5"/>
+    <child link="panda_link6"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-0.0175" upper="3.7525" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link7">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0.08"/>
+      <mass value="0.5"/>
+      <inertia ixx="0.05" ixy="0.0" ixz="0.0" iyy="0.05" iyz="0.0" izz="0.05"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link7.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link7.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint7" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
+    <parent link="panda_link6"/>
+    <child link="panda_link7"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link8">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+        <mass value="0.005"/>
+        <inertia ixx="0.00003" ixy="0.0" ixz="0.0" iyy="0.00003" iyz="0.0" izz="0.00003"/>
+      </inertial>
+    </link>
+    <joint name="panda_joint8" type="fixed">
+      <origin rpy="0 0 0" xyz="0 0 0.107"/>
+      <parent link="panda_link7"/>
+      <child link="panda_link8"/>
+      <axis xyz="0 0 0"/>
+    </joint>
+</robot>
+
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm.urdf.xacro b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm.urdf.xacro
new file mode 100644
index 0000000000000000000000000000000000000000..ffd0bf1352da0f059e827233f399bade484d46d4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm.urdf.xacro
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='utf-8'?>
+<robot xmlns:xacro="http://www.ros.org/wiki/xacro" name="panda">
+  <xacro:include filename="panda_arm.xacro" />
+  <xacro:panda_arm />
+</robot>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm.xacro b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm.xacro
new file mode 100644
index 0000000000000000000000000000000000000000..452e56804b247f347d38c844b2b4783462c6cf0d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm.xacro
@@ -0,0 +1,217 @@
+<?xml version='1.0' encoding='utf-8'?>
+<robot xmlns:xacro="http://www.ros.org/wiki/xacro" name="panda">
+  <xacro:macro name="panda_arm" params="arm_id:='panda' description_pkg:='panda_description' connected_to:='' xyz:='0 0 0' rpy:='0 0 0'">
+    <xacro:unless value="${not connected_to}">
+      <joint name="${arm_id}_joint_${connected_to}" type="fixed">
+        <parent link="${connected_to}"/>
+        <child link="${arm_id}_link0"/>
+        <origin rpy="${rpy}" xyz="${xyz}"/>
+      </joint>
+    </xacro:unless>
+    <link name="${arm_id}_link0">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.0 0.0 0.05"/>
+        <mass value="4"/>
+        <inertia ixx="0.4" ixy="0.0" ixz="0.0" iyy="0.4" iyz="0.0" izz="0.4"/>
+      </inertial>
+      <visual>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/visual/link0.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/collision/link0.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <link name="${arm_id}_link1">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.0 0.0 -0.07"/>
+        <mass value="3"/>
+        <inertia ixx="0.3" ixy="0.0" ixz="0.0" iyy="0.3" iyz="0.0" izz="0.3"/>
+      </inertial>
+      <visual>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/visual/link1.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/collision/link1.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <joint name="${arm_id}_joint1" type="revolute">
+      <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+      <origin rpy="0 0 0" xyz="0 0 0.333"/>
+      <parent link="${arm_id}_link0"/>
+      <child link="${arm_id}_link1"/>
+      <axis xyz="0 0 1"/>
+      <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+    </joint>
+    <link name="${arm_id}_link2">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.0 -0.1 0.0"/>
+        <mass value="3"/>
+        <inertia ixx="0.3" ixy="0.0" ixz="0.0" iyy="0.3" iyz="0.0" izz="0.3"/>
+      </inertial>
+      <visual>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/visual/link2.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/collision/link2.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <joint name="${arm_id}_joint2" type="revolute">
+      <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+      <origin rpy="${-pi/2} 0 0" xyz="0 0 0"/>
+      <parent link="${arm_id}_link1"/>
+      <child link="${arm_id}_link2"/>
+      <axis xyz="0 0 1"/>
+      <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
+    </joint>
+    <link name="${arm_id}_link3">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.04 0 -0.05"/>
+        <mass value="2"/>
+        <inertia ixx="0.2" ixy="0.0" ixz="0.0" iyy="0.2" iyz="0.0" izz="0.2"/>
+      </inertial>
+      <visual>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/visual/link3.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/collision/link3.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <joint name="${arm_id}_joint3" type="revolute">
+      <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+      <origin rpy="${pi/2} 0 0" xyz="0 -0.316 0"/>
+      <parent link="${arm_id}_link2"/>
+      <child link="${arm_id}_link3"/>
+      <axis xyz="0 0 1"/>
+      <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+    </joint>
+    <link name="${arm_id}_link4">
+      <inertial>
+        <origin rpy="0 0 0" xyz="-0.04 0.05 0"/>
+        <mass value="2"/>
+        <inertia ixx="0.2" ixy="0.0" ixz="0.0" iyy="0.2" iyz="0.0" izz="0.2"/>
+      </inertial>
+      <visual>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/visual/link4.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/collision/link4.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <joint name="${arm_id}_joint4" type="revolute">
+      <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+      <origin rpy="${pi/2} 0 0" xyz="0.0825 0 0"/>
+      <parent link="${arm_id}_link3"/>
+      <child link="${arm_id}_link4"/>
+      <axis xyz="0 0 1"/>
+      <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
+    </joint>
+    <link name="${arm_id}_link5">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0 0 -0.15"/>
+        <mass value="2"/>
+        <inertia ixx="0.2" ixy="0.0" ixz="0.0" iyy="0.2" iyz="0.0" izz="0.2"/>
+      </inertial>
+      <visual>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/visual/link5.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/collision/link5.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <joint name="${arm_id}_joint5" type="revolute">
+      <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+      <origin rpy="${-pi/2} 0 0" xyz="-0.0825 0.384 0"/>
+      <parent link="${arm_id}_link4"/>
+      <child link="${arm_id}_link5"/>
+      <axis xyz="0 0 1"/>
+      <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+    </joint>
+    <link name="${arm_id}_link6">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.06 0 0"/>
+        <mass value="1.5"/>
+        <inertia ixx="0.1" ixy="0.0" ixz="0.0" iyy="0.1" iyz="0.0" izz="0.1"/>
+      </inertial>
+      <visual>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/visual/link6.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/collision/link6.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <joint name="${arm_id}_joint6" type="revolute">
+      <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+      <origin rpy="${pi/2} 0 0" xyz="0 0 0"/>
+      <parent link="${arm_id}_link5"/>
+      <child link="${arm_id}_link6"/>
+      <axis xyz="0 0 1"/>
+      <limit effort="12" lower="-0.0175" upper="3.7525" velocity="2.6100"/>
+    </joint>
+    <link name="${arm_id}_link7">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0 0 0.08"/>
+        <mass value="0.5"/>
+        <inertia ixx="0.05" ixy="0.0" ixz="0.0" iyy="0.05" iyz="0.0" izz="0.05"/>
+      </inertial>
+      <visual>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/visual/link7.dae"/>
+        </geometry>
+      </visual>
+      <collision>
+        <geometry>
+          <mesh filename="package://${description_pkg}/meshes/collision/link7.stl"/>
+        </geometry>
+      </collision>
+    </link>
+    <joint name="${arm_id}_joint7" type="revolute">
+      <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+      <origin rpy="${pi/2} 0 0" xyz="0.088 0 0"/>
+      <parent link="${arm_id}_link6"/>
+      <child link="${arm_id}_link7"/>
+      <axis xyz="0 0 1"/>
+      <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+    </joint>
+    <!-- <link name="${arm_id}_link8">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+        <mass value="0.005"/>
+        <inertia ixx="0.00003" ixy="0.0" ixz="0.0" iyy="0.00003" iyz="0.0" izz="0.00003"/>
+      </inertial>
+    </link>
+    <joint name="${arm_id}_joint8" type="fixed">
+      <origin rpy="0 0 0" xyz="0 0 0.107"/>
+      <parent link="${arm_id}_link7"/>
+      <child link="${arm_id}_link8"/>
+      <axis xyz="0 0 0"/>
+    </joint> -->
+  </xacro:macro>
+</robot>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm_hand.urdf b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm_hand.urdf
new file mode 100644
index 0000000000000000000000000000000000000000..824e988509d7216dfcb2e2d9fe006fb90e456105
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm_hand.urdf
@@ -0,0 +1,286 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from ./panda_arm_hand.urdf.xacro    | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="panda_link0">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 0.0 0.05"/>
+      <mass value="4"/>
+      <inertia ixx="0.4" ixy="0.0" ixz="0.0" iyy="0.4" iyz="0.0" izz="0.4"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link0.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link0.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_link1">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.07"/>
+      <mass value="3"/>
+      <inertia ixx="0.3" ixy="0.0" ixz="0.0" iyy="0.3" iyz="0.0" izz="0.3"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link1.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link1.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint1" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="0 0 0" xyz="0 0 0.333"/>
+    <parent link="panda_link0"/>
+    <child link="panda_link1"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link2">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 -0.1 0.0"/>
+      <mass value="3"/>
+      <inertia ixx="0.3" ixy="0.0" ixz="0.0" iyy="0.3" iyz="0.0" izz="0.3"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link2.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link2.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint2" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+    <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link1"/>
+    <child link="panda_link2"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link3">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.04 0 -0.05"/>
+      <mass value="2"/>
+      <inertia ixx="0.2" ixy="0.0" ixz="0.0" iyy="0.2" iyz="0.0" izz="0.2"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link3.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link3.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint3" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
+    <parent link="panda_link2"/>
+    <child link="panda_link3"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link4">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.04 0.05 0"/>
+      <mass value="2"/>
+      <inertia ixx="0.2" ixy="0.0" ixz="0.0" iyy="0.2" iyz="0.0" izz="0.2"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link4.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link4.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint4" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
+    <parent link="panda_link3"/>
+    <child link="panda_link4"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link5">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 -0.15"/>
+      <mass value="2"/>
+      <inertia ixx="0.2" ixy="0.0" ixz="0.0" iyy="0.2" iyz="0.0" izz="0.2"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link5.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link5.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint5" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
+    <parent link="panda_link4"/>
+    <child link="panda_link5"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link6">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.06 0 0"/>
+      <mass value="1.5"/>
+      <inertia ixx="0.1" ixy="0.0" ixz="0.0" iyy="0.1" iyz="0.0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link6.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link6.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint6" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link5"/>
+    <child link="panda_link6"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-0.0175" upper="3.7525" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link7">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0.08"/>
+      <mass value="0.5"/>
+      <inertia ixx="0.05" ixy="0.0" ixz="0.0" iyy="0.05" iyz="0.0" izz="0.05"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/link7.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/link7.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint7" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
+    <parent link="panda_link6"/>
+    <child link="panda_link7"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link8">
+      <inertial>
+        <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+        <mass value="0.005"/>
+        <inertia ixx="0.00003" ixy="0.0" ixz="0.0" iyy="0.00003" iyz="0.0" izz="0.00003"/>
+      </inertial>
+    </link>
+    <joint name="panda_joint8" type="fixed">
+      <origin rpy="0 0 0" xyz="0 0 0.107"/>
+      <parent link="panda_link7"/>
+      <child link="panda_link8"/>
+      <axis xyz="0 0 0"/>
+    </joint>
+  <joint name="panda_hand_joint" type="fixed">
+    <parent link="panda_link8"/>
+    <child link="panda_hand"/>
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
+  </joint>
+  <link name="panda_hand">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+      <mass value="0.67893"/>
+      <inertia ixx="0.01" ixy="0.0" ixz="0.0" iyy="0.01" iyz="0.0" izz="0.01"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/hand.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/hand.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_leftfinger">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+      <mass value="0.01053"/>
+      <inertia ixx="0.001" ixy="0.0" ixz="0.0" iyy="0.001" iyz="0.0" izz="0.001"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/finger.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_rightfinger">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0 0.0 0.0"/>
+      <mass value="0.01053"/>
+      <inertia ixx="0.001" ixy="0.0" ixz="0.0" iyy="0.001" iyz="0.0" izz="0.001"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://panda_description/meshes/collision/finger.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_finger_joint1" type="prismatic">
+    <parent link="panda_hand"/>
+    <child link="panda_leftfinger"/>
+    <origin rpy="0 0 0" xyz="0 0 0.0584"/>
+    <axis xyz="0 1 0"/>
+    <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
+  </joint>
+  <joint name="panda_finger_joint2" type="prismatic">
+    <parent link="panda_hand"/>
+    <child link="panda_rightfinger"/>
+    <origin rpy="0 0 0" xyz="0 0 0.0584"/>
+    <axis xyz="0 -1 0"/>
+    <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
+    <mimic joint="panda_finger_joint1"/>
+  </joint>
+</robot>
+
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm_hand.urdf.xacro b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm_hand.urdf.xacro
new file mode 100644
index 0000000000000000000000000000000000000000..c2415c2c6ea7827364aeb7c4f2e5e1918c22d51e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/panda_description/urdf/panda_arm_hand.urdf.xacro
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="utf-8"?>
+<robot xmlns:xacro="http://www.ros.org/wiki/xacro" name="panda">
+  <xacro:include filename="panda_arm.xacro"/>
+  <xacro:include filename="hand.xacro"/>
+  <xacro:panda_arm />
+  <xacro:hand ns="panda" rpy="0 0 ${-pi/4}" connected_to="panda_link8"/>
+</robot>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/CMakeLists.txt b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a0a3a886382af73c4d3c5a8c53ec87bb21ed9fe2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(sawyer_description)
+
+find_package(catkin REQUIRED)
+
+catkin_package()
+
+foreach(dir config meshes params urdf)
+   install(DIRECTORY ${dir}/
+      DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/${dir})
+endforeach(dir)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/config/sawyer.rviz b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/config/sawyer.rviz
new file mode 100644
index 0000000000000000000000000000000000000000..edb9b7dfcd14be2ba6a4efd0f8c80141f8ef92df
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/config/sawyer.rviz
@@ -0,0 +1,216 @@
+Panels:
+  - Class: rviz/Displays
+    Help Height: 78
+    Name: Displays
+    Property Tree Widget:
+      Expanded:
+        - /Global Options1
+        - /Status1
+      Splitter Ratio: 0.5
+    Tree Height: 728
+  - Class: rviz/Selection
+    Name: Selection
+  - Class: rviz/Tool Properties
+    Expanded:
+      - /2D Pose Estimate1
+      - /2D Nav Goal1
+      - /Publish Point1
+    Name: Tool Properties
+    Splitter Ratio: 0.588679016
+  - Class: rviz/Views
+    Expanded:
+      - /Current View1
+    Name: Views
+    Splitter Ratio: 0.5
+  - Class: rviz/Time
+    Experimental: false
+    Name: Time
+    SyncMode: 0
+    SyncSource: ""
+Visualization Manager:
+  Class: ""
+  Displays:
+    - Alpha: 0.5
+      Cell Size: 1
+      Class: rviz/Grid
+      Color: 160; 160; 164
+      Enabled: true
+      Line Style:
+        Line Width: 0.0299999993
+        Value: Lines
+      Name: Grid
+      Normal Cell Count: 0
+      Offset:
+        X: 0
+        Y: 0
+        Z: 0
+      Plane: XY
+      Plane Cell Count: 10
+      Reference Frame: <Fixed Frame>
+      Value: true
+    - Alpha: 1
+      Class: rviz/RobotModel
+      Collision Enabled: false
+      Enabled: true
+      Links:
+        All Links Enabled: true
+        Expand Joint Details: false
+        Expand Link Details: false
+        Expand Tree: false
+        Link Tree Style: Links in Alphabetic Order
+        base:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+        head:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        head_camera:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+        pedestal:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        right_arm_base_link:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        right_arm_itb:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+        right_hand:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        right_hand_camera:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+        right_l0:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        right_l1:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        right_l2:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        right_l3:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        right_l4:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        right_l5:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        right_l6:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        right_torso_itb:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+        right_wrist:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+        screen:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+        torso:
+          Alpha: 1
+          Show Axes: false
+          Show Trail: false
+          Value: true
+      Name: RobotModel
+      Robot Description: robot_description
+      TF Prefix: ""
+      Update Interval: 0
+      Value: true
+      Visual Enabled: true
+  Enabled: true
+  Global Options:
+    Background Color: 48; 48; 48
+    Fixed Frame: base
+    Frame Rate: 30
+  Name: root
+  Tools:
+    - Class: rviz/Interact
+      Hide Inactive Objects: true
+    - Class: rviz/MoveCamera
+    - Class: rviz/Select
+    - Class: rviz/FocusCamera
+    - Class: rviz/Measure
+    - Class: rviz/SetInitialPose
+      Topic: /initialpose
+    - Class: rviz/SetGoal
+      Topic: /move_base_simple/goal
+    - Class: rviz/PublishPoint
+      Single click: true
+      Topic: /clicked_point
+  Value: true
+  Views:
+    Current:
+      Class: rviz/Orbit
+      Distance: 2.27867007
+      Enable Stereo Rendering:
+        Stereo Eye Separation: 0.0599999987
+        Stereo Focal Distance: 1
+        Swap Stereo Eyes: false
+        Value: false
+      Focal Point:
+        X: 0
+        Y: 0
+        Z: 0
+      Focal Shape Fixed Size: true
+      Focal Shape Size: 0.0500000007
+      Name: Current View
+      Near Clip Distance: 0.00999999978
+      Pitch: 0.240398526
+      Target Frame: <Fixed Frame>
+      Value: Orbit (rviz)
+      Yaw: 5.87858343
+    Saved: ~
+Window Geometry:
+  Displays:
+    collapsed: false
+  Height: 1016
+  Hide Left Dock: false
+  Hide Right Dock: false
+  QMainWindow State: 000000ff00000000fd00000004000000000000015600000360fc0200000008fb0000001200530065006c0065006300740069006f006e00000001e10000009b0000005c00fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000a3fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000003a00000360000000c600fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261000000010000010f00000360fc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003a000003600000009e00fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e10000019700000003000006b50000003efc0100000002fb0000000800540069006d00650100000000000006b50000024400fffffffb0000000800540069006d00650100000000000004500000000000000000000005590000036000000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000
+  Selection:
+    collapsed: false
+  Time:
+    collapsed: false
+  Tool Properties:
+    collapsed: false
+  Views:
+    collapsed: false
+  Width: 1717
+  X: 203
+  Y: 35
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/launch/test_sawyer_description.launch.test b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/launch/test_sawyer_description.launch.test
new file mode 100644
index 0000000000000000000000000000000000000000..37e9393501e5538492a09d45448ef14d7e96ce9d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/launch/test_sawyer_description.launch.test
@@ -0,0 +1,19 @@
+<?xml version="1.0" ?>
+ <!-- Note: This launch file is used to test the URDF and Meshes
+      PLEASE - DO NOT launch this while networked against
+      a real Sawyer robot. It will crash the onboard
+      robot_state_publisher, and you will need to reboot
+      your robot for it to function properly again -->
+<launch>
+ <!-- Load the URDF into the ROS Parameter Server
+ <param name="robot_description" command="cat '$(find sawyer_description)urdf/sawyer.urdf'" />
+
+ <node pkg="joint_state_publisher" type="joint_state_publisher" name="test_joint_state_publisher" output="screen" args="_use_gui:=true"/>
+
+ <node pkg="tf2_ros" type="static_transform_publisher" name="test_base_to_world" args="0 0 0 0 0 0 1 world base" />
+
+ <node name="test_robot_state_publisher" type="robot_state_publisher" pkg="robot_state_publisher" output="screen"/>
+
+ <node name="rviz" pkg="rviz" type="rviz" args="-d $(find sawyer_description)config/sawyer.rviz" required="true"/>
+ -->
+</launch>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/base.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/base.dae
new file mode 100644
index 0000000000000000000000000000000000000000..79993c4d499be9416a86e6f5abdcb35845fb7ef4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/base.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f879eb086ce265c73fa16b6c39b9344be2545e293fabbc4cd37ae405f991101
+size 1000898
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/base.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/base.stl
new file mode 100644
index 0000000000000000000000000000000000000000..83382c3684a5416298a7889732f86b57a5635efd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/base.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97b3997a278b7d7be42142f49f435a4a0d7856736b943bfb3590dc43210055f6
+size 264934
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l0.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l0.dae
new file mode 100644
index 0000000000000000000000000000000000000000..8f39b917258adab2e893e8ea9fef2abb8e0f1bb6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l0.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d5ef27f6f53e5ab78ee0791d28f47099753ab5966132f67339e3514346eefd8
+size 3760739
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..263bfce5dbb757036183d9c849615e4508891ba9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f4a7f56bcf4cbfbb72414acd61956ccb8db88ca5ec4074ff626a44cf41a18c1
+size 675584
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..253e63fd21bb6796d2357984078aca396175869b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f500967d2a91c6321a453dc3a5949013729354d04c8de4a8c20116e61918408e
+size 473148
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..964ea6cdf273aa59cb968bec8b71889df8bc5727
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6d8ba089c3da8a4a40e176a13928e3a39cfffa2fdc83311ac2f6b59035ab6d0
+size 511884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l2.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l2.dae
new file mode 100644
index 0000000000000000000000000000000000000000..b83494d7fb80655f2e0d5292b1eea479e1b74ca2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l2.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a56dc6354f42d850b35d246266de67c0fb22eb17576e19712340f5909116aed
+size 654737
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..3ef2de15c32d518d145e0d76a78894fd17d563dd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a0d2ccf5668f737409d0e7fc2578b9e24660298d09787852494a7adae8c58b1
+size 133734
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l3.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l3.dae
new file mode 100644
index 0000000000000000000000000000000000000000..2a6630f83b9739810d4bc5d6506ceefaa28a5587
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l3.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84f67d46fcc4acafd8fd7bd620ef6354b884d3f96eb4d09e792b40ce9ead0669
+size 618017
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l3.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l3.stl
new file mode 100644
index 0000000000000000000000000000000000000000..f0d4d108e88ba1629d943451db1d68916e810bf6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l3.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9faa1f689135fcda53c50b18f9a5c55bc718bb30790395a20821e96b87a174d0
+size 160034
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l4.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l4.dae
new file mode 100644
index 0000000000000000000000000000000000000000..08e9bafdf91240f1c1213757797b5d33b63e0a91
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l4.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b1742337c74de3ec0841eeb651291a09dadd6cbe84e2b1f26a1c4a99272a1c0
+size 4922491
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l4.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l4.stl
new file mode 100644
index 0000000000000000000000000000000000000000..a7d307ae67fac5efc496faebd8050ebbf73dacaa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l4.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9f7032958ae8feef741a4a073f59f3cf9f8f491d93505d0bc9047269cb2e1f2
+size 208284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l5.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l5.dae
new file mode 100644
index 0000000000000000000000000000000000000000..2e68e1492190a8672d7cee28cc916c5fa9a60cc2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l5.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1ab84f9f671c9842a5673c4236af79c9809ea19469d447bd156fdd8283027d8
+size 2121615
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l5.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l5.stl
new file mode 100644
index 0000000000000000000000000000000000000000..1ca0d1a4164f61bde4838fef7f86312150c5bc45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l5.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5078849cf3e1ae7790a6014e1ecb51eda84b44d2d91206783bb0b1fd1740e9ff
+size 176534
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l6.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l6.dae
new file mode 100644
index 0000000000000000000000000000000000000000..0a170130340091219653ef8fc94307b7798669ac
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l6.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096155b75cd14bdb9e921c990f358844c9212178f32034a3fa92ef299039b7dc
+size 1990228
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l6.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l6.stl
new file mode 100644
index 0000000000000000000000000000000000000000..9c369213a5624835906d3f9abc7fa32face88e95
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/meshes/l6.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6f4d3eb3e3b89bd364b7d92cd5fa229582295bb7ed6c37f494f370897151acd
+size 261534
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/package.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/package.xml
new file mode 100644
index 0000000000000000000000000000000000000000..22d3d336d400d4091af0ac8e90a1261903b39153
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/package.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0"?>
+<package>
+  <name>sawyer_description</name>
+  <version>5.0.4</version>
+  <description>
+    Description of Sawyer Robot from Rethink Robotics.
+    This package contains the URDF and meshes describing Sawyer.
+  </description>
+
+  <maintainer email="rsdk.support@rethinkrobotics.com">
+    Rethink Robotics Inc.
+  </maintainer>
+  <license>BSD</license>
+  <url type="website">http://sdk.rethinkrobotics.com/intera/</url>
+  <url type="repository">
+    https://github.com/RethinkRobotics/sawyer_robot
+  </url>
+  <url type="bugtracker">
+    https://github.com/RethinkRobotics/sawyer_robot/issues
+  </url>
+  <author>Rethink Robotics Inc.</author>
+  <buildtool_depend>catkin</buildtool_depend>
+
+  <run_depend>robot_state_publisher</run_depend>
+  <run_depend>joint_state_publisher</run_depend>
+  <run_depend>tf2_ros</run_depend>
+  <run_depend>rviz</run_depend>
+
+</package>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/params/named_poses.yaml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/params/named_poses.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d8d30349c8a2913c247ba921be54111f516bf6c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/params/named_poses.yaml
@@ -0,0 +1,12 @@
+# ------------------------------ Sawyer ------------------------------
+named_poses:
+  right:
+    joint_names: ['right_j0', 'right_j1', 'right_j2', 'right_j3', 'right_j4', 'right_j5', 'right_j6']
+    poses:
+      neutral: [0.00, -1.18, 0.00, 2.18, 0.00, 0.57, 3.3161]
+      shipping: [0.00, -1.57, 0.00, 2.79, 0.00, -2.79, 3.3161]
+  head:
+    joint_names: ['head_pan']
+    poses:
+      neutral: [0.00]
+      shipping: [-3.14]
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/urdf/sawyer_arm.urdf b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/urdf/sawyer_arm.urdf
new file mode 100644
index 0000000000000000000000000000000000000000..7d483774a20c7462672ff8eaa70b338e52a834e4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/bullet_data/sawyer_description/urdf/sawyer_arm.urdf
@@ -0,0 +1,234 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from ./urdf.xacro                   | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="sawyer" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <material name="black">
+    <color rgba="0 0 0 1"/>
+  </material>
+  <material name="darkgray">
+    <color rgba=".2 .2 .2 1"/>
+  </material>
+  <material name="darkred">
+    <color rgba=".5 .1 .1 1"/>
+  </material>
+  <material name="sawyer_red">
+    <color rgba=".5 .1 .1 1"/>
+  </material>
+  <material name="sawyer_gray">
+    <color rgba="0.75294 0.75294 0.75294 1"/>
+  </material>
+  
+  <link name="right_arm_base_link">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.0006241 -2.8025E-05 0.065404"/>
+      <mass value="2.0687"/>
+      <inertia ixx="0.0067599" ixy="-4.2024E-05" ixz="-6.1904E-07" iyy="0.0067877" iyz="1.5888E-05" izz="0.0074031"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://sawyer_description/meshes/base.dae"/>
+      </geometry>
+      <material name="sawyer_red"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.12"/>
+      <geometry>
+        <cylinder length="0.24" radius="0.08"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="right_l0">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.024366 0.010969 0.14363"/>
+      <mass value="5.3213"/>
+      <inertia ixx="0.053314" ixy="0.0047093" ixz="0.011734" iyy="0.057902" iyz="0.0080179" izz="0.023659"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://sawyer_description/meshes/l0.dae"/>
+      </geometry>
+      <material name="sawyer_red"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0.081 0 0.237"/>
+      <geometry>
+        <sphere radius="0.07"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="right_j0" type="revolute">
+    <origin rpy="0 0 0" xyz="0 0 0.08"/>
+    <parent link="right_arm_base_link"/>
+    <child link="right_l0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="80.0" lower="-3.0503" upper="3.0503" velocity="1.74"/>
+  </joint>
+  
+  <link name="right_l1">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.0030849 -0.026811 0.092521"/>
+      <mass value="4.505"/>
+      <inertia ixx="0.022398" ixy="-0.00023986" ixz="-0.00029362" iyy="0.014613" iyz="-0.0060875" izz="0.017295"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://sawyer_description/meshes/l1.dae"/>
+      </geometry>
+      <material name="sawyer_red"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.1225"/>
+      <geometry>
+        <sphere radius="0.07"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="right_j1" type="revolute">
+    <origin rpy="-1.57079632679 1.57079632679 0" xyz="0.081 0.05 0.237"/>
+    <parent link="right_l0"/>
+    <child link="right_l1"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="80.0" lower="-3.8183" upper="2.2824" velocity="1.328"/>
+  </joint>
+  <link name="right_l2">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.00016232 -0.015346 0.13445"/>
+      <mass value="1.7251"/>
+      <inertia ixx="0.025176" ixy="4.3031E-06" ixz="1.4564E-05" iyy="0.024982" iyz="-0.0033928" izz="0.0033798"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://sawyer_description/meshes/l2.dae"/>
+      </geometry>
+      <material name="sawyer_red"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.14"/>
+      <geometry>
+        <cylinder length="0.35" radius="0.06"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="right_j2" type="revolute">
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.14 0.1425"/>
+    <parent link="right_l1"/>
+    <child link="right_l2"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="40.0" lower="-3.0514" upper="3.0514" velocity="1.957"/>
+  </joint>
+  <link name="right_l3">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.0046678 -0.028394 -0.083127"/>
+      <mass value="2.4743"/>
+      <inertia ixx="0.0099549" ixy="-1.7675E-05" ixz="0.00027521" iyy="0.0064248" iyz="0.0030853" izz="0.0067746"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://sawyer_description/meshes/l3.dae"/>
+      </geometry>
+      <material name="sawyer_red"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 -0.01 -0.12"/>
+      <geometry>
+        <sphere radius="0.06"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="right_j3" type="revolute">
+    <origin rpy="-1.57079632679 0 0" xyz="0 -0.042 0.26"/>
+    <parent link="right_l2"/>
+    <child link="right_l3"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="40.0" lower="-3.0514" upper="3.0514" velocity="1.957"/>
+  </joint>
+  <link name="right_l4">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.0027794 0.0076558 0.13273"/>
+      <mass value="1.0433"/>
+      <inertia ixx="0.012913" ixy="2.573E-05" ixz="0.00017705" iyy="0.012939" iyz="0.0011152" izz="0.0012405"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://sawyer_description/meshes/l4.dae"/>
+      </geometry>
+      <material name="sawyer_red"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.14"/>
+      <geometry>
+        <cylinder length="0.36" radius="0.045"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="right_j4" type="revolute">
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.125 -0.1265"/>
+    <parent link="right_l3"/>
+    <child link="right_l4"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="9.0" lower="-2.9842" upper="2.9842" velocity="3.485"/>
+  </joint>
+ 
+  <link name="right_l5">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0062067 -0.024291 0.075564"/>
+      <mass value="1.5343"/>
+      <inertia ixx="0.0046072" ixy="0.00012002" ixz="5.3065E-05" iyy="0.0028725" iyz="-0.0011886" izz="0.003101"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://sawyer_description/meshes/l5.stl"/>
+      </geometry>
+      <material name="sawyer_red"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0.01 0 0.09"/>
+      <geometry>
+        <sphere radius="0.06"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="right_j5" type="revolute">
+    <origin rpy="-1.57079632679 0 0" xyz="0 0.031 0.275"/>
+    <parent link="right_l4"/>
+    <child link="right_l5"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="9.0" lower="-2.9842" upper="2.9842" velocity="3.485"/>
+  </joint>
+  <link name="right_l6">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-8.0726E-06 0.0085838 -0.0049566"/>
+      <mass value="0.3292"/>
+      <inertia ixx="0.00031105" ixy="1.4771E-06" ixz="-3.7074E-07" iyy="0.00021549" iyz="-8.4533E-06" izz="0.00035976"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://sawyer_description/meshes/l6.stl"/>
+      </geometry>
+      <material name="sawyer_red"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.005"/>
+      <geometry>
+        <cylinder length="0.05" radius="0.065"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="right_j6" type="revolute">
+    <origin rpy="-1.57079632679 -0.17453 3.1416" xyz="0 -0.11 0.1053"/>
+    <parent link="right_l5"/>
+    <child link="right_l6"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="9.0" lower="-4.7104" upper="4.7104" velocity="4.545"/>
+  </joint>
+</robot>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/handover/panda_panda/demo.hdf5 b/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/handover/panda_panda/demo.hdf5
new file mode 100644
index 0000000000000000000000000000000000000000..643380a388aeb3c084b1db807142e51baa627620
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/handover/panda_panda/demo.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2e4b04b8a46dea44218a733d022c14a8ceda723b338d274795d7274f0eeed24
+size 5045092
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/lift/demo.hdf5 b/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/lift/demo.hdf5
new file mode 100644
index 0000000000000000000000000000000000000000..d1777467a5c63a7bd924ef0d399e06e7f268eb2f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/lift/demo.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08c060c1dcd30582a4f84cf1b9c2daa009c9c75262e78235eca1bb58d0f84e61
+size 559784
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/wipe/panda/demo.hdf5 b/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/wipe/panda/demo.hdf5
new file mode 100644
index 0000000000000000000000000000000000000000..c8ade1a68fba3d17e50597eaf721c118fce8c11a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/wipe/panda/demo.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06c164a9fc609d6374b5ecc30e14e3f331e7f18e2ea6d7cb5888b14f16630a5f
+size 48112
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/wipe/panda/models/model_1.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/wipe/panda/models/model_1.xml
new file mode 100644
index 0000000000000000000000000000000000000000..a5b8948c45f648069497f96e28cc3699d99f24d2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/demonstrations/wipe/panda/models/model_1.xml
@@ -0,0 +1,247 @@
+<mujoco model="base">
+  <compiler angle="radian" meshdir="meshes/" />
+  <option cone="elliptic" impratio="20" />
+  <size nconmax="5000" njmax="5000" />
+
+  <asset>
+  <texture builtin="gradient" height="256" rgb1=".9 .9 1." rgb2=".2 .3 .4" type="skybox" width="256" />
+    <texture file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/arenas/../textures/wood-tiles.png" name="texplane" type="2d" />
+    <material name="floorplane" reflectance="0.01" shininess="0.0" specular="0.0" texrepeat="2 2" texture="texplane" texuniform="true" />
+    
+    <texture file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/arenas/../textures/ceramic.png" name="tex-ceramic" type="cube" />
+    <material name="table_ceramic" reflectance="0.0" shininess="0.0" specular="0.2" texrepeat="1 1" texture="tex-ceramic" />
+    
+    <texture file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/arenas/../textures/steel-brushed.png" name="tex-steel-brushed" type="cube" />
+    <material name="table_legs_metal" reflectance="0.8" shininess="0.8" texrepeat="1 1" texture="tex-steel-brushed" />
+    
+    <texture file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/arenas/../textures/cream-plaster.png" name="tex-cream-plaster" type="2d" />
+    <material name="walls_mat" reflectance="0.0" shininess="0.1" specular="0.1" texrepeat="3 3" texture="tex-cream-plaster" texuniform="true" />
+    
+    <texture builtin="flat" height="512" name="textable" rgb1="0.5 0.5 0.5" rgb2="0.5 0.5 0.5" width="512" />
+    <material name="table_mat" texture="textable" />
+
+  <texture file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/textures/dirt.png" name="dirt" type="cube" /><material name="dirt_mat" shininess="0.0" specular="0.0" texrepeat="1 1" texture="dirt" /><mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link0.stl" name="robot0_link0" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link1.stl" name="robot0_link1" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link2.stl" name="robot0_link2" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link3.stl" name="robot0_link3" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link4.stl" name="robot0_link4" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link5.stl" name="robot0_link5" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link6.stl" name="robot0_link6" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link7.stl" name="robot0_link7" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link0_vis.stl" name="robot0_link0_vis" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link1_vis.stl" name="robot0_link1_vis" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link2_vis.stl" name="robot0_link2_vis" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link3_vis.stl" name="robot0_link3_vis" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link4_vis.stl" name="robot0_link4_vis" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link5_vis.stl" name="robot0_link5_vis" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link6_vis.stl" name="robot0_link6_vis" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/link7_vis.stl" name="robot0_link7_vis" />
+        <mesh file="/Users/creme_brule/Documents/PAIR/refactor_arm/robosuite/models/assets/robots/panda/meshes/pedestal.stl" name="robot0_pedestal" />
+
+        
+        <texture builtin="flat" height="100" name="robot0_pedestal_tex" rgb1="0.2 0.2 0.2" rgb2="0.2 0.2 0.2" type="cube" width="100" />
+        <texture builtin="flat" height="100" name="robot0_torso_tex" rgb1="0.2 0.2 0.2" rgb2="0.2 0.2 0.2" type="cube" width="100" />
+        <texture builtin="flat" height="100" name="robot0_arm_tex" rgb1="1 1 1" rgb2="1 1 1" type="cube" width="100" />
+
+        
+        <material name="robot0_pedestal_mat" texture="robot0_pedestal_tex" />
+        <material name="robot0_torso_mat" texture="robot0_torso_tex" />
+        <material name="robot0_arm_mat" texture="robot0_arm_tex" />
+    </asset>
+
+  <visual>
+    <map znear="0.001" />
+  </visual>
+
+  <actuator>
+  <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="robot0_joint1" name="robot0_torq_j1" />
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="robot0_joint2" name="robot0_torq_j2" />
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="robot0_joint3" name="robot0_torq_j3" />
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="robot0_joint4" name="robot0_torq_j4" />
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="robot0_joint5" name="robot0_torq_j5" />
+        <motor ctrllimited="true" ctrlrange="-12.0 12.0" joint="robot0_joint6" name="robot0_torq_j6" />
+        <motor ctrllimited="true" ctrlrange="-12.0 12.0" joint="robot0_joint7" name="robot0_torq_j7" />
+    </actuator>
+
+  <worldbody>
+  <geom condim="3" material="floorplane" name="floor" pos="0.0 0.0 0.0" size="3 3 .125" type="plane" />
+    
+    <geom conaffinity="0" contype="0" group="1" material="walls_mat" name="wall_leftcorner_visual" pos="-1.25 2.25 1.5" quat="0.6532815 0.6532815 0.2705981 0.2705981" size="1.06 1.5 0.01" type="box" />
+    <geom conaffinity="0" contype="0" group="1" material="walls_mat" name="wall_rightcorner_visual" pos="-1.25 -2.25 1.5" quat="0.6532815 0.6532815 -0.2705981 -0.2705981" size="1.06 1.5 0.01" type="box" />
+    <geom conaffinity="0" contype="0" group="1" material="walls_mat" name="wall_left_visual" pos="1.25 3.0 1.5" quat="0.7071 0.7071 0 0" size="1.75 1.5 0.01" type="box" />
+    <geom conaffinity="0" contype="0" group="1" material="walls_mat" name="wall_right_visual" pos="1.25 -3.0 1.5" quat="0.7071 -0.7071 0 0" size="1.75 1.5 0.01" type="box" />
+    <geom conaffinity="0" contype="0" group="1" material="walls_mat" name="wall_rear_visual" pos="-2.0 0.0 1.5" quat="0.5 0.5 0.5 0.5" size="1.5 1.5 0.01" type="box" />
+    <geom conaffinity="0" contype="0" group="1" material="walls_mat" name="wall_front_visual" pos="3.0 0.0 1.5" quat="0.5 0.5 -0.5 -0.5" size="3 1.5 0.01" type="box" />
+    
+    <body name="table" pos="0.0 0.0 0.775">
+      <geom friction="1e-05 0.005 0.0001" name="table_collision" pos="0 0 0" size="0.3 0.4 0.025" type="box" />
+      <geom conaffinity="0" contype="0" group="1" material="table_ceramic" name="table_visual" pos="0 0 0" size="0.3 0.4 0.025" type="box" />
+      <site name="table_top" pos="0.0 0.0 0.025" rgba="0 0 0 0" size="0.001 0.001 0.001" />
+      
+      <geom conaffinity="0" contype="0" group="1" material="table_legs_metal" name="table_leg1_visual" pos="0.19999999999999998 0.30000000000000004 -0.3875" size="0.025 0.3875" type="cylinder" />
+      <geom conaffinity="0" contype="0" group="1" material="table_legs_metal" name="table_leg2_visual" pos="-0.19999999999999998 0.30000000000000004 -0.3875" size="0.025 0.3875" type="cylinder" />
+      <geom conaffinity="0" contype="0" group="1" material="table_legs_metal" name="table_leg3_visual" pos="-0.19999999999999998 -0.30000000000000004 -0.3875" size="0.025 0.3875" type="cylinder" />
+      <geom conaffinity="0" contype="0" group="1" material="table_legs_metal" name="table_leg4_visual" pos="0.19999999999999998 -0.30000000000000004 -0.3875" size="0.025 0.3875" type="cylinder" />
+    <body name="contact_0" pos="-0.07329715886729446 -0.17633295743566285 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_0" size="0.02 0.001" type="cylinder" /><site name="contact_0" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_1" pos="-0.07781224361960613 -0.1741849328363059 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_1" size="0.02 0.001" type="cylinder" /><site name="contact_1" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_2" pos="-0.08232732837191781 -0.17203690823694895 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_2" size="0.02 0.001" type="cylinder" /><site name="contact_2" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_3" pos="-0.08530627289410074 -0.16802120287751876 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_3" size="0.02 0.001" type="cylinder" /><site name="contact_3" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_4" pos="-0.08711422039752346 -0.16335951586262087 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_4" size="0.02 0.001" type="cylinder" /><site name="contact_4" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_5" pos="-0.08892216790094619 -0.158697828847723 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_5" size="0.02 0.001" type="cylinder" /><site name="contact_5" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_6" pos="-0.09073011540436891 -0.1540361418328251 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_6" size="0.02 0.001" type="cylinder" /><site name="contact_6" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_7" pos="-0.09253806290779164 -0.14937445481792722 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_7" size="0.02 0.001" type="cylinder" /><site name="contact_7" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_8" pos="-0.09551875605177015 -0.1453600472176156 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_8" size="0.02 0.001" type="cylinder" /><site name="contact_8" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_9" pos="-0.09996884602559072 -0.14308042022512285 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_9" size="0.02 0.001" type="cylinder" /><site name="contact_9" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_10" pos="-0.1044189359994113 -0.1408007932326301 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_10" size="0.02 0.001" type="cylinder" /><site name="contact_10" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_11" pos="-0.10886902597323186 -0.13852116624013733 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_11" size="0.02 0.001" type="cylinder" /><site name="contact_11" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_12" pos="-0.11331911594705243 -0.13624153924764457 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_12" size="0.02 0.001" type="cylinder" /><site name="contact_12" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_13" pos="-0.117769205920873 -0.1339619122551518 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_13" size="0.02 0.001" type="cylinder" /><site name="contact_13" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_14" pos="-0.12210865336204135 -0.13147812564891176 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_14" size="0.02 0.001" type="cylinder" /><site name="contact_14" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_15" pos="-0.1264481008032097 -0.1289943390426717 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_15" size="0.02 0.001" type="cylinder" /><site name="contact_15" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_16" pos="-0.13084060408232837 -0.12660563333812544 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_16" size="0.02 0.001" type="cylinder" /><site name="contact_16" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_17" pos="-0.13456416943584681 -0.12326870944656348 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_17" size="0.02 0.001" type="cylinder" /><site name="contact_17" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_18" pos="-0.13828773478936526 -0.11993178555500153 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_18" size="0.02 0.001" type="cylinder" /><site name="contact_18" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_19" pos="-0.1420113001428837 -0.11659486166343958 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_19" size="0.02 0.001" type="cylinder" /><site name="contact_19" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_20" pos="-0.14573486549640216 -0.11325793777187762 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_20" size="0.02 0.001" type="cylinder" /><site name="contact_20" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_21" pos="-0.1494584308499206 -0.10992101388031567 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_21" size="0.02 0.001" type="cylinder" /><site name="contact_21" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_22" pos="-0.15318199620343906 -0.10658408998875371 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_22" size="0.02 0.001" type="cylinder" /><site name="contact_22" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_23" pos="-0.1569055615569575 -0.10324716609719176 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_23" size="0.02 0.001" type="cylinder" /><site name="contact_23" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_24" pos="-0.15996986719615186 -0.10719811926694733 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_24" size="0.02 0.001" type="cylinder" /><site name="contact_24" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_25" pos="-0.15741698071699528 -0.1028989599741856 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_25" size="0.02 0.001" type="cylinder" /><site name="contact_25" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_26" pos="-0.1548640942378387 -0.09859980068142388 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_26" size="0.02 0.001" type="cylinder" /><site name="contact_26" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_27" pos="-0.15231120775868212 -0.09430064138866215 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_27" size="0.02 0.001" type="cylinder" /><site name="contact_27" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_28" pos="-0.14975832127952554 -0.09000148209590042 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_28" size="0.02 0.001" type="cylinder" /><site name="contact_28" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_29" pos="-0.14720543480036896 -0.08570232280313869 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_29" size="0.02 0.001" type="cylinder" /><site name="contact_29" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_30" pos="-0.14465254832121238 -0.08140316351037696 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_30" size="0.02 0.001" type="cylinder" /><site name="contact_30" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_31" pos="-0.1420996618420558 -0.07710400421761524 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_31" size="0.02 0.001" type="cylinder" /><site name="contact_31" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_32" pos="-0.13954677536289922 -0.0728048449248535 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_32" size="0.02 0.001" type="cylinder" /><site name="contact_32" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_33" pos="-0.13699388888374264 -0.06850568563209178 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_33" size="0.02 0.001" type="cylinder" /><site name="contact_33" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_34" pos="-0.13614408409729478 -0.06357843164788006 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_34" size="0.02 0.001" type="cylinder" /><site name="contact_34" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_35" pos="-0.13529427931084692 -0.058651177663668334 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_35" size="0.02 0.001" type="cylinder" /><site name="contact_35" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_36" pos="-0.13133646921170947 -0.05559573347023825 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_36" size="0.02 0.001" type="cylinder" /><site name="contact_36" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_37" pos="-0.12685752602957712 -0.0533733320728237 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_37" size="0.02 0.001" type="cylinder" /><site name="contact_37" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_38" pos="-0.12353014425494108 -0.04964123741622401 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_38" size="0.02 0.001" type="cylinder" /><site name="contact_38" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_39" pos="-0.12020276248030504 -0.04590914275962432 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_39" size="0.02 0.001" type="cylinder" /><site name="contact_39" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_40" pos="-0.116875380705669 -0.04217704810302463 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_40" size="0.02 0.001" type="cylinder" /><site name="contact_40" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_41" pos="-0.11354799893103296 -0.03844495344642494 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_41" size="0.02 0.001" type="cylinder" /><site name="contact_41" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_42" pos="-0.11022061715639692 -0.03471285878982525 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_42" size="0.02 0.001" type="cylinder" /><site name="contact_42" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_43" pos="-0.10661715186712453 -0.031246587319064618 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_43" size="0.02 0.001" type="cylinder" /><site name="contact_43" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_44" pos="-0.1017249748458243 -0.03021382210263742 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_44" size="0.02 0.001" type="cylinder" /><site name="contact_44" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_45" pos="-0.09683279782452406 -0.029181056886210224 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_45" size="0.02 0.001" type="cylinder" /><site name="contact_45" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_46" pos="-0.09194062080322382 -0.028148291669783027 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_46" size="0.02 0.001" type="cylinder" /><site name="contact_46" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_47" pos="-0.08704844378192358 -0.02711552645335583 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_47" size="0.02 0.001" type="cylinder" /><site name="contact_47" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_48" pos="-0.08335844454822895 -0.023741522058673893 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_48" size="0.02 0.001" type="cylinder" /><site name="contact_48" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_49" pos="-0.079573909525449 -0.020473909067306655 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_49" size="0.02 0.001" type="cylinder" /><site name="contact_49" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_50" pos="-0.07578937450266905 -0.017206296075939417 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_50" size="0.02 0.001" type="cylinder" /><site name="contact_50" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_51" pos="-0.0720048394798891 -0.01393868308457218 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_51" size="0.02 0.001" type="cylinder" /><site name="contact_51" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_52" pos="-0.06970814757268935 -0.009497376262993292 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_52" size="0.02 0.001" type="cylinder" /><site name="contact_52" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_53" pos="-0.0674114556654896 -0.005056069441414405 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_53" size="0.02 0.001" type="cylinder" /><site name="contact_53" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_54" pos="-0.06511476375828985 -0.0006147626198355176 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_54" size="0.02 0.001" type="cylinder" /><site name="contact_54" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_55" pos="-0.0628180718510901 0.0038265442017433697 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_55" size="0.02 0.001" type="cylinder" /><site name="contact_55" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_56" pos="-0.06052137994389035 0.008267851023322257 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_56" size="0.02 0.001" type="cylinder" /><site name="contact_56" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_57" pos="-0.05822468803669059 0.012709157844901144 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_57" size="0.02 0.001" type="cylinder" /><site name="contact_57" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_58" pos="-0.055927996129490835 0.01715046466648003 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_58" size="0.02 0.001" type="cylinder" /><site name="contact_58" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_59" pos="-0.05560956361406964 0.02214031443707659 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_59" size="0.02 0.001" type="cylinder" /><site name="contact_59" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_60" pos="-0.05706736755845174 0.0269230759468891 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_60" size="0.02 0.001" type="cylinder" /><site name="contact_60" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_61" pos="-0.05852517150283383 0.03170583745670161 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_61" size="0.02 0.001" type="cylinder" /><site name="contact_61" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_62" pos="-0.059982975447215925 0.036488598966514116 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_62" size="0.02 0.001" type="cylinder" /><site name="contact_62" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_63" pos="-0.06144077939159802 0.041271360476326624 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_63" size="0.02 0.001" type="cylinder" /><site name="contact_63" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_64" pos="-0.06289858333598011 0.04605412198613913 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_64" size="0.02 0.001" type="cylinder" /><site name="contact_64" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_65" pos="-0.0643563872803622 0.05083688349595164 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_65" size="0.02 0.001" type="cylinder" /><site name="contact_65" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_66" pos="-0.06018615841063247 0.05359535973729805 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_66" size="0.02 0.001" type="cylinder" /><site name="contact_66" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_67" pos="-0.05601592954090273 0.05635383597864446 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_67" size="0.02 0.001" type="cylinder" /><site name="contact_67" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_68" pos="-0.05237860542209411 0.059784560866293295 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_68" size="0.02 0.001" type="cylinder" /><site name="contact_68" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_69" pos="-0.04874128130328549 0.06321528575394213 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_69" size="0.02 0.001" type="cylinder" /><site name="contact_69" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_70" pos="-0.04510395718447687 0.06664601064159095 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_70" size="0.02 0.001" type="cylinder" /><site name="contact_70" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_71" pos="-0.04146663306566825 0.07007673552923978 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_71" size="0.02 0.001" type="cylinder" /><site name="contact_71" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_72" pos="-0.03782930894685963 0.0735074604168886 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_72" size="0.02 0.001" type="cylinder" /><site name="contact_72" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_73" pos="-0.03419198482805101 0.07693818530453743 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_73" size="0.02 0.001" type="cylinder" /><site name="contact_73" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_74" pos="-0.030554660709242388 0.08036891019218625 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_74" size="0.02 0.001" type="cylinder" /><site name="contact_74" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_75" pos="-0.026917336590433764 0.08379963507983508 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_75" size="0.02 0.001" type="cylinder" /><site name="contact_75" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_76" pos="-0.02328001247162514 0.0872303599674839 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_76" size="0.02 0.001" type="cylinder" /><site name="contact_76" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_77" pos="-0.019642688352816517 0.09066108485513273 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_77" size="0.02 0.001" type="cylinder" /><site name="contact_77" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_78" pos="-0.01583865192775478 0.09390597420252249 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_78" size="0.02 0.001" type="cylinder" /><site name="contact_78" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_79" pos="-0.012034615502693045 0.09715086354991224 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_79" size="0.02 0.001" type="cylinder" /><site name="contact_79" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_80" pos="-0.00823057907763131 0.100395752897302 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_80" size="0.02 0.001" type="cylinder" /><site name="contact_80" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_81" pos="-0.004426542652569573 0.10364064224469176 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_81" size="0.02 0.001" type="cylinder" /><site name="contact_81" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_82" pos="-0.0006225062275078378 0.10688553159208151 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_82" size="0.02 0.001" type="cylinder" /><site name="contact_82" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_83" pos="0.003181530197553898 0.11013042093947127 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_83" size="0.02 0.001" type="cylinder" /><site name="contact_83" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_84" pos="0.006985566622615633 0.11337531028686103 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_84" size="0.02 0.001" type="cylinder" /><site name="contact_84" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_85" pos="0.010789603047677369 0.11662019963425078 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_85" size="0.02 0.001" type="cylinder" /><site name="contact_85" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_86" pos="0.014593639472739105 0.11986508898164054 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_86" size="0.02 0.001" type="cylinder" /><site name="contact_86" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_87" pos="0.01839767589780084 0.1231099783290303 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_87" size="0.02 0.001" type="cylinder" /><site name="contact_87" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_88" pos="0.021640610419278165 0.12691568137701115 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_88" size="0.02 0.001" type="cylinder" /><site name="contact_88" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_89" pos="0.024883544940755495 0.13072138442499198 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_89" size="0.02 0.001" type="cylinder" /><site name="contact_89" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_90" pos="0.027585315257411397 0.13492857317236678 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_90" size="0.02 0.001" type="cylinder" /><site name="contact_90" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_91" pos="0.0302870855740673 0.13913576191974159 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_91" size="0.02 0.001" type="cylinder" /><site name="contact_91" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_92" pos="0.0329888558907232 0.1433429506671164 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_92" size="0.02 0.001" type="cylinder" /><site name="contact_92" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_93" pos="0.035841570436820465 0.14744928963491788 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_93" size="0.02 0.001" type="cylinder" /><site name="contact_93" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_94" pos="0.03869428498291773 0.15155562860271937 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_94" size="0.02 0.001" type="cylinder" /><site name="contact_94" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_95" pos="0.0433269721770246 0.15343664145795408 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_95" size="0.02 0.001" type="cylinder" /><site name="contact_95" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_96" pos="0.047959659371131474 0.15531765431318878 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_96" size="0.02 0.001" type="cylinder" /><site name="contact_96" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_97" pos="0.05259234656523835 0.15719866716842348 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_97" size="0.02 0.001" type="cylinder" /><site name="contact_97" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_98" pos="0.05722503375934522 0.1590796800236582 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_98" size="0.02 0.001" type="cylinder" /><site name="contact_98" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body><body name="contact_99" pos="0.06185772095345209 0.1609606928788929 0.025"><geom conaffinity="0" contype="0" group="1" material="dirt_mat" name="contact_99" size="0.02 0.001" type="cylinder" /><site name="contact_99" pos="[0, 0, 0.005]" rgba="0 0 0 0" size="0.002 0.002 0.002" type="sphere" /></body></body>
+
+    <light castshadow="false" diffuse=".8 .8 .8" dir="0 -.15 -1" directional="false" name="light1" pos="1.0 1.0 4.0" specular="0.3 0.3 0.3" />
+    <light castshadow="false" diffuse=".8 .8 .8" dir="0 -.15 -1" directional="false" name="light2" pos="-3.0 -3.0 4.0" specular="0.3 0.3 0.3" />
+    
+    <camera mode="fixed" name="frontview" pos="1.6 0.0 1.45" quat="0.56 0.43 0.43 0.56" />
+    
+    <camera mode="fixed" name="birdview" pos="-0.2 0.0 3.0" quat="0.7071 0 0 0.7071" />
+    
+    <camera mode="fixed" name="agentview" pos="0.5 0.0 1.35" quat="0.653 0.271 0.271 0.653" />
+    
+    <camera mode="fixed" name="sideview" pos="-0.05651774593317116 1.2761224129427358 1.4879572214102434" quat="0.009905065491771751 0.006877963156909582 0.5912228352893879 0.806418094001364" />
+  <body name="robot0_link0" pos="-0.45999999999999996 0.0 0.913">
+            <body name="robot0_base" pos="0 0 0">
+
+                
+                <camera mode="fixed" name="robot0_robotview" pos="1.0 0 0.4" quat="0.653 0.271 0.271 0.653" />
+                <inertial diaginertia="0 0 0" mass="0" pos="0 0 0" />
+                <body name="robot0_controller_box" pos="0 0 0">
+                    <inertial diaginertia="1.71363 1.27988 0.809981" mass="46.64" pos="-0.325 0 -0.38" />
+                    <geom name="robot0_controller_box_col" pos="-0.325 0 -0.38" size="0.11 0.2 0.265" type="box" />
+                </body>
+                <body name="robot0_pedestal_feet" pos="0 0 0">
+                    <inertial diaginertia="8.16095 9.59375 15.0785" mass="167.09" pos="-0.1225 0 -0.758" />
+                    <geom name="robot0_pedestal_feet_col" pos="-0.1225 0 -0.758" size="0.385 0.35 0.155" type="box" />
+                </body>
+                <body name="robot0_torso" pos="0 0 0">
+                    <inertial diaginertia="1e-08 1e-08 1e-08" mass="0.0001" pos="0 0 0" />
+                    <geom conaffinity="0" contype="0" group="1" material="robot0_torso_mat" name="robot0_torso_col" pos="0 0 -0.05" size="0.05 0.05 0.05" type="box" />
+                </body>
+                <body name="robot0_pedestal" pos="0 0 0">
+                    <inertial diaginertia="6.0869 5.81635 4.20915" mass="60.864" pos="0 0 0" quat="0.659267 -0.259505 -0.260945 0.655692" />
+                    <geom conaffinity="0" contype="0" group="1" material="robot0_pedestal_mat" mesh="robot0_pedestal" name="robot0_pedestal_col1" type="mesh" />
+                    <geom name="robot0_pedestal_col2" pos="-0.02 0 -0.29" rgba="0.2 0.2 0.2 1" size="0.18 0.31" type="cylinder" />
+                </body>
+            </body>
+            <inertial diaginertia="0.4 0.4 0.4" mass="4" pos="0 0 0.05" />
+            <geom conaffinity="0" contype="0" group="1" material="robot0_arm_mat" mesh="robot0_link0_vis" name="robot0_link0_visual" type="mesh" />
+            <geom group="0" mesh="robot0_link0" name="robot0_link0_collision" type="mesh" />
+            <body name="robot0_link1" pos="0 0 0.333">
+                <inertial diaginertia="0.3 0.3 0.3" mass="3" pos="0 0 -0.07" />
+                <joint axis="0 0 1" damping="0.1" limited="true" name="robot0_joint1" pos="0 0 0" range="-2.8973 2.8973" />
+                <geom conaffinity="0" contype="0" group="1" material="robot0_arm_mat" mesh="robot0_link1_vis" name="robot0_link1_visual" type="mesh" />
+                <geom group="0" mesh="robot0_link1" name="robot0_link1_collision" type="mesh" />
+                <body name="robot0_link2" pos="0 0 0" quat="0.707107 -0.707107 0 0">
+                    <inertial diaginertia="0.3 0.3 0.3" mass="3" pos="0 -0.1 0" />
+                    <joint axis="0 0 1" damping="0.1" limited="true" name="robot0_joint2" pos="0 0 0" range="-1.7628 1.7628" />
+                    <geom conaffinity="0" contype="0" group="1" material="robot0_arm_mat" mesh="robot0_link2_vis" name="robot0_link2_visual" type="mesh" />
+                    <geom group="0" mesh="robot0_link2" name="robot0_link2_collision" type="mesh" />
+                    <body name="robot0_link3" pos="0 -0.316 0" quat="0.707107 0.707107 0 0">
+                        <inertial diaginertia="0.2 0.2 0.2" mass="2" pos="0.04 0 -0.05" />
+                        <joint axis="0 0 1" damping="0.1" limited="true" name="robot0_joint3" pos="0 0 0" range="-2.8973 2.8973" />
+                        <geom conaffinity="0" contype="0" group="1" material="robot0_arm_mat" mesh="robot0_link3_vis" name="robot0_link3_visual" type="mesh" />
+                        <geom group="0" mesh="robot0_link3" name="robot0_link3_collision" type="mesh" />
+                        <body name="robot0_link4" pos="0.0825 0 0" quat="0.707107 0.707107 0 0">
+                            <inertial diaginertia="0.2 0.2 0.2" mass="2" pos="-0.04 0.05 0" />
+                            <joint axis="0 0 1" damping="0.1" limited="true" name="robot0_joint4" pos="0 0 0" range="-3.0718 -0.0698" />
+                            <geom conaffinity="0" contype="0" group="1" material="robot0_arm_mat" mesh="robot0_link4_vis" name="robot0_link4_visual" type="mesh" />
+                            <geom group="0" mesh="robot0_link4" name="robot0_link4_collision" type="mesh" />
+                            <body name="robot0_link5" pos="-0.0825 0.384 0" quat="0.707107 -0.707107 0 0">
+                                <inertial diaginertia="0.2 0.2 0.2" mass="2" pos="0 0 -0.15" />
+                                <joint axis="0 0 1" damping="0.1" limited="true" name="robot0_joint5" pos="0 0 0" range="-2.8973 2.8973" />
+                                <geom conaffinity="0" contype="0" group="1" material="robot0_arm_mat" mesh="robot0_link5_vis" name="robot0_link5_visual" type="mesh" />
+                                <geom group="0" mesh="robot0_link5" name="robot0_link5_collision" type="mesh" />
+                                <body name="robot0_link6" pos="0 0 0" quat="0.707107 0.707107 0 0">
+                                    <inertial diaginertia="0.1 0.1 0.1" mass="1.5" pos="0.06 0 0" />
+                                    <joint axis="0 0 1" damping="0.01" limited="true" name="robot0_joint6" pos="0 0 0" range="-0.0175 3.7525" />
+                                    <geom conaffinity="0" contype="0" group="1" material="robot0_arm_mat" mesh="robot0_link6_vis" name="robot0_link6_visual" type="mesh" />
+                                    <geom group="0" mesh="robot0_link6" name="robot0_link6_collision" type="mesh" />
+                                    <body name="robot0_link7" pos="0.088 0 0" quat="0.707107 0.707107 0 0">
+                                        <inertial diaginertia="0.05 0.05 0.05" mass="0.5" pos="0 0 0.08" />
+                                        <joint axis="0 0 1" damping="0.01" limited="true" name="robot0_joint7" pos="0 0 0" range="-2.8973 2.8973" />
+                                        <geom conaffinity="0" contype="0" group="1" material="robot0_arm_mat" mesh="robot0_link7_vis" name="robot0_link7_visual" type="mesh" />
+                                        <geom group="0" mesh="robot0_link7" name="robot0_link7_collision" type="mesh" />
+                                        
+                                        <body name="robot0_right_hand" pos="0 0 0.1065" quat="0.924 0 0 -0.383">
+                                            <inertial diaginertia="0.05 0.05 0.05" mass="0.5" pos="0 0 0" />
+                                            
+                                            
+                                            
+                                            <site group="1" name="robot0_ee" pos="0 0 0" rgba="0 0 1 1" size="0.01 0.01 0.01" type="sphere" />
+                                            <site group="1" name="robot0_ee_x" pos="0 0 0" quat="0.707105 0.707108 0 0 " rgba="1 0 0 0" size="0.005 .1" type="cylinder" />
+                                            <site group="1" name="robot0_ee_z" pos="0 0 0" quat="0.707105 0 0 0.707108" rgba="0 0 1 0" size="0.005 .1" type="cylinder" />
+                                            <site group="1" name="robot0_ee_y" pos="0 0 0" quat="0.707105 0 0.707108 0 " rgba="0 1 0 0" size="0.005 .1" type="cylinder" />
+                                            
+                                            <camera fovy="75" mode="fixed" name="robot0_eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" />
+                                            
+                                        <body name="gripper0_wiping_gripper" pos="0 0 0.015" quat="0.707107 0 0 -0.707107">
+            <site group="1" name="gripper0_ft_frame" pos="0 0 0" rgba="1 0 0 1" size="0.01 0.01 0.01" type="sphere" />
+            <inertial diaginertia="1e-2 1e-2 1e-2" mass="3e-1" pos="0 0 0" />
+
+            <geom conaffinity="0" contype="0" group="1" name="gripper0_wiping_surface_vis" pos="0 0 0.0" rgba="0.25 0.25 0.25 1" size="0.06 0.025 0.015" type="box" />
+
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1a" pos="0.055 0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1b" pos="0.04 0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1c" pos="0.02 0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1d" pos="0.0 0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1e" pos="-0.02 0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1f" pos="-0.04 0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1g" pos="-0.055 0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_corner1" pos="0.06 0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            
+            <site name="gripper0_wiping_corner1_site" pos="0.06 0.025 0.015" size="0.001" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1br" pos="0.04 0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1cr" pos="0.02 0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1dr" pos="0.0 0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1er" pos="-0.02 0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1fr" pos="-0.04 0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_corner2" pos="-0.06 0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <site name="gripper0_wiping_corner2_site" pos="-0.06 0.025 0.015" size="0.001" type="sphere" />
+
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2a" pos="0.055 -0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2b" pos="0.04 -0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2c" pos="0.02 -0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2d" pos="0.0 -0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2e" pos="-0.02 -0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2f" pos="-0.04 -0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2g" pos="-0.055 -0.02 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.01 0.015" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" type="box" />
+
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_corner3" pos="0.06 -0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <site name="gripper0_wiping_corner3_site" pos="0.06 -0.025 0.015" size="0.001" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2br" pos="0.04 -0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2cr" pos="0.02 -0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2dr" pos="0.0 -0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2er" pos="-0.02 -0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface2fr" pos="-0.04 -0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_corner4" pos="-0.06 -0.025 0.015" rgba="0.25 0.25 0.25 1" size="0.001" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" type="sphere" />
+            <site name="gripper0_wiping_corner4_site" pos="-0.06 -0.025 0.015" size="0.001" type="sphere" />
+
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1ax" pos="0.055 0.0 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.03 0.005" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1bx" pos="0.04 0.0 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.03 0.005" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1cx" pos="0.02 0.0 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.03 0.005" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1dx" pos="0.0 0.0 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.03 0.005" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1ex" pos="-0.02 0.0 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.03 0.005" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1fx" pos="-0.04 0.0 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.03 0.005" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" type="box" />
+            <geom friction="0.001 0.005 0.0001" name="gripper0_wiping_surface1gx" pos="-0.055 0.0 0.0" rgba="0.25 0.25 0.25 1" size="0.005 0.03 0.005" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" type="box" />
+
+            
+            <site group="0" name="gripper0_grip_site" pos="0 0 0" rgba="1 0 0 0" size="0.01 0.01 0.01" type="sphere" />
+            
+            <site group="0" name="gripper0_grip_site_cylinder" pos="0 0 0" rgba="0 1 0 0" size="0.005 10" type="cylinder" />
+        </body>
+    </body>
+                                    </body>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+
+  <equality>
+  </equality>
+<sensor><force name="gripper0_force_ee" site="gripper0_ft_frame" />
+        <torque name="gripper0_torque_ee" site="gripper0_ft_frame" />
+        <touch name="gripper0_touch1" site="gripper0_wiping_corner1_site" />
+        <touch name="gripper0_touch2" site="gripper0_wiping_corner2_site" />
+        <touch name="gripper0_touch3" site="gripper0_wiping_corner3_site" />
+        <touch name="gripper0_touch4" site="gripper0_wiping_corner4_site" />
+    </sensor><tendon /><contact /><default /></mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/jaco_three_finger_gripper.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/jaco_three_finger_gripper.xml
new file mode 100644
index 0000000000000000000000000000000000000000..da48f24ae15f9f9f4716a8942a79619dd58307db
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/jaco_three_finger_gripper.xml
@@ -0,0 +1,127 @@
+<mujoco model="jaco_three_finger_hand">
+    <asset>
+        <mesh name="hand_3finger" file="meshes/jaco_three_finger_gripper/hand_3finger.stl" />
+        <mesh name="finger_proximal" file="meshes/jaco_three_finger_gripper/finger_proximal.stl" />
+        <mesh name="finger_distal" file="meshes/jaco_three_finger_gripper/finger_distal.stl" />
+        <mesh name="ring_small" file="meshes/jaco_three_finger_gripper/ring_small.stl" />
+        <!-- Materials for robot -->
+        <texture type="cube" name="carbon_tex" builtin="flat" width="100" height="100" rgb1="1 1 1" rgb2="1 1 1"/>
+        <texture type="cube" name="grey_plastic_tex" builtin="flat" width="100" height="100" rgb1="1 1 1" rgb2="1 1 1"/>
+        <material name="carbon_hand" texture="carbon_tex" rgba="0.05 0.05 0.05 1" />
+        <material name="grey_plastic_hand" texture="grey_plastic_tex" rgba="0.88 0.86 0.86 1" />
+    </asset>
+
+
+    <tendon>
+		<!--Middlefinger tendons-->
+		<fixed name="thumb_cpl" range="-5 5" stiffness="3.0" springlength="0.2" frictionloss="0.4" limited="true">
+			<joint joint="joint_thumb"  coef="0.4"/>
+			<joint joint="joint_thumb_distal"  coef="-0.4"/>
+		</fixed>
+
+		<!--finger2 tendons-->
+        <fixed name="index_12_cpl" range="-5 5" stiffness="3.0" springlength="0.2" frictionloss="0.4" limited="true">
+			<joint joint="joint_index"  coef="0.4"/>
+			<joint joint="joint_index_distal"  coef="-0.4"/>
+		</fixed>
+
+		<!--Finger1 tendons-->
+        <fixed name="pinky_12_cpl" range="-5 5" stiffness="3.0" springlength="0.2" frictionloss="0.4" limited="true">
+			<joint joint="joint_pinky"  coef="0.4"/>
+			<joint joint="joint_pinky_distal"  coef="-0.4"/>
+		</fixed>
+	</tendon>
+
+    <equality>
+		<!-- GRIPPER Couplings -->
+		<tendon name="thumb_cpl" 	tendon1="thumb_cpl"/>
+
+        <tendon name="index_12_cpl" 	tendon1="index_12_cpl"/>
+
+        <tendon name="pinky_12_cpl" 	tendon1="pinky_12_cpl"/>
+	</equality>
+
+    <actuator>
+        <position name='thumb' ctrllimited="true" kp="20" joint='joint_thumb' ctrlrange='0 1.51' forcelimited="true" forcerange="-0.3 0.3" />
+        <position name='index' ctrllimited="true" kp="20" joint='joint_index' ctrlrange='0 1.51' forcelimited="true" forcerange="-0.3 0.3" />
+        <position name='pinky'  ctrllimited="true" kp="20" joint='joint_pinky' ctrlrange='0 1.51' forcelimited="true" forcerange="-0.3 0.3" />
+    </actuator>
+
+    <worldbody>
+        <!-- Note: This model is a tendon-driven variation of the model found at https://github.com/abr/abr_control/blob/master/abr_control/arms/jaco2/jaco2.xml -->
+        <body name="right_gripper" pos="0 0 0">
+            <inertial pos="0 0 0" mass="1e-6" diaginertia="1e-08 1e-08 1e-08" />
+            <body name="palm" pos="0 0 0" quat="0 -1 0 0">
+                <inertial pos="0 0 -0.06" quat="0.5 0.5 -0.5 0.5" mass="0.99" diaginertia="0.0005816 0.000345324 0.000345324" />
+                <geom type="mesh" contype="0" conaffinity="0" group="1" material="carbon_hand" name="hand_visual" mesh="hand_3finger" />
+                <geom type="mesh" contype="0" conaffinity="0" group="1" material="grey_plastic_hand" name="hand_ring_visual" mesh="ring_small" />
+                <geom type="mesh" group="0" material="carbon_hand" mesh="hand_3finger" name="hand_collision" />
+
+                <site name="ft_frame" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 1" type="sphere" group="1" />
+
+                <!-- This site was added for visualization. -->
+                <body name="eef" pos="0 0 -0.1775" quat="0 0.707105 0.707108 0 ">
+                    <site name="grip_site" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 0.5" type="sphere" group="1"/>
+                    <site name="ee_x" pos="0.1 0 0" size="0.005 .1"  quat="0.707105  0 0.707108 0 " rgba="1 0 0 0" type="cylinder" group="1"/>
+                    <site name="ee_y" pos="0 0.1 0" size="0.005 .1" quat="0.707105 0.707108 0 0" rgba="0 1 0 0" type="cylinder" group="1"/>
+                    <site name="ee_z" pos="0 0 0.1" size="0.005 .1" quat="1 0 0 0" rgba="0 0 1 0" type="cylinder" group="1"/>
+                    <!-- This site was added for visualization. -->
+                    <site name="grip_site_cylinder" pos="0 0 0" size="0.005 10" rgba="0 1 0 0.3" type="cylinder" group="1"/>
+                </body>
+            </body>
+
+            <body name="thumb_proximal" pos="0.002786 -0.03126 0.114668" quat="0.95922726  0.262085 0.02762996 -0.10213274">
+                <inertial pos="0.022 0 0" mass="0.01" diaginertia="7.89997e-07 7.89997e-07 8e-08" />
+                <joint name="joint_thumb" pos="0 0 0" axis="1 0 0" ref="1.1" limited="true" range="0 1.51" damping="0.15"/>
+                <geom name="thumb_proximal_visual" type="mesh" contype="0" conaffinity="0" group="1" euler="0 -1.5707 0" material="grey_plastic_hand" mesh="finger_proximal" />
+                <geom name="thumb_proximal_collision" type="mesh" group="0" mesh="finger_proximal" euler="0 -1.5707 0" friction="1 0.5 0.01" />
+
+                <body name="thumb_distal" pos="0 -0.003 0.044">
+                    <inertial pos="0.022 0 0" mass="0.01" diaginertia="7.89997e-07 7.89997e-07 8e-08" />
+                    <joint name="joint_thumb_distal" pos="0 0 0" axis="-1 0 0" ref="-0.5" limited="true" range="0 2" damping="0.1" />
+                    <geom name="thumb_distal_visual" type="mesh" contype="0" conaffinity="0" group="1" euler="0 -1.5707 0" material="grey_plastic_hand" mesh="finger_distal" />
+                    <geom name="thumb_distal_collision" type="mesh" group="0" mesh="finger_distal" euler="0 -1.5707 0" friction="1 0.5 0.01" />
+                    <geom type="box" group="0" pos="0 -0.003 0.021" quat="0.9914449 0.1305262 0 0" size="0.01 0.005 0.02" name="thumb_tip_collision" solref="0.01 0.25" friction="2 0.05 0.001" />
+                    <geom type="box" group="0" pos="0 0.0015 0.023" quat="0.9914449 0.1305262 0 0" size="0.008 0.001 0.018" name="thumb_pad_collision" />
+                </body>
+            </body>
+
+            <body name="index_proximal" pos="0.022256 0.027073 0.114668" quat="0.96181018 -0.25771638 0.0238668 -0.08907205">
+                <inertial pos="0.022 0 0" mass="0.01" diaginertia="7.89997e-07 7.89997e-07 8e-08" />
+                <joint name="joint_index" pos="0 0 0" axis="-1 0 0" ref="1.1" limited="true" range="0 1.51" damping="0.15"/>
+                <geom name="index_proximal_visual" type="mesh" contype="0" conaffinity="0" group="1" quat="5.63312174e-04  7.06824957e-01 -5.62863772e-04  7.07388045e-01" material="grey_plastic_hand" mesh="finger_proximal" />
+                <geom name="index_proximal_collision" type="mesh" group="0" mesh="finger_proximal" quat="5.63312174e-04  7.06824957e-01 -5.62863772e-04  7.07388045e-01" friction="1 0.5 0.01" />
+
+                <body name="index_distal" pos="0 0.003 0.044">
+                    <inertial pos="0.022 0 0" mass="0.01" diaginertia="7.89997e-07 7.89997e-07 8e-08" />
+                    <joint name="joint_index_distal" pos="0 0 0" axis="1 0 0" ref="-0.5" limited="true" range="0 2" damping="0.1" />
+                    <geom name="index_distal_visual" type="mesh" contype="0" conaffinity="0" group="1" quat="5.63312174e-04  7.06824957e-01 -5.62863772e-04  7.07388045e-01" material="grey_plastic_hand" mesh="finger_distal" />
+                    <geom name="index_distal_collision" type="mesh" group="0" mesh="finger_distal" quat="5.63312174e-04  7.06824957e-01 -5.62863772e-04  7.07388045e-01" friction="1 0.5 0.01" />
+                    <geom type="box" group="0" pos="0 0.003 0.021" quat="0.9914449 -0.1305262 0 0" size="0.01 0.005 0.02" name="index_tip_collision" solref="0.01 0.25" friction="2 0.05 0.001" />
+                    <geom type="box" group="0" pos="0 -0.0015 0.023" quat="0.9914449 -0.1305262 0 0" size="0.008 0.001 0.018" name="index_pad_collision" />
+                </body>
+            </body>
+
+            <body name="pinky_proximal" pos="-0.022256 0.027073 0.114816" quat="0.96181018 -0.25771638 -0.0238668 0.08907205">
+                <joint name="joint_pinky" pos="0 0 0" axis="-1 0 0" ref="1.1" limited="true" range="0 1.51" damping="0.15"/>
+                <inertial pos="0.022 0 0" mass="0.01" diaginertia="7.89997e-07 7.89997e-07 8e-08" />
+                <geom name="pinky_proximal_visual" type="mesh" contype="0" conaffinity="0" group="1" quat="5.63312174e-04  7.06824957e-01 -5.62863772e-04  7.07388045e-01" material="grey_plastic_hand" mesh="finger_proximal" />
+                <geom name="pinky_proximal_collision" type="mesh" group="0" mesh="finger_proximal" quat="5.63312174e-04  7.06824957e-01 -5.62863772e-04  7.07388045e-01" friction="1 0.5 0.01" />
+
+                <body name="pinky_distal" pos="0 0.003 0.044">
+                    <inertial pos="0.022 0 0" mass="0.01" diaginertia="7.89997e-07 7.89997e-07 8e-08" />
+                    <joint name="joint_pinky_distal" pos="0 0 0" axis="1 0 0" ref="-0.5" limited="true" range="0 2" damping="0.1" />
+                    <geom name="pinky_distal_visual" type="mesh" contype="0" conaffinity="0" group="1" quat="5.63312174e-04  7.06824957e-01 -5.62863772e-04  7.07388045e-01" material="grey_plastic_hand" mesh="finger_distal" />
+                    <geom name="pinky_distal_collision" type="mesh" group="0" mesh="finger_distal" quat="5.63312174e-04  7.06824957e-01 -5.62863772e-04  7.07388045e-01" friction="1 0.5 0.01" />
+                    <geom type="box" group="0" pos="0 0.003 0.021" quat="0.9914449 -0.1305262 0 0" size="0.01 0.005 0.02" name="pinky_tip_collision" solref="0.01 0.25" friction="2 0.05 0.001" />
+                    <geom type="box" group="0" pos="0 -0.0015 0.023" quat="0.9914449 -0.1305262 0 0" size="0.008 0.001 0.018" name="pinky_pad_collision" />
+                </body>
+            </body>
+        </body>
+    </worldbody>
+
+    <sensor>
+        <force name="force_ee" site="ft_frame"/>
+        <torque name="torque_ee" site="ft_frame"/>
+    </sensor>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.dae
new file mode 100644
index 0000000000000000000000000000000000000000..0d8e1bcf055e86f815d5e6bf598ec4eca0e61f4d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73f6662e0e9c47e43b9f3018069cf6285f40e163d3555a6928607f058ed7cffb
+size 141699
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..cfab8c125d4e18b2404e2af4cceea7a1d92e1782
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75491d30baa89763ac7b5cac43ea69aa42a863527805b6166709371c3a7dbfa7
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.obj
new file mode 100644
index 0000000000000000000000000000000000000000..bb9ee71efcac9cec598405d44a94c6dc5e3f72cf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:898c8ca4f5790fe956bde643836338346ae740b7e7f27a33ea0ea81fe2c32027
+size 170213
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.stl
new file mode 100644
index 0000000000000000000000000000000000000000..e511f032175daecd562f67950a904f67ebde33d9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_distal.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27ce521eedbd5e70b6bfa364a29b357d2b03f15ebf1ffeb08c46bf28ccf7f6ca
+size 97184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.dae
new file mode 100644
index 0000000000000000000000000000000000000000..7a0c2598d4214ef6e10bf4c4c775b5dbc09488e0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce16acaa3571b4023c131c7310744fc28894c9703c9833a3d274bb38a704fa33
+size 200973
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..a33160bac0a49d454b675786838b316a2cbf7c2b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48583af9d2f3a7234a334faae1c997bc77b0ab74b631d6bbe972be92f94a1597
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.obj
new file mode 100644
index 0000000000000000000000000000000000000000..80fd9f750bfe1d3bf14d206a0833ae58ef306c08
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af3119fe7f58591b81ce9fb70c98228fe7ed4b5d33111b36e1a3724f220fa49b
+size 235345
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.stl
new file mode 100644
index 0000000000000000000000000000000000000000..363b6403b838407c0e9ecef365f34dbcb40e6ef6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/finger_proximal.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e4c3b68fe806f53fbae43503176981be61aca54a396d1aa9f3e10f87bc4e4f5
+size 135584
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.dae
new file mode 100644
index 0000000000000000000000000000000000000000..b904643656d3c019560c12d8458ec9de710672fc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:018d65704e40f198c2af21ce69de59b955f13c238875f0df229ac7f8436bb003
+size 2286771
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..a3f5d677bb381e83691b67687eb2a4ea527d06d9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04e4454818bf7b38eaa107b96761eac370f7e7eb506adfa59bf246b02f4c1ef6
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4fb49331561c303f59fd9e70fe456c98b9c83a88
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b12a144371e024b3b16acdb473a2a68eaf939dbed963895be80cb0062af29bc9
+size 2834748
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ffbab3ecdcc1151d9003a80eb6fb53aa54161038
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/hand_3finger.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e1a0ecb85ee2c8213ec197e75e53c30dbfcca06ca114bef1a1a80c798349a22
+size 1395484
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.dae
new file mode 100644
index 0000000000000000000000000000000000000000..84a5951ee6c71476b32936883981c593580974d0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf4d3b83ed06116b66c6d33a7e2d5e2d2643b7a822147c44bfbba80b5b27a234
+size 35177
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..7fcc1144e135f419a54d66bf76a29b2b659ecbd0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a727be7aca01ef35192e60aa4db05200a956579a4f7b26d9b232b5def096ec2f
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c5693cf2b2b7f57d3beb88048469bcec72be5bfa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2694fd3dbf1c3341fbc447b298457d93a80a6697aeb3bc6849f7238608dcba93
+size 35173
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.stl
new file mode 100644
index 0000000000000000000000000000000000000000..6ae5c8c3335e4d54343b30fe8fc9959d20fb1419
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/jaco_three_finger_gripper/ring_small.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d60973e9d9ff8c6b05d49a120bab2fc8df42e7270d72e369808e9c1678db5eb
+size 22684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ef5e672efbb990561b36fcee2c15b2f61cf42065
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d07a740392f3b9b0816f65d64fff9927d3d57c897870fc4b6ff9c56fff3a0c8
+size 1684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_longer.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_longer.stl
new file mode 100644
index 0000000000000000000000000000000000000000..9082187ed0ee299915e8fa4cc26867d2ea125df6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_longer.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8857e33ee59c3a395c3036a11e4f1731c88c203c821bb03ca97737a2d04cba27
+size 2884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..fb26096106f740cb0bddbb967a50ee0c5c8a3ab5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:caefea762f2d18ca9412cf6c2e64e5007ad03571181d790a8e0c828b1b2035cf
+size 51239
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..91e73caca094b6a58d95209ccf1b6f045a36a8be
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54213c34eb8bc8db0d52a3d38c28954dcb7e3d36395d5f78a77e6f5efa5f1d69
+size 432
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c0490203d61752c1bee87f584ff289734d8b7a95
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:105cce7bacc069fd288a55b214fdfe37287d52f26d844940b56f022ad5d4839e
+size 65235
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..5b8512a4d5d2de4c7019ca5fc664dfbf309c6f61
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/finger_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e49613148a74f1ba9b5793813078e5becf15833e0296073dc1c523508be35ae4
+size 31284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand.stl
new file mode 100644
index 0000000000000000000000000000000000000000..bb315217a60e27343b84a9d4e3a4686762c4fc8d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94493e94f30fe940f2c8ca2f155c3bbe67bbff406d3edf5e261670d2f0f6e2ed
+size 10084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..4d3078316aadb6d8ef825e214d4d0214855c9e88
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe5d445d509e44a9bd107ff78f0b03c49752c98e5e0a8ebafddf2b6cf5a8b380
+size 549239
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..94839a4e83a0b567a1833d23c2695282b5c14d6f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:732ba90affaf8debb3a4f3bddbe2bc5438137ed5cf325ce1f5829d0bab5e7b82
+size 1038
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..8ce0a4209c5ce0620b479f3f7d72e8f459010600
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8fa0939d6143d690838f611716901bbf5220d12068a5a60d76251bd3d24862f
+size 737011
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..440763e579e0bc61fb9986de9a751e09f5269e40
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/panda_gripper/hand_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdeb1a924b8d0f1f997f47d6d8af102c7290a1d5144145efdc39501dc13f0fa1
+size 353984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.dae
new file mode 100644
index 0000000000000000000000000000000000000000..31a7fbc7e1ebbea72c8fc0ad7c5db17cff36b1a6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ea82518c235326eec78addddb4d21cfb629c0b28299b7382a7f361df4f7d98f
+size 2512513
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..b94e8f669d65d43bbcff3693856082fa67812c62
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa1d09f88548c514fdb275dd03d09bc0d961875aeb5e016f57740825b1d93c8f
+size 233
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.obj
new file mode 100644
index 0000000000000000000000000000000000000000..cee3c58a18040882fb7831efd6d239b767e763ea
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38b7adecc394bc2d829dc7de308126a93d53a2de0129318d2cb01fcac3ffa10d
+size 3149293
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ef9368c24b01dc3fa8f1272254133ab5fb4fc119
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/connector_plate.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a23b14473b4a6c38a7702982b77b3f12a15859e4eb96eb0b4b343e41d08f679f
+size 1758684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.dae
new file mode 100644
index 0000000000000000000000000000000000000000..4cc25cd14b357177bc85c3504711caa2eaa77d3e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d58734eef052462c3832d56bb1ea82621bfd23b3b78ec76b45e4ebcf3453b416
+size 984153
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..43a6db963b31a557c06a46fa706335907de7e316
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50421e5c4910c71f56d578f5f46d5c025a3c607c505644cf72ec21781a05ef55
+size 419
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4a48227847c776763fe9af753e982b57dd0c2c39
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad10083f865da3aabb7d578f62c8f41414a6bca7e6ed198a9cddfff60da132a5
+size 1213713
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.stl
new file mode 100644
index 0000000000000000000000000000000000000000..af9970c234745044aeaaaa7baaf1ffbf0cc1c933
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/electric_gripper_base.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3112ecc88a650b57a3f20dfcafb28d41cc88639b6e8ac85f3cb8739605a2da39
+size 618984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.dae
new file mode 100644
index 0000000000000000000000000000000000000000..ac663e964f00bbf1ebc9ee78859625ca563230d5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b6517e2e44b503984beee9fb83c7a1e5228f532c943757a00959d7748bd44a7
+size 235303
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..c5ad9787119bae0d504525e5d7787939231632fa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2eae3b81e9059410c438231b86e57f2b78692ff46b4cd665c9ca90cab0f27d88
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c4edffa28e35811c0a803ccec6ecc067580bd6cd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4c772ec1757804654ee972496cc6df4a501fbf4f106aabc916cb43b29c11056
+size 287325
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.stl
new file mode 100644
index 0000000000000000000000000000000000000000..91bed464dbc7d62204f25fbc012d8e9b1192c4d2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/half_round_tip.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a38e5377a4806d6d997efa2afae78d37aee10353e7c59dd991b2f41ade85e6b
+size 148234
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.dae
new file mode 100644
index 0000000000000000000000000000000000000000..58f22e8b17372dd508867638c61b45b2257ed979
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9dd36ca75cd19bf67c8864c2c1a13c802be96c6f9f370f51764cf56c9f732d4b
+size 478054
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..ea8491ab829d563156e2f50e578a016a5f827c34
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9bf5fa3bfbe71eda7979c64f1280ee6b03ab861e27d863d4bc8ba53e0743fce
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4150d9d4fd2a595b7208ec263140321ecaaccddb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8eab9e9f9565e7e61385ccff06fb11068b20330c96c4cc759347aa3d2388c713
+size 615399
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ff20d7b9d30c61b52d7a04b86598370e26d78c98
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/rethink_gripper/standard_narrow.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb611ec499f718b82d894ae51d0babe6500c3fdbadd125143ead59bb7795a55
+size 298434
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger.stl
new file mode 100644
index 0000000000000000000000000000000000000000..c15965c0cfd22fff5f937cc8a0d09f3d4afd2e33
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:043901d9c22b38d09b30d11326108ab3ef1445b4bd655ef313f94199f25f57ed
+size 7284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..893ecc12ce943c24747573eac5d14a52e2d03457
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b83c49589f1a1ed2981aa0e6adcba03216536f633b2037d9006bf314b9e57e4
+size 51616
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..dc22f558729864de277fd1acf24bee78923e9507
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be21c5e18c901d4747cbc8fa1a2af15dad7173ff701482a8517e33278432bb21
+size 33984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle.stl
new file mode 100644
index 0000000000000000000000000000000000000000..1375e4e8f379978449da10ccbb14dd6555074b67
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d341d986aa8aca7262565c039c04fb6b4c0f2f5c93443519eb7ca9bfc67ba17c
+size 5484
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..8be2f8503d756381333de0f139828644e763f1da
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdca583239b4da0024c1e8d7dec8f2cbd191b228375028832d41f6bab962c0a2
+size 68467
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..33365cf8627bf74af9ffd6d09241ec770f5cc2ce
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d61e9c5304d8015333856e4a26d99e32b695b6ada993253986b1afe8e396ab11
+size 43484
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger.stl
new file mode 100644
index 0000000000000000000000000000000000000000..4f19c840371f5e5ece42bde7876de650e9ac15cc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd7da1b31b73f1aa1d61b26a582cea10f16f17396d54b8937890fa51547d26b0
+size 11684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..d77ecc2e2f7c68e2e81899647724c164cff86174
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7fe9d55a54b4604e6e4781eeccc5b69bff431f37abad051024e5cef75302768
+size 121033
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ff31816334d2214b68d5afbee8ed6ca2863d41a4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:666a92ee075f6f320ffb13b39995a5b374657cee90ded4c52c23ede20a812f34
+size 76084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle.stl
new file mode 100644
index 0000000000000000000000000000000000000000..c2818a6266086849687d93c10833d19be237e36e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37c1779f71fb5504a5898048a36f9f4bfce0f5e7039ff1dce53808b95c229777
+size 9784
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5f6470195ffb186581ef2451f97137e475ff0e9b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c6cfb105fb57001f377d986b8870243faa5a02b6d73fa81db32c590cf2f2c73
+size 125864
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..d511e7320d717cc6a271ad5f94f05f700fc996f1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f042edc44ede9773e7cad00f7d7354d0b7c1c6a6353fe897b2ca2e6ac71107fc
+size 78384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_base_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_base_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..3ef56dcf17eb02fa9870e22f411d2543de2f49ff
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_base_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:111e37f13a664989dd54226f80f521b32ea0b71c975282a16696b14be7cc9249
+size 86384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_base_link_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_base_link_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..65e501d811be48c67556ef9637dd5f0b2f3bcd54
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_base_link_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f914125f62184d183c4fd30d04a7ea79222d622d584d5d39db0354ef19c982c3
+size 1864825
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_base_link_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_base_link_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..6c39b86af6cd7cac704a703438ff5993e65c065d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_base_link_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74a62de75ae10cf77c60f2c49749b5d11f4c265f8624bbe7697a941fa86f6b3b
+size 1054984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_coupling.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_coupling.stl
new file mode 100644
index 0000000000000000000000000000000000000000..8958e11500cfb541905c4a5179d71947f6895379
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_coupling.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ca9ffc28ed04193854b005358599dd9c3dc6fa92c8403e661fda94732d9ac25
+size 21184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_coupling_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_coupling_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..05e44f123c43c3bc469852c12f03078cc9f49a8c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_coupling_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e5f1f30e322107726a1f742120fc953bc2086e8f8eb80d1eb09b249dff63b5f
+size 273992
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_coupling_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_coupling_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..e2289a1e75b5ee9e21d22de667fdb5805011d08b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_140_gripper/robotiq_arg2f_coupling_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4281e83002a25c20dc07c68b8d77da30a13e9a8401f157f6848ed8287d7cce44
+size 160684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_adapter_plate.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_adapter_plate.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e4629051e91c163b78039dadebdc2bbb5a27d0dc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_adapter_plate.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a54704e6690f1c0e020003b83631e6e7aa3de74c7174c2df1edff5fc35d1713e
+size 15002
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_base.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_base.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9019e6a324a847da99faeb374757927dd3917a1a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_base.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fa5ba3211cf22d21e34723579f536b0bf64d3ac58ad1293a960e19a746d402f
+size 755744
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_0_L.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_0_L.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e863ebb9fc4ef6de615786fc5aed112cad60be00
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_0_L.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c5bf8cfb5f36fa9122cccab8e1ef17089eb9816ed1706c3bde65ea29eca3e8f
+size 73327
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_0_R.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_0_R.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4a5de0f295eb1e67a2029d20d95c58cf10582928
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_0_R.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07ea7c7e2a38369cfa927f1121169cbd271f2c4c7a7fffca3b87420ee8b15af5
+size 74351
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_1_L.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_1_L.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6f648790f96039cda0cfe305d36bb537faaa6ba4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_1_L.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17a5979d5569da3141fa20d5a18856beefae656f2fd79f126dcdaf7e187de339
+size 112247
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_1_R.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_1_R.obj
new file mode 100644
index 0000000000000000000000000000000000000000..51febe17fefb6589028a5c99c2970a44964dad13
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_1_R.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e716745b5de753ba9594cca657e8e5610ed4f7485360c218fecf996dc6883be
+size 114232
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_2_L.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_2_L.obj
new file mode 100644
index 0000000000000000000000000000000000000000..bc6f25d209554cdd70fb0b31591ff76dd952e49c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_2_L.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e77401e1335569a2b9fe55af188768829dba47f5dfc54a982925f39c4fed38ba
+size 78238
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_2_R.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_2_R.obj
new file mode 100644
index 0000000000000000000000000000000000000000..98f57bf4356465c0b8c334b2e8ba6636986e3459
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_2_R.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a0dac0a84b03aa3cdb0482a58d38ce98df1d904d8a919b26a35747273f3b997
+size 79705
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_3_L.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_3_L.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6bfbca7bb208c0994f3f3d0c27f8488e169dfe44
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_3_L.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:596cd27d8736cc273ead163e8c08149ef1c54bdaa3636744762e099f2be43a19
+size 214920
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_3_R.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_3_R.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b49116be495ca40aa5be1ad831f13676dc6b01dc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_85_gripper_joint_3_R.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:629c1639e3fa07d38e3536c1ec19227e3969aebe867ad55fa8e25fa5a027bc7b
+size 227020
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_base_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_base_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e490e7823efb0eb8b5cffcbc73dd1f8a32826
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_base_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8844d7d09e05423b6edb56b354eef561ad6cf4787d8b7f980232cd4346f46bf
+size 86384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_base_link_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_base_link_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..97f50b6eb33350f752b41c23b2738934deddf572
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_base_link_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d03d6e7395b0aa11ae7954b2f04a0a650448547d5d4f367238098ccd848b3eb5
+size 2523721
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.dae
new file mode 100644
index 0000000000000000000000000000000000000000..5c6319336ab72b67b82743f59ebc365155d03e15
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:807aff52c5a12ca5429cfb2eb19cc88dfaf0083bac6b69e24d49f6beb29aa2c8
+size 21799
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6679ab7c18f58da3537167796792d5c6b1410ebd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f967bf4a720ccdecc85579c82c16058bce41ae29505b6992a147b09915a78735
+size 21387
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.stl
new file mode 100644
index 0000000000000000000000000000000000000000..434b257a7f6ad3d511b4ab161a7b1013da2a5975
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2222dbc6ebe5579e718231c4c1766680cfe77297f4086e84c60711ca98571a7c
+size 18484
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..daead518e98d02ed6a8ddc5334b976987e5f6df8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3172e8fe000fda75b9b29b48cd8cad4011d184a6df23eb2c3adb6c1f5a4eae93
+size 154078
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..d9af1e883faa8bc2ea0bfc02b4f647b9958186af
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fca72dbbdbdd9de02a96ac3cf693e4f985ce60593bf3fec750120280890bc044
+size 235
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6990e25783e645b539f8bbacf316ec2879b315e0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58c5003a139496eb48c7a5220cb6c5ee9b72d4b7e99af425028f8d79ca90aaf9
+size 180046
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..241eaadfa5209f4a4e5bb5d2dd198b81862ac470
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_finger_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:208f8225e5291f6476f7f4ca307d3786808c20a8acb0bb759bd274cc69ea4a37
+size 110484
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.dae
new file mode 100644
index 0000000000000000000000000000000000000000..d15b19feb229310001158ca7a8c6f8231de78d83
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8e6bb0b3b5ef6ea1323aa698a16dc1ec1a878ed2c2bcd43547750a18e72d9b4
+size 18425
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a455963857784a604a6dcda14f8abe009ccf712a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c254381e04eee630a55c81acfc0466443425a8f752f5183737c25f048424ece
+size 16457
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.stl
new file mode 100644
index 0000000000000000000000000000000000000000..67ba4121dd3dcfe8a00587cb13903aa93f9d3d9b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:657af24c1dff19fe9ed7c027fe3aa67448b42ad67f2dd77a71fd8b6f04346ad4
+size 14884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..bb92c2765ddc036caa868ab892b1e5ca729dfc99
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19b12510eec739a1a4655ff633ce3f02ad3c2eda2cb8f451df900ddfa76c8f14
+size 117207
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9ddcdea5a0b1df886643b641dbbbecdb1cd9fb69
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b13a0335f4528e73f97578deecf80966f3d283104137a50542eebff9627348b
+size 136894
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..5dc7bc2e860a8f146dee2dc76bc2e72bbc74d06c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_inner_knuckle_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c5d83c5732e50b5224e1393084d5123ce997f2d3468f97639f48d2257c5e9c1
+size 84884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.dae
new file mode 100644
index 0000000000000000000000000000000000000000..bea4c0c3eaf28a820b0997c1a7661d554c07bfb3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e45e7773260cc4b04bbae626af94bfdd49c38b3c69fc248cf0708e5fa65a39f
+size 21252
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3e791ccfcc4fa645b2e7fda13b0f778155fb4b68
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b404f95e3e5e372206de173dd0e70286a017f065385b0a2df8da89de15cdc492
+size 20584
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.stl
new file mode 100644
index 0000000000000000000000000000000000000000..2061c441dd1d4642783f356b8eae83c63aa18e7f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eb5949cb98e382f500ef050537cafd5c8cebcbb753c88d3a6ef06ad42d8ab93
+size 17284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..77d2bf2b4ff3555378903a9005cac2c9b40006af
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32d842e51e0ddd25b3354af212131befa191c183cf4cdb61f82ecbe5c5276323
+size 120496
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c4bbc0bf4c1d7096472e26b0bba5732f12abc132
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d213fe23f7cb8fcef2b9c6556c9051edd92c2f03ccab49e81e77498aa591da7
+size 132641
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ad91ae4f66536e0497538cb1bd49ef12f103d1e4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_finger_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e77b6a9227acd546f433c5a29abf9d27848193ac410fd369c2b56c264d274d55
+size 89084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.dae
new file mode 100644
index 0000000000000000000000000000000000000000..0c4814d9ba893ff5ab4828b35dc42d422fc02a35
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c7ce8ee42044149f9a956d55f33f06e6b75e5e35080ee4bf1e8cb381535aeb7
+size 25969
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e5f1275605f663f5839b4d5d3b2c145b58d1871f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f951bf4ef40fbe75e0b2bc0bd713cb8a72d75ea554dc9d62b2ba486d2251c04c
+size 25953
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.stl
new file mode 100644
index 0000000000000000000000000000000000000000..923521ad79ab288bda9a62d9692333d686ac082a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba362290082912a5cc1718d9406886a23aa3c15c6de850eca0ce81c09312bcfa
+size 21084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..4c2a9f4c01e8178bdd3d1d924486f4322b3c71c7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1cc5198c9f3b06979af07d2944034b32acbb6130df5396dc2c7253cb656f587
+size 69464
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..30ee68439d12479e0711b358207d3c513a76e9e6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba43ef55f4819ef60c79294037dca7f6f7ff1a7eb4d3f62fda728d1f9355dff2
+size 108831
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..52f9a1f822fead442bbe832b536eb6ad5368ae61
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_outer_knuckle_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d0949ab29a3def1b35b48b15576ec9adf5315d59a6277efe2ba45a48c3a393d
+size 67084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_pad_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_pad_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..29a457b7e183f3170e5258220c21f7533b437167
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_pad_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f1d5b8403fed489fb40b389e3d9ec27882db324d77d20cc84cf8bb8901fee80
+size 4136
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_pad_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_pad_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..0b5446e63b6a984775f7c690499b0b0d74f958e4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_85_pad_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:063a977fb9800b206dc818234f4a2193ff19602366a388b4140442f53463389e
+size 684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_base_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_base_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_base_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_base_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_base_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..27875acad4f68c7c23267b5e2f401698d94437ff
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_base_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:021222c315c9f3318c058654b2407e142c93bdfa7ecf87b11fb67493cbc472ff
+size 110714
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_base_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_base_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..3ef56dcf17eb02fa9870e22f411d2543de2f49ff
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_arg2f_base_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:111e37f13a664989dd54226f80f521b32ea0b71c975282a16696b14be7cc9249
+size 86384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_gripper_coupling_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_gripper_coupling_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..e2289a1e75b5ee9e21d22de667fdb5805011d08b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper/robotiq_gripper_coupling_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4281e83002a25c20dc07c68b8d77da30a13e9a8401f157f6848ed8287d7cce44
+size 160684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/base.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/base.stl
new file mode 100644
index 0000000000000000000000000000000000000000..fc34f505a5f2d462ca7ef782ae866c4de663f06f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/base.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1019b87c1dcff4a08a2fdca2dfd4893d60a5e2fc53512c6b5fe2e372b75c9aa3
+size 2307084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/base_coupling.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/base_coupling.stl
new file mode 100644
index 0000000000000000000000000000000000000000..eaa901b1171bcfa51d7f2c36f62cb8a3407223c2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/base_coupling.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a2517f9d6d78f89d9edb617fb93a279b4d52ac61c12d9cb743c701676eeb06d
+size 540884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/c-a01-85-open.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/c-a01-85-open.stl
new file mode 100644
index 0000000000000000000000000000000000000000..fda17bd8508a457ddde2de1981d00d33fda21152
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/c-a01-85-open.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13ad73ef491f6f28b9ed6b8fbb3d6fb45896110ea7d415cda1689fc8daa5d925
+size 283384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/coupler.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/coupler.stl
new file mode 100644
index 0000000000000000000000000000000000000000..c29e887c49050da3e8a4e17092726c6db0e20688
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/coupler.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54949f7355c35c976d854fb77272feb92d9201213e343a8852429556fc81d416
+size 641884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/driver.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/driver.stl
new file mode 100644
index 0000000000000000000000000000000000000000..5da0f469460d9b9e7209896b0584e28d1fe0766f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/driver.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baf8b4dde18ce59eeebc0928a289c69dccec9da81bb186e2838e2e304274e106
+size 438284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/follower.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/follower.stl
new file mode 100644
index 0000000000000000000000000000000000000000..b1e46dd408ee4ed64ba512e6f9e612b903ea5461
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/follower.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28811d3651345dbb5f2020c67d3bd05f754b5e3e791e379c3c4d1d87418bb9c5
+size 572284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/pad.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/pad.stl
new file mode 100644
index 0000000000000000000000000000000000000000..be08ea411ae9d06d9ab566af28906c702f132176
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/pad.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f4d31e867a5b3634c102669b76ac5e8c026ede5fc645b751a5eb3d4bb0be02
+size 15084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300.stl
new file mode 100644
index 0000000000000000000000000000000000000000..200adcd0dbf6f42b61090bcf3bf6a39eca04ac97
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50be7b765c349abf102d81dc62d8816a39a6345d3e794dec505ec8d5a034b973
+size 5915384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300_base.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300_base.stl
new file mode 100644
index 0000000000000000000000000000000000000000..7715b80d70f9ffa0f36bf162004e60eef0eba0d6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300_base.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1f5aa91c33e900fb10be4d8cb5658d669fe48fdc5dbf2bf61ca03002953b0cf
+size 1104584
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300_coupling.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300_coupling.stl
new file mode 100644
index 0000000000000000000000000000000000000000..b31f72d8479199dd4fd97bff7abe0456bb539e18
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300_coupling.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c45d18f0a7aa4a52582bb0eeae061f76902803965dbd7712990627b200eb667d
+size 2006284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300_top.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300_top.stl
new file mode 100644
index 0000000000000000000000000000000000000000..4249b38b2e6cd342db0978605036720fc45aeccf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/robotiq_fts300_top.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e9ca76b0b09c6871e4c4ffb5031a85c19db62e5734ba41d93622322629fc189
+size 4111084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/spring_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/spring_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..1cd4d44f9e96b739b1a3037f7d5ddef9dbb46216
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/spring_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56e9f28ce90841d654ab6d953161aeb62142b1459c210335d5862e7fcb281aab
+size 656084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/tongue.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/tongue.stl
new file mode 100644
index 0000000000000000000000000000000000000000..0e502de31417ddd1ad66b8d18e97be1bc44211d0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_85_gripper_v4/tongue.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bd5df8a2132542703d40d94be36a990bbc3eabe070283232871147081348d52
+size 383384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..7b8659dca8cfe90a377661a3a110ab4ca93832e0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51a5dd4d3afb59a319724f9b51ef27616b079d49dabca2f160ae0cc06c9dd7ca
+size 10884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..85eaa49951dd376e78dd0d8577726eb8bf6c955d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e9d3a996e9508ac064e2984f0488eaf9147cc0c547f897ce70aa9c0aa623d27
+size 39300
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..da7a6a0920505546563fe55d9ca16f62e88b67a1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_0_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f555448a7907a16daa7c0462d79aafae97a81c009eb7238460646da690b70fc7
+size 25084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..18d32cfece91797a4b8645658943c2c36e96a3fd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50b05475bd700f1d6418cbc52bbadb952bb1ab876b850f9c36d2cd309d7b78c5
+size 11084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b03b4f5fa89144ed3724cadb11785997915e1c43
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b21997b387c22d130e785a086b7e15686c0d9118add614eadaad37cdd9f76ec
+size 73867
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..10bcba9b46ac9079d5e4263e0cab843db3802ee0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_1_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32cc3a859be1c5fd8ad7583a41bf619e2ea01d2bfbde94a17c5dd3217576627b
+size 46284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..14ac4e25c65fb153b8f3c2f0257758f940dd6672
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9f1dcebe3eac5d4ab3801d6e2722bbd9e99d9c3e3b2f741f8bd4efd997b605a
+size 10684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ce2e2a4e135352f45aaf6b9af1394d67d60c296d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68de4ff372a69164fae416134aefe6b32638d2d685c75dea6240ba37c4c85472
+size 69320
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..29ccf85d1e0aed6bd74517e06a641cd70db373b1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_2_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a49ffc1093232d6c2c690c3d59566f512c66198c23231bf47825be143a456f87
+size 43684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3.stl
new file mode 100644
index 0000000000000000000000000000000000000000..d0e5b4383a53028043e53f882b2c2904f05deeaa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db8715e5676185ee507b60a466d3a31622192e7a250bc56286923d9fd1a435a4
+size 15184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..d133dcbe7cf5a3f9679b6477e52d0621fa26bd68
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f1884f923a14621a8b45decc6524ba71e5c31022f9c9cb57352e8cc24682494
+size 59535
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..cf397c85d588b2582c8fbe3b91138c1fdd421bdb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/link_3_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea63028f37f3ff6acd587559e6c48c4510ee766c53df813c7c6cd5796beaf6fe
+size 35484
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm.stl
new file mode 100644
index 0000000000000000000000000000000000000000..8486c3b16b20d908f81b45eb40802575e70474ae
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc17ceb8141d8a145af7abc86e89edad55a3a9b4e3a9c8bbbf7283adc8b759d6
+size 133084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..67462553270427a98f0dab21242693b96acebcc7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19b2e0b9dc20cacc505131f9d7b01bf0bcebdb12f3c3f453aa87b7c0fe8a3364
+size 1055346
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..eec508569dfa92159ed5241b4d96521f18730adf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/meshes/robotiq_s_gripper/palm_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11a87fed9f516a6d81473a8ca112e6b976d44383a3375ba6e8c84b7040455141
+size 534284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/null_gripper.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/null_gripper.xml
new file mode 100644
index 0000000000000000000000000000000000000000..b6467381562ad0bf41548ea3d7d14aa370786fbf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/null_gripper.xml
@@ -0,0 +1,21 @@
+<mujoco model="null_hand">
+    <worldbody>
+        <body name="null_gripper" pos="0 0 0">
+            <site name="ft_frame" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 1" type="sphere" group="1"/>
+            <inertial pos="0 0 0" mass="3e-1" diaginertia="1e-2 1e-2 1e-2" />
+            <!-- This site was added for visualization. -->
+            <body name="eef" pos="0 0 0" quat="0.707105 0 0 -0.707105">
+                <site name="grip_site" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 0.5" type="sphere" group="1"/>
+                <site name="ee_x" pos="0.1 0 0" size="0.005 .1"  quat="0.707105  0 0.707108 0 " rgba="1 0 0 0" type="cylinder" group="1"/>
+                <site name="ee_y" pos="0 0.1 0" size="0.005 .1" quat="0.707105 0.707108 0 0" rgba="0 1 0 0" type="cylinder" group="1"/>
+                <site name="ee_z" pos="0 0 0.1" size="0.005 .1" quat="1 0 0 0" rgba="0 0 1 0" type="cylinder" group="1"/>
+                <!-- This site was added for visualization. -->
+                <site name="grip_site_cylinder" pos="0 0 0" size="0.005 10" rgba="0 1 0 0.3" type="cylinder" group="1"/>
+            </body>
+        </body>
+    </worldbody>
+    <sensor>
+        <force name="force_ee" site="ft_frame"/>
+        <torque name="torque_ee" site="ft_frame"/>
+    </sensor>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..60a118587f2d80893cec7fe8f170717539e7a490
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8dfe6bf6554ea5057c47c1c7a506ea3089f6801300963231c150cb5c283864e
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate.obj
new file mode 100644
index 0000000000000000000000000000000000000000..629e685206ac79ce00cd97dfbabca077633b5ffd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4537b1b8c6d45fbc8c0efcdcce6cdaec14aa9c6a13b1c6eb52adcbf2be9a79a9
+size 2085066
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate/connector_plate.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate/connector_plate.obj
new file mode 100644
index 0000000000000000000000000000000000000000..827ede49ac7a3fe0b5c30ca5eb4532c2ef461503
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate/connector_plate.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22caf42a3e398e1a737efd1ba0c86788b194126c995c0e7449687709421b384e
+size 7162239
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate/connector_plate.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate/connector_plate.xml
new file mode 100644
index 0000000000000000000000000000000000000000..91b058d047d1f1962d17396c6592da9df457db28
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/connector_plate/connector_plate.xml
@@ -0,0 +1,20 @@
+<mujoco model="connector_plate">
+  <default>
+    <default class="visual">
+      <geom group="2" type="mesh" contype="0" conaffinity="0"/>
+    </default>
+    <default class="collision">
+      <geom group="3" type="mesh"/>
+    </default>
+  </default>
+  <asset>
+    <material name="Material_001" specular="0.5" shininess="0.25" rgba="0.001651 0.001651 0.001651 1.000000"/>
+    <mesh file="connector_plate.obj"/>
+  </asset>
+  <worldbody>
+    <body name="connector_plate">
+      <geom material="Material_001" mesh="connector_plate" class="visual"/>
+      <geom mesh="connector_plate" class="collision"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..97c37a579113abad3a3c13ceaa62d68730bf8b47
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a43c875708217c6c96f7825ee317b89d7ce2913af83cab277118deae7f77a8f
+size 427
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base.obj
new file mode 100644
index 0000000000000000000000000000000000000000..d9b61704a1cc0341f7f4238860beeba921a0bbeb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd36692c2590a3f2d6d128a0dffbf7d4f6ac316d0bfd2d9c2f66a2e31e3edd1d
+size 862969
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base/electric_gripper_base_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base/electric_gripper_base_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..bc5dd9de415768cd52699f92187da3bff3128864
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base/electric_gripper_base_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:234baff59359fca1ebd3dea25acb8ac80d28e6a576b322dd86d0c9e6cd8ca4f0
+size 358197
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base/electric_gripper_base_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base/electric_gripper_base_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a5acc7bc46fcf41a91dc706469f5285df8a9eba7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/obj_meshes/rethink_gripper/electric_gripper_base/electric_gripper_base_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:995fcb7934edc215173f403f20870799c9772fd1f314e749d11c46d68318e330
+size 2512399
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/panda_gripper.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/panda_gripper.xml
new file mode 100644
index 0000000000000000000000000000000000000000..fb9463484727e1df64cc3db2be2ddb4a8341a22a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/panda_gripper.xml
@@ -0,0 +1,54 @@
+<mujoco model="panda_hand">
+    <asset>
+        <mesh name="hand" file="meshes/panda_gripper/hand.stl" />
+        <mesh name="hand_vis" file="meshes/panda_gripper/hand_vis.stl" />
+        <mesh name="finger" file="meshes/panda_gripper/finger.stl" />
+        <mesh name="finger_vis" file="meshes/panda_gripper/finger_vis.stl" />
+        <mesh name="finger_vis2" file="meshes/panda_gripper/finger_longer.stl" />
+    </asset>
+    <actuator>
+        <position ctrllimited="true" ctrlrange="0.0 0.04" joint="finger_joint1" kp="1000" name="gripper_finger_joint1" forcelimited="true" forcerange="-20 20"/>
+        <position ctrllimited="true" ctrlrange="-0.04 0.0" joint="finger_joint2" kp="1000" name="gripper_finger_joint2" forcelimited="true" forcerange="-20 20"/>
+    </actuator>
+    <worldbody>
+        <body name="right_gripper" pos="0 0 0" quat="0.707107 0 0 -0.707107">
+            <site name="ft_frame" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 1" type="sphere" group="1"/>
+            <inertial pos="0 0 0.17" quat="0.707107 0.707107 0 0" mass="0.3" diaginertia="0.09 0.07 0.05" />
+            <geom pos="0 0 0." quat="0.707107 0 0 0.707107" type="mesh" contype="0" conaffinity="0" group="1" mesh="hand_vis" name="hand_visual" rgba="1 1 1 1" />
+            <geom pos="0 0 0." quat="0.707107 0 0 0.707107" type="mesh" mesh="hand"  group="0" name="hand_collision"/>
+            <!-- This site was added for visualization. -->
+            <body name="eef" pos="0 0 0.097" quat="1 0 0 0">
+                <site name="grip_site" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 0.5" type="sphere" group="1"/>
+                <site name="ee_x" pos="0.1 0 0" size="0.005 .1"  quat="0.707105  0 0.707108 0 " rgba="1 0 0 0" type="cylinder" group="1"/>
+                <site name="ee_y" pos="0 0.1 0" size="0.005 .1" quat="0.707105 0.707108 0 0" rgba="0 1 0 0" type="cylinder" group="1"/>
+                <site name="ee_z" pos="0 0 0.1" size="0.005 .1" quat="1 0 0 0" rgba="0 0 1 0" type="cylinder" group="1"/>
+                <!-- This site was added for visualization. -->
+                <site name="grip_site_cylinder" pos="0 0 0" size="0.005 10" rgba="0 1 0 0.3" type="cylinder" group="1"/>
+            </body>
+            <body name="leftfinger" pos="0 0 0.0524" quat="0.707107 0 0 0.707107">
+                <inertial pos="0 0 0.05" mass="0.1" diaginertia="0.01 0.01 0.005" />
+                <joint name="finger_joint1" pos="0 0 0" axis="0 1 0" type="slide" limited="true" range="0.0 0.04" damping="100" armature="1.0" frictionloss="1.0"/>
+                <geom type="mesh" contype="0" conaffinity="0" group="1" mesh="finger_vis" name="finger1_visual" rgba="0.499 0.499 0.499 1" />
+                <geom type="mesh" group="0" conaffinity="1" contype="0" solref="0.02 1" friction="1 0.005 0.0001" condim="4" mesh="finger" name="finger1_collision"/>
+                <body name="finger_joint1_tip" pos="0 0.0085 0.056">
+                    <inertial pos="0 0 0" quat="0 0 0 1" mass="0.01" diaginertia="0.01 0.01 0.01" />
+                    <geom size="0.008 0.004 0.008" pos="0 -0.005 -0.015" quat="0 0 0 1" type="box" group="0" solref="0.01 0.5" friction = "2 0.05 0.0001" conaffinity="1" contype="1" name="finger1_pad_collision"/>
+                </body>
+            </body>
+            <body name="rightfinger" pos="0 0 0.0524" quat="0.707107 0 0 0.707107">
+                <inertial pos="0 0 0.05" mass="0.1" diaginertia="0.01 0.01 0.005" />
+                <joint name="finger_joint2" pos="0 0 0" axis="0 1 0" type="slide" limited="true" range="-0.04 0.0" damping="100" armature="1.0" frictionloss="1.0"/>
+                <geom quat="0 0 0 1" type="mesh" contype="0" conaffinity="0" group="1" mesh="finger_vis" name="finger2_visual" rgba="0.499 0.499 0.499 1" />
+                <geom quat="0 0 0 1" type="mesh" group="0" conaffinity="1" contype="0" solref="0.02 1" friction="1 0.005 0.0001" condim="4" mesh="finger" name="finger2_collision"/>
+                <body name="finger_joint2_tip" pos="0 -0.0085 0.056">
+                    <inertial pos="0 0 0" quat="0 0 0 1" mass="0.01" diaginertia="0.01 0.01 0.01" />
+                    <geom size="0.008 0.004 0.008" pos="0 0.005 -0.015" quat="0 0 0 1" type="box" group="0" solref="0.01 0.5" friction = "2 0.05 0.0001" conaffinity="1" contype="1" name="finger2_pad_collision"/>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+    <sensor>
+        <force name="force_ee" site="ft_frame"/>
+        <torque name="torque_ee" site="ft_frame"/>
+    </sensor>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/rethink_gripper.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/rethink_gripper.xml
new file mode 100644
index 0000000000000000000000000000000000000000..072542a76e7701285956397bde7f9066a7950466
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/rethink_gripper.xml
@@ -0,0 +1,73 @@
+<mujoco model="base">
+    <asset>
+        <mesh name="standard_narrow" file="meshes/rethink_gripper/standard_narrow.stl" />
+        <mesh name="half_round_tip" file="meshes/rethink_gripper/half_round_tip.stl" />
+	<material name="Material_001.001" specular="0.5" shininess="0.25" rgba="0.640000 0.000000 0.000000 1.000000"/>
+	<material name="Material_002" specular="0.5" shininess="0.25" rgba="0.640000 0.640000 0.640000 1.000000"/>
+	<mesh name="electric_gripper_base_0" file="obj_meshes/rethink_gripper/electric_gripper_base/electric_gripper_base_0.obj"/>
+	<mesh name="electric_gripper_base_1" file="obj_meshes/rethink_gripper/electric_gripper_base/electric_gripper_base_1.obj"/>
+	<material name="finger_mat" specular="0.5" shininess="0.25" rgba="0.000000 0.000000 0.000000 1.000000"/>
+	<material name="Material_001" specular="0.5" shininess="0.25" rgba="0.001651 0.001651 0.001651 1.000000"/>
+	<mesh name="connector_plate" file="obj_meshes/rethink_gripper/connector_plate/connector_plate.obj"/>
+    </asset>
+    <actuator>
+        <position ctrllimited="true" ctrlrange="-0.0115 0.020833" joint="r_finger_joint" kp="1000" name="gripper_r_finger_joint" forcelimited="true" forcerange="-20 20"/>
+        <position ctrllimited="true" ctrlrange="-0.020833 0.0115" joint="l_finger_joint" kp="1000" name="gripper_l_finger_joint" forcelimited="true" forcerange="-20 20"/>
+    </actuator>
+    <default>
+      <default class="visual">
+	<geom contype="0" conaffinity="0" group="1" type="mesh"/>
+      </default>
+    </default>
+    <worldbody>
+        <body name="gripper_base" pos="0 0 0">
+            <site name="ft_frame" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 1" type="sphere" group="1"/>
+            <inertial pos="0 0 0" quat="-0.5 0.5 0.5 0.5" mass="0.3" diaginertia="3e-08 2e-08 2e-08" />
+	    <geom material="Material_001" mesh="connector_plate" pos="0 0 0.0018" quat="0.7071068 0 0 0.7071068" class="visual"/>
+	    <geom mesh="electric_gripper_base_0" material="Material_002" pos="0 0 0.0194" quat="0.7071068 0 0 0.7071068" class="visual"/>
+	    <geom mesh="electric_gripper_base_1" material="Material_001.001" pos="0 0 0.0194" quat="0.7071068 0 0 0.7071068" class="visual"/>
+
+            <geom size="0.029 0.05" quat="0 0 0.707107 0.707107" type="cylinder" group="0" name="gripper_base_col" pos="0.004 0.0 0.04"/>
+            <!-- This site was added for visualization. -->
+            <body name="eef" pos="0 0 0.109" quat="0.707105 0 0 -0.707105">
+                <site name="grip_site" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 0.5" type="sphere" group="1"/>
+                <site name="ee_x" pos="0.1 0 0" size="0.005 .1"  quat="0.707105  0 0.707108 0 " rgba="1 0 0 0" type="cylinder" group="1"/>
+                <site name="ee_y" pos="0 0.1 0" size="0.005 .1" quat="0.707105 0.707108 0 0" rgba="0 1 0 0" type="cylinder" group="1"/>
+                <site name="ee_z" pos="0 0 0.1" size="0.005 .1" quat="1 0 0 0" rgba="0 0 1 0" type="cylinder" group="1"/>
+                <!-- This site was added for visualization. -->
+                <site name="grip_site_cylinder" pos="0 0 0" size="0.005 10" rgba="0 1 0 0.3" type="cylinder" group="1"/>
+            </body>
+            <body name="l_finger" pos="0 0.01 0.0444">
+                <inertial pos="0 0 0" quat="0 0 0 -1" mass="0.02" diaginertia="0.01 0.01 0.01" />
+                <joint name="l_finger_joint" pos="0 0 0" axis="0 1 0" type="slide" limited="true" range="-0.0115 0.020833" damping="100" armature="1.0" frictionloss="1.0"/>
+                <geom name="l_finger" quat="0 0 0 -1" type="mesh" contype="0" conaffinity="0" group="1" mesh="standard_narrow" material="finger_mat"/>
+                <geom size="0.005 0.00675 0.0375" pos="0 0.01725 0.04" quat="0 0 0 -1" type="box" group="0" conaffinity="1" contype="0" name="l_finger_g0" friction="0 0 0"/>
+                <geom size="0.005 0.025 0.0085" pos="-0.005 -0.003 0.0083" quat="0 0 0 -1" type="box" group="0" conaffinity="1" contype="0" name="l_finger_g1" friction="0 0 0"/>
+                <body name="l_finger_tip" pos="0 0.01725 0.075">
+                    <inertial pos="0 0 0" quat="0 0 0 1" mass="0.01" diaginertia="0.01 0.01 0.01" />
+                    <geom name="l_fingertip_g0_vis" quat="0 0 0 1" type="mesh" contype="0" conaffinity="0" group="1" mesh="half_round_tip" material="finger_mat"/>
+
+                    <geom size="0.004 0.004 0.0185" pos="0 -0.0045 -0.015" quat="0 0 0 -1" type="box" group="0"  conaffinity="1" contype="0" name="l_fingertip_g0" friction="0 0 0"/>
+                    <geom size="0.0035 0.004 0.0165" pos="0 -0.0047 -0.017" type="box"  conaffinity="1" contype="0" name="l_fingerpad_g0" friction="0 0 0"/>
+                </body>
+            </body>
+            <body name="r_finger" pos="0 -0.01 0.0444">
+                <inertial pos="0 0 0" mass="0.02" diaginertia="0.01 0.01 0.01" />
+                <joint name="r_finger_joint" pos="0 0 0" axis="0 1 0" type="slide" limited="true" range="-0.020833 0.0115" damping="100" armature="1.0" frictionloss="1.0"/>
+                <geom name="r_finger" type="mesh" contype="0" conaffinity="0" group="1" mesh="standard_narrow" material="finger_mat"/>
+                <geom size="0.005 0.00675 0.0375" pos="0 -0.01725 0.04" type="box" group="0" conaffinity="1" contype="0" name="r_finger_g0" friction="0 0 0"/>
+                <geom size="0.005 0.025 0.0085" pos="0.005 0.003 0.0083" type="box" group="0" conaffinity="1" contype="0" name="r_finger_g1" friction="0 0 0"/>
+                <body name="r_finger_tip" pos="0 -0.01725 0.075">
+                    <inertial pos="0 0 0" mass="0.01" diaginertia="0.01 0.01 0.01" />
+                    <geom name="r_fingertip_g0_vis" type="mesh" contype="0" conaffinity="0" group="1" mesh="half_round_tip" material="finger_mat"/>
+                    <geom size="0.004 0.004 0.0185" pos="0 0.0045 -0.015" type="box" group="0" conaffinity="1" contype="0" name="r_fingertip_g0" friction="0 0 0"/>
+                    <geom size="0.0035 0.004 0.0165" pos="0 0.0047 -0.017" type="box"  conaffinity="1" contype="0" name="r_fingerpad_g0" friction="0 0 0"/>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+    <sensor>
+        <force name="force_ee" site="ft_frame"/>
+        <torque name="torque_ee" site="ft_frame"/>
+    </sensor>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_140.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_140.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e6b7d79bb835c4fe1ae1a5fe2a9c008d4e6c3d2c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_140.xml
@@ -0,0 +1,119 @@
+<mujoco model="robotiq_gripper_140_model">
+    <asset>
+        <mesh name="robotiq_arg2f_base_link" file="meshes/robotiq_140_gripper/robotiq_arg2f_base_link.stl" />
+        <mesh name="robotiq_arg2f_140_outer_knuckle" file="meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle.stl" />
+        <mesh name="robotiq_arg2f_140_outer_finger" file="meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger.stl" />
+        <mesh name="robotiq_arg2f_140_inner_finger" file="meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger.stl" />
+        <mesh name="robotiq_arg2f_140_inner_knuckle" file="meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle.stl" />
+        <mesh name="robotiq_arg2f_base_link_vis" file="meshes/robotiq_140_gripper/robotiq_arg2f_base_link_vis.stl" />
+        <mesh name="robotiq_arg2f_140_outer_knuckle_vis" file="meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_knuckle_vis.stl" />
+        <mesh name="robotiq_arg2f_140_outer_finger_vis" file="meshes/robotiq_140_gripper/robotiq_arg2f_140_outer_finger_vis.stl" />
+        <mesh name="robotiq_arg2f_140_inner_finger_vis" file="meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_finger_vis.stl" />
+        <mesh name="robotiq_arg2f_140_inner_knuckle_vis" file="meshes/robotiq_140_gripper/robotiq_arg2f_140_inner_knuckle_vis.stl" />
+    </asset>
+
+    <tendon>
+		<!--finger2 tendons-->
+        <fixed name="finger2_12_cpl" range="0 1" limited="true">
+			<joint joint="finger_joint"  coef="1"/>
+			<joint joint="left_inner_finger_joint"  coef="1.5"/>
+		</fixed>
+		<fixed name="finger2_23_cpl" range="0 1" limited="true">
+			<joint joint="left_inner_finger_joint"  coef="1"/>
+			<joint joint="left_inner_knuckle_joint"  coef="3.5"/>
+		</fixed>
+
+		<!--Finger1 tendons-->
+        <fixed name="finger1_12_cpl" range="0 1" limited="true">
+			<joint joint="right_outer_knuckle_joint"  coef="1"/>
+			<joint joint="right_inner_finger_joint"  coef="-1.5"/>
+		</fixed>
+		<fixed name="finger1_23_cpl" range="0 1" limited="true">
+			<joint joint="right_inner_finger_joint"  coef="1"/>
+			<joint joint="right_inner_knuckle_joint"  coef="3.5"/>
+		</fixed>
+	</tendon>
+
+    <equality>
+		<!-- GRIPPER Couplings -->
+        <tendon name="finger2_12_cpl" 	tendon1="finger2_12_cpl"/>
+		<tendon name="finger2_23_cpl" 	tendon1="finger2_23_cpl"/>
+
+        <tendon name="finger1_12_cpl" 	tendon1="finger1_12_cpl"/>
+		<tendon name="finger1_23_cpl" 	tendon1="finger1_23_cpl"/>
+	</equality>
+    <actuator>
+        <position name='finger_1' ctrllimited="true" kp="20" joint='finger_joint' ctrlrange='0 0.7'/>
+        <position name='finger_2' ctrllimited="true" kp="20" joint='right_outer_knuckle_joint' ctrlrange='-0.7 0'/>
+    </actuator>
+
+    <worldbody>
+        <body name="right_gripper" pos="0 0 -0.0625" quat="0 -0.707105 0.707108 0 ">
+            <geom pos="0 0 -0.061525" quat="0 0.707388 -0.706825 0" type="mesh" contype="0" conaffinity="0" group="1" rgba="0.1 0.1 0.1 1" name="hand_visual" mesh="robotiq_arg2f_base_link_vis" />
+            <geom pos="0 0 -0.061525" quat="0 0.707388 -0.706825 0" type="mesh" group="0" rgba="0.1 0.1 0.1 1" mesh="robotiq_arg2f_base_link" name="hand_collision"/>
+
+            <site name="ft_frame" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 1" type="sphere" group="1" />
+
+            <!-- This site was added for visualization. -->
+            <body name="eef" pos="0 0 -0.27" quat="0 1 0 0">
+                <site name="grip_site" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 0.5" type="sphere" group="1"/>
+                <site name="ee_x" pos="0.1 0 0" size="0.005 .1"  quat="0.707105  0 0.707108 0 " rgba="1 0 0 0" type="cylinder" group="1"/>
+                <site name="ee_y" pos="0 0.1 0" size="0.005 .1" quat="0.707105 0.707108 0 0" rgba="0 1 0 0" type="cylinder" group="1"/>
+                <site name="ee_z" pos="0 0 0.1" size="0.005 .1" quat="1 0 0 0" rgba="0 0 1 0" type="cylinder" group="1"/>
+                <!-- This site was added for visualization. -->
+                <site name="grip_site_cylinder" pos="0 0 0" size="0.005 10" rgba="0 1 0 0.3" type="cylinder" group="1"/>
+            </body>
+
+            <body name="left_outer_knuckle" pos="0.030601 2.43684e-05 -0.11643" quat="-0.64507 0.290316 -0.290085 0.644556">
+                <inertial pos="0.000163875 0.0458404 0.0117804" quat="0.881368 0.472423 -0.0024451 -0.000996122" mass="0.0311462" diaginertia="2.96023e-05 2.79814e-05 4.39017e-06" />
+                <joint name="finger_joint" pos="0 0 0" axis="-1 0 0" limited="true" range="0 0.7" />
+                <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.792157 0.819608 0.933333 1" name="left_outer_knuckle_visual" mesh="robotiq_arg2f_140_outer_knuckle_vis" />
+                <geom type="mesh" group="0" rgba="0.792157 0.819608 0.933333 1" mesh="robotiq_arg2f_140_outer_knuckle" name="left_outer_knuckle_collision"/>
+                <geom pos="0 0.01822 0.0260018" type="mesh" contype="0" conaffinity="0" group="1" rgba="0.1 0.1 0.1 1" name="left_outer_finger_visual" mesh="robotiq_arg2f_140_outer_finger_vis" />
+                <geom pos="0 0.01822 0.0260018" type="mesh" group="0" rgba="0.1 0.1 0.1 1" mesh="robotiq_arg2f_140_outer_finger" name="left_outer_finger_collision"/>
+                <body name="left_inner_finger" pos="0 0.0999754 -0.00221853" quat="0.935013 -0.354613 0 0">
+                    <inertial pos="0.000119314 0.0339244 -0.021841" quat="0.545437 0.430197 -0.442938 0.566776" mass="0.0261503" diaginertia="1.62408e-05 1.59131e-05 2.38936e-06" />
+                    <joint name="left_inner_finger_joint" pos="0 0 0" axis="1 0 0" limited="true" range="-0.8757 0.8757" />
+                    <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.1 0.1 0.1 1" name="left_inner_finger_visual" mesh="robotiq_arg2f_140_inner_finger_vis" />
+                    <geom type="mesh" group="0" rgba="0.1 0.1 0.1 1" mesh="robotiq_arg2f_140_inner_finger" name="left_inner_finger_collision"/>
+                    <geom size="0.0135 0.0325 0.00375" pos="0 0.0457554 -0.0272203" type="box" contype="0" conaffinity="0" group="1" rgba="0.9 0.9 0.9 1" name="left_fingertip_visual" />
+                    <geom size="0.015 0.035 0.00375" pos="0 0.0457554 -0.0272203" type="box" group="0" rgba="0.9 0.9 0.9 1" name="left_fingertip_collision" solref="0.01 0.25"/>
+                    <geom size="0.014 0.033 0.001" pos="0 0.045 -0.031" type="box" group="0" name="left_fingerpad_collision" />
+                </body>
+            </body>
+            <body name="left_inner_knuckle" pos="0.0127 1.01133e-05 -0.122945" quat="-0.64507 0.290316 -0.290085 0.644556">
+                <inertial pos="0.000123012 0.0507851 0.00103969" quat="0.497203 0.502496 -0.507943 0.492221" mass="0.0271177" diaginertia="2.83809e-05 2.61936e-05 2.81319e-06" />
+                <joint name="left_inner_knuckle_joint" pos="0 0 0" axis="1 0 0" limited="true" range="-0.8757 0.8757" />
+                <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.1 0.1 0.1 1" name="left_inner_knuckle_visual" mesh="robotiq_arg2f_140_inner_knuckle_vis" />
+                <geom type="mesh" group="0" rgba="0.1 0.1 0.1 1" mesh="robotiq_arg2f_140_inner_knuckle" name="left_inner_knuckle_collision"/>
+            </body>
+            <body name="right_outer_knuckle" pos="-0.030601 -2.43684e-05 -0.11643" quat="0.644556 -0.290085 -0.290316 0.64507">
+                <inertial pos="0.000163875 0.0458404 0.0117804" quat="0.881368 0.472423 -0.0024451 -0.000996122" mass="0.0311462" diaginertia="2.96023e-05 2.79814e-05 4.39017e-06" />
+                <joint name="right_outer_knuckle_joint" pos="0 0 0" axis="1 0 0" limited="true" range="-0.725 0.725" />
+                <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.792157 0.819608 0.933333 1" name="right_outer_knuckle_visual" mesh="robotiq_arg2f_140_outer_knuckle_vis" />
+                <geom type="mesh" group="0" rgba="0.792157 0.819608 0.933333 1" mesh="robotiq_arg2f_140_outer_knuckle" name="right_outer_knuckle_collision"/>
+                <geom pos="0 0.01822 0.0260018" type="mesh" contype="0" conaffinity="0" group="1" rgba="0.1 0.1 0.1 1" name="right_outer_finger_visual" mesh="robotiq_arg2f_140_outer_finger_vis" />
+                <geom pos="0 0.01822 0.0260018" type="mesh" group="0" rgba="0.1 0.1 0.1 1" mesh="robotiq_arg2f_140_outer_finger" name="right_outer_finger_collision"/>
+                <body name="right_inner_finger" pos="0 0.0999754 -0.00221853" quat="0.935013 -0.354613 0 0">
+                    <inertial pos="0.000119314 0.0339244 -0.021841" quat="0.545437 0.430197 -0.442938 0.566776" mass="0.0261503" diaginertia="1.62408e-05 1.59131e-05 2.38936e-06" />
+                    <joint name="right_inner_finger_joint" pos="0 0 0" axis="1 0 0" limited="true" range="-0.8757 0.8757" />
+                    <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.1 0.1 0.1 1" name="right_inner_finger_visual" mesh="robotiq_arg2f_140_inner_finger_vis" />
+                    <geom type="mesh" group="0" rgba="0.1 0.1 0.1 1" mesh="robotiq_arg2f_140_inner_finger" name="right_inner_finger_collision"/>
+                    <geom size="0.0135 0.0325 0.00375" pos="0 0.0457554 -0.0272203" type="box" contype="0" conaffinity="0" group="1" rgba="0.9 0.9 0.9 1" name="right_fingertip_visual" />
+                    <geom size="0.015 0.035 0.00375" pos="0 0.0457554 -0.0272203" type="box" group="0" rgba="0.9 0.9 0.9 1" name="right_fingertip_collision" solref="0.01 0.25"/>
+                    <geom size="0.014 0.033 0.001" pos="0 0.045 -0.031" type="box" group="0" name="right_fingerpad_collision" />
+                </body>
+            </body>
+            <body name="right_inner_knuckle" pos="-0.0127 -1.01133e-05 -0.122945" quat="-0.644556 0.290085 0.290316 -0.64507">
+                <inertial pos="0.000123012 0.0507851 0.00103969" quat="0.497203 0.502496 -0.507943 0.492221" mass="0.0271177" diaginertia="2.83809e-05 2.61936e-05 2.81319e-06" />
+                <joint name="right_inner_knuckle_joint" pos="0 0 0" axis="1 0 0" limited="true" range="-0.8757 0.8757" />
+                <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.1 0.1 0.1 1" name="right_inner_knuckle_visual" mesh="robotiq_arg2f_140_inner_knuckle_vis" />
+                <geom type="mesh" group="0" rgba="0.1 0.1 0.1 1" mesh="robotiq_arg2f_140_inner_knuckle" name="right_inner_knuckle_collision"/>
+            </body>
+        </body>
+    </worldbody>
+    <sensor>
+        <force name="force_ee" site="ft_frame"/>
+        <torque name="torque_ee" site="ft_frame"/>
+    </sensor>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_85.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_85.xml
new file mode 100644
index 0000000000000000000000000000000000000000..22ec92d5546c1e7a747a3fe177ea1d5330c2ce65
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_85.xml
@@ -0,0 +1,185 @@
+<mujoco model="robotiq_gripper_85_model">
+  <compiler angle="radian" meshdir="assets" autolimits="true"/>
+
+  <option cone="elliptic" impratio="10"/>
+
+  <asset>
+
+    <material name="metal" rgba="0.58 0.58 0.58 1"/>
+    <material name="silicone" rgba="0.1882 0.1882 0.1882 1"/>
+    <material name="black" rgba="0.149 0.149 0.149 1"/>
+
+    <mesh name="base" file="meshes/robotiq_85_gripper_v4/base.stl"/>
+    <mesh name="base_coupling" file="meshes/robotiq_85_gripper_v4/base_coupling.stl"/>
+    <mesh name="c-a01-85-open" file="meshes/robotiq_85_gripper_v4/c-a01-85-open.stl"/>
+    <mesh name="driver" file="meshes/robotiq_85_gripper_v4/driver.stl"/>
+    <mesh name="coupler" file="meshes/robotiq_85_gripper_v4/coupler.stl"/>
+    <mesh name="spring_link" file="meshes/robotiq_85_gripper_v4/spring_link.stl"/>
+    <mesh name="follower" file="meshes/robotiq_85_gripper_v4/follower.stl"/>
+    <mesh name="tongue" file="meshes/robotiq_85_gripper_v4/tongue.stl"/>
+  </asset>
+
+
+  <!-- <default>
+    <default class="2f85">
+      <mesh scale="0.001 0.001 0.001"/>
+      <general biastype="affine"/>
+
+      <joint axis="0 0 1"/>
+
+
+
+
+    </default>
+  </default> -->
+
+
+  <worldbody>
+    <body name="base_mount" pos="0 0 0.007">
+        <site name="ft_frame" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 1" type="sphere" group="1" />
+
+        <body name="eef" pos="0 0 0.148" quat="1 0 0 -1">
+
+        <!-- <body name="eef" pos="0 0 0.136" quat="1 0 0 -1"> -->
+            <!-- <body name="eef" pos="0 0 0.136" quat="1 0 0 -1"> -->
+        <!-- <body name="eef" pos="0 0 0.15" > -->
+            <site name="grip_site" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 0.5" type="sphere"/>
+            <site name="ee_x" pos="0.1 0 0" size="0.005 .1"  quat="0.707105  0 0.707108 0 " rgba="1 0 0 0" type="cylinder"/>
+            <site name="ee_y" pos="0 0.1 0" size="0.005 .1" quat="0.707105 0.707108 0 0" rgba="0 1 0 0" type="cylinder"/>
+            <site name="ee_z" pos="0 0 0.1" size="0.005 .1" quat="1 0 0 0" rgba="0 0 1 0" type="cylinder"/>
+            <!-- This site was added for visualization. -->
+            <site name="grip_site_cylinder" pos="0 0 0" size="0.005 10" rgba="0 1 0 0.3" type="cylinder"/>
+        </body>
+
+        <body name="base" pos="0 0 -0.0038"  quat="0.924 0 0 -0.383">
+            <inertial mass="0.777441" pos="0 -2.70394e-05 0.0354675" quat="1 -0.00152849 0 0"
+                diaginertia="0.000260285 0.000225381 0.000152708"/>
+            <geom pos="0 0 0.0108" quat="0 0 0 1" type="mesh" mesh="base"  material="black" contype="0" conaffinity="0" group="1"/>
+            <geom pos="0 0 0.004" quat="1 -1 0 0" type="mesh" mesh="base_coupling"  material="black" contype="0" conaffinity="0" group="1"/>
+            <geom pos="0 0 0.0108" quat="1 0 0 0" type="mesh" material="metal" mesh="c-a01-85-open" contype="0" conaffinity="0" group="1"/>
+            <geom type="mesh" mesh="base"/>
+            <!-- Left-hand side 4-bar linkage -->
+            <body name="left_driver" pos="-0.0306011 0.00475 0.0657045" quat="1 -1 0 0">
+                <inertial mass="0.00899563" pos="0 0.0177547 0.00107314" quat="0.681301 0.732003 0 0"
+                    diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
+                <joint name="left_driver_joint" range="0 0.9" armature="0.005" damping="0.1" solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0"  material="metal" type="mesh" mesh="driver" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0"  type="mesh" mesh="driver"  />
+                <body name="left_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
+                    <inertial mass="0.0140974" pos="0 0.00301209 0.0232175" quat="0.705636 -0.0455904 0.0455904 0.705636"
+                    diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh"  material="black" mesh="coupler" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh" mesh="coupler"  />
+                </body>
+            </body>
+            <body name="left_spring_link" pos="-0.0127 -0.012 0.07222" quat="1 -1 0 0">
+                <inertial mass="0.0221642" pos="-8.65005e-09 0.0181624 0.0212658" quat="0.663403 -0.244737 0.244737 0.663403"
+                    diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
+                <joint name="left_spring_link_joint" range="-0.29670597283 0.9" armature="0.001" stiffness="0.05" springref="2.62" damping="0.00125"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"  mesh="spring_link"  material="black" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"  mesh="spring_link"  />
+                <body name="left_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 -1.90231e-05 0">
+                    <inertial mass="0.0125222" pos="0 -0.011046 0.0124786" quat="1 0.1664 0 0"
+                    diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
+                    <joint name="left_follower" range="-0.872664 0.9" armature="0.001"  solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="follower" material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="tongue"  material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="follower"  />
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="tongue"  />
+                    <body name="left_pad" pos="-0.0377897 -0.103916 -0.0091" quat="1 -1 0 0" >
+                        <geom name="left_pad1" mass="1e-6" type="box" pos="0.043258  0 0.12"  size="0.002 0.011 0.009375" friction="0.7"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                        <geom name="left_pad2" mass="1e-6" type="box" pos="0.043258 0 0.13875" size="0.002 0.011 0.009375" friction="0.6"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                    </body>
+                </body>
+            </body>
+            <!-- Right-hand side 4-bar linkage -->
+            <body name="right_driver" pos="0.0306011 -0.00475 0.0657045" quat="0 0 -1 1">
+                <inertial mass="0.00899563" pos="2.96931e-12 0.0177547 0.00107314" quat="0.681301 0.732003 0 0"
+                diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
+                <joint name="right_driver_joint" range="0 0.9" armature="0.005" damping="0.1" solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" material="metal" type="mesh" mesh="driver" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" type="mesh" mesh="driver"  />
+                <body name="right_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
+                    <inertial mass="0.0140974" pos="0 0.00301209 0.0232175" quat="0.705636 -0.0455904 0.0455904 0.705636"
+                    diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh"  mesh="coupler"  material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh"  mesh="coupler"  />
+                </body>
+            </body>
+            <body name="right_spring_link" pos="0.0127 0.012 0.07222" quat="0 0 -1 1">
+                <inertial mass="0.0221642" pos="-8.65005e-09 0.0181624 0.0212658" quat="0.663403 -0.244737 0.244737 0.663403"
+                diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
+                <joint name="right_spring_link_joint" range="-0.29670597283 0.9" armature="0.001" stiffness="0.05" springref="2.62" damping="0.00125"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"  mesh="spring_link"  material="black" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"   mesh="spring_link"  />
+                <body name="right_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 0 0">
+                    <inertial mass="0.0125222" pos="0 -0.011046 0.0124786" quat="1 0.1664 0 0"
+                    diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
+                    <joint name="right_follower_joint" range="-0.872664 0.9" armature="0.001"  solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  material="black" type="mesh" mesh="tongue" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  type="mesh"  mesh="follower"  material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  type="mesh" mesh="tongue"  />
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  type="mesh" mesh="follower"  />
+                    <body name="right_pad" pos="-0.0377897 -0.103916 -0.0091"   quat="1 -1 0 0" >
+                        <geom name="right_pad1" mass="1e-6" type="box" pos="0.043258  0 0.12"  size="0.002 0.011 0.009375" friction="0.7"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                        <geom name="right_pad2" mass="1e-6" type="box" pos="0.043258 0 0.13875" size="0.002 0.011 0.009375" friction="0.6"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </body>
+  </worldbody>
+
+  <contact>
+    <exclude body1="base" body2="left_driver"/>
+    <exclude body1="base" body2="right_driver"/>
+    <exclude body1="base" body2="left_spring_link"/>
+    <exclude body1="base" body2="right_spring_link"/>
+    <exclude body1="right_coupler" body2="right_follower"/>
+    <exclude body1="left_coupler" body2="left_follower"/>
+  </contact>
+
+  <!--
+    This adds stability to the model by having a tendon that distributes the forces between both
+    joints, such that the equality constraint doesn't have to do that much work in order to equalize
+    both joints. Since both joints share the same sign, we split the force between both equally by
+    setting coef=0.485
+  -->
+  <tendon>
+    <fixed name="split">
+      <joint joint="right_driver_joint" coef="0.485"/>
+      <joint joint="left_driver_joint" coef="0.485"/>
+    </fixed>
+  </tendon>
+
+  <equality>
+    <connect anchor="-0.0179014 -0.00651468 0.0044" body1="right_follower" body2="right_coupler" solimp="0.95 0.99 0.001" solref="0.005 1"/>
+    <connect anchor="-0.0179014 -0.00651468 0.0044" body1="left_follower" body2="left_coupler" solimp="0.95 0.99 0.001" solref="0.005 1"/>
+    <joint joint1="right_driver_joint" joint2="left_driver_joint" polycoef="0 1 0 0 0" solimp="0.95 0.99 0.001"
+    solref="0.005 1"/>
+  </equality>
+
+  <!--
+    The general actuator below is a customized position actuator (with some damping) where
+    gainprm[0] != kp (see http://mujoco.org/book/modeling.html#position).
+    The reason why gainprm[0] != kp is because the control input range has to be re-scaled to
+    [0, 255]. The joint range is currently set at [0, 0.8], the control range is [0, 255] and
+    kp = 100. Tau = Kp * scale * control_input - Kp * error, max(Kp * scale * control_input) = 0.8,
+    hence scale = 0.8 * 100 / 255
+  -->
+  <actuator>
+    <general name="fingers_actuator" tendon="split" forcerange="-5 5" ctrlrange="0 255"
+      gainprm="0.3137255 0 0" biasprm="0 -100 -10" biastype="affine"/>
+  </actuator>
+
+  <sensor>
+      <force name="force_ee" site="ft_frame"/>
+      <torque name="torque_ee" site="ft_frame"/>
+  </sensor>
+
+
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_85_real_kinova.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_85_real_kinova.xml
new file mode 100644
index 0000000000000000000000000000000000000000..d4a24161ea0f7311a935fd4851b2e22d2fbef881
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_85_real_kinova.xml
@@ -0,0 +1,186 @@
+
+<mujoco model="robotiq_gripper_85_model">
+  <compiler angle="radian" meshdir="assets" autolimits="true"/>
+
+  <option cone="elliptic" impratio="10"/>
+
+  <asset>
+
+    <material name="metal" rgba="0.58 0.58 0.58 1"/>
+    <material name="silicone" rgba="0.1882 0.1882 0.1882 1"/>
+    <material name="black" rgba="0.149 0.149 0.149 1"/>
+
+    <mesh name="base" file="meshes/robotiq_85_gripper_v4/base.stl"/>
+    <mesh name="base_coupling" file="meshes/robotiq_85_gripper_v4/base_coupling.stl"/>
+    <mesh name="c-a01-85-open" file="meshes/robotiq_85_gripper_v4/c-a01-85-open.stl"/>
+    <mesh name="driver" file="meshes/robotiq_85_gripper_v4/driver.stl"/>
+    <mesh name="coupler" file="meshes/robotiq_85_gripper_v4/coupler.stl"/>
+    <mesh name="spring_link" file="meshes/robotiq_85_gripper_v4/spring_link.stl"/>
+    <mesh name="follower" file="meshes/robotiq_85_gripper_v4/follower.stl"/>
+    <mesh name="tongue" file="meshes/robotiq_85_gripper_v4/tongue.stl"/>
+  </asset>
+
+
+  <!-- <default>
+    <default class="2f85">
+      <mesh scale="0.001 0.001 0.001"/>
+      <general biastype="affine"/>
+
+      <joint axis="0 0 1"/>
+
+
+
+
+    </default>
+  </default> -->
+
+
+  <worldbody>
+    <body name="base_mount" pos="0 0 0.007">
+        <site name="ft_frame" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 1" type="sphere" group="1" />
+
+        <body name="eef" pos="0 0 0.148" quat="1 0 0 -1">
+
+        <!-- <body name="eef" pos="0 0 0.136" quat="1 0 0 -1"> -->
+            <!-- <body name="eef" pos="0 0 0.136" quat="1 0 0 -1"> -->
+        <!-- <body name="eef" pos="0 0 0.15" > -->
+            <site name="grip_site" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 0.5" type="sphere"/>
+            <site name="ee_x" pos="0.1 0 0" size="0.005 .1"  quat="0.707105  0 0.707108 0 " rgba="1 0 0 0" type="cylinder"/>
+            <site name="ee_y" pos="0 0.1 0" size="0.005 .1" quat="0.707105 0.707108 0 0" rgba="0 1 0 0" type="cylinder"/>
+            <site name="ee_z" pos="0 0 0.1" size="0.005 .1" quat="1 0 0 0" rgba="0 0 1 0" type="cylinder"/>
+            <!-- This site was added for visualization. -->
+            <site name="grip_site_cylinder" pos="0 0 0" size="0.005 10" rgba="0 1 0 0.3" type="cylinder"/>
+        </body>
+
+        <body name="base" pos="0 0 -0.0038"  quat="1 0 0 -1">
+            <inertial mass="0.777441" pos="0 -2.70394e-05 0.0354675" quat="1 -0.00152849 0 0"
+                diaginertia="0.000260285 0.000225381 0.000152708"/>
+            <geom pos="0 0 0.0108" quat="0 0 0 1" type="mesh" mesh="base"  material="black" contype="0" conaffinity="0" group="1"/>
+            <geom pos="0 0 0.004" quat="1 -1 0 0" type="mesh" mesh="base_coupling"  material="black" contype="0" conaffinity="0" group="1"/>
+            <geom pos="0 0 0.0108" quat="1 0 0 0" type="mesh" material="metal" mesh="c-a01-85-open" contype="0" conaffinity="0" group="1"/>
+            <geom type="mesh" mesh="base"/>
+            <!-- Left-hand side 4-bar linkage -->
+            <body name="left_driver" pos="-0.0306011 0.00475 0.0657045" quat="1 -1 0 0">
+                <inertial mass="0.00899563" pos="0 0.0177547 0.00107314" quat="0.681301 0.732003 0 0"
+                    diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
+                <joint name="left_driver_joint" range="0 0.9" armature="0.005" damping="0.1" solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0"  material="metal" type="mesh" mesh="driver" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0"  type="mesh" mesh="driver"  />
+                <body name="left_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
+                    <inertial mass="0.0140974" pos="0 0.00301209 0.0232175" quat="0.705636 -0.0455904 0.0455904 0.705636"
+                    diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh"  material="black" mesh="coupler" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh" mesh="coupler"  />
+                </body>
+            </body>
+            <body name="left_spring_link" pos="-0.0127 -0.012 0.07222" quat="1 -1 0 0">
+                <inertial mass="0.0221642" pos="-8.65005e-09 0.0181624 0.0212658" quat="0.663403 -0.244737 0.244737 0.663403"
+                    diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
+                <joint name="left_spring_link_joint" range="-0.29670597283 0.9" armature="0.001" stiffness="0.05" springref="2.62" damping="0.00125"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"  mesh="spring_link"  material="black" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"  mesh="spring_link"  />
+                <body name="left_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 -1.90231e-05 0">
+                    <inertial mass="0.0125222" pos="0 -0.011046 0.0124786" quat="1 0.1664 0 0"
+                    diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
+                    <joint name="left_follower" range="-0.872664 0.9" armature="0.001"  solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="follower" material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="tongue"  material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="follower"  />
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="tongue"  />
+                    <body name="left_pad" pos="-0.0377897 -0.103916 -0.0091" quat="1 -1 0 0" >
+                        <geom name="left_pad1" mass="1e-6" type="box" pos="0.043258  0 0.12"  size="0.002 0.011 0.009375" friction="0.7"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                        <geom name="left_pad2" mass="1e-6" type="box" pos="0.043258 0 0.13875" size="0.002 0.011 0.009375" friction="0.6"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                    </body>
+                </body>
+            </body>
+            <!-- Right-hand side 4-bar linkage -->
+            <body name="right_driver" pos="0.0306011 -0.00475 0.0657045" quat="0 0 -1 1">
+                <inertial mass="0.00899563" pos="2.96931e-12 0.0177547 0.00107314" quat="0.681301 0.732003 0 0"
+                diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
+                <joint name="right_driver_joint" range="0 0.9" armature="0.005" damping="0.1" solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" material="metal" type="mesh" mesh="driver" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" type="mesh" mesh="driver"  />
+                <body name="right_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
+                    <inertial mass="0.0140974" pos="0 0.00301209 0.0232175" quat="0.705636 -0.0455904 0.0455904 0.705636"
+                    diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh"  mesh="coupler"  material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh"  mesh="coupler"  />
+                </body>
+            </body>
+            <body name="right_spring_link" pos="0.0127 0.012 0.07222" quat="0 0 -1 1">
+                <inertial mass="0.0221642" pos="-8.65005e-09 0.0181624 0.0212658" quat="0.663403 -0.244737 0.244737 0.663403"
+                diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
+                <joint name="right_spring_link_joint" range="-0.29670597283 0.9" armature="0.001" stiffness="0.05" springref="2.62" damping="0.00125"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"  mesh="spring_link"  material="black" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"   mesh="spring_link"  />
+                <body name="right_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 0 0">
+                    <inertial mass="0.0125222" pos="0 -0.011046 0.0124786" quat="1 0.1664 0 0"
+                    diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
+                    <joint name="right_follower_joint" range="-0.872664 0.9" armature="0.001"  solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  material="black" type="mesh" mesh="tongue" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  type="mesh"  mesh="follower"  material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  type="mesh" mesh="tongue"  />
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  type="mesh" mesh="follower"  />
+                    <body name="right_pad" pos="-0.0377897 -0.103916 -0.0091"   quat="1 -1 0 0" >
+                        <geom name="right_pad1" mass="1e-6" type="box" pos="0.043258  0 0.12"  size="0.002 0.011 0.009375" friction="0.7"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                        <geom name="right_pad2" mass="1e-6" type="box" pos="0.043258 0 0.13875" size="0.002 0.011 0.009375" friction="0.6"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </body>
+  </worldbody>
+
+  <contact>
+    <exclude body1="base" body2="left_driver"/>
+    <exclude body1="base" body2="right_driver"/>
+    <exclude body1="base" body2="left_spring_link"/>
+    <exclude body1="base" body2="right_spring_link"/>
+    <exclude body1="right_coupler" body2="right_follower"/>
+    <exclude body1="left_coupler" body2="left_follower"/>
+  </contact>
+
+  <!--
+    This adds stability to the model by having a tendon that distributes the forces between both
+    joints, such that the equality constraint doesn't have to do that much work in order to equalize
+    both joints. Since both joints share the same sign, we split the force between both equally by
+    setting coef=0.485
+  -->
+  <tendon>
+    <fixed name="split">
+      <joint joint="right_driver_joint" coef="0.485"/>
+      <joint joint="left_driver_joint" coef="0.485"/>
+    </fixed>
+  </tendon>
+
+  <equality>
+    <connect anchor="-0.0179014 -0.00651468 0.0044" body1="right_follower" body2="right_coupler" solimp="0.95 0.99 0.001" solref="0.005 1"/>
+    <connect anchor="-0.0179014 -0.00651468 0.0044" body1="left_follower" body2="left_coupler" solimp="0.95 0.99 0.001" solref="0.005 1"/>
+    <joint joint1="right_driver_joint" joint2="left_driver_joint" polycoef="0 1 0 0 0" solimp="0.95 0.99 0.001"
+    solref="0.005 1"/>
+  </equality>
+
+  <!--
+    The general actuator below is a customized position actuator (with some damping) where
+    gainprm[0] != kp (see http://mujoco.org/book/modeling.html#position).
+    The reason why gainprm[0] != kp is because the control input range has to be re-scaled to
+    [0, 255]. The joint range is currently set at [0, 0.8], the control range is [0, 255] and
+    kp = 100. Tau = Kp * scale * control_input - Kp * error, max(Kp * scale * control_input) = 0.8,
+    hence scale = 0.8 * 100 / 255
+  -->
+  <actuator>
+    <general name="fingers_actuator" tendon="split" forcerange="-5 5" ctrlrange="0 255"
+      gainprm="0.3137255 0 0" biasprm="0 -100 -10" biastype="affine"/>
+  </actuator>
+
+  <sensor>
+      <force name="force_ee" site="ft_frame"/>
+      <torque name="torque_ee" site="ft_frame"/>
+  </sensor>
+
+
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_85_v4.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_85_v4.xml
new file mode 100644
index 0000000000000000000000000000000000000000..13e09b7f53b2cc320283229e501edbd89968ae34
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_85_v4.xml
@@ -0,0 +1,165 @@
+<mujoco model="robotiq_gripper_85_model">
+  <compiler angle="radian" meshdir="assets" autolimits="true"/>
+
+  <option cone="elliptic" impratio="10"/>
+
+  <asset>
+    <material name="metal" rgba="0.58 0.58 0.58 1"/>
+    <material name="silicone" rgba="0.1882 0.1882 0.1882 1"/>
+    <material name="black" rgba="0.149 0.149 0.149 1"/>
+
+    <mesh name="base" file="meshes/robotiq_85_gripper_v4/base.stl"/>
+    <mesh name="base_coupling" file="meshes/robotiq_85_gripper_v4/base_coupling.stl"/>
+    <mesh name="c-a01-85-open" file="meshes/robotiq_85_gripper_v4/c-a01-85-open.stl"/>
+    <mesh name="driver" file="meshes/robotiq_85_gripper_v4/driver.stl"/>
+    <mesh name="coupler" file="meshes/robotiq_85_gripper_v4/coupler.stl"/>
+    <mesh name="spring_link" file="meshes/robotiq_85_gripper_v4/spring_link.stl"/>
+    <mesh name="follower" file="meshes/robotiq_85_gripper_v4/follower.stl"/>
+    <mesh name="tongue" file="meshes/robotiq_85_gripper_v4/tongue.stl"/>
+  </asset>
+
+  <worldbody>
+    <body name="base_mount" pos="0 0 0.007">
+        <site name="ft_frame" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 1" type="sphere" group="1" />
+
+        <body name="eef" pos="0 0 0.148" quat="1 0 0 -1">
+            <site name="grip_site" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 0.5" type="sphere"/>
+            <site name="ee_x" pos="0.1 0 0" size="0.005 .1"  quat="0.707105  0 0.707108 0 " rgba="1 0 0 0" type="cylinder"/>
+            <site name="ee_y" pos="0 0.1 0" size="0.005 .1" quat="0.707105 0.707108 0 0" rgba="0 1 0 0" type="cylinder"/>
+            <site name="ee_z" pos="0 0 0.1" size="0.005 .1" quat="1 0 0 0" rgba="0 0 1 0" type="cylinder"/>
+            <!-- This site was added for visualization. -->
+            <site name="grip_site_cylinder" pos="0 0 0" size="0.005 10" rgba="0 1 0 0.3" type="cylinder"/>
+        </body>
+
+        <body name="base" pos="0 0 -0.0038"  quat="0.924 0 0 -0.383">
+            <inertial mass="0.777441" pos="0 -2.70394e-05 0.0354675" quat="1 -0.00152849 0 0"
+                diaginertia="0.000260285 0.000225381 0.000152708"/>
+            <geom pos="0 0 0.0108" quat="0 0 0 1" type="mesh" mesh="base"  material="black" contype="0" conaffinity="0" group="1"/>
+            <geom pos="0 0 0.004" quat="1 -1 0 0" type="mesh" mesh="base_coupling"  material="black" contype="0" conaffinity="0" group="1"/>
+            <geom pos="0 0 0.0108" quat="1 0 0 0" type="mesh" material="metal" mesh="c-a01-85-open" contype="0" conaffinity="0" group="1"/>
+            <geom type="mesh" mesh="base"/>
+            <!-- Left-hand side 4-bar linkage -->
+            <body name="left_driver" pos="-0.0306011 0.00475 0.0657045" quat="1 -1 0 0">
+                <inertial mass="0.00899563" pos="0 0.0177547 0.00107314" quat="0.681301 0.732003 0 0"
+                    diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
+                <joint name="left_driver_joint" range="0 0.9" armature="0.005" damping="0.1" solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0"  material="metal" type="mesh" mesh="driver" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0"  type="mesh" mesh="driver"  />
+                <body name="left_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
+                    <inertial mass="0.0140974" pos="0 0.00301209 0.0232175" quat="0.705636 -0.0455904 0.0455904 0.705636"
+                    diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh"  material="black" mesh="coupler" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh" mesh="coupler"  />
+                </body>
+            </body>
+            <body name="left_spring_link" pos="-0.0127 -0.012 0.07222" quat="1 -1 0 0">
+                <inertial mass="0.0221642" pos="-8.65005e-09 0.0181624 0.0212658" quat="0.663403 -0.244737 0.244737 0.663403"
+                    diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
+                <joint name="left_spring_link_joint" range="-0.29670597283 0.9" armature="0.001" stiffness="0.05" springref="2.62" damping="0.00125"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"  mesh="spring_link"  material="black" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"  mesh="spring_link"  />
+                <body name="left_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 -1.90231e-05 0">
+                    <inertial mass="0.0125222" pos="0 -0.011046 0.0124786" quat="1 0.1664 0 0"
+                    diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
+                    <joint name="left_follower" range="-0.872664 0.9" armature="0.001"  solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="follower" material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="tongue"  material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="follower"  />
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" type="mesh"  mesh="tongue"  />
+                    <body name="left_pad" pos="-0.0377897 -0.103916 -0.0091" quat="1 -1 0 0" >
+                        <geom name="left_pad1" mass="1e-6" type="box" pos="0.043258  0 0.12"  size="0.002 0.011 0.009375" friction="0.7"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                        <geom name="left_pad2" mass="1e-6" type="box" pos="0.043258 0 0.13875" size="0.002 0.011 0.009375" friction="0.6"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                    </body>
+                </body>
+            </body>
+            <!-- Right-hand side 4-bar linkage -->
+            <body name="right_driver" pos="0.0306011 -0.00475 0.0657045" quat="0 0 -1 1">
+                <inertial mass="0.00899563" pos="2.96931e-12 0.0177547 0.00107314" quat="0.681301 0.732003 0 0"
+                diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
+                <joint name="right_driver_joint" range="0 0.9" armature="0.005" damping="0.1" solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" material="metal" type="mesh" mesh="driver" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" type="mesh" mesh="driver"  />
+                <body name="right_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
+                    <inertial mass="0.0140974" pos="0 0.00301209 0.0232175" quat="0.705636 -0.0455904 0.0455904 0.705636"
+                    diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh"  mesh="coupler"  material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" type="mesh"  mesh="coupler"  />
+                </body>
+            </body>
+            <body name="right_spring_link" pos="0.0127 0.012 0.07222" quat="0 0 -1 1">
+                <inertial mass="0.0221642" pos="-8.65005e-09 0.0181624 0.0212658" quat="0.663403 -0.244737 0.244737 0.663403"
+                diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
+                <joint name="right_spring_link_joint" range="-0.29670597283 0.9" armature="0.001" stiffness="0.05" springref="2.62" damping="0.00125"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"  mesh="spring_link"  material="black" contype="0" conaffinity="0" group="1"/>
+                <geom pos="0.0127 0.06142 0.01205" quat="1 1 0 0" type="mesh"   mesh="spring_link"  />
+                <body name="right_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 0 0">
+                    <inertial mass="0.0125222" pos="0 -0.011046 0.0124786" quat="1 0.1664 0 0"
+                    diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
+                    <joint name="right_follower_joint" range="-0.872664 0.9" armature="0.001"  solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  material="black" type="mesh" mesh="tongue" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  type="mesh"  mesh="follower"  material="black" contype="0" conaffinity="0" group="1"/>
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  type="mesh" mesh="tongue"  />
+                    <geom pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0"  type="mesh" mesh="follower"  />
+                    <body name="right_pad" pos="-0.0377897 -0.103916 -0.0091"   quat="1 -1 0 0" >
+                        <geom name="right_pad1" mass="1e-6" type="box" pos="0.043258  0 0.12"  size="0.002 0.011 0.009375" friction="0.7"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                        <geom name="right_pad2" mass="1e-6" type="box" pos="0.043258 0 0.13875" size="0.002 0.011 0.009375" friction="0.6"
+                        solimp="0.95 0.99 0.001" solref="0.004 1" priority="1" material="black"/>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </body>
+  </worldbody>
+
+  <contact>
+    <exclude body1="base" body2="left_driver"/>
+    <exclude body1="base" body2="right_driver"/>
+    <exclude body1="base" body2="left_spring_link"/>
+    <exclude body1="base" body2="right_spring_link"/>
+    <exclude body1="right_coupler" body2="right_follower"/>
+    <exclude body1="left_coupler" body2="left_follower"/>
+  </contact>
+
+  <!--
+    This adds stability to the model by having a tendon that distributes the forces between both
+    joints, such that the equality constraint doesn't have to do that much work in order to equalize
+    both joints. Since both joints share the same sign, we split the force between both equally by
+    setting coef=0.485
+  -->
+  <tendon>
+    <fixed name="split">
+      <joint joint="right_driver_joint" coef="0.485"/>
+      <joint joint="left_driver_joint" coef="0.485"/>
+    </fixed>
+  </tendon>
+
+  <equality>
+    <connect anchor="-0.0179014 -0.00651468 0.0044" body1="right_follower" body2="right_coupler" solimp="0.95 0.99 0.001" solref="0.005 1"/>
+    <connect anchor="-0.0179014 -0.00651468 0.0044" body1="left_follower" body2="left_coupler" solimp="0.95 0.99 0.001" solref="0.005 1"/>
+    <joint joint1="right_driver_joint" joint2="left_driver_joint" polycoef="0 1 0 0 0" solimp="0.95 0.99 0.001"
+    solref="0.005 1"/>
+  </equality>
+
+  <!--
+    The general actuator below is a customized position actuator (with some damping) where
+    gainprm[0] != kp (see http://mujoco.org/book/modeling.html#position).
+    The reason why gainprm[0] != kp is because the control input range has to be re-scaled to
+    [0, 255]. The joint range is currently set at [0, 0.8], the control range is [0, 255] and
+    kp = 100. Tau = Kp * scale * control_input - Kp * error, max(Kp * scale * control_input) = 0.8,
+    hence scale = 0.8 * 100 / 255
+  -->
+  <actuator>
+    <general name="fingers_actuator" tendon="split" forcerange="-5 5" ctrlrange="0 255"
+      gainprm="0.3137255 0 0" biasprm="0 -100 -10" biastype="affine"/>
+  </actuator>
+
+  <sensor>
+      <force name="force_ee" site="ft_frame"/>
+      <torque name="torque_ee" site="ft_frame"/>
+  </sensor>
+
+
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_s.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_s.xml
new file mode 100644
index 0000000000000000000000000000000000000000..26e5c749114241d9343053fdbaa41c6d09732d9e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/robotiq_gripper_s.xml
@@ -0,0 +1,182 @@
+<!-- Add damping = 0.3, 0.1 for bigger ones -->
+<!-- Set kp to larger -->
+<mujoco model="s-model_articulated">
+    <asset>
+        <mesh name="palm" file="meshes/robotiq_s_gripper/palm.stl" />
+        <mesh name="link_0" file="meshes/robotiq_s_gripper/link_0.stl" />
+        <mesh name="link_1" file="meshes/robotiq_s_gripper/link_1.stl" />
+        <mesh name="link_2" file="meshes/robotiq_s_gripper/link_2.stl" />
+        <mesh name="link_3" file="meshes/robotiq_s_gripper/link_3.stl" />
+        <mesh name="palm_vis" file="meshes/robotiq_s_gripper/palm_vis.stl" />
+        <mesh name="link_0_vis" file="meshes/robotiq_s_gripper/link_0_vis.stl" />
+        <mesh name="link_1_vis" file="meshes/robotiq_s_gripper/link_1_vis.stl" />
+        <mesh name="link_2_vis" file="meshes/robotiq_s_gripper/link_2_vis.stl" />
+        <mesh name="link_3_vis" file="meshes/robotiq_s_gripper/link_3_vis.stl" />
+    </asset>
+    <tendon>
+		<!--Middlefinger tendons-->
+		<fixed name="middle12_cpl" range="0 1" limited="true">
+			<joint joint="finger_middle_joint_1"  coef="0.5"/>
+			<joint joint="finger_middle_joint_2"  coef="0.5"/>
+		</fixed>
+		<fixed name="middle23_cpl" range="0 1" limited="true">
+			<joint joint="finger_middle_joint_2"  coef="0.5"/>
+			<joint joint="finger_middle_joint_3"  coef="-1"/>
+		</fixed>
+
+		<!--finger2 tendons-->
+        <fixed name="finger2_12_cpl" range="0 1" limited="true">
+			<joint joint="finger_2_joint_1"  coef="0.5"/>
+			<joint joint="finger_2_joint_2"  coef="0.5"/>
+		</fixed>
+		<fixed name="finger2_23_cpl" range="0 1" limited="true">
+			<joint joint="finger_2_joint_2"  coef="0.5"/>
+			<joint joint="finger_2_joint_3"  coef="-1"/>
+		</fixed>
+
+		<!--Finger1 tendons-->
+        <fixed name="finger1_12_cpl" range="0 1" limited="true">
+			<joint joint="finger_1_joint_1"  coef="0.5"/>
+			<joint joint="finger_1_joint_2"  coef="0.5"/>
+		</fixed>
+		<fixed name="finger1_23_cpl" range="0 1" limited="true">
+			<joint joint="finger_1_joint_2"  coef="0.5"/>
+			<joint joint="finger_1_joint_3"  coef="-1"/>
+		</fixed>
+	</tendon>
+
+    <equality>
+		<!-- GRIPPER Couplings -->
+		<tendon name="middle12_cpl" 	tendon1="middle12_cpl"/>
+		<tendon name="middle23_cpl" 	tendon1="middle23_cpl"/>
+
+        <tendon name="finger2_12_cpl" 	tendon1="finger2_12_cpl"/>
+		<tendon name="finger2_23_cpl" 	tendon1="finger2_23_cpl"/>
+
+        <tendon name="finger1_12_cpl" 	tendon1="finger1_12_cpl"/>
+		<tendon name="finger1_23_cpl" 	tendon1="finger1_23_cpl"/>
+
+		<!-- SCISSOR Coupling -->
+		<joint name="finger_12_scissor" joint1="palm_finger_2_joint" joint2="palm_finger_1_joint" polycoef="0 -1 0 0 0"/>
+	</equality>
+    <actuator>
+        <position name='finger_1' ctrllimited="true" kp="20" joint='finger_1_joint_1' ctrlrange='0 1.2217'  />
+        <position name='finger_2' ctrllimited="true" kp="20" joint='finger_2_joint_1' ctrlrange='0 1.2217'  />
+        <position name='middle_finger'  ctrllimited="true" kp="20" joint='finger_middle_joint_1' ctrlrange='0 1.2217'  />
+        <position name='finger_scissor' ctrllimited="true" kp="20" joint='palm_finger_1_joint' ctrlrange="-0.29 0.29"  />
+    </actuator>
+    <worldbody>
+        <body name="palm" pos="0 0 0.045" quat="-0.49921826 -0.50133955 0.50133955 0.49921826" >
+            <site name="ft_frame" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 1" type="sphere" group="1"/>
+            <inertial pos="-0.00134236 0.00263073 -1.01672e-08" quat="0.489887 0.489849 0.509908 0.509954" mass="1.20044" diaginertia="0.00263541 0.00218876 0.00210987" />
+            <geom name="palm_visual" type="mesh" contype="0" conaffinity="0" group="1" rgba="0 0 0 1" mesh="palm_vis" />
+            <geom name="palm" contype="0" type="mesh"  group="0" rgba="0 0 0 1" mesh="palm" />
+            <!-- This site was added for visualization. -->
+            <body name="eef" pos="0 0.15 0" quat="0.707105 -0.707105 0 0">
+                <site name="grip_site" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 0.5" type="sphere" group="1"/>
+                <site name="ee_x" pos="0.1 0 0" size="0.005 .1"  quat="0.707105  0 0.707108 0 " rgba="1 0 0 0" type="cylinder" group="1"/>
+                <site name="ee_y" pos="0 0.1 0" size="0.005 .1" quat="0.707105 0.707108 0 0" rgba="0 1 0 0" type="cylinder" group="1"/>
+                <site name="ee_z" pos="0 0 0.1" size="0.005 .1" quat="1 0 0 0" rgba="0 0 1 0" type="cylinder" group="1"/>
+                <!-- This site was added for visualization. -->
+                <site name="grip_site_cylinder" pos="0 0 0" size="0.005 10" rgba="0 1 0 0.3" type="cylinder" group="1"/>
+            </body>
+            <body name="finger_1_link_0" pos="-0.0455 0.0214 0.036" quat="-2.59838e-06 0.706825 0.707388 2.59631e-06">
+                <inertial pos="-0.0381134 0.026928 0.0359997" quat="0.0835458 0.702169 -0.0835699 0.702136" mass="0.0494545" diaginertia="1.39633e-05 1.10443e-05 9.98401e-06" />
+                <joint damping="0.1" name="palm_finger_1_joint" pos="0 0 0" axis="0 1 0" limited="true" range="-0.29 0.29" ref="0.29" />
+                <geom name="f1_l0_vis" pos="0.02 0 0" type="mesh" contype="0" conaffinity="0" group="1" rgba="0 0 0 1" mesh="link_0_vis" />
+                <geom name="f1_l0" contype="0" pos="0.02 0 0" type="mesh" rgba="0 0 0 1" group="0" mesh="link_0" />
+                <!-- pos="-0.0455 0.0414 0.036" quat="-2.59838e-06 0.706825 0.707388 2.59631e-06"  -->
+                <body name="finger_1_link_1" pos="0.02 0 0">
+                    <inertial pos="0.0184407 -0.0219985 -5.83379e-12" quat="0.170512 0.68624 -0.170512 0.68624" mass="0.0859984" diaginertia="6.46024e-05 5.9524e-05 1.50124e-05" />
+                    <joint damping="0.03" name="finger_1_joint_1" pos="0 0 0" axis="0 0 1" limited="true" range="0 3.1416" />
+                    <geom name="f1_l1_vis" pos="0.05 -0.028 0" quat="0.96639 0 0 -0.257081" type="mesh" contype="0" conaffinity="0" group="1" rgba="0 0 0 1" mesh="link_1_vis" />
+                    <geom name="f1_l1" pos="0.05 -0.028 0" quat="0.96639 0 0 -0.257081" type="mesh" rgba="0 0 0 1" group="0" mesh="link_1" />
+                    <body name="finger_1_link_2" pos="0.05 -0.028 0" quat="0.96639 0 0 -0.257081">
+                        <inertial pos="0.0147516 -0.00823285 -0.000546645" quat="-0.0619175 0.704391 0.0619175 0.704391" mass="0.0461767" diaginertia="1.70717e-05 1.67626e-05 5.90767e-06" />
+                        <joint damping="0.03" name="finger_1_joint_2" pos="0 0 0" axis="0 0 1" limited="true" range="0 3.1416" />
+                        <geom name="f1_l2_vis" pos="0.039 0 0.0075" type="mesh" contype="0" conaffinity="0" group="1" rgba="0 0 0 1" mesh="link_2_vis" />
+                        <geom name="f1_l2" pos="0.039 0 0.0075" type="mesh" rgba="0 0 0 1" group="0" mesh="link_2" />
+                        <body name="finger_1_link_3" pos="0.039 0 0">
+                            <inertial pos="0.00866021 0.00239356 2.11103e-08" quat="0.653623 0.653626 -0.269769 -0.269767" mass="0.0227212" diaginertia="5.48529e-06 4.69221e-06 2.01594e-06" />
+                            <joint damping="0.03" name="finger_1_joint_3" pos="0 0 0" axis="0 0 1" limited="true" range="0 3.1416" />
+                            <geom name="f1_l3_vis" quat="0.96639 0 0 0.257081" type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" mesh="link_3_vis" />
+                            <geom name="f1_l3" quat="0.96639 0 0 0.257081" type="mesh" rgba="1 1 1 1" group="0" mesh="link_3" />
+                            <geom type="box" group="0" pos="0.018 0.018 0" quat="0.9659258 0 0 0.258819" size="0.013 0.002 0.013" name="f1_tip_collision" />
+                            <geom type="box" group="0" pos="0.017 0.019 0" quat="0.9659258 0 0 0.258819" size="0.011 0.001 0.012" name="f1_pad_collision" />
+                        </body>
+                    </body>
+                </body>
+            </body>
+            <body name="finger_2_link_0" pos="-0.0455 0.0214 -0.036" quat="-2.59838e-06 0.706825 0.707388 2.59631e-06">
+                <inertial pos="-0.0381134 0.026928 0.0359997" quat="0.0835458 0.702169 -0.0835699 0.702136" mass="0.0494545" diaginertia="1.39633e-05 1.10443e-05 9.98401e-06" />
+                <joint damping="0.1" name="palm_finger_2_joint" pos="0 0 0" axis="0 1 0" limited="true" range="-0.29 0.29" ref="-0.29" />
+                <geom name="f2_l0_vis" pos="0.02 0 0" type="mesh" contype="0" conaffinity="0" group="1" rgba="0 0 0 1" mesh="link_0_vis" />
+                <geom name="f2_l0" contype="0" pos="0.02 0 0" type="mesh" rgba="0 0 0 1" group="0" mesh="link_0" />
+                <!-- pos="-0.0455 0.0414 0.036" quat="-2.59838e-06 0.706825 0.707388 2.59631e-06" type="mesh"  -->
+                <body name="finger_2_link_1" pos="0.02 0 0">
+                    <inertial pos="0.0184407 -0.0219985 -5.83379e-12" quat="0.170512 0.68624 -0.170512 0.68624" mass="0.0859984" diaginertia="6.46024e-05 5.9524e-05 1.50124e-05" />
+                    <joint damping="0.03" name="finger_2_joint_1" pos="0 0 0" axis="0 0 1" limited="true" range="0 3.1416" />
+                    <geom name="f2_l1_vis" pos="0.05 -0.028 0" quat="0.96639 0 0 -0.257081" type="mesh" contype="0" conaffinity="0" group="1" rgba="0 0 0 1" mesh="link_1_vis" />
+                    <geom name="f2_l1" pos="0.05 -0.028 0" quat="0.96639 0 0 -0.257081" type="mesh" rgba="0 0 0 1" group="0" mesh="link_1" />
+                    <body name="finger_2_link_2" pos="0.05 -0.028 0" quat="0.96639 0 0 -0.257081">
+                        <inertial pos="0.0147516 -0.00823285 -0.000546645" quat="-0.0619175 0.704391 0.0619175 0.704391" mass="0.0461767" diaginertia="1.70717e-05 1.67626e-05 5.90767e-06" />
+                        <joint damping="0.03" name="finger_2_joint_2" pos="0 0 0" axis="0 0 1" limited="true" range="0 3.1416" />
+                        <geom name="f2_l2_vis" pos="0.039 0 0.0075" type="mesh" contype="0" conaffinity="0" group="1" rgba="0 0 0 1" mesh="link_2_vis" />
+                        <geom name="f2_l2" pos="0.039 0 0.0075" type="mesh" rgba="0 0 0 1" group="0" mesh="link_2" />
+                        <body name="finger_2_link_3" pos="0.039 0 0">
+                            <inertial pos="0.00866021 0.00239356 2.11103e-08" quat="0.653623 0.653626 -0.269769 -0.269767" mass="0.0227212" diaginertia="5.48529e-06 4.69221e-06 2.01594e-06" />
+                            <joint damping="0.03" name="finger_2_joint_3" pos="0 0 0" axis="0 0 1" limited="true" range="0 3.1416" />
+                            <geom name="f2_l3_vis" quat="0.96639 0 0 0.257081" type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" mesh="link_3_vis" />
+                            <geom name="f2_l3" quat="0.96639 0 0 0.257081" type="mesh" rgba="1 1 1 1" group="0" mesh="link_3" />
+                            <geom type="box" group="0" pos="0.018 0.018 0" quat="0.9659258 0 0 0.258819" size="0.013 0.002 0.013" name="f2_tip_collision" />
+                            <geom type="box" group="0" pos="0.017 0.019 0" quat="0.9659258 0 0 0.258819" size="0.011 0.001 0.012" name="f2_pad_collision" />
+                        </body>
+                    </body>
+                </body>
+            </body>
+            <body name="finger_middle_link_0" pos="0.0455 0.0214 0" quat="0.707388 0 0 0.706825">
+                <inertial pos="-0.0381134 0.026928 0.0359997" quat="0.0835458 0.702169 -0.0835699 0.702136" mass="0.0494545" diaginertia="1.39633e-05 1.10443e-05 9.98401e-06" />
+                <geom name="f3_l0_vis" pos="0.02 0 0" type="mesh" contype="0" conaffinity="0" group="1" rgba="0 0 0 1" mesh="link_0_vis" />
+                <geom name="f3_l0" contype="0" pos="0.02 0 0" type="mesh" rgba="0 0 0 1" group="0" mesh="link_0" />
+                <!-- pos="-0.0455 0.0414 0.036" quat="-2.59838e-06 0.706825 0.707388 2.59631e-06"  -->
+                <body name="finger_middle_link_1" pos="0.02 0 0">
+                    <inertial pos="0.0184407 -0.0219985 -5.83379e-12" quat="0.170512 0.68624 -0.170512 0.68624" mass="0.0859984" diaginertia="6.46024e-05 5.9524e-05 1.50124e-05" />
+                    <joint damping="0.03" name="finger_middle_joint_1" pos="0 0 0" axis="0 0 1" limited="true" range="0 3.1416" />
+                    <geom name="f3_l1_vis" pos="0.05 -0.028 0" quat="0.96639 0 0 -0.257081" type="mesh" contype="0" conaffinity="0" group="1" rgba="0 0 0 1" mesh="link_1_vis" />
+                    <geom name="f3_l1" pos="0.05 -0.028 0" quat="0.96639 0 0 -0.257081" type="mesh" rgba="0 0 0 1" group="0" mesh="link_1" />
+                    <body name="finger_middle_link_2" pos="0.05 -0.028 0" quat="0.96639 0 0 -0.257081">
+                        <inertial pos="0.0147516 -0.00823285 -0.000546645" quat="-0.0619175 0.704391 0.0619175 0.704391" mass="0.0461767" diaginertia="1.70717e-05 1.67626e-05 5.90767e-06" />
+                        <joint damping="0.03" name="finger_middle_joint_2" pos="0 0 0" axis="0 0 1" limited="true" range="0 3.1416" />
+                        <geom name="f3_l2_vis" pos="0.039 0 0.0075" type="mesh" contype="0" conaffinity="0" group="1" rgba="0 0 0 1" mesh="link_2_vis" />
+                        <geom name="f3_l2" pos="0.039 0 0.0075" type="mesh" rgba="0 0 0 1" group="0" mesh="link_2" />
+                        <body name="finger_middle_link_3" pos="0.039 0 0">
+                            <inertial pos="0.00866021 0.00239356 2.11103e-08" quat="0.653623 0.653626 -0.269769 -0.269767" mass="0.0227212" diaginertia="5.48529e-06 4.69221e-06 2.01594e-06" />
+                            <joint damping="0.03" name="finger_middle_joint_3" pos="0 0 0" axis="0 0 1" limited="true" range="0 3.1416" />
+                            <geom name="f3_l3_vis" quat="0.96639 0 0 0.257081" type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" mesh="link_3_vis" />
+                            <geom name="f3_l3" quat="0.96639 0 0 0.257081" type="mesh" rgba="1 1 1 1" group="0" mesh="link_3" />
+                            <geom type="box" group="0" pos="0.018 0.018 0" quat="0.9659258 0 0 0.258819" size="0.013 0.002 0.013" name="finger_middle_tip_collision" />
+                            <geom type="box" group="0" pos="0.017 0.019 0" quat="0.9659258 0 0 0.258819" size="0.011 0.001 0.012" name="finger_middle_pad_collision" />
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+    <sensor>
+        <force name="force_ee" site="ft_frame"/>
+        <torque name="torque_ee" site="ft_frame"/>
+    </sensor>
+    <!--
+    <contact>
+        <exclude body1="finger_1_link_1" body2="palm"/>
+        <exclude body1="finger_2_link_1" body2="palm"/>
+        <exclude body1="finger_middle_link_1" body2="palm"/>
+        <exclude body1="finger_1_link_0" body2="palm"/>
+        <exclude body1="finger_2_link_0" body2="palm"/>
+        <exclude body1="finger_middle_link_0" body2="palm"/>
+        <exclude body1="finger_1_link_0" body2="finger_1_link_1"/>
+        <exclude body1="finger_2_link_0" body2="finger_2_link_1"/>
+        <exclude body1="finger_middle_link_0" body2="finger_middle_link_1"/>
+    </contact>
+    -->
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/wiping_gripper.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/wiping_gripper.xml
new file mode 100644
index 0000000000000000000000000000000000000000..829fff7ec3733e777e528ebc70f3885c4702f7f7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/grippers/wiping_gripper.xml
@@ -0,0 +1,73 @@
+<mujoco model="wiping_hand">
+    <worldbody>
+        <body name="wiping_gripper" pos="0 0 0.015" quat="0.707107 0 0 -0.707107" >
+            <site name="ft_frame" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 1" type="sphere" group="1"/>
+            <inertial pos="0 0 0" mass="3e-2" diaginertia="1e-2 1e-2 1e-2"/>
+
+            <geom type="box" pos="0 0 0.0" size="0.06 0.025 0.015" name="wiping_surface_vis" contype="0" conaffinity="0" group="1" rgba="0.25 0.25 0.25 1" />
+
+            <geom type="box" group="0" pos="0.055 0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface1a" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="0.04 0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface1b" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="0.02 0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface1c" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="0.0 0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface1d" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="-0.02 0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface1e" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="-0.04 0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface1f" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="-0.055 0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface1g" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+
+            <geom type="sphere" group="0" pos="0.06 0.025 0.015" size="0.001" name="wiping_corner1" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            
+            <site type="sphere" pos="0.06 0.025 0.015" size="0.001" name="wiping_corner1_site"/>
+            <geom type="sphere" group="0" pos="0.04 0.025 0.015" size="0.001" name="wiping_surface1br" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="sphere" group="0" pos="0.02 0.025 0.015" size="0.001" name="wiping_surface1cr" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="sphere" group="0" pos="0.0 0.025 0.015" size="0.001" name="wiping_surface1dr" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="sphere" group="0" pos="-0.02 0.025 0.015" size="0.001" name="wiping_surface1er" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="sphere" group="0" pos="-0.04 0.025 0.015" size="0.001" name="wiping_surface1fr" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="sphere" group="0" pos="-0.06 0.025 0.015" size="0.001" name="wiping_corner2" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <site type="sphere" pos="-0.06 0.025 0.015" size="0.001" name="wiping_corner2_site"/>
+
+            <geom type="box" group="0" pos="0.055 -0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface2a" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="0.04 -0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface2b" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="0.02 -0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface2c" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="0.0 -0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface2d" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="-0.02 -0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface2e" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="-0.04 -0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface2f" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="-0.055 -0.02 0.0" size="0.005 0.01 0.015" name="wiping_surface2g" solimp="0.2 0.9 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+
+            <geom type="sphere" group="0" pos="0.06 -0.025 0.015" size="0.001" name="wiping_corner3" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <site type="sphere" pos="0.06 -0.025 0.015" size="0.001" name="wiping_corner3_site"/>
+            <geom type="sphere" group="0" pos="0.04 -0.025 0.015" size="0.001" name="wiping_surface2br" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="sphere" group="0" pos="0.02 -0.025 0.015" size="0.001" name="wiping_surface2cr" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="sphere" group="0" pos="0.0 -0.025 0.015" size="0.001" name="wiping_surface2dr" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="sphere" group="0" pos="-0.02 -0.025 0.015" size="0.001" name="wiping_surface2er" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="sphere" group="0" pos="-0.04 -0.025 0.015" size="0.001" name="wiping_surface2fr" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="sphere" group="0" pos="-0.06 -0.025 0.015" size="0.001" name="wiping_corner4" solimp="0.0 0.1 0.01" solmix="10000" solref="1 2" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <site type="sphere" pos="-0.06 -0.025 0.015" size="0.001" name="wiping_corner4_site"/>
+
+            <geom type="box" group="0" pos="0.055 0.0 0.0" size="0.005 0.03 0.005" name="wiping_surface1ax" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="0.04 0.0 0.0" size="0.005 0.03 0.005" name="wiping_surface1bx" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="0.02 0.0 0.0" size="0.005 0.03 0.005" name="wiping_surface1cx" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="0.0 0.0 0.0" size="0.005 0.03 0.005" name="wiping_surface1dx" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="-0.02 0.0 0.0" size="0.005 0.03 0.005" name="wiping_surface1ex" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="-0.04 0.0 0.0" size="0.005 0.03 0.005" name="wiping_surface1fx" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+            <geom type="box" group="0" pos="-0.055 0.0 0.0" size="0.005 0.03 0.005" name="wiping_surface1gx" solimp="0.001 0.3 0.01" solmix="10000" solref="0.05 1" rgba="0.25 0.25 0.25 1" friction="0.001 0.005 0.0001"/>
+
+            <!-- This site was added for visualization. -->
+            <body name="eef" pos="0 0 0.015" quat="1 0 0 0">
+                <site name="grip_site" pos="0 0 0" size="0.01 0.01 0.01" rgba="1 0 0 0.5" type="sphere" group="1"/>
+                <site name="ee_x" pos="0.1 0 0" size="0.005 .1"  quat="0.707105  0 0.707108 0 " rgba="1 0 0 0" type="cylinder" group="1"/>
+                <site name="ee_y" pos="0 0.1 0" size="0.005 .1" quat="0.707105 0.707108 0 0" rgba="0 1 0 0" type="cylinder" group="1"/>
+                <site name="ee_z" pos="0 0 0.1" size="0.005 .1" quat="1 0 0 0" rgba="0 0 1 0" type="cylinder" group="1"/>
+                <!-- This site was added for visualization. -->
+                <site name="grip_site_cylinder" pos="0 0 0" size="0.005 10" rgba="0 1 0 0.3" type="cylinder" group="1"/>
+            </body>
+        </body>
+    </worldbody>
+    <sensor>
+        <force name="force_ee" site="ft_frame"/>
+        <torque name="torque_ee" site="ft_frame"/>
+        <touch name="touch1" site="wiping_corner1_site"/>
+        <touch name="touch2" site="wiping_corner2_site"/>
+        <touch name="touch3" site="wiping_corner3_site"/>
+        <touch name="touch4" site="wiping_corner4_site"/>
+    </sensor>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/light_maps/photo_studio_01_2k.hdr b/phantom/submodules/phantom-robosuite/robosuite/models/assets/light_maps/photo_studio_01_2k.hdr
new file mode 100644
index 0000000000000000000000000000000000000000..b298836f7ff552d84055d8813bf2255ba3fe3148
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/light_maps/photo_studio_01_2k.hdr
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6178cf04ea2ac9390b8794d3088a04c0254905f335f5f30d2b582c57c40f387
+size 6375901
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_collision.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_collision.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..04d42877603638d350f9fdb749596bcdcffd9d45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_collision.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23b5279777ea3bcbaeb3a0c748f95a51d1dc3bcce1db8c035a9361512e73a66
+size 124
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_collision.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_collision.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4f1d07b322563a13b0a87fe0113ff01fa9435cdb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_collision.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b57c84fc95e3497c4554bac47fbdac9b50622083e36010d99ed24604b749d81c
+size 16065
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_collision.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_collision.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ec574cfed5b11239176ef327c720f4a8527e6f99
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_collision.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73296b38d0d267f5d2aa0f8626432807e7e7fc3b6aa50263da9d31620d5cba1d
+size 10284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..31f11409b2145ac7455d93c8d3aa918c51aa7f21
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3658352da1bf92339c8d5da2a9c83c8eb952779ae351ae3a7c3d675923a1078
+size 139
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..f357a16537a5a203c83bbad4d373d8a46f1f6127
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fd6a4a0ad76ab9dabad8de389d043e1d1cf856b0d94d8e4bd5a4e476fe774c3
+size 1128212
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..e6b86c57258affa2767b8d6846bce74ab5babbef
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_minimal_mount/pedestal_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73266304fb9e039e2d77051465f5a7697d351326c09b4609b1d8dede6ef5fa6d
+size 501234
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.dae
new file mode 100644
index 0000000000000000000000000000000000000000..3d1b182cfdd00212645903b3980e1597d0cbd76e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9a74be4ae114c9acafccf68e1e49d8fd815ec030012ce1b60bdcf9b30db49f5
+size 2734652
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..7486ceea55881c4a527e51837c38df066a77c9db
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad7c0b8f65e2212a0c70ba09e79ff4729cc288a963bc52d69c0bc0304a683aaa
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.obj
new file mode 100644
index 0000000000000000000000000000000000000000..abb25d449d73b66c81efdf7c3484d4e95a571762
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5db81bfd184f4a20857a238d878cdb73428986767ec7aec5bd629f75ee71c75
+size 4009925
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.stl
new file mode 100644
index 0000000000000000000000000000000000000000..78cd09c62f6cdee80a1698c3077c7dd6d5b543bf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/meshes/rethink_mount/pedestal.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f96ff12bfb347c77580e4392556b15e47ef7228711c45b0f1193b100af22866a
+size 1647684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/null_mount.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/null_mount.xml
new file mode 100644
index 0000000000000000000000000000000000000000..7ef2cc3631ccc42d2f1134d6c7900d9ed86aa0ce
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/null_mount.xml
@@ -0,0 +1,6 @@
+<mujoco model="null_mount">
+    <worldbody>
+        <body name="base" pos="0 0 0">
+        </body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/phantom_mount.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/phantom_mount.xml
new file mode 100644
index 0000000000000000000000000000000000000000..f4b25f61dcc72d08ed06a1670eaf2234f574f2ef
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/phantom_mount.xml
@@ -0,0 +1,14 @@
+
+<mujoco model="phantom_mount">
+    <worldbody>
+        <body name="base" pos="0 0 0">
+            <body name="pedestal" pos="0 0 -0.1">
+              <inertial pos="0 0 0" quat="0.659267 -0.259505 -0.260945 0.655692" mass="60.864" diaginertia="6.0869 5.81635 4.20915" />
+              <!-- visual (no collisions) -->
+              <geom name="pedestal_visual" contype="0" conaffinity="0" group="1" type="cylinder" size="0.08 0.1" rgba="0.2 0.2 0.2 0"/>
+              <!-- collision geometry -->
+              <geom name="pedestal_collision" material="pedestal_mat" type="cylinder" size="0.08 0.125" rgba="0.2 0.2 0.2 1"/>
+            </body>
+        </body>
+    </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/rethink_minimal_mount.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/rethink_minimal_mount.xml
new file mode 100644
index 0000000000000000000000000000000000000000..b3c31886b5f1a9c98e90ff30efc21f6b823e72ca
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/rethink_minimal_mount.xml
@@ -0,0 +1,15 @@
+<mujoco model="rethink_mount">
+    <asset>
+        <mesh name="pedestal" file="meshes/rethink_minimal_mount/pedestal_collision.stl" />
+        <mesh name="pedestal_vis" file="meshes/rethink_minimal_mount/pedestal_vis.stl" />
+    </asset>
+    <worldbody>
+        <body name="base" pos="0 0 -0.924">
+            <body name="pedestal" pos="0 0 0.0">
+              <inertial pos="0 0 0" quat="0.659267 -0.259505 -0.260945 0.655692" mass="60.864" diaginertia="6.0869 5.81635 4.20915" />
+              <geom name="pedestal_visual" contype="0" conaffinity="0" group="1" type="mesh" rgba="0.2 0.2 0.2 1" mesh="pedestal_vis" />
+              <geom name="pedestal_collision" material="pedestal_mat" rgba="0.2 0.2 0.2 1" mesh="pedestal" />
+            </body>
+        </body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/rethink_mount.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/rethink_mount.xml
new file mode 100644
index 0000000000000000000000000000000000000000..7fed3a86436dd94cf60601c7af2fa0e3429b5181
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/mounts/rethink_mount.xml
@@ -0,0 +1,26 @@
+<mujoco model="rethink_mount">
+    <asset>
+        <mesh name="pedestal" file="meshes/rethink_mount/pedestal.stl" />
+    </asset>
+    <worldbody>
+        <body name="base" pos="0 0 -0.922">
+            <body name="controller_box" pos="0 0 0">
+                <inertial diaginertia="1.71363 1.27988 0.809981" mass="46.64" pos="-0.325 0 -0.38"/>
+                <geom pos="-0.325 0 -0.38" size="0.11 0.2 0.265" type="box" name="controller_box_col"/>
+            </body>
+            <body name="pedestal_feet" pos="0 0 0">
+                <inertial diaginertia="8.16095 9.59375 15.0785" mass="167.09" pos="-0.1225 0 -0.758"/>
+                <geom pos="-0.1225 0 -0.758" size="0.385 0.35 0.155" type="box" name="pedestal_feet_col"/>
+            </body>
+            <body name="torso" pos="0 0 0">
+                <inertial diaginertia="1e-08 1e-08 1e-08" mass="0.0001" pos="0 0 0"/>
+                <geom conaffinity="0" contype="0" group="1" pos="0 0 -0.05" size="0.05 0.05 0.05" type="box" name="torso_vis" rgba="0.2 0.2 0.2 1" />
+            </body>
+            <body name="pedestal" pos="0 0 0">
+                <inertial diaginertia="6.0869 5.81635 4.20915" mass="60.864" pos="0 0 0" quat="0.659267 -0.259505 -0.260945 0.655692"/>
+                <geom conaffinity="0" contype="0" group="1" mesh="pedestal" type="mesh" name="pedestal_vis" rgba="0.2 0.2 0.2 1" />
+                <geom pos="-0.02 0 -0.29" rgba="0.2 0.2 0.2 1" size="0.18 0.31" type="cylinder" name="pedestal_col"/>
+            </body>
+        </body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/bottle.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/bottle.xml
new file mode 100644
index 0000000000000000000000000000000000000000..5b83fb2f0d6f8495997cf8fea1b92e8c45173583
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/bottle.xml
@@ -0,0 +1,18 @@
+<mujoco model="bottle">
+  <asset>
+    <mesh file="meshes/bottle.stl" name="bottle_mesh"/>
+    <texture type="skybox" file="../textures/glass.png" name="tex-glass"/>
+    <material name="glass" reflectance="0.5" texrepeat="5 5" texture="tex-glass" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" mesh="bottle_mesh" type="mesh" solimp="0.998 0.998 0.001" solref="0.001 1" density="50" friction="0.95 0.3 0.1" material="glass" contype="0" conaffinity="0" group="1"/>
+        <geom pos="0 0 0" mesh="bottle_mesh" type="mesh" solimp="0.998 0.998 0.001" solref="0.001 1" density="50" friction="0.95 0.3 0.1" material="glass" group="0"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.082" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.075" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.0254 0.0254 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/bread-visual.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/bread-visual.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ff20060006c2250896d01e71c10a07db88053245
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/bread-visual.xml
@@ -0,0 +1,15 @@
+<mujoco model="bread-visual">
+  <asset>
+    <mesh file="meshes/bread.stl" name="bread_mesh" scale="0.8 0.8 0.8"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" mesh="bread_mesh" type="mesh" rgba="0.8 0.8 0.8 0.3"  conaffinity="0" contype="0"  group="1" mass="0.0001"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.045" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.03" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.03 0.03 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/bread.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/bread.xml
new file mode 100644
index 0000000000000000000000000000000000000000..a5796c20d2649482a79d17642b00a82512c703bd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/bread.xml
@@ -0,0 +1,17 @@
+<mujoco model="bread">
+  <asset>
+    <mesh file="meshes/bread.stl" name="bread_mesh" scale="0.8 0.8 0.8"/>
+    <texture file="../textures/bread.png" type="2d" name="tex-bread" />
+    <material name="bread" reflectance="0.7" texrepeat="15 15" texture="tex-bread" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" mesh="bread_mesh" type="mesh" solimp="0.998 0.998 0.001" solref="0.001 1" density="50" friction="0.95 0.3 0.1"  material="bread" group="0" condim="4"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.045" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.03" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.03 0.03 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/can-visual.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/can-visual.xml
new file mode 100644
index 0000000000000000000000000000000000000000..5c9d905ae89d55866b3c65be69b60481f165155e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/can-visual.xml
@@ -0,0 +1,15 @@
+<mujoco model="can-visual">
+  <asset>
+    <mesh file="meshes/can.stl" name="can_mesh"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" mesh="can_mesh" type="mesh" rgba="0.8 0.8 0.8 0.3"  conaffinity="0" contype="0" group="1" mass="0.0001"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.06" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.04" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.025 0.025 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/can.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/can.xml
new file mode 100644
index 0000000000000000000000000000000000000000..d9c222b379b5dd01d6b55649b29039b79efb8c76
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/can.xml
@@ -0,0 +1,17 @@
+<mujoco model="can">
+  <asset>
+    <mesh file="meshes/can.msh" name="can_mesh"/>
+    <texture file="../textures/soda.png" name="tex-can" type="2d"/>
+    <material name="coke" reflectance="0.7" texrepeat="5 5" texture="tex-can" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" mesh="can_mesh" type="mesh" solimp="0.998 0.998 0.001" solref="0.001 1" density="100" friction="0.95 0.3 0.1" material="coke" group="0" condim="4"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.06" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.04" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.025 0.025 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/cereal-visual.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/cereal-visual.xml
new file mode 100644
index 0000000000000000000000000000000000000000..eb399031e23eaa73f578e855daa7a1c9fc42c87c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/cereal-visual.xml
@@ -0,0 +1,15 @@
+<mujoco model="cereal-visual">
+  <asset>
+    <mesh file="meshes/cereal.stl" name="cereal_mesh"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" mesh="cereal_mesh" type="mesh" rgba="0.8 0.8 0.8 0.3"  conaffinity="0" contype="0"  group="1" mass="0.0001"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.10" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.03" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.04 0.03 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/cereal.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/cereal.xml
new file mode 100644
index 0000000000000000000000000000000000000000..708de0f5f374afe0e6be7e955043145bc8f5f16c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/cereal.xml
@@ -0,0 +1,17 @@
+<mujoco model="cereal">
+  <asset>
+    <mesh file="meshes/cereal.msh" name="cereal_mesh"/>
+    <texture type="2d" file="../textures/cereal.png" rgb1="1 1 1" name="tex-cereal"/>
+    <material name="cereal" reflectance="0.5" texrepeat="1 1" texture="tex-cereal" texuniform="false"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" mesh="cereal_mesh" type="mesh" solimp="0.998 0.998 0.001" solref="0.001 1" density="150" friction="0.95 0.3 0.1" material="cereal" group="0" condim="4"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.10" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.03" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.04 0.03 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/door.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/door.xml
new file mode 100644
index 0000000000000000000000000000000000000000..5ef1c5a685f747ff7e9205965b19f924bcac9336
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/door.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- from https://github.com/vikashplus/mj_envs/tree/master/mj_envs/hand_manipulation_suite/assets -->
+<!-- Information about door can be found in this paper: https://arxiv.org/pdf/1709.10087.pdf -->
+
+<mujoco model="door_lock">
+  <asset>
+    <texture file="../textures/light-wood.png" type="cube" name="light-wood" />
+    <texture file="../textures/dark-wood.png" type="cube" name="dark-wood" />
+    <texture file="../textures/brass-ambra.png" type="cube" name="metal" />
+    <material name="MatMetal" texture="metal" specular="1" shininess="0.3" rgba="0.9 0.9 0.9 1" />
+    <material name="MatDarkWood" texture="dark-wood" texrepeat="3 3" specular="0.4" shininess="0.1" />
+    <material name="MatLightWood" texture="light-wood" texrepeat="3 3" specular="0.4" shininess="0.1" />
+  </asset>
+    <worldbody>
+      <body>
+        <body name="object">
+          <body name="frame" pos="0 0.22 0" quat="0.707388 0 0 -0.706825">
+              <inertial pos="0.3 0 0" quat="0.5 0.5 0.5 0.5" mass="7.85398" diaginertia="0.923301 0.764585 0.168533" />
+              <geom size="0.03 0.3" pos="0.555 0 0" type="cylinder" rgba="1 1 1 1" group="1" name="r_frame"/>
+              <geom size="0.03 0.3" pos="0.045 0 0" type="cylinder" rgba="1 1 1 1" group="1" name="l_frame"/>
+              <body name="door" pos="0.3 0 0">
+                  <inertial pos="0.0296816 -0.00152345 0" quat="0.701072 0 0 0.713091" mass="2.43455" diaginertia="0.0913751 0.0521615 0.043714" />
+                  <joint name="hinge" pos="0.255 0 0" axis="0 0 1" range="0.0 0.4" damping="1" frictionloss="1" limited="true"/>
+                  <geom size="0.22 0.02 .29" type="box" friction="1 1 1" group="1" name="panel" material="MatDarkWood"/>
+                  <body name="latch" pos="-0.175 0 -0.025">
+                      <inertial pos="-0.017762 0.0138544 0" quat="0.365653 0.605347 -0.36522 0.605365" mass="0.1" diaginertia="0.0483771 0.0410001 0.0111013" />
+                      <geom fromto="0 0 0 0 -0.125 0" size="0.025" type="cylinder" group="1" name="handle_base" material="MatMetal"/>
+                      <geom size="0.075 0.015 0.02" pos="0.075 -0.10 0" type="box" group="1" name="handle" material="MatMetal"/>
+                      <!-- Don't render the door handle site by default -->
+                      <site name="handle" pos="0.125 -0.10 0" size="0.02" rgba="0 0 1 0" />
+                  </body>
+              </body>
+          </body>
+        </body>
+        <site name="bottom_site" pos="0 0 -0.3" rgba="0 0 0 0" size="0.005"/>
+        <site name="top_site" pos="0 0 0.3" rgba="0 0 0 0" size="0.005"/>
+        <site name="horizontal_radius_site" pos="0.3 0 0" rgba="0 0 0 0" size="0.1"/>
+      </body>
+    </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/door_lock.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/door_lock.xml
new file mode 100644
index 0000000000000000000000000000000000000000..5337073f3c14f48f15d19baaa345ad38267afb82
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/door_lock.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- from https://github.com/vikashplus/mj_envs/tree/master/mj_envs/hand_manipulation_suite/assets -->
+<!-- Information about door can be found in this paper: https://arxiv.org/pdf/1709.10087.pdf -->
+
+<mujoco model="door_lock">
+  <asset>
+    <texture file="../textures/light-wood.png" type="cube" name="light-wood" />
+    <texture file="../textures/dark-wood.png" type="cube" name="dark-wood" />
+    <texture file="../textures/brass-ambra.png" type="cube" name="metal" />
+    <material name="MatMetal" texture="metal" specular="1" shininess="0.3" rgba="0.9 0.9 0.9 1" />
+    <material name="MatDarkWood" texture="dark-wood" texrepeat="3 3" specular="0.4" shininess="0.1" />
+    <material name="MatLightWood" texture="light-wood" texrepeat="3 3" specular="0.4" shininess="0.1" />
+  </asset>
+    <worldbody>
+      <body>
+        <body name="object">
+          <body name="frame" pos="0 0.22 0" quat="0.707388 0 0 -0.706825">
+              <inertial pos="0.3 0 0" quat="0.5 0.5 0.5 0.5" mass="7.85398" diaginertia="0.923301 0.764585 0.168533" />
+              <geom size="0.03 0.3" pos="0.555 0 0" type="cylinder" rgba="1 1 1 1" group="0" name="r_frame"/>
+              <geom size="0.03 0.3" pos="0.045 0 0" type="cylinder" rgba="1 1 1 1" group="0" name="l_frame"/>
+              <body name="door" pos="0.3 0 0">
+                  <inertial pos="0.0296816 -0.00152345 0" quat="0.701072 0 0 0.713091" mass="2.43455" diaginertia="0.0913751 0.0521615 0.043714" />
+                  <joint name="hinge" pos="0.255 0 0" axis="0 0 1" range="0.0 0.4" damping="1" frictionloss="1" limited="true"/>
+                  <geom size="0.22 0.02 .29" type="box" friction="1 1 1" group="0" name="panel" material="MatDarkWood"/>
+                  <body name="latch" pos="-0.175 0 -0.025">
+                      <inertial pos="-0.017762 0.0138544 0" quat="0.365653 0.605347 -0.36522 0.605365" mass="0.1" diaginertia="0.0483771 0.0410001 0.0111013" />
+                      <joint name="latch_joint" pos="0 0 0" axis="0 1 0" range="-1.57 1.57" frictionloss="0.1" damping="0" limited="true" springref="0" stiffness="1" />
+                      <geom fromto="0 0.0625 0 0 -0.125 0" size="0.025" type="cylinder" group="0" name="handle_base" material="MatMetal"/>
+                      <geom size="0.075 0.015 0.02" pos="0.075 -0.10 0" type="box" group="0" name="handle" material="MatMetal"/>
+                      <geom size="0.025 0.0125 0.03125" pos="-0.03125 0.05 0" quat="0.707388 0 0.706825 0" type="box" group="0" name="latch" material="MatMetal"/>
+                      <geom size="0.025 0.0125" pos="-0.0625 0.05 0" quat="0.707388 0.706825 0 0" type="cylinder" group="0" name="latch_tip" material="MatMetal"/>
+                      <!-- Don't render the door handle site by default -->
+                      <site name="handle" pos="0.125 -0.10 0" size="0.02" rgba="0 0 1 0" />
+                  </body>
+              </body>
+          </body>
+        </body>
+        <site name="bottom_site" pos="0 0 -0.3" rgba="0 0 0 0" size="0.005"/>
+        <site name="top_site" pos="0 0 0.3" rgba="0 0 0 0" size="0.005"/>
+        <site name="horizontal_radius_site" pos="0.3 0 0" rgba="0 0 0 0" size="0.1"/>
+      </body>
+    </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/lemon.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/lemon.xml
new file mode 100644
index 0000000000000000000000000000000000000000..6b2c6a2f71daede6a1a2384052f7c3f34319e8ba
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/lemon.xml
@@ -0,0 +1,17 @@
+<mujoco model="lemon">
+  <asset>
+    <mesh file="meshes/lemon.msh" name="lemon_mesh" scale="1.5 1 1"/>
+    <texture file="../textures/lemon.png" type="2d" name="tex-lemon"/>
+    <material name="lemon" reflectance="0.7" texrepeat="60 50" texture="tex-lemon" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" mesh="lemon_mesh" type="mesh" solimp="0.998 0.998 0.001" solref="0.001 1" density="50" friction="0.95 0.3 0.1"  material="lemon" group="0" condim="4"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.035" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.02" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.0375 0.02 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.msh b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.msh
new file mode 100644
index 0000000000000000000000000000000000000000..54a9d787168fbbda5d14ea4c8144ae5dc153e469
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.msh
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16203cc4564fe8dc2c1e5a6bdbd20ff87561f016f3b11a2fb6445e144cf80fa6
+size 14272
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..dbc0ceb4c124304b234285c1dba173ed9befb5fb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e427f17d7d05eb45adaf1e2bd7fd249bc3fe56437ba8e348cbdf147cd384b996
+size 238
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.obj
new file mode 100644
index 0000000000000000000000000000000000000000..fb0198196513ba778408d536fcd511756261338b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92353bf012eaea33c1e0d78d391fa61e083d53c57325e68d3410b31649c43da9
+size 8999
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.stl
new file mode 100644
index 0000000000000000000000000000000000000000..cd2419eb4fd7fcbb074c214c6c0dfdf3168e4851
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bottle.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2b6ca097b2d0d43b255fe3d605c8628117459b1d7369220ddf62ced5e84d962
+size 6684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.msh b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.msh
new file mode 100644
index 0000000000000000000000000000000000000000..93a48ecfb152f84765df8d80f6e9914b54b271fb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.msh
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:befe24f2e5876b3bf7df07c92622c52691ec818d73c9594cb68b6fc43a1cb8d3
+size 11248
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..b0b6cb8dc80f989a843c100aaf94cce73b4fbe3c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d776116483420a01ef53fd8f941583255a56467f521768027bfcd0123b7be527
+size 265
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.obj
new file mode 100644
index 0000000000000000000000000000000000000000..f8424f309bdf8f6bf7c6e929e0f013cdab2937a7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb11db62e9da167ebf91ff4bed4d07a2f8d8210d5333df1bf01e18217a37cc1a
+size 8026
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.stl
new file mode 100644
index 0000000000000000000000000000000000000000..1215c89ce1a8337fcda6d45febbe9e6a7c885237
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/bread.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c6646bcf9ec086d972d4d89141e8bb8081f14170306376bae59d3be59a6ace2
+size 5284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.msh b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.msh
new file mode 100644
index 0000000000000000000000000000000000000000..e17dffc422fd0ba18764f53677c8ce0297fc65a0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.msh
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a0e736201604130d47abe6f93e9173b11b1d49df6e2f7093537a435c061cbb8
+size 103264
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..aedbd504433cc686ed638d5d13edfaf5a32b39c4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:292e64f3385432e11d55dae641f29b6855e4fe4e2d1d48baf7e7ea218a5e9d85
+size 264
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.obj
new file mode 100644
index 0000000000000000000000000000000000000000..2dc699c438431007ee8815cb6b3bcb32db225d69
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1904183501b49346131ed46bd1ee6fbf2c14f394c233ef9df048758c9a4b5118
+size 77216
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.stl
new file mode 100644
index 0000000000000000000000000000000000000000..b848dab76d94d2e184388ba8adae65470b3b0893
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/can.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb459d8f0dcfc855aadc91815462d69ba58b4f5bcc289631043f17eccbf937d3
+size 47884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.msh b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.msh
new file mode 100644
index 0000000000000000000000000000000000000000..671c657da225036af7c2931f4ec5a5570ad63b29
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.msh
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baee1abe4b11299a3cacab3db566273953f6444241c92a32ba3c8dcbf7fe8d5d
+size 12328
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..a5dc698ebc28f604917a9624364da4da959a1adb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:485789398b0ef6b1c147956be55419ea06396686e69a4fb244bdbcdededda341
+size 266
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e943e328881aa8e202549f442c84af21277cbdfb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8377fc93fca888314fe6a47bfe392fc8eb6db8f5c0cb796ffce8f9e47f589bff
+size 11709
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.stl
new file mode 100644
index 0000000000000000000000000000000000000000..99560a13d2a6df746a78c60eeb0a303c0258041c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cereal.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd415818056831cb6fa4b40112280f463de8624b06e23d188a7e0f83ae5be1df
+size 5784
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cube.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cube.obj
new file mode 100644
index 0000000000000000000000000000000000000000..10ba1dc72aab49609f02593bf2fb09d61e205b72
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cube.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56a9cb677cb4f267f49c879af58db1d24d0160688781c90498bdd6a879e87dd4
+size 788
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cylinder.msh b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cylinder.msh
new file mode 100644
index 0000000000000000000000000000000000000000..a32038a621d6aa0af121f3f3db35ecd6bffae47f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cylinder.msh
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41061b9ceee866bcc115470d0e3fca91f013f2b1b93b5aa03f8357a9025c37a3
+size 27232
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cylinder.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cylinder.obj
new file mode 100644
index 0000000000000000000000000000000000000000..110f6e974c245f291a8f6b32e396e69769ec71a0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/cylinder.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d8c3cbd3c7e45f6fbb324c2c890a0fce966196813e2f4567356c814c053a56e
+size 23189
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.msh b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.msh
new file mode 100644
index 0000000000000000000000000000000000000000..f5843d937342345321d686f599fb44863f9cf26c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.msh
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84d98eb8a8e406de6745280af4461b13b9e4e1475451aa27674de3ae27c5a486
+size 14272
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..dbc0ceb4c124304b234285c1dba173ed9befb5fb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e427f17d7d05eb45adaf1e2bd7fd249bc3fe56437ba8e348cbdf147cd384b996
+size 238
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.obj
new file mode 100644
index 0000000000000000000000000000000000000000..24bd3cb5821deceb45c618424ce2c507d0b119e0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a14962cb87de19cfa50bfc8d0fff4aab01d82f0bb7460b0894770cc635ff635d
+size 9695
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.stl
new file mode 100644
index 0000000000000000000000000000000000000000..39d0590d0fa05b180aea24a1ae6dc487ade165fe
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/handles.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5564bb9bfc069a3834f87e6761c5e5797856fe8b932928aa19efebb1568625a
+size 17484
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.msh b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.msh
new file mode 100644
index 0000000000000000000000000000000000000000..6dee42b15420cc45068b92ac3d696e2f5750e36a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.msh
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a2dfa73cb1e6650180bdb67fe253c565c409cdc6d89f70937017afe5555780b
+size 57040
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..4909d82692b41142528425095c2c1e5eaa8b71e2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4457f44c4aacbe345e57c7e73ed87656f06b3dfba225e32ed427d2b79c79d34
+size 265
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b139f4392c19f6b1827ee41a6afcc16bc0db037a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97cbc7113211f9525427cfd9087b891fb32eef55a5f691b1227c4f498e5bd99e
+size 51035
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.stl
new file mode 100644
index 0000000000000000000000000000000000000000..59f92681d6f5d877960ef3ad45963cd1c2500c8a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/lemon.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcaa216283267fff9621f524d609b59aad279971dbb52cb263680f03fc1f79db
+size 26484
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.msh b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.msh
new file mode 100644
index 0000000000000000000000000000000000000000..ea137bf28fb3148e04d4514b59fdaebbdc799f80
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.msh
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92dbdf689a2fc2a46ddf3d5390cac3bde7fba9bfec642e6693acf205a201a40d
+size 26152
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..054a76bedc1ca9f0da9ff303e942b3f71d1602fe
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3a9c63bea5ebf37805dcd187627773757d056bf82fed26e4c0298c99c3b21ef
+size 267
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.obj
new file mode 100644
index 0000000000000000000000000000000000000000..fb85f09df8e0fc74b0a096dab13ee4f4b6042c26
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15f99744f9c5a73e50750f1f3e44208d084bff5686f96281349ed73ab38666c8
+size 20373
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.stl
new file mode 100644
index 0000000000000000000000000000000000000000..05a17aadc8a973b2c49f5fe1c344ec76f21cdfb2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/milk.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7880b8da658d7612990cbafa83fb8d358005319a5cab7be6db6993b4b830b6b
+size 12184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/sphere8.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/sphere8.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3320445e5d26abfcd6cae2e81c426e08333722e4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/meshes/sphere8.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:114f2371259557ac546250c63690045bf9fb9dd89171c56124a2df9374d120fb
+size 46933
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/milk-visual.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/milk-visual.xml
new file mode 100644
index 0000000000000000000000000000000000000000..0b92a03fd62892b9341459c04f3cbf79b9a9bfdf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/milk-visual.xml
@@ -0,0 +1,15 @@
+<mujoco model="milk-visual">
+  <asset>
+    <mesh file="meshes/milk.stl" name="milk_mesh" scale="0.9 0.9 0.9"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" mesh="milk_mesh" type="mesh" rgba="0.8 0.8 0.8 0.3" conaffinity="0" contype="0" group="1" mass="0.0001"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.085" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.075" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.025 0.025 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/milk.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/milk.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c6a2404f1378582c1fc33920bf1da1893c54aadc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/milk.xml
@@ -0,0 +1,17 @@
+<mujoco model="milk">
+  <asset>
+    <mesh file="meshes/milk.msh" name="milk_mesh" scale="0.9 0.9 0.9"/>
+    <texture file="../textures/ceramic.png" name="tex-ceramic" type="2d"/>
+    <material name="ceramic" reflectance="0.5" texrepeat="1 1" texture="tex-ceramic" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" mesh="milk_mesh" type="mesh" solimp="0.998 0.998 0.001" solref="0.001 1" density="100" friction="0.95 0.3 0.1" material="ceramic" group="0" condim="4"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.085" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.075" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.025 0.025 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/plate-with-hole.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/plate-with-hole.xml
new file mode 100644
index 0000000000000000000000000000000000000000..29ff23acd51beaf9f3d2698a14b79fb1d8fc182d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/plate-with-hole.xml
@@ -0,0 +1,19 @@
+<mujoco model="hole">
+  <asset>
+    <texture file="../textures/red-wood.png" type="cube" name="red-wood" />
+    <material name="plate_mat" texture="red-wood" texrepeat="3 3" specular="0.4" shininess="0.1" />
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="0 0 0" size="0.06 0.17 0.01" type="box" group="0" material="plate_mat" />
+        <geom pos="0.22 0 0" size="0.06 0.17 0.01" type="box" group="0" material="plate_mat" />
+        <geom pos="0.11 0.11 0" size="0.05 0.06 0.01" type="box" group="0" material="plate_mat" />
+        <geom pos="0.11 -0.11 0" size="0.05 0.06 0.01" type="box" group="0" material="plate_mat" />
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.02" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.02" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.25 0.25 0.1" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/round-nut.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/round-nut.xml
new file mode 100644
index 0000000000000000000000000000000000000000..894547e5d75613f6d05643b911be1cc39d66c3c3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/round-nut.xml
@@ -0,0 +1,26 @@
+<mujoco model="round-nut">
+  <asset>
+    <texture file="../textures/steel-scratched.png" type="cube" name="steel-metal"/>
+    <material name="smetal" reflectance="1.0" shininess="1.0" specular="1.0" texrepeat="1 1" texture="steel-metal" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="-0.04245 0 0.0" size="0.01125 0.0225 0.01" type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="smetal" />
+        <geom pos="0.04245 0 0.0" size="0.01125 0.0225 0.01" type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="smetal" />
+        <geom pos="0 -0.04245 0.0" size="0.0225 0.01125 0.01" type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="smetal" />
+        <geom pos="0 0.04245 0.0" size="0.0225 0.01125 0.01" type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="smetal" />
+        <geom pos="-0.03 -0.03 0.0" size="0.01125 0.0225 0.01" axisangle='0 0 1 0.785398' type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="smetal" />
+        <geom pos="0.03 0.03 0.0" size="0.01125 0.0225 0.01" axisangle='0 0 1 0.785398' type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="smetal" />
+        <geom pos="0.03 -0.03 0.0" size="0.0225 0.01125 0.01" axisangle='0 0 1 0.785398' type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="smetal" />
+        <geom pos="-0.03 0.03 0.0" size="0.0225 0.01125 0.01" axisangle='0 0 1 0.785398' type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="smetal" />
+        <geom pos="0.06 0 0.0" size="0.02525 0.015875 0.01" type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="smetal" />
+        <site rgba="1 0 0 1" size="0.005" pos="0.06 0 0.0" name="handle_site"/>
+        <site rgba="1 0 0 1" size="0.003" pos="0 0 0" name="center_site"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.05" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.025" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.11 0.05 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/square-nut.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/square-nut.xml
new file mode 100644
index 0000000000000000000000000000000000000000..b33db2e8055d479ee1b3c15ee1534e0fdebbdc0f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/objects/square-nut.xml
@@ -0,0 +1,22 @@
+<mujoco model="square-nut">
+  <asset>
+    <texture file="../textures/brass-ambra.png" type="cube" name="brass-metal"/>
+    <material name="bmetal"  reflectance="1.0" shininess="1.0" specular="1.0" texrepeat="1 1" texture="brass-metal" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <body>
+      <body name="object">
+        <geom pos="-0.03325 0 0" size="0.0105 0.04375 0.01" type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.2" group="0" condim="4" material="bmetal" />
+        <geom pos="0.0 0.03325 0" size="0.03125 0.0105 0.01" type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="bmetal" />
+        <geom pos="0.0 -0.03325 0" size="0.03125 0.0105 0.01" type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="bmetal" />
+        <geom pos="0.03325 0 0" size="0.0105 0.04375 0.01" type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="bmetal" />
+        <geom pos="0.054 0 0" size="0.02525 0.015875 0.01" type="box" solimp="0.998 0.998 0.001" solref="0.02 1" density="100" friction="0.95 0.3 0.1" group="0" condim="4" material="bmetal" />
+        <site rgba="1 0 0 1" size="0.005" pos="0.054 0 0" name="handle_site"/>
+        <site rgba="1 0 0 1" size="0.003" pos="0 0 0" name="center_site"/>
+      </body>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 -0.05" name="bottom_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0 0 0.01" name="top_site"/>
+      <site rgba="0 0 0 0" size="0.005" pos="0.11 0.06 0" name="horizontal_radius_site"/>
+    </body>
+  </worldbody>
+</mujoco>
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H0.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H0.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..f2c28be24cbe8df8292234616147770959d01cc4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H0.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b76defe6cb672ae9738a457b76312452969d2b73b8495e34269008c051d715ac
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e5933822637b38d096fb3a90d72fbb893f982a66
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cd395c754a8d50c124f00ae9424f6214e697519b3c5c0290be993fa33d8996e
+size 442742
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..de85f896b6886b97172f1a323887787c4fde93da
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fc0a1a1bee8949c2247ccf837883af5e31516bc54dede2d962243691c0c8c68
+size 260834
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..5db192ed8f17c6f826469e5df1b47e3ce0486ad0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb7fcba8800dde485fe84b064394506feeddfa589414aed49da8494c93a3b247
+size 423
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..359ae12d5006880c825906efad350bc3200ccc34
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f6aa86bf7b0a59941ee85f0de0c94a0779bb60c347d24b2f2da688e037af2f4
+size 305084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..949a5e84ba7069b847bd31e392bed2e907baf633
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/head/H1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96cbc1f7b8c5a7a89927d0a91cf7d3993df6ce43f31e96c6c7baeae0e34d072a
+size 174384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_elbow/E1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_elbow/E1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..1645322fdf43bba38c31773faf6c87816409bc51
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_elbow/E1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b82e27896cf4c64d445da77b52b6f1d643fc76fbddc1a54b67572e83ae852c20
+size 423
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_elbow/E1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_elbow/E1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9fa04217c4ceb2639791bda0c98344dceaf8de33
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_elbow/E1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bba3da6ef201294504170bf897fd8f2cb452d98fde01f306fb6bc45381f5a6b8
+size 1115988
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_elbow/E1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_elbow/E1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..9e4e21da8b4a51d3765062c352208fe22c579b8c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_elbow/E1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68b033ec7e612536b07f6d095e6e43f74fa368e42f5db91555ab38ec504e8fd0
+size 196634
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_forearm/W1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_forearm/W1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..c1de083bd694a8d0ec52d991ebdf634e983ae9c3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_forearm/W1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e29f69975eabbf015f0fa798c5cb2dd362391cb8e90e25b2801515f42e2c1994
+size 427
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_forearm/W1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_forearm/W1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4e6e1acebb5fa97ce8aad5d59e167542bb59ae54
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_forearm/W1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f1bfab9203dc80a530129d03b90a9be6edeb9704d20fbbad9ccb63ceeb018c9
+size 1281812
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_forearm/W1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_forearm/W1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..9004f5a45481637bc66a90bad5d1ed16e3c2a966
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_forearm/W1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f5997c917acb65ba1c628bc0a60af101ccd1b2dcbf30bfd23f27ef3683c5ad9
+size 180184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_shoulder/S1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_shoulder/S1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..eee673b13547faba785a78d1844c1b51dfc23d7c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_shoulder/S1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da38617d1e9dbf7521070fce58c7c9539d8d202a9c7bda713e81414271e76e37
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_shoulder/S1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_shoulder/S1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..525b73c94eb2cf01c2054cde21fcbbfec9ead465
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_shoulder/S1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06ca546f11a057e3f90ca808c88b13531f1b0c85d5871394ce876f97c0017cce
+size 585520
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_shoulder/S1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_shoulder/S1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..6f85f47d825faa5f2a3eeae2fb4f91d4255a0677
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/lower_shoulder/S1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:679ae0707bfc3666943ffa94103f8b4134d924d0c86ff3e2f165f7f328988c2a
+size 168234
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..da780dd9687471c53d98d546a56496387f0626bd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af6e4618bf5b2698573adb222d79ec3e3a2eb58c31f9998a5537029b4eec5fc7
+size 1167
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..2a9dd77426259ad43fdc65c9ecd09e5a8513d798
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:657b6843909b4ecec9852949e46c3bcb50382e7354353ad4afea064f0bee6073
+size 7674162
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..c9d0b7bfda82e66971b63b266f54b5a1ff479019
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9bd43749dca13a758d7a8aa1fc3dcccb20a50c3a9bd03089348265ee9a020c7
+size 464684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link_collision.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link_collision.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..ecf5425e98a66698505ea5616f2d11ba6c81a903
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link_collision.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bafb2488a40cdcd320933854e63846d52069484d2cf9ccb863160ff3b28a149a
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link_collision.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link_collision.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e9ae0148cf18dac3fe82c8dac5a85c198f446c6d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link_collision.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03ae32a5f1087f4165d442574b2c71d56eaa9ee8cbe94c0c484db2938da5e229
+size 958444
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link_collision.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link_collision.stl
new file mode 100644
index 0000000000000000000000000000000000000000..369ffe19ab92bce8d47f31ffaddfff0fffd2d485
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/torso/base_link_collision.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2980b03a0e699f0d85dd4ce3074758ee7067de4b81409b5fa1db4e3078c8b58e
+size 458034
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_elbow/E0.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_elbow/E0.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..2fde2abe0c641872c06d0f031560801b37efd9a6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_elbow/E0.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e38a7464ee3b85817f62dc556e77ab2f153149f16ff57759015335abd9514f1
+size 427
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_elbow/E0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_elbow/E0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b0b0ad0f891a44afcc739e0faabbcc849437dd00
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_elbow/E0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4033a198ba715cfa8cb1d25d1d419bd76438abde20371e60b55ca52afae815fb
+size 1249570
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_elbow/E0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_elbow/E0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..cf1c2ed223be70895fbff5408d136be544b6d9b2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_elbow/E0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2badd3a918969773800ed5e0f378676823c446d511383f957d1379fd75649b3c
+size 214984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_forearm/W0.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_forearm/W0.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..fed78c3c1e365d5f0ea8fee6140543a06aba6173
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_forearm/W0.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25c3f09349ddee41ba636c9a31fe17672681e580ed7af9c5a65a3d0b1225a9ae
+size 807
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_forearm/W0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_forearm/W0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..968c9c04847e80e44ea1f8bf10a751f94439def4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_forearm/W0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d3130c527590a6c53c35a9110dac29bb979384e87aeda6433d785d62d3828c6
+size 2802095
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_forearm/W0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_forearm/W0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..299c3a47dbdcaf8c481858a41b6b6010887e6d0e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_forearm/W0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97d38ddc30cd28ed713977f69f991889cfd16574470265a8d575cbaa0c96d295
+size 344684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_shoulder/S0.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_shoulder/S0.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..2560aa9ccedfe243bd6a73f02177291b163514c0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_shoulder/S0.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd4a453fc16c84b40ccf82e79c23958d93c552fc3ffcf2a8fccbdbe80c4a6d51
+size 427
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_shoulder/S0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_shoulder/S0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3b2722136933baaf50c7ebeb18cf9a76b279d495
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_shoulder/S0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ff27deb7fcc317a7552a7f58de317e946e809dd3fe272a8d7699531d54e9ddb
+size 3708139
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_shoulder/S0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_shoulder/S0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..7da1a767d4fa90e51194a49ad94749b21bca0bd2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/upper_shoulder/S0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a52d18783743e956c62dc5693381748b40a0b3c5f19f63ce59495bf13e74fc8
+size 402034
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/wrist/W2.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/wrist/W2.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..97ad78af8b0a865f0229a7b73226da093d869935
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/wrist/W2.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a92de9e7504631dfa839a43b739d720cbb0df1208eaa30d0650ab53b3f58236d
+size 617
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/wrist/W2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/wrist/W2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..dde0fee77f5861ce13374492f76d6f3ce3ac4054
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/wrist/W2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c18c29a228f1db62fed9742d0e3e009757f1ce98c5ff63a05084c1242730f24c
+size 1168229
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/wrist/W2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/wrist/W2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..f76da64469251765e12c8b07cc7e20172fc341d0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/meshes/wrist/W2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbaeb449896a49d5bdee6d40bd6eaf26f43a3f4b156b2328ad35f4aba00b73b6
+size 148434
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/no_texture_robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/no_texture_robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..eac7d8e648b4a1e3d634bd4342d7175d6f642a4a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/no_texture_robot.xml
@@ -0,0 +1,194 @@
+<mujoco model="baxter">
+  <asset>
+    <mesh name="base_link" file="meshes/torso/base_link.stl" />
+    <mesh name="base_link_collision" file="meshes/torso/base_link_collision.stl" />
+    <mesh name="H0" file="meshes/head/H0.stl" />
+    <mesh name="H1" file="meshes/head/H1.stl" />
+    <mesh name="S0" file="meshes/upper_shoulder/S0.stl" />
+    <mesh name="S1" file="meshes/lower_shoulder/S1.stl" />
+    <mesh name="E0" file="meshes/upper_elbow/E0.stl" />
+    <mesh name="E1" file="meshes/lower_elbow/E1.stl" />
+    <mesh name="W0" file="meshes/upper_forearm/W0.stl" />
+    <mesh name="W1" file="meshes/lower_forearm/W1.stl" />
+    <mesh name="W2" file="meshes/wrist/W2.stl" />
+  </asset>
+  <default>
+    <default class="viz">
+      <geom contype="0" conaffinity="0" group="1" type="mesh" />
+    </default>
+    <default class="right_col">
+      <geom contype="4" conaffinity="3" group="0" />
+    </default>
+    <default class="left_col">
+      <geom contype="2" conaffinity="5" group="0" />
+    </default>
+  </default>
+  <contact>
+    <exclude body1="right_upper_shoulder" body2="torso" />
+    <exclude body1="right_upper_shoulder" body2="right_upper_elbow" />
+    <exclude body1="right_lower_shoulder" body2="torso" />
+    <exclude body1="left_upper_shoulder" body2="torso" />
+    <exclude body1="left_upper_shoulder" body2="left_upper_elbow" />
+    <exclude body1="left_lower_shoulder" body2="torso" />
+    <exclude body1="collision_head_link_1" body2="torso" />
+    <exclude body1="collision_head_link_2" body2="torso" />
+  </contact>
+  <worldbody>
+    <body name="base" pos="0 0 -0.062">
+      <!-- robot view -->
+      <camera mode="fixed" name="robotview" pos="1.3 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+      <inertial pos="0 0 0" mass="10" diaginertia="0.001 0.001 0.001" />
+      <!-- mount attached here -->
+      <body name="collision_head_link_1" pos="0.11 0 0.75">
+        <inertial pos="0 0 0" mass="0.0001" diaginertia="1e-008 1e-008 1e-008" />
+        <geom name="collision_head_link_1_vis" margin="0.001" type="sphere" class="viz" size="0.001" rgba="0.8 0.3 0.3 1" pos="0 0 0" />
+        <geom name="collision_head_link_1_col" margin="0.001" type="sphere" size="0.22" rgba="0.8 0.3 0.3 0.3" pos="-0.07 -0.04 0"/>
+      </body>
+      <body name="collision_head_link_2" pos="0.11 0 0.75">
+        <inertial pos="0 0 0" mass="0.0001" diaginertia="1e-008 1e-008 1e-008" />
+        <geom name="collision_head_link_2_col0" margin="0.001" type="sphere" size="0.001" rgba="0.8 0.3 0.3 0.3" pos="0 0 0" />
+        <geom name="collision_head_link_2_col1" margin="0.001" type="sphere" size="0.22" rgba="0.8 0.3 0.3 0.3" pos="-0.07 0.04 0" />
+      </body>
+      <body name="torso" pos="0 0 0">
+        <inertial pos="0 0 0" quat="0.997433 0.00167178 0.0715546 -0.00209003" mass="35.3365" diaginertia="1.87139 1.66268 0.779991" />
+        <geom name="base_link_vis" margin="0.001" class="viz" mesh="base_link" rgba="0.49 0.49 0.49 1"/>
+        <geom name="base_link_col" margin="0.001" type="mesh" mesh="base_link_collision"/>
+        <body name="head" pos="0.06 0 0.686">
+          <inertial pos="0 0 0" quat="0.339345 0.850736 -0.347392 0.201027" mass="0.547767" diaginertia="0.00474065 0.00461087 0.00199949" />
+          <!-- Don't use head joint -->
+          <!-- <joint limited="true" damping="0" armature=".01" name="head_pan" type="hinge" pos="0 0 0" axis="0 0 1" range="-1.5708 1.5708" /> -->
+          <geom name="head_vis" margin="0.001" class="viz" rgba="0.2 0.2 0.2 1" mesh="H0" />
+          <body name="screen" pos="0.1227 0 0" quat="0.453099 0.542864 0.542864 0.453099">
+            <inertial pos="0 0 0" quat="0.995785 0.0113501 -0.000390852 0.0910132" mass="0.440171" diaginertia="0.0040484 0.00275826 0.00150834" />
+            <geom name="screen1_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="H1" pos="0 0 -.03" euler="0 -1.57 0" />
+            <geom name="screen2_vis" margin="0.001" class="viz" type="box" size="0.109 0.08 0.002" rgba="0.9 0.9 0.9 1" pos="0 -0.016 0" quat="0.991445 0.130526 0 0" />
+          </body>
+        </body>
+        <body name="right_arm_mount" pos="0.024645 -0.219645 0.118588" quat="0.923879 0 0 -0.382684">
+          <inertial pos="0 0 0" mass="0.0001" diaginertia="1e-008 1e-008 1e-008" />
+          <body name="right_upper_shoulder" pos="0.055695 0 0.011038">
+            <inertial pos="0.01783 0.00086 0.19127" quat="0.975667 0.0673303 -0.206117 0.0325173" mass="5.70044" diaginertia="0.0498344 0.0377511 0.0331351" />
+            <joint limited="true" damping="0" armature=".01" name="right_s0" type="hinge" pos="0 0 0" axis="0 0 1" range="-1.70168 1.70168" />
+            <geom name="right_s0_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="S0" />
+            <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.1361" rgba="0.5 0.1 0.1 1" pos="0 0 0.1361" name="right_s0_collision"/>
+            <body name="right_lower_shoulder" pos="0.069 0 0.27035" quat="0.707107 -0.707107 0 0">
+              <inertial pos="0.06845 0.00269 -0.00529" quat="0.447487 0.536948 0.56425 0.439391" mass="3.22698" diaginertia="0.0278991 0.0212327 0.0112937" />
+              <joint limited="true" damping="0" armature=".01" name="right_s1" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.147 1.047" />
+              <geom name="right_s1_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="S1" />
+              <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.06" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="right_s1_collision"/>
+              <body name="right_upper_elbow" pos="0.102 0 0" quat="0.5 0.5 0.5 0.5">
+                <inertial pos="-0.00276 0.00132 0.18086" quat="0.838995 -0.0423128 -0.121287 0.52876" mass="4.31272" diaginertia="0.0287695 0.0273625 0.011409" />
+                <joint limited="true" damping="0" armature=".01" name="right_e0" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.05418 3.05418" />
+                <geom name="right_e0_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="E0" />
+                <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.0535" rgba="0.5 0.1 0.1 1" pos="0 0 -0.0535" name="right_e0_collision"/>
+                <geom margin="0.001" class="right_col" name="right_upper_elbow_col" type="cylinder" size="0.06 0.1365" pos="0 0 0.1365" rgba="0.499 0.499 0.499 1"/>
+                <body name="right_lower_elbow" pos="0.069 0 0.26242" quat="0.5 -0.5 -0.5 -0.5">
+                  <inertial pos="0.02611 0.00159 -0.01117" quat="0.440433 0.577149 0.562777 0.395221" mass="2.07206" diaginertia="0.0132096 0.00950002 0.00685697" />
+                  <joint limited="true" damping="0" armature=".01" name="right_e1" type="hinge" pos="0 0 0" axis="0 0 1" range="-0.05 2.618" />
+                  <geom name="right_e1_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="E1" />
+                  <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.05" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="right_e1_collision" />
+                  <body name="right_upper_forearm" pos="0.10359 0 0" quat="0.5 0.5 0.5 0.5">
+                    <inertial pos="-0.00168 0.0046 0.13952" quat="0.892709 -0.0256256 0.00446984 0.449882" mass="2.24665" diaginertia="0.016916 0.0165511 0.00371123" />
+                    <joint limited="true" damping="0" armature=".01" name="right_w0" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.059 3.059" />
+                    <geom name="right_w0_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="W0" />
+                    <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.044" rgba="0.5 0.1 0.1 1" pos="0 0 -0.044" name="right_w0_collision" />
+                    <geom margin="0.001" class="right_col" name="right_upper_forearm_col" type="cylinder" size="0.06 0.1036" pos="0 0 0.1" rgba="0.499 0.499 0.499 1"/>
+                    <body name="right_lower_forearm" pos="0.01 0 0.2707" quat="0.5 -0.5 -0.5 -0.5">
+                      <inertial pos="0.06041 0.00697 0.006" quat="0.456911 0.468526 0.475683 0.587743" mass="1.60979" diaginertia="0.00708828 0.00552689 0.00379383" />
+                      <joint limited="true" damping="0" armature=".01" name="right_w1" type="hinge" pos="0 0 0" axis="0 0 1" range="-1.5708 2.094" />
+                      <geom name="right_w1_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="W1" />
+                      <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.05" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="right_w1_collision" />
+                      <body name="right_wrist" pos="0.115975 0 0" quat="0.5 0.5 0.5 0.5">
+                        <inertial pos="0.00198 0.00125 0.01855" quat="0.0594843 0.68926 -0.16016 0.704082" mass="0.35093" diaginertia="0.0003082 0.000269928 0.000251035" />
+                        <joint limited="true" damping="0" armature=".01" name="right_w2" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.059 3.059" />
+                        <geom name="right_w2_vis" margin="0.001" class="viz" rgba="0.1 0.1 0.1 1" mesh="W2" />
+                        <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.0825" rgba="0.1 0.1 0.1 1" pos="0 0 0" name="right_w2_collision" />
+                        <body name="right_hand" pos="0 0 0.11355">
+				          <!-- This camera points out from the eef. -->
+                          <camera mode="fixed" name="right_eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                          <!-- right gripper gets added here -->
+                        </body>
+                      </body>
+                    </body>
+                  </body>
+                </body>
+              </body>
+            </body>
+          </body>
+        </body>
+        <body name="left_arm_mount" pos="0.024645 0.219645 0.118588" quat="0.923879 0 0 0.382684">
+          <inertial pos="0 0 0" mass="0.0001" diaginertia="1e-008 1e-008 1e-008" />
+          <body name="left_upper_shoulder" pos="0.055695 0 0.011038">
+            <inertial pos="0.01783 0.00086 0.19127" quat="0.975667 0.0673303 -0.206117 0.0325173" mass="5.70044" diaginertia="0.0498344 0.0377511 0.0331351" />
+            <joint limited="true" damping="0" armature=".01" name="left_s0" type="hinge" pos="0 0 0" axis="0 0 1" range="-1.70168 1.70168" />
+            <geom name="left_s0_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="S0" />
+            <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.1361" rgba="0.5 0.1 0.1 1" pos="0 0 0.1361" name="left_s0_collision" />
+            <body name="left_lower_shoulder" pos="0.069 0 0.27035" quat="0.707107 -0.707107 0 0">
+              <inertial pos="0.06845 0.00269 -0.00529" quat="0.447487 0.536948 0.56425 0.439391" mass="3.22698" diaginertia="0.0278991 0.0212327 0.0112937" />
+              <joint limited="true" damping="0" armature=".01" name="left_s1" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.147 1.047" />
+              <geom name="left_s1_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="S1" />
+              <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.06" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="left_s1_collision" />
+              <body name="left_upper_elbow" pos="0.102 0 0" quat="0.5 0.5 0.5 0.5">
+                <inertial pos="-0.00276 0.00132 0.18086" quat="0.838995 -0.0423128 -0.121287 0.52876" mass="4.31272" diaginertia="0.0287695 0.0273625 0.011409" />
+                <joint limited="true" damping="0" armature=".01" name="left_e0" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.05418 3.05418" />
+                <geom name="left_e0_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="E0" />
+                <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.0535" rgba="0.5 0.1 0.1 1" pos="0 0 -0.0535" name="left_e0_collision" />
+                <geom margin="0.001" class="left_col" name="left_upper_elbow_col" type="cylinder" size="0.06 0.1365" pos="0 0 0.1365" rgba="0.499 0.499 0.499 1"/>
+                <body name="left_lower_elbow" pos="0.069 0 0.26242" quat="0.5 -0.5 -0.5 -0.5">
+                  <inertial pos="0.02611 0.00159 -0.01117" quat="0.440433 0.577149 0.562777 0.395221" mass="2.07206" diaginertia="0.0132096 0.00950002 0.00685697" />
+                  <joint limited="true" damping="0" armature=".01" name="left_e1" type="hinge" pos="0 0 0" axis="0 0 1" range="-0.05 2.618" />
+                  <geom name="left_e1_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="E1" />
+                  <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.05" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="left_e1_collision" />
+                  <body name="left_upper_forearm" pos="0.10359 0 0" quat="0.5 0.5 0.5 0.5">
+                    <inertial pos="-0.00168 0.0046 0.13952" quat="0.892709 -0.0256256 0.00446984 0.449882" mass="2.24665" diaginertia="0.016916 0.0165511 0.00371123" />
+                    <joint limited="true" damping="0" armature=".01" name="left_w0" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.059 3.059" />
+                    <geom name="left_w0_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="W0" />
+                    <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.044" rgba="0.5 0.1 0.1 1" pos="0 0 -0.044" name="left_w0_collision" />
+                    <geom margin="0.001" class="left_col" name="left_upper_forearm_col" type="cylinder" size="0.06 0.136" pos="0 0 0.1" rgba="0.499 0.499 0.499 1"/>
+                    <body name="left_lower_forearm" pos="0.01 0 0.2707" quat="0.5 -0.5 -0.5 -0.5">
+                      <inertial pos="0.06041 0.00697 0.006" quat="0.456911 0.468526 0.475683 0.587743" mass="1.60979" diaginertia="0.00708828 0.00552689 0.00379383" />
+                      <joint limited="true" damping="0" armature=".01" name="left_w1" type="hinge" pos="0 0 0" axis="0 0 1" range="-1.5708 2.094" />
+                      <geom name="left_w1_vis" margin="0.001" class="viz" rgba="0.499 0.1 0.1 1" mesh="W1" />
+                      <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.05" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="left_w1_collision" />
+                      <body name="left_wrist" pos="0.115975 0 0" quat="0.5 0.5 0.5 0.5">
+                        <inertial pos="0.00198 0.00125 0.01855" quat="0.0594843 0.68926 -0.16016 0.704082" mass="0.35093" diaginertia="0.0003082 0.000269928 0.000251035" />
+                        <joint limited="true" damping="0" armature=".01" name="left_w2" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.059 3.059" />
+                        <geom name="left_w2_vis" margin="0.001" class="viz" rgba="0.1 0.1 0.1 1" mesh="W2" />
+                        <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.0825" rgba="0.1 0.1 0.1 1" pos="0 0 0" name="left_w2_collision" />
+                        <body name="left_hand" pos="0 0 0.11355">
+       				      <!-- This camera points out from the eef. -->
+                          <camera mode="fixed" name="left_eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                          <!-- left gripper gets added here -->
+                        </body>
+                      </body>
+                    </body>
+                  </body>
+                </body>
+              </body>
+            </body>
+          </body>
+        </body>
+      </body>
+    </body>
+  </worldbody>
+  <actuator>
+
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="right_s0" name="torq_right_j0"/>
+    <motor ctrllimited="true" ctrlrange="-100 100" joint="right_s1" name="torq_right_j1"/>
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="right_e0" name="torq_right_j2"/>
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="right_e1" name="torq_right_j3"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="right_w0" name="torq_right_j4"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="right_w1" name="torq_right_j5"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="right_w2" name="torq_right_j6"/>
+
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="left_s0" name="torq_left_j0"/>
+    <motor ctrllimited="true" ctrlrange="-100 100" joint="left_s1" name="torq_left_j1"/>
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="left_e0" name="torq_left_j2"/>
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="left_e1" name="torq_left_j3"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="left_w0" name="torq_left_j4"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="left_w1" name="torq_left_j5"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="left_w2" name="torq_left_j6"/>
+
+
+</actuator>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H0.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H0.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..f2c28be24cbe8df8292234616147770959d01cc4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H0.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b76defe6cb672ae9738a457b76312452969d2b73b8495e34269008c051d715ac
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e5933822637b38d096fb3a90d72fbb893f982a66
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cd395c754a8d50c124f00ae9424f6214e697519b3c5c0290be993fa33d8996e
+size 442742
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H0/H0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H0/H0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a1ae6aa3f04b0de267b37a7462edaaaf4ad5b891
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H0/H0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75393f5e86ed251401de0ee80b310b9f664fd5fbeb7568624d7ae5c178c55cb4
+size 550064
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..5db192ed8f17c6f826469e5df1b47e3ce0486ad0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb7fcba8800dde485fe84b064394506feeddfa589414aed49da8494c93a3b247
+size 423
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..359ae12d5006880c825906efad350bc3200ccc34
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f6aa86bf7b0a59941ee85f0de0c94a0779bb60c347d24b2f2da688e037af2f4
+size 305084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1/H1_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1/H1_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6bdb9d49fdc6dacc3bcf0be01f0ce90ea6580974
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1/H1_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:698c713c81b743493c8c7d6148528b3e3c0745e605ef422a2f577cb2592328ff
+size 408153
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1/H1_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1/H1_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b6a79d2f56397af9d42be8e6a77b069c980e3c74
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/head/H1/H1_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05f9a776f041fc9ff7459f0014770922804e883d1fb6646b6bfc50a83b37fe1e
+size 483
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..1645322fdf43bba38c31773faf6c87816409bc51
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b82e27896cf4c64d445da77b52b6f1d643fc76fbddc1a54b67572e83ae852c20
+size 423
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9fa04217c4ceb2639791bda0c98344dceaf8de33
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bba3da6ef201294504170bf897fd8f2cb452d98fde01f306fb6bc45381f5a6b8
+size 1115988
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1/E1_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1/E1_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..eb7d94622361dabdd7b042ce3eb2be3f1f9b12a8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1/E1_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6286acbf989bf2dab5c979e4f6429cfa92edc5dd4f45e4d6c4f7e3e3c47973e4
+size 1082756
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1/E1_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1/E1_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4becca61cbd400972034d95d700646ad02f59352
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_elbow/E1/E1_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87b4090e97be61c3d97ea5201b35106f974312386bdd9fbb15ee5c2ea687b326
+size 364501
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..c1de083bd694a8d0ec52d991ebdf634e983ae9c3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e29f69975eabbf015f0fa798c5cb2dd362391cb8e90e25b2801515f42e2c1994
+size 427
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4e6e1acebb5fa97ce8aad5d59e167542bb59ae54
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f1bfab9203dc80a530129d03b90a9be6edeb9704d20fbbad9ccb63ceeb018c9
+size 1281812
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1/W1_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1/W1_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4ede251c7d2166ccddfc16201b3f2d41aae0f268
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1/W1_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a061afc4df56e4cc9ab48009f1edb6660f4ee0b4ade7242af7250d33a599c2f
+size 1289984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1/W1_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1/W1_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..53869fababf0d9c4f4e7a76b6a97ac8951fe11ec
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_forearm/W1/W1_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c79f09f34d9e0de50389fe3d9ccbed5fe7295cdbdae89d437446ae34ca8a9c7
+size 369676
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_shoulder/S1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_shoulder/S1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..eee673b13547faba785a78d1844c1b51dfc23d7c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_shoulder/S1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da38617d1e9dbf7521070fce58c7c9539d8d202a9c7bda713e81414271e76e37
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_shoulder/S1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_shoulder/S1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..525b73c94eb2cf01c2054cde21fcbbfec9ead465
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_shoulder/S1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06ca546f11a057e3f90ca808c88b13531f1b0c85d5871394ce876f97c0017cce
+size 585520
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_shoulder/S1/S1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_shoulder/S1/S1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..0fd1fb0b2e63fce46d9660116450df62756499e7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/lower_shoulder/S1/S1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ea783562471b62e2266f8723d26f5c5075ad1f1de16baaa51965a8ec51b20fb
+size 749512
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..da780dd9687471c53d98d546a56496387f0626bd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af6e4618bf5b2698573adb222d79ec3e3a2eb58c31f9998a5537029b4eec5fc7
+size 1167
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..2a9dd77426259ad43fdc65c9ecd09e5a8513d798
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:657b6843909b4ecec9852949e46c3bcb50382e7354353ad4afea064f0bee6073
+size 7674162
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..0b480ed159015071cdfff98a372288b25b0e809e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4d0e8d08a997c215656b39eaeac49810bed85cf6a777c07e87003861454577a
+size 436791
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..7e99d8300c1979629f09d3b156b307c0856765b5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29e572575d0a51fd38394eed90b90e444ad45dab5d2211160adcc82861bb4419
+size 917951
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a638ed4decaeb326248fe14d1762bb59da97a06b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfab1eefa32e40cb3a15507745e20f2f822871c8006abfca4eaffad35b9cd784
+size 586928
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..bcd04f31bf6e318b7910f610284e8bbf89cb42d2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8d10810985fe13a711b67e6823507a0e1b9def877830f1595674c6dac4916cb
+size 643868
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4bbcb96d2478383c9d69379c869996859e2c7f13
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba6bb2b561734838120c24670341973acc4280de4b3e51cf6174526716fe761e
+size 7378939
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..8516199933334d41091909b9e7df9a7d98eaf217
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link/base_link_5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b907f1684b602fb4b5fcccf105c65e6e9cfd19d70ee007d6d6cb97e5b5b99e7
+size 76285
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link_collision.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link_collision.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..ecf5425e98a66698505ea5616f2d11ba6c81a903
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link_collision.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bafb2488a40cdcd320933854e63846d52069484d2cf9ccb863160ff3b28a149a
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link_collision.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link_collision.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e9ae0148cf18dac3fe82c8dac5a85c198f446c6d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link_collision.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03ae32a5f1087f4165d442574b2c71d56eaa9ee8cbe94c0c484db2938da5e229
+size 958444
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link_collision/base_link_collision.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link_collision/base_link_collision.obj
new file mode 100644
index 0000000000000000000000000000000000000000..1ad2f9f3f2eb950a9c0153625da4d1fb0834ac09
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/torso/base_link_collision/base_link_collision.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4823c7870c2c4feebbd7cfe2363097dca57093f4477fff2b941834eeefd815c
+size 2367315
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..2fde2abe0c641872c06d0f031560801b37efd9a6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e38a7464ee3b85817f62dc556e77ab2f153149f16ff57759015335abd9514f1
+size 427
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b0b0ad0f891a44afcc739e0faabbcc849437dd00
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4033a198ba715cfa8cb1d25d1d419bd76438abde20371e60b55ca52afae815fb
+size 1249570
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0/E0_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0/E0_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..608ad6da6bcc1a01545371dc7d00dc05bcb73a87
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0/E0_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00e7968c4995248e49e30d97c2cd216ac453fe37346fefb4bf71899481187014
+size 1227887
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0/E0_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0/E0_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4c27157e19d075f6340fb7c12c9e4e7274382a8b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_elbow/E0/E0_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffa8d3fcf0b39d04fbfdebfcc712e857984747bf74ed4b6b16bf4b1852b85667
+size 389970
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..fed78c3c1e365d5f0ea8fee6140543a06aba6173
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25c3f09349ddee41ba636c9a31fe17672681e580ed7af9c5a65a3d0b1225a9ae
+size 807
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..968c9c04847e80e44ea1f8bf10a751f94439def4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d3130c527590a6c53c35a9110dac29bb979384e87aeda6433d785d62d3828c6
+size 2802095
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..629aad022fca2bca9532342a515007237f845fb0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:caf539d497011da5f8e4d9c17acfd9f30a9a16c825b8db0553ed19c505a6a271
+size 202838
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..1a610fd515d1f7f3cd691e36d78dcf2dd00594c2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0ede1c72daf2a6eb428c98b628d5ed59e57694f967a38c88d434fd65e751bf3
+size 2859270
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5e40a54c19c243dca68ccf19d28d89145f7b4776
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:331ba0c22f54fceed1720cd759885464c8e2b4e75d2861d03e5d1d6c9a5a742d
+size 189386
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3318373728790dfe7a495b2ff8dec4425e2d818b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_forearm/W0/W0_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fce43a04f3c5f571d649ebf74fd3edd2c3ce7b208794766413e848a83806efc
+size 366039
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..2560aa9ccedfe243bd6a73f02177291b163514c0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd4a453fc16c84b40ccf82e79c23958d93c552fc3ffcf2a8fccbdbe80c4a6d51
+size 427
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3b2722136933baaf50c7ebeb18cf9a76b279d495
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ff27deb7fcc317a7552a7f58de317e946e809dd3fe272a8d7699531d54e9ddb
+size 3708139
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0/S0_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0/S0_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4caf5ff4dbebcb9b637dbcb80c2fe0739b38636d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0/S0_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52a5a1599cf4d83e1f14678dd6ad4f3791c6bb0ef65c4b5ec7033fecc6b7932c
+size 4816582
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0/S0_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0/S0_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c28ac4ce19fdacc1c80e80718ee36e5ad0471233
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/upper_shoulder/S0/S0_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95c1432b2f06895dc8adbcbca34479a6660c4de15146d6dbad5a50bffbf1e66d
+size 14317
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..97ad78af8b0a865f0229a7b73226da093d869935
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a92de9e7504631dfa839a43b739d720cbb0df1208eaa30d0650ab53b3f58236d
+size 617
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..dde0fee77f5861ce13374492f76d6f3ce3ac4054
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c18c29a228f1db62fed9742d0e3e009757f1ce98c5ff63a05084c1242730f24c
+size 1168229
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2/W2_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2/W2_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c95816df15a9866e10e7a7d75c3273a4ca031e6d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2/W2_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4efebb0c202790f3953bd2a9d0198d90a1e8b06f7869ef3c11a39bee0bff38a
+size 343268
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2/W2_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2/W2_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9bb90df69d417d8398fe5cf480fa78fb649c7ff2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2/W2_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfb43f7ff99def357f99c52cecbb710bd60b30a4b57d2acf96ef41cb6ce5dc4e
+size 3349619
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2/W2_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2/W2_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..21d84d2987ae997ee59a6a9b493df554542b3a73
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/obj_meshes/wrist/W2/W2_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1db9581a8d4fb2745ff8e87266c146872e5315bf49bfd00a395a377d589f6d3d
+size 138204
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..bd60be1feb5622a7d39bb889746c68540215be4a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/baxter/robot.xml
@@ -0,0 +1,296 @@
+<mujoco model="baxter">
+  <asset>
+
+    <!-- head H0 material and mesh -->
+    <material name="Material_001.001" specular="0.5" shininess="0.225" rgba="0.121856 0.121856 0.121856 1.000000"/>
+    <mesh name="H0" file="obj_meshes/head/H0/H0.obj"/>
+    <!-- head H1 material and mesh -->
+    <material name="Material.003" specular="0.5" shininess="0.225" rgba="0.640000 0.000000 0.000000 1.000000"/>
+    <material name="Material_001.003" specular="0.5" shininess="0.225" rgba="0.000000 0.000000 0.000000 1.000000"/>
+    <mesh name="H1_0" file="obj_meshes/head/H1/H1_0.obj"/>
+    <mesh name="H1_1" file="obj_meshes/head/H1/H1_1.obj"/>
+
+    <!-- lower elbow E1 material and mesh -->
+    <material name="Material_001.004" specular="0.5" shininess="0.225" rgba="0.512000 0.000000 0.000000 1.000000"/>
+    <material name="Material_002" specular="0.5" shininess="0.225" rgba="0.016303 0.016303 0.016303 1.000000"/>
+    <mesh name="E1_0" file="obj_meshes/lower_elbow/E1/E1_0.obj"/>
+    <mesh name="E1_1" file="obj_meshes/lower_elbow/E1/E1_1.obj"/>
+
+    <!-- lower forearm material and mesh -->
+    <material name="Material_001.005" specular="0.5" shininess="0.225" rgba="0.097485 0.097485 0.097485 1.000000"/>
+    <material name="Material_002.005" specular="0.5" shininess="0.225" rgba="0.016303 0.016303 0.016303 1.000000"/>
+    <mesh name="W1_0" file="obj_meshes/lower_forearm/W1/W1_0.obj"/>
+    <mesh name="W1_1" file="obj_meshes/lower_forearm/W1/W1_1.obj"/>
+
+    <!-- lower shoulder material and mesh -->
+    <material name="Material_001.006" specular="0.5" shininess="0.225" rgba="0.640000 0.000000 0.000000 1.000000"/>
+    <mesh name="S1" file="obj_meshes/lower_shoulder/S1/S1.obj"/>
+
+    <!-- torso material and mesh -->
+    <material name="Material_001.010" specular="0.5" shininess="0.225" rgba="0.512000 0.000000 0.000000 1.000000"/>
+    <material name="Material_003" specular="0.5" shininess="0.225" rgba="0.016303 0.016303 0.016303 1.000000"/>
+    <material name="Material_004" specular="0.5" shininess="0.225" rgba="0.066807 0.066822 0.066822 1.000000"/>
+    <material name="Material_005" specular="0.5" shininess="0.225" rgba="0.640000 0.640000 0.640000 1.000000"/>
+    <material name="Material_006" specular="0.5" shininess="0.225" rgba="0.351258 0.351258 0.351258 1.000000"/>
+    <material name="Material_007" specular="0.5" shininess="0.225" rgba="0.016303 0.016303 0.016303 1.000000"/>
+    <mesh name="base_link_0" file="obj_meshes/torso/base_link/base_link_0.obj"/>
+    <mesh name="base_link_1" file="obj_meshes/torso/base_link/base_link_1.obj"/>
+    <mesh name="base_link_2" file="obj_meshes/torso/base_link/base_link_2.obj"/>
+    <mesh name="base_link_3" file="obj_meshes/torso/base_link/base_link_3.obj"/>
+    <mesh name="base_link_4" file="obj_meshes/torso/base_link/base_link_4.obj"/>
+    <mesh name="base_link_5" file="obj_meshes/torso/base_link/base_link_5.obj"/>
+
+    <!-- upper elbow material and mesh -->
+    <material name="Material_001.008" specular="0.5" shininess="0.225" rgba="0.512000 0.013653 0.013653 1.000000"/>
+    <material name="Material_002.002" specular="0.5" shininess="0.225" rgba="0.016303 0.016303 0.016303 1.000000"/>
+    <mesh name="E0_0" file="obj_meshes/upper_elbow/E0/E0_0.obj"/>
+    <mesh name="E0_1" file="obj_meshes/upper_elbow/E0/E0_1.obj"/>
+
+    <!-- upper forearm material and mesh -->
+    <material name="Material_001.009" specular="0.5" shininess="0.225" rgba="0.512000 0.000000 0.000000 1.000000"/>
+    <material name="Material_002.003" specular="0.5" shininess="0.225" rgba="0.097485 0.097485 0.097485 1.000000"/>
+    <material name="Material_003.001" specular="0.5" shininess="0.225" rgba="0.016303 0.016303 0.016303 1.000000"/>
+    <material name="Material_005.001" specular="0.5" shininess="0.225" rgba="0.512000 0.512000 0.512000 1.000000"/>
+    <mesh name="W0_0" file="obj_meshes/upper_forearm/W0/W0_0.obj"/>
+    <mesh name="W0_1" file="obj_meshes/upper_forearm/W0/W0_1.obj"/>
+    <mesh name="W0_2" file="obj_meshes/upper_forearm/W0/W0_2.obj"/>
+    <mesh name="W0_3" file="obj_meshes/upper_forearm/W0/W0_3.obj"/>
+
+    <!-- upper shoulder material and mesh -->
+    <material name="Material_002.004" specular="0.5" shininess="0.225" rgba="0.097485 0.097485 0.097485 1.000000"/>
+    <material name="Material_003.002" specular="0.5" shininess="0.225" rgba="0.016303 0.016303 0.016303 1.000000"/>
+    <mesh name="S0_0" file="obj_meshes/upper_shoulder/S0/S0_0.obj"/>
+    <mesh name="S0_1" file="obj_meshes/upper_shoulder/S0/S0_1.obj"/>
+
+    <!-- wrist material and mesh -->
+    <material name="Material_001.011" specular="0.5" shininess="0.225" rgba="0.016303 0.016303 0.016303 1.000000"/>
+    <material name="Material_002.006" specular="0.5" shininess="0.225" rgba="0.512000 0.512000 0.512000 1.000000"/>
+    <material name="Material_003.004" specular="0.5" shininess="0.225" rgba="0.170722 0.170722 0.170722 1.000000"/>
+    <mesh name="W2_0" file="obj_meshes/wrist/W2/W2_0.obj"/>
+    <mesh name="W2_1" file="obj_meshes/wrist/W2/W2_1.obj"/>
+    <mesh name="W2_2" file="obj_meshes/wrist/W2/W2_2.obj"/>
+
+    <mesh name="base_link_collision" file="meshes/torso/base_link_collision.stl" />
+  </asset>
+  <default>
+    <default class="viz">
+      <geom contype="0" conaffinity="0" group="1" type="mesh" />
+    </default>
+    <default class="right_col">
+      <geom contype="4" conaffinity="3" group="0" />
+    </default>
+    <default class="left_col">
+      <geom contype="2" conaffinity="5" group="0" />
+    </default>
+    <default class="visual">
+      <geom contype="0" conaffinity="0" group="1" type="mesh"/>
+    </default>
+  </default>
+  <contact>
+    <exclude body1="right_upper_shoulder" body2="torso" />
+    <exclude body1="right_upper_shoulder" body2="right_upper_elbow" />
+    <exclude body1="right_lower_shoulder" body2="torso" />
+    <exclude body1="left_upper_shoulder" body2="torso" />
+    <exclude body1="left_upper_shoulder" body2="left_upper_elbow" />
+    <exclude body1="left_lower_shoulder" body2="torso" />
+    <exclude body1="collision_head_link_1" body2="torso" />
+    <exclude body1="collision_head_link_2" body2="torso" />
+  </contact>
+  <worldbody>
+    <body name="base" pos="0 0 -0.062">
+      <!-- robot view -->
+      <camera mode="fixed" name="robotview" pos="1.3 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+      <inertial pos="0 0 0" mass="10" diaginertia="0.001 0.001 0.001" />
+      <!-- mount attached here -->
+      <body name="collision_head_link_1" pos="0.11 0 0.75">
+        <inertial pos="0 0 0" mass="0.0001" diaginertia="1e-008 1e-008 1e-008" />
+        <geom name="collision_head_link_1_vis" margin="0.001" type="sphere" class="viz" size="0.001" rgba="0.8 0.3 0.3 1" pos="0 0 0" />
+        <geom name="collision_head_link_1_col" margin="0.001" type="sphere" size="0.22" rgba="0.8 0.3 0.3 0.3" pos="-0.07 -0.04 0"/>
+      </body>
+      <body name="collision_head_link_2" pos="0.11 0 0.75">
+        <inertial pos="0 0 0" mass="0.0001" diaginertia="1e-008 1e-008 1e-008" />
+        <geom name="collision_head_link_2_col0" margin="0.001" type="sphere" size="0.001" rgba="0.8 0.3 0.3 0.3" pos="0 0 0" />
+        <geom name="collision_head_link_2_col1" margin="0.001" type="sphere" size="0.22" rgba="0.8 0.3 0.3 0.3" pos="-0.07 0.04 0" />
+      </body>
+      <body name="torso" pos="0 0 0">
+        <inertial pos="0 0 0" quat="0.997433 0.00167178 0.0715546 -0.00209003" mass="35.3365" diaginertia="1.87139 1.66268 0.779991" />
+	<geom mesh="base_link_0" material="Material_005" class="visual"/>
+	<geom mesh="base_link_1" material="Material_007" class="visual"/>
+	<geom mesh="base_link_2" material="Material_003" class="visual"/>
+	<geom mesh="base_link_3" material="Material_001.010" class="visual"/>
+	<geom mesh="base_link_4" material="Material_004" class="visual"/>
+	<geom mesh="base_link_5" material="Material_006" class="visual"/>
+
+        <geom name="base_link_col" margin="0.001" type="mesh" mesh="base_link_collision"/>
+        <body name="head" pos="0.06 0 0.686">
+          <inertial pos="0 0 0" quat="0.339345 0.850736 -0.347392 0.201027" mass="0.547767" diaginertia="0.00474065 0.00461087 0.00199949" />
+          <!-- Don't use head joint -->
+          <!-- <joint limited="true" damping="0" armature=".01" name="head_pan" type="hinge" pos="0 0 0" axis="0 0 1" range="-1.5708 1.5708" /> -->
+	  <geom material="Material_001.001" mesh="H0" class="visual"/>
+
+          <body name="screen" pos="0.1227 0 0" quat="0.453099 0.542864 0.542864 0.453099">
+            <inertial pos="0 0 0" quat="0.995785 0.0113501 -0.000390852 0.0910132" mass="0.440171" diaginertia="0.0040484 0.00275826 0.00150834" />
+	    <geom mesh="H1_0" material="Material.003" pos="0 0 -.03" euler="0 -1.57 0" class="visual"/>
+	    <geom mesh="H1_1" material="Material_001.003" pos="0 0 -.03" euler="0 -1.57 0" class="visual"/>
+
+            <geom name="screen2_vis" margin="0.001" class="viz" type="box" size="0.109 0.08 0.002" rgba="0.9 0.9 0.9 1" pos="0 -0.016 0" quat="0.991445 0.130526 0 0" />
+          </body>
+        </body>
+        <body name="right_arm_mount" pos="0.024645 -0.219645 0.118588" quat="0.923879 0 0 -0.382684">
+          <inertial pos="0 0 0" mass="0.0001" diaginertia="1e-008 1e-008 1e-008" />
+          <body name="right_upper_shoulder" pos="0.055695 0 0.011038">
+            <inertial pos="0.01783 0.00086 0.19127" quat="0.975667 0.0673303 -0.206117 0.0325173" mass="5.70044" diaginertia="0.0498344 0.0377511 0.0331351" />
+            <joint limited="true" damping="0" armature=".01" name="right_s0" type="hinge" pos="0 0 0" axis="0 0 1" range="-1.70168 1.70168" />
+	    <geom mesh="S0_0" material="Material_002.004" class="visual"/>
+	    <geom mesh="S0_1" material="Material_003.002" class="visual"/>
+
+            <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.1361" rgba="0.5 0.1 0.1 1" pos="0 0 0.1361" name="right_s0_collision"/>
+            <body name="right_lower_shoulder" pos="0.069 0 0.27035" quat="0.707107 -0.707107 0 0">
+              <inertial pos="0.06845 0.00269 -0.00529" quat="0.447487 0.536948 0.56425 0.439391" mass="3.22698" diaginertia="0.0278991 0.0212327 0.0112937" />
+              <joint limited="true" damping="0" armature=".01" name="right_s1" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.147 1.047" />
+	      <geom material="Material_001.006" mesh="S1" class="visual"/>
+	      <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.06" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="right_s1_collision"/>
+              <body name="right_upper_elbow" pos="0.102 0 0" quat="0.5 0.5 0.5 0.5">
+                <inertial pos="-0.00276 0.00132 0.18086" quat="0.838995 -0.0423128 -0.121287 0.52876" mass="4.31272" diaginertia="0.0287695 0.0273625 0.011409" />
+                <joint limited="true" damping="0" armature=".01" name="right_e0" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.05418 3.05418" />
+		<geom mesh="E0_0" material="Material_001.008" class="visual"/>
+		<geom mesh="E0_1" material="Material_002.002" class="visual"/>
+
+                <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.0535" rgba="0.5 0.1 0.1 1" pos="0 0 -0.0535" name="right_e0_collision"/>
+                <geom margin="0.001" class="right_col" name="right_upper_elbow_col" type="cylinder" size="0.06 0.1365" pos="0 0 0.1365" rgba="0.499 0.499 0.499 1"/>
+                <body name="right_lower_elbow" pos="0.069 0 0.26242" quat="0.5 -0.5 -0.5 -0.5">
+                  <inertial pos="0.02611 0.00159 -0.01117" quat="0.440433 0.577149 0.562777 0.395221" mass="2.07206" diaginertia="0.0132096 0.00950002 0.00685697" />
+                  <joint limited="true" damping="0" armature=".01" name="right_e1" type="hinge" pos="0 0 0" axis="0 0 1" range="-0.05 2.618" />
+		  <geom mesh="E1_0" material="Material_001.004" class="visual"/>
+		  <geom mesh="E1_1" material="Material_002" class="visual"/>
+
+                  <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.05" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="right_e1_collision" />
+                  <body name="right_upper_forearm" pos="0.10359 0 0" quat="0.5 0.5 0.5 0.5">
+                    <inertial pos="-0.00168 0.0046 0.13952" quat="0.892709 -0.0256256 0.00446984 0.449882" mass="2.24665" diaginertia="0.016916 0.0165511 0.00371123" />
+                    <joint limited="true" damping="0" armature=".01" name="right_w0" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.059 3.059" />
+		    <geom mesh="W0_0" material="Material_003.001" class="visual"/>
+		    <geom mesh="W0_1" material="Material_001.009" class="visual"/>
+		    <geom mesh="W0_2" material="Material_002.003" class="visual"/>
+		    <geom mesh="W0_3" material="Material_005.001" class="visual"/>
+
+                    <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.044" rgba="0.5 0.1 0.1 1" pos="0 0 -0.044" name="right_w0_collision" />
+                    <geom margin="0.001" class="right_col" name="right_upper_forearm_col" type="cylinder" size="0.06 0.1036" pos="0 0 0.1" rgba="0.499 0.499 0.499 1"/>
+                    <body name="right_lower_forearm" pos="0.01 0 0.2707" quat="0.5 -0.5 -0.5 -0.5">
+                      <inertial pos="0.06041 0.00697 0.006" quat="0.456911 0.468526 0.475683 0.587743" mass="1.60979" diaginertia="0.00708828 0.00552689 0.00379383" />
+                      <joint limited="true" damping="0" armature=".01" name="right_w1" type="hinge" pos="0 0 0" axis="0 0 1" range="-1.5708 2.094" />
+		      <geom mesh="W1_0" material="Material_001.005" class="visual"/>
+		      <geom mesh="W1_1" material="Material_002.005" class="visual"/>
+
+                      <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.05" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="right_w1_collision" />
+                      <body name="right_wrist" pos="0.115975 0 0" quat="0.5 0.5 0.5 0.5">
+                        <inertial pos="0.00198 0.00125 0.01855" quat="0.0594843 0.68926 -0.16016 0.704082" mass="0.35093" diaginertia="0.0003082 0.000269928 0.000251035" />
+                        <joint limited="true" damping="0" armature=".01" name="right_w2" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.059 3.059" />
+			<geom mesh="W2_0" material="Material_002.006" class="visual"/>
+			<geom mesh="W2_1" material="Material_001.011" class="visual"/>
+			<geom mesh="W2_2" material="Material_003.004" class="visual"/>
+
+                        <geom margin="0.001" class="right_col" type="cylinder" size="0.06 0.0825" rgba="0.1 0.1 0.1 1" pos="0 0 0" name="right_w2_collision" />
+                        <body name="right_hand" pos="0 0 0.11355">
+				          <!-- This camera points out from the eef. -->
+                          <camera mode="fixed" name="right_eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                          <!-- right gripper gets added here -->
+                        </body>
+                      </body>
+                    </body>
+                  </body>
+                </body>
+              </body>
+            </body>
+          </body>
+        </body>
+        <body name="left_arm_mount" pos="0.024645 0.219645 0.118588" quat="0.923879 0 0 0.382684">
+          <inertial pos="0 0 0" mass="0.0001" diaginertia="1e-008 1e-008 1e-008" />
+          <body name="left_upper_shoulder" pos="0.055695 0 0.011038">
+            <inertial pos="0.01783 0.00086 0.19127" quat="0.975667 0.0673303 -0.206117 0.0325173" mass="5.70044" diaginertia="0.0498344 0.0377511 0.0331351" />
+            <joint limited="true" damping="0" armature=".01" name="left_s0" type="hinge" pos="0 0 0" axis="0 0 1" range="-1.70168 1.70168" />
+	    <geom mesh="S0_0" material="Material_002.004" class="visual"/>
+	    <geom mesh="S0_1" material="Material_003.002" class="visual"/>
+
+            <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.1361" rgba="0.5 0.1 0.1 1" pos="0 0 0.1361" name="left_s0_collision" />
+            <body name="left_lower_shoulder" pos="0.069 0 0.27035" quat="0.707107 -0.707107 0 0">
+              <inertial pos="0.06845 0.00269 -0.00529" quat="0.447487 0.536948 0.56425 0.439391" mass="3.22698" diaginertia="0.0278991 0.0212327 0.0112937" />
+              <joint limited="true" damping="0" armature=".01" name="left_s1" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.147 1.047" />
+	      <geom material="Material_001.006" mesh="S1" class="visual"/>
+              <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.06" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="left_s1_collision" />
+              <body name="left_upper_elbow" pos="0.102 0 0" quat="0.5 0.5 0.5 0.5">
+                <inertial pos="-0.00276 0.00132 0.18086" quat="0.838995 -0.0423128 -0.121287 0.52876" mass="4.31272" diaginertia="0.0287695 0.0273625 0.011409" />
+                <joint limited="true" damping="0" armature=".01" name="left_e0" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.05418 3.05418" />
+		<geom mesh="E0_0" material="Material_001.008" class="visual"/>
+		<geom mesh="E0_1" material="Material_002.002" class="visual"/>
+
+                <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.0535" rgba="0.5 0.1 0.1 1" pos="0 0 -0.0535" name="left_e0_collision" />
+                <geom margin="0.001" class="left_col" name="left_upper_elbow_col" type="cylinder" size="0.06 0.1365" pos="0 0 0.1365" rgba="0.499 0.499 0.499 1"/>
+                <body name="left_lower_elbow" pos="0.069 0 0.26242" quat="0.5 -0.5 -0.5 -0.5">
+                  <inertial pos="0.02611 0.00159 -0.01117" quat="0.440433 0.577149 0.562777 0.395221" mass="2.07206" diaginertia="0.0132096 0.00950002 0.00685697" />
+                  <joint limited="true" damping="0" armature=".01" name="left_e1" type="hinge" pos="0 0 0" axis="0 0 1" range="-0.05 2.618" />
+		  <geom mesh="E1_0" material="Material_001.004" class="visual"/>
+		  <geom mesh="E1_1" material="Material_002" class="visual"/>
+
+		  <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.05" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="left_e1_collision" />
+                  <body name="left_upper_forearm" pos="0.10359 0 0" quat="0.5 0.5 0.5 0.5">
+                    <inertial pos="-0.00168 0.0046 0.13952" quat="0.892709 -0.0256256 0.00446984 0.449882" mass="2.24665" diaginertia="0.016916 0.0165511 0.00371123" />
+                    <joint limited="true" damping="0" armature=".01" name="left_w0" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.059 3.059" />
+		    <geom mesh="W0_0" material="Material_003.001" class="visual"/>
+		    <geom mesh="W0_1" material="Material_001.009" class="visual"/>
+		    <geom mesh="W0_2" material="Material_002.003" class="visual"/>
+		    <geom mesh="W0_3" material="Material_005.001" class="visual"/>
+
+		    <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.044" rgba="0.5 0.1 0.1 1" pos="0 0 -0.044" name="left_w0_collision" />
+                    <geom margin="0.001" class="left_col" name="left_upper_forearm_col" type="cylinder" size="0.06 0.136" pos="0 0 0.1" rgba="0.499 0.499 0.499 1"/>
+                    <body name="left_lower_forearm" pos="0.01 0 0.2707" quat="0.5 -0.5 -0.5 -0.5">
+                      <inertial pos="0.06041 0.00697 0.006" quat="0.456911 0.468526 0.475683 0.587743" mass="1.60979" diaginertia="0.00708828 0.00552689 0.00379383" />
+                      <joint limited="true" damping="0" armature=".01" name="left_w1" type="hinge" pos="0 0 0" axis="0 0 1" range="-1.5708 2.094" />
+		      <geom mesh="W1_0" material="Material_001.005" class="visual"/>
+		      <geom mesh="W1_1" material="Material_002.005" class="visual"/>
+
+		      <geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.05" rgba="0.5 0.1 0.1 1" pos="0 0 0" name="left_w1_collision" />
+                      <body name="left_wrist" pos="0.115975 0 0" quat="0.5 0.5 0.5 0.5">
+                        <inertial pos="0.00198 0.00125 0.01855" quat="0.0594843 0.68926 -0.16016 0.704082" mass="0.35093" diaginertia="0.0003082 0.000269928 0.000251035" />
+                        <joint limited="true" damping="0" armature=".01" name="left_w2" type="hinge" pos="0 0 0" axis="0 0 1" range="-3.059 3.059" />
+			<geom mesh="W2_0" material="Material_002.006" class="visual"/>
+			<geom mesh="W2_1" material="Material_001.011" class="visual"/>
+			<geom mesh="W2_2" material="Material_003.004" class="visual"/>
+
+			<geom margin="0.001" class="left_col" type="cylinder" size="0.06 0.0825" rgba="0.1 0.1 0.1 1" pos="0 0 0" name="left_w2_collision" />
+                        <body name="left_hand" pos="0 0 0.11355">
+       				      <!-- This camera points out from the eef. -->
+                          <camera mode="fixed" name="left_eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                          <!-- left gripper gets added here -->
+                        </body>
+                      </body>
+                    </body>
+                  </body>
+                </body>
+              </body>
+            </body>
+          </body>
+        </body>
+      </body>
+    </body>
+  </worldbody>
+  <actuator>
+
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="right_s0" name="torq_right_j0"/>
+    <motor ctrllimited="true" ctrlrange="-100 100" joint="right_s1" name="torq_right_j1"/>
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="right_e0" name="torq_right_j2"/>
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="right_e1" name="torq_right_j3"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="right_w0" name="torq_right_j4"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="right_w1" name="torq_right_j5"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="right_w2" name="torq_right_j6"/>
+
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="left_s0" name="torq_left_j0"/>
+    <motor ctrllimited="true" ctrlrange="-100 100" joint="left_s1" name="torq_left_j1"/>
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="left_e0" name="torq_left_j2"/>
+    <motor ctrllimited="true" ctrlrange="-50 50" joint="left_e1" name="torq_left_j3"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="left_w0" name="torq_left_j4"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="left_w1" name="torq_left_j5"/>
+    <motor ctrllimited="true" ctrlrange="-15 15" joint="left_w2" name="torq_left_j6"/>
+
+
+</actuator>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ffe82631ec082acafd07d1751071059059006892
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4568669d3a1b144c8f05c850ab7eaca71dbd3ad55ff93ea5a05b861faeb5f7f
+size 191420
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..bf36feb6f3c5cbfb95930a976b38d3bd6cc28bcc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c34032c80481e163cba7f18a105bd6379fbb6a829d362d834500e3486e41b710
+size 464934
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..15751124b37a00e80c3660db6f4c670c88525860
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7e40268ac1fb6da2804ef95f1e7489d2ee4c60750d8c3e2d71c75257531e594
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b695f7ea29d5cac3b5594733c14f0a7e473cc39b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ea0ded1fd32f422fa4d49ba8339ca519377e9b5b52a7787431a2ccd91d411ea
+size 3879140
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..065fdd69d999bcff6fee4b2b1058d4528f16ea7b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_0_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffb1fe84e189a71d44e181b9c1b536223065520ee584530b5bed387662e87f65
+size 1782284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4f217d591b489ba0ae2a0a27d1a5736f59d5743f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1b0f52b84415bfd0ed3056e6e8fd73f3549b28116fe0cf9cd86b0bbbd9fcb26
+size 175775
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..e13ceb7e1ddd80199d26bbe40d1e974cf791329e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abc2f56d710d0c235093772f67bf66cd70ee1dbaabb215eeeadf9ebe1af8c91e
+size 193684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..b9544679b3e5cb9fe731477cd85bb465fac89f52
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff59907b9e16f843734c07638cd6a5b7f4e6da3033e37252b2ff45f3404581a7
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..bca5c9715c4748f9fa8c406d2cb041a97b87799a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baba099476580b0202ab43c194b204669b775876c91261a5f847e9e131ab125a
+size 986388
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..4d98a0934ee86146832ba9253940cc537a0be81d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_1_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:539a59f6200bdba9bf87f98b49c935085173145331438d388194febbba743895
+size 465984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5a152d229eab94565585620294cf867adbed6c72
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf01b42d94b646acfc520ff01ff62e393215a161a554d7345077bc88c90ae744
+size 89639
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..4ddb0184b5d252c13b17ae9d8bcf7678a6b39150
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6e1626424e9172161c4830eca09d9bd23f7c7571eab8e84e77fd28913834f40
+size 422184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..142f765e9c8a9fbe115f2e4d492187897521ab8d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7331b73a2eabfd947f83a52923ffe6e02de22631b8b7406fc92d6ff1fcc656f
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..25991748c4227e12b6135c5bd707ac5f9d888905
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34ad0041f3ccc351e0a947c115fb1100510f0ab76e50fea2d0f8435f5b5ed68e
+size 2988723
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..b1dfd0d14650b341801dfbecf5a217227f348423
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_2_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0511257c13eb8eedbf1f5e02aa3c6132a249554c4c512295400a8374812051b
+size 1327984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..79859b7b22e9c0850bc3089831704fafafd667b9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e57f6c82d68a0c7ad336995ce37f5f0b6463d5f7b69ea9f3ab4bea4c4cfbdcc
+size 119675
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3.stl
new file mode 100644
index 0000000000000000000000000000000000000000..e593b3a8a6c6b72d8b85712be4c17eb7cbd6e7ce
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b8f8382ffccadcc05c73dcb0727b7217505ae8105adfe7a15e83173c2b862d4
+size 231084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..54b602506004ee2d356deaf462d11b32168285b4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52a6d5160c4e8ba8f9ebbda46265f9c84a35adccd126b870b11fa0256024566a
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..502da3ca98c1d18ed7aba431147375dc52416809
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a77101fde75b0212127b72663a02a2321d8c29dfd2adfc92a024f04746f8e624
+size 1131490
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..4cad456384caa00fcd3352004545f8265e3c5b80
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_3_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4823603ca250881735890a0224b3b7bf81107d54c662b36b51abc8656f384b41
+size 528284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..f9ae49c4b3d79adb563cdc270716716d94f1760e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2abe2350293ad05fe8ef7f5d3ffdf8b5d62f99e264257e6f982ea5d2a960de3a
+size 96582
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4.stl
new file mode 100644
index 0000000000000000000000000000000000000000..10280d27b5be12ef60bbfda93a42c65133a63945
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbf6666c08a1fea57463075f261f2ab3eebf45f764e1ad104ff076ec4f1f51c7
+size 423984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..3659b8b3dfa0f5bfc2e4f53fbd17c221af3fda68
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fb6f71fd66d56883459f6d489d3f14c05ef6093a538146d7e9c35c5b5cd8ac4
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3f35bd16b815bd0f855b61d6add7e9f25a767119
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8e1f9bbaa11b24d73a1435e1d7c4cc926c25fe879925117216a31ec1845e8cb
+size 2999816
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ef8aa38e0de4cedf965368bfd67ac9c69fce8593
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_4_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89dacf4668f45ee51cd1f0a7cf027f180272c170ff1a1a735695a58dc2ed864b
+size 1327884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4a0b13e8614c6b47b72094b5f5ea1aa83b8308fa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67dbd40d05edf4cc10ab38c5a9f84b83ed3902ea0f14b58a30ab01190ed2f1d6
+size 78828
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5.stl
new file mode 100644
index 0000000000000000000000000000000000000000..eaabe56277598cab20b746c6ef868bae5e69ebf6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:485d39c71a5528a4d504a8e81a94ef299c1fa5f51f6807abb0b4ea9dcfb68869
+size 220784
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..2404d80f7dc78a3e72bc71c016c91406a152359f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98daa94aed669f2db37529a42819c873c07a68d0849e50486e642f0a9cd64d08
+size 235
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..1ab6de93aab71d4fe857b3921cadc904e3e324ce
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a15e1fc99c265c208b05b1fb5df0b0dd1e87c2d1fac5f36671d0208ea7cc50f6
+size 1254618
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..d16f44c14dd704bf147da9575c77d34f26d53af0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_5_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7727674cec42ad82e6f6a73cb5055e83ee445a213ed12cee1e624aacbc707b7
+size 580684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5ba6413ba362388b2a34077299178239308a2a62
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad373b171e80ce3bb0b262d7baf57929357720896eb290ceadd7b4515391317d
+size 70813
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6.stl
new file mode 100644
index 0000000000000000000000000000000000000000..a0c931234bbab1cd40628567572cff3c855126e7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df125a61e2862e6716896d88d2b929c2daa91b26233df9ecb98d4b2a2c90b3e1
+size 484584
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..cfcb76337509653df1d61a61db9a9e483ccaf28d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f1fb3e3100f1fbc6519277625d567e67d1eb18a491f5ab88f1a60abf03424f5
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e22e849001650d17b6000cf408b9e05bc9665f3e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35b680097ddaf1d3db9920eb5fd00c6f76f01d2d632d9c5df32f1787359f1243
+size 3007254
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..01c21a59bcdab1baa06ca4bb83197b6f808030d2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_6_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72f756210c1bb50bcaad00f443f3e6a8d68f946b5fbc69751ee70362ab1e0966
+size 1313384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ec9ba43f88b638d49b293bb97657ea59dc911d4f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4daa6315750ae17470e3b7c24a11edecd3676fd771dee9b0e3d4c44860360d28
+size 96043
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7.stl
new file mode 100644
index 0000000000000000000000000000000000000000..82828f6ee1a58b200f1b4757f2e59b823f1fe7c0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09bcb05bc0f47074fd64c3d9a0546fe013ace0939b0dcb7b8588b6af06914637
+size 653684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..3ce79682ac3b74963638bb7db25964697653684d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:646f866567c527bb730e65e31fee915beeeaa36631227f9aba6038f474ba9ae5
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..379bef2de9ef4760effce35ff3aba4adb4940ff9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5e5ab7b6a099b0c63e4006bf1836919ee24fc703925d5699d04cd2b5c36cb10
+size 4662165
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..04c9486ff1b7be2b2edc80953b4b0c50794259e4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/link_7_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b6477fbbc24007398fa2972d6a3b15c3cfc6a6286dfef95732deef0bf8569e7
+size 2130884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/pedestal.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/pedestal.dae
new file mode 100644
index 0000000000000000000000000000000000000000..3d1b182cfdd00212645903b3980e1597d0cbd76e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/pedestal.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9a74be4ae114c9acafccf68e1e49d8fd815ec030012ce1b60bdcf9b30db49f5
+size 2734652
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/pedestal.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/pedestal.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..d14dd8ea3ce64fc27bbf003316b5c60fed2de6fe
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/pedestal.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:006ba359c617d76b9765b9beae494301c2816bae4726504b4aa5401a618df844
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/pedestal.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/pedestal.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5a74893fa216810483bafbba10b283ccdea9737e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/meshes/pedestal.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5665d1cbbf54e71bec781ee0af99684d2e516619817df250e43782fb377c723
+size 4143393
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..061dc5bb12f97b5f85d00086d49c8912a7d09b52
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/iiwa/robot.xml
@@ -0,0 +1,88 @@
+<mujoco model="iiwa7">
+    <actuator>
+        <!-- Physical limits of the actuator. -->
+        <!-- Values taken from kuka_lbr_iiwa_brochure_en.pdf (see website) -->
+        <motor ctrllimited="true" ctrlrange="-176.0 176.0" joint="joint_1" name="torq_j1"/>
+        <motor ctrllimited="true" ctrlrange="-176.0 176.0" joint="joint_2" name="torq_j2"/>
+        <motor ctrllimited="true" ctrlrange="-110.0 110.0" joint="joint_3" name="torq_j3"/>
+        <motor ctrllimited="true" ctrlrange="-110.0 110.0" joint="joint_4" name="torq_j4"/>
+        <motor ctrllimited="true" ctrlrange="-110.0 110.0" joint="joint_5" name="torq_j5"/>
+        <motor ctrllimited="true" ctrlrange="-40.0 40.0" joint="joint_6" name="torq_j6"/>
+        <motor ctrllimited="true" ctrlrange="-40.0 40.0" joint="joint_7" name="torq_j7"/>
+    </actuator>
+    <asset>
+        <mesh name="link_0" file="meshes/link_0.stl" />
+        <mesh name="link_1" file="meshes/link_1.stl" />
+        <mesh name="link_2" file="meshes/link_2.stl" />
+        <mesh name="link_3" file="meshes/link_3.stl" />
+        <mesh name="link_4" file="meshes/link_4.stl" />
+        <mesh name="link_5" file="meshes/link_5.stl" />
+        <mesh name="link_6" file="meshes/link_6.stl" />
+        <mesh name="link_7" file="meshes/link_7.stl" />
+        <mesh name="link_0_vis" file="meshes/link_0_vis.stl" />
+        <mesh name="link_1_vis" file="meshes/link_1_vis.stl" />
+        <mesh name="link_2_vis" file="meshes/link_2_vis.stl" />
+        <mesh name="link_3_vis" file="meshes/link_3_vis.stl" />
+        <mesh name="link_4_vis" file="meshes/link_4_vis.stl" />
+        <mesh name="link_5_vis" file="meshes/link_5_vis.stl" />
+        <mesh name="link_6_vis" file="meshes/link_6_vis.stl" />
+        <mesh name="link_7_vis" file="meshes/link_7_vis.stl" />
+    </asset>
+    <worldbody>
+        <body name="base" pos="0 0 0">
+            <!-- robot view -->
+            <camera mode="fixed" name="robotview" pos="1.0 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+            <inertial diaginertia="0 0 0" mass="0" pos="0 0 0"/>
+            <!-- mount attached here -->
+            <geom type="mesh" pos="0 0 0" contype="0" conaffinity="0" group="1" rgba="0.4 0.4 0.4 1" name="link_0_visual" mesh="link_0_vis" />
+            <geom type="mesh" pos="0 0 0" rgba="0.4 0.4 0.4 1" contype="0" mesh="link_0" name="link_0_collision" />
+            <body name="link_1" pos="0 0 0.15">
+                <inertial pos="0 -0.03 0.12" quat="0.428904 0.562175 -0.562175 0.428904" mass="3.4525" diaginertia="0.0218946 0.02183 0.00663837" />
+                <joint name="joint_1" pos="0 0 0" axis="0 0 1" limited="true" range="-2.96706 2.96706" damping="0.1" />
+                <geom pos="0 0 0.0075" type="mesh" contype="0" conaffinity="0" group="1" rgba="0.4 0.4 0.4 1" name="link_1_visual" mesh="link_1_vis" />
+                <geom pos="0 0 0.0075" type="mesh" contype="0" rgba="0.4 0.4 0.4 1" mesh="link_1" name="link_1_collision" />
+                <body name="link_2" pos="0 0 0.19" quat="0 0 0.707107 0.707107">
+                    <inertial pos="0.0003 0.059 0.042" quat="0.701371 0.0898824 0.0898824 0.701371" mass="3.4821" diaginertia="0.02179 0.0217049 0.00684512" />
+                    <joint name="joint_2" pos="0 0 0" axis="0 0 1" limited="true" range="-2.0944 2.0944" damping="0.1" />
+                    <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="1 0.423529 0.0392157 1" name="link_2_visual" mesh="link_2_vis" />
+                    <geom type="mesh" contype="0" rgba="1 0.423529 0.0392157 1" mesh="link_2" name="link_2_collision" />
+                    <body name="link_3" pos="0 0.21 0" quat="0 0 0.707107 0.707107">
+                        <inertial pos="0 0.03 0.13" quat="0.56292 0.427927 -0.427927 0.56292" mass="4.05623" diaginertia="0.0321488 0.03204 0.00799117" />
+                        <joint name="joint_3" pos="0 0 0" axis="0 0 1" limited="true" range="-2.96706 2.96706" damping="0.1" />
+                        <geom pos="0 0 -0.026" type="mesh" contype="0" conaffinity="0" group="1" rgba="0.4 0.4 0.4 1" name="link_3_visual" mesh="link_3_vis" />
+                        <geom pos="0 0 -0.026" type="mesh" contype="0" rgba="0.4 0.4 0.4 1" mesh="link_3" name="link_3_collision" />
+                        <body name="link_4" pos="0 0 0.19" quat="0.707107 0.707107 0 0">
+                            <inertial pos="0 0.067 0.034" quat="0.991887 -0.127124 0 0" mass="3.4822" diaginertia="0.02178 0.0216947 0.00684029" />
+                            <joint name="joint_4" pos="0 0 0" axis="0 0 1" limited="true" range="-2.0944 2.0944" damping="0.1" />
+                            <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="1 0.423529 0.0392157 1" name="link_4_visual" mesh="link_4_vis" />
+                            <geom type="mesh" contype="0" rgba="1 0.423529 0.0392157 1" mesh="link_4" name="link_4_collision" />
+                            <body name="link_5" pos="0 0.21 0" quat="0 0 0.707107 0.707107">
+                                <inertial pos="0.0001 0.021 0.076" quat="0.365352 0.605407 -0.605407 0.365352" mass="2.1633" diaginertia="0.0131987 0.01287 0.0036293" />
+                                <joint name="joint_5" pos="0 0 0" axis="0 0 1" limited="true" range="-2.96706 2.96706" damping="0.1" />
+                                <geom pos="0 0 -0.026" type="mesh" contype="0" conaffinity="0" group="1" rgba="0.4 0.4 0.4 1" name="link_5_visual" mesh="link_5_vis" />
+                                <geom pos="0 0 -0.026" type="mesh" contype="0" rgba="0.4 0.4 0.4 1" mesh="link_5" name="link_5_collision" />
+                                <body name="link_6" pos="0 0.0607 0.19" quat="0.707107 0.707107 0 0">
+                                    <inertial pos="0 0.0006 0.0004" quat="0.996112 0.088097 0 0" mass="2.3466" diaginertia="0.006509 0.00631585 0.00447015" />
+                                    <joint name="joint_6" pos="0 0 0" axis="0 0 1" limited="true" range="-2.0944 2.0944" damping="0.1" />
+                                    <geom type="mesh" contype="0" conaffinity="0" group="1" mesh="link_6_vis" name="link_6_visual" rgba="1 0.423529 0.0392157 1" />
+                                    <geom type="mesh" contype="0" rgba="1 0.423529 0.0392157 1" mesh="link_6" name="link_6_collision" />
+                                    <body name="link_7" pos="0 0.081 0.0607" quat="0 0 0.707107 0.707107">
+                                        <inertial pos="0 0 0.02" quat="0.923068 0 0 0.384636" mass="3.129" diaginertia="0.0152362 0.0140538 0.002872" />
+                                        <joint name="joint_7" pos="0 0 0" axis="0 0 1" limited="true" range="-3.05433 3.05433" damping="0.1" />
+                                        <geom pos="0 0 -0.0005" type="mesh" contype="0" conaffinity="0" group="1" rgba="0.4 0.4 0.4 1" mesh="link_7_vis" name="link_7_visual" />
+                                        <geom pos="0 0 -0.0005" type="mesh" contype="0" rgba="0.4 0.4 0.4 1" mesh="link_7" name="link_7_collision" />
+                                        <body name="right_hand" pos="0 0 0.044" quat="0 0 0 1">
+                                            <!-- This camera points out from the eef. -->
+                                            <camera mode="fixed" name="eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                                            <!-- To add gripper -->
+                                        </body>
+                                    </body>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..6b7de5ee329a0bb5937cfdb270b21b940fdc2dee
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eb9e711e672934d2808c47b88c14da1e0976514691aa667fff6b1e034505ffa
+size 469658
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..7735c3c0ea73461539b6124ff8db833c37bf2d82
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc9456293025112e1ac54bfcaf7002336ba67f9621aa5210dd83388b3c6ed17
+size 225
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..7c62633de88da80dd71e601dd3685c0dd434c3d8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ec4a0297285adc071a7989793ce38230b48378c14358f7a60178c2bd23b8a18
+size 703206
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..627b39b9b59daa6a54ad95efe2502481a3cd8701
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3845b39cf479174a0405b4ae1830ca1fa445e5451ed9a26b7870f3d7773dc19
+size 321084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.dae
new file mode 100644
index 0000000000000000000000000000000000000000..0510b03c108882d5a537f7ead853e14acc698962
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ce56479380f41c816670cfbed4fad600329ad890e1d2b5d7beac891b0212f24
+size 467029
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..c0d7270263ba65dcfb5c13be911e075cfc4b4458
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d11b5b4f8e111a2a2d6d0351c89b77a39f34c2fe92afb1616ec279075b3ca42
+size 229
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..1878f3be721c58c833422b2d7c8ad1ae499487e5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a01b581909e2e1a803d110bfab7c89910315389adb01402ba2a9a597e139d917
+size 704116
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..902eab9217526b673ce6d7305099cabccfd9aa98
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/arm_half_2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8ec7a8deb28027d4c785611449ba83da2b61245c408646f7dbe15b297563143
+size 321084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.dae
new file mode 100644
index 0000000000000000000000000000000000000000..c4aca9a60ad20d883a652b7ebb2211e9e65797d8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84d6c46a5cf934f8d9abd669928948857ef8eec0d537ea226e9e44cd72e5133c
+size 1336047
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..1ff704558ac720191e6fc588fb66b57030eb9515
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:681c302b4d46ae0a54aca2befb0f70c0fd31d98c562bf67205a9a26eb361cca2
+size 233
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.obj
new file mode 100644
index 0000000000000000000000000000000000000000..51b9118c5054ca01de5cb9fd971de1e6fe9fd190
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:221f1e9fd309b3a97e781da619f55ecf3a0a1f07452b99745729f2e29a4fb3d4
+size 1886377
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.stl
new file mode 100644
index 0000000000000000000000000000000000000000..656b3894aa4edd0d936ab364149f1edd7d2a31f2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/base.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5889fe8e3c490dc41b8532c92deb7895dbf62ecd1544bd43942681efe0336b60
+size 853484
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.dae
new file mode 100644
index 0000000000000000000000000000000000000000..5af3dea8b5fd5d41de6a5fa41d50b87a998dbb01
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4da6f9148f2b3fd0d4fdfa13129f1a1f2080277a07598540636c4b9eccabc170
+size 547752
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..3a2d845a1579b03b1807032e68e512b461a05d62
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97e4d6a811c2cb53a32b4fbc04ce0f4c6170533c99995d42305336a08bebc8e3
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a5050a9e3f28e0e90e5c3f00e975bc72b3042e36
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25bf077ff919ca1881d82465e5b2cc77cad96f248dc638911158eeb1382cd6dc
+size 761635
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ca64dc7649e275a77cbc5d38e9875f0695316172
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/forearm.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48cdf606a4d84b329bdb37508cc2aac522312ad61c216c3dbc8106f7df022256
+size 350384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/pedestal.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/pedestal.dae
new file mode 100644
index 0000000000000000000000000000000000000000..3d1b182cfdd00212645903b3980e1597d0cbd76e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/pedestal.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9a74be4ae114c9acafccf68e1e49d8fd815ec030012ce1b60bdcf9b30db49f5
+size 2734652
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/pedestal.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/pedestal.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..cd396d468b9404d42e14b2272da5b7ddd73194b5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/pedestal.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdbc8ef1d184ea57a0bbb1b6e688b3058da58ac7c1d6981a697e419cb473916f
+size 232
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/pedestal.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/pedestal.obj
new file mode 100644
index 0000000000000000000000000000000000000000..7375daeb23defc94c51297f77995e5b9dd661122
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/pedestal.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61cbf39abcbc7aa3f20642e370974ee05348d070521fadae11046e0ee39bde9b
+size 4140465
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.dae
new file mode 100644
index 0000000000000000000000000000000000000000..fb3f8fc7ef0614b81942e6cdf89cbd34e629eac6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddd3db4dd2e64ef0673b90a0b2aead433c65ee81fa485a64c169356dcea2b952
+size 34042
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..a7f5b2b2f66cded0149cca162555a5ea086c706a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a40128e8b368c2481b6bc2eaf57fbe687bc540af04dd37e6b64246e12045ee9
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ce0a217c8803b2d9054fbdc140aace38643c6e17
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c72a733aacb06838a754281355c290b6db22baf875b9c7022b9abf61e40c6f42
+size 37985
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.stl
new file mode 100644
index 0000000000000000000000000000000000000000..96b9f64fb6e77f248a87334c38894542a2802635
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_big.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0715633b6d217880a8291843f4716de29faa317c7365be7d3c2c6b95ad47c58
+size 22684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.dae
new file mode 100644
index 0000000000000000000000000000000000000000..84a5951ee6c71476b32936883981c593580974d0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf4d3b83ed06116b66c6d33a7e2d5e2d2643b7a822147c44bfbba80b5b27a234
+size 35177
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..b16b231843c0fcb88dcf8f323084acfcc8fdcff9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:209bf888fdf75b7b8d88b4be11800f8f6f3e559c2ed02b6bb44e26348516fab1
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c60b2d412cdfb0f15a2ea0fa9bea6a2bc2dd390e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b5e76280ea5e11b18c34137ceea1af0a32c5a21a511cc9d94c1536c3fa6d85d
+size 38601
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.stl
new file mode 100644
index 0000000000000000000000000000000000000000..6ae5c8c3335e4d54343b30fe8fc9959d20fb1419
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/ring_small.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d60973e9d9ff8c6b05d49a120bab2fc8df42e7270d72e369808e9c1678db5eb
+size 22684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.dae
new file mode 100644
index 0000000000000000000000000000000000000000..ea76ca19b9792473706a84ea927588c15b51afae
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c94f20e293244a5cf55b724cc19573fb5f4dd3e7fa3ba6fb17f1582bc9bf9963
+size 643584
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..436a8a53d3b24617aee67469d19c104d6da847a5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f8dc3a3c8635ab855c42c05f8b3079f9ce64a5074962584645bd04d36a62edf
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9b91ba4816967e9f7b4584a95fbbe7286c266489
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41c5e331f1a1b09f39f488b8762cadd81e1289c8c4c990d1f899d90ecdaa75b5
+size 903959
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ff2984623ee23cbfb35a72fa861752a3592222bb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/shoulder.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:142067f7985c4fe9cdd94738a0fa8a7fa2d0ed713851d275c4491048b6157e99
+size 410184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..ed1fdc62723dd30490650e0e52ad572364d9a047
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03bd008c3511cef0a42071b223f1089845f838df17975450919d852597032b7f
+size 566902
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..d5f1853eb4efba408083e3fcac3302307ceb28ef
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34a3959c4f5e5dab0d157e8336ccbd2f062c388fd75f7ed84c48a8f844b62d25
+size 229
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..03256674b09cbcdb4af71b97896a256da967150e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfa6353b552b3c02832229dbf520fbb23e16159878e170eba19c3d71fe94625a
+size 861304
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..62748fa31e8cd2a2d2f206bc1fdc9d559833af87
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e703075888eeded30c99b3daa83a98513ef3dfb6d57456b89f2f2240781bd62d
+size 387384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.dae
new file mode 100644
index 0000000000000000000000000000000000000000..c6515325880d144289e288b07640c4f60b5c65ef
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7d7924da2ca1eb6ba82c1415aa3171c52c807baf900222bf64597cf91516eeb
+size 568819
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..4614133957b197a54f8b88dc0b9780bae3916b3b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ad264aa4161423d1c26cc9ddac03e2a1270783d94b69269dbeba91e80789908
+size 229
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..639662ed467aa9f03bf1d187601d2502ac31915e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20aab8b37fd65a6ecf05138dbce7941c0c7d07bb1c553dd830937d6bdce10cbe
+size 861905
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..eddd6fe13ea74d96541e18252780a34ae3d30007
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/meshes/wrist_spherical_2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eafa8be6d2d91ca4592c1bb642c116d879b5c0ba952a0ec3dc9d111a86b5dbd
+size 387384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..8c82945a323a7d79eaceb67955d2d42ef3d99a53
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/jaco/robot.xml
@@ -0,0 +1,94 @@
+<mujoco model="jaco_v2">
+    <actuator>
+        <!-- Physical limits of the actuator. Pulled from raw actuator spec sheet, see:
+         https://www.kinovarobotics.com/sites/default/files/AS-ACT-KA58-KA75-SP-INT-EN%20201804-1.2%20%28KINOVA%E2%84%A2%20Actuator%20series%20KA75%2B%20KA-58%20Specifications%29.pdf -->
+        <motor ctrllimited="true" ctrlrange="-30.5 30.5" joint="j2s7s300_joint_1" name="torq_j1"/>
+        <motor ctrllimited="true" ctrlrange="-30.5 30.5" joint="j2s7s300_joint_2" name="torq_j2"/>
+        <motor ctrllimited="true" ctrlrange="-30.5 30.5" joint="j2s7s300_joint_3" name="torq_j3"/>
+        <motor ctrllimited="true" ctrlrange="-30.5 30.5" joint="j2s7s300_joint_4" name="torq_j4"/>
+        <motor ctrllimited="true" ctrlrange="-30.5 30.5" joint="j2s7s300_joint_5" name="torq_j5"/>
+        <motor ctrllimited="true" ctrlrange="-6.8 6.8" joint="j2s7s300_joint_6" name="torq_j6"/>
+        <motor ctrllimited="true" ctrlrange="-6.8 6.8" joint="j2s7s300_joint_7" name="torq_j7"/>
+    </actuator>
+    <asset>
+        <mesh name="base" file="meshes/base.stl" />
+        <mesh name="shoulder" file="meshes/shoulder.stl" />
+        <mesh name="ring_big" file="meshes/ring_big.stl" />
+        <mesh name="arm_half_1" file="meshes/arm_half_1.stl" />
+        <mesh name="arm_half_2" file="meshes/arm_half_2.stl" />
+        <mesh name="forearm" file="meshes/forearm.stl" />
+        <mesh name="ring_small" file="meshes/ring_small.stl" />
+        <mesh name="wrist_spherical_1" file="meshes/wrist_spherical_1.stl" />
+        <mesh name="wrist_spherical_2" file="meshes/wrist_spherical_2.stl" />
+
+        <!-- Materials for robot -->
+        <texture type="cube" name="carbon_tex" builtin="flat" width="100" height="100" rgb1="1 1 1" rgb2="1 1 1"/>
+        <texture type="cube" name="grey_plastic_tex" builtin="flat" width="100" height="100" rgb1="1 1 1" rgb2="1 1 1"/>
+        <material name="carbon_jaco" texture="carbon_tex" rgba="0.05 0.05 0.05 1" />
+        <material name="grey_plastic_jaco" texture="grey_plastic_tex" rgba="0.88 0.86 0.86 1" />
+    </asset>
+    <worldbody>
+        <body name="base" pos="0 0 0">
+            <!-- robot view -->
+            <camera mode="fixed" name="robotview" pos="1.0 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+            <inertial diaginertia="0 0 0" mass="0" pos="0 0 0"/>
+            <!-- mount attached here -->
+            <body name="j2s7s300_link_0" pos="0 0 0">
+                <inertial pos="0 0 0.05" mass="4" diaginertia="0.4 0.4 0.4" />
+                <geom type="mesh" contype="0" conaffinity="0" group="1" material="carbon_jaco" mesh="base" />
+                <geom type="mesh" contype="0" conaffinity="1" material="carbon_jaco" mesh="base" name="base_collision"/>
+                <body name="j2s7s300_link_1" pos="0 0 0.15675" quat="0 0 1 0">
+                    <inertial pos="0 -0.002 -0.0605" mass="0.7477" diaginertia="0.00152032 0.00152032 0.00059816" />
+                    <joint name="j2s7s300_joint_1" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="0.1" frictionloss="0.01"/>
+                    <geom type="mesh" contype="0" conaffinity="0" group="1" material="carbon_jaco" name="s_visual" mesh="shoulder" />
+                    <geom type="mesh" contype="0" conaffinity="0" group="1" material="grey_plastic_jaco" name="s_ring_visual" mesh="ring_big" />
+                    <geom type="mesh" contype="0" conaffinity="1" material="carbon_jaco" mesh="shoulder" name="s_collision"/>
+                    <body name="j2s7s300_link_2" pos="0 0.0016 -0.11875" quat="0 0 -0.707107 0.707107">
+                        <inertial pos="0 -0.103563 0" quat="0.707107 0.707107 0 0" mass="0.8447" diaginertia="0.00247074 0.00247074 0.000380115" />
+                        <joint name="j2s7s300_joint_2" pos="0 0 0" axis="0 0 1" limited="true" range="0.820305 5.46288" damping="0.1" frictionloss="0.01"/>
+                        <geom type="mesh" contype="0" conaffinity="0" group="1" material="carbon_jaco" name="ah1_visual" mesh="arm_half_1" />
+                        <geom type="mesh" contype="0" conaffinity="0" group="1" material="grey_plastic_jaco" name="ah1_ring_visual" mesh="ring_big" />
+                        <geom type="mesh" material="carbon_jaco" mesh="arm_half_1" name="ah1_collision"/>
+                        <body name="j2s7s300_link_3" pos="0 -0.205 0" quat="0.707107 -0.707107 0 0">
+                            <inertial pos="0 0 -0.102245" mass="0.8447" diaginertia="0.00247074 0.00247074 0.000380115" />
+                            <joint name="j2s7s300_joint_3" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="0.1" frictionloss="0.01"/>
+                            <geom type="mesh" contype="0" conaffinity="0" group="1" material="carbon_jaco" name="ah2_visual" mesh="arm_half_2" />
+                            <geom type="mesh" contype="0" conaffinity="0" group="1" material="grey_plastic_jaco" name="ah2_ring_visual" mesh="ring_big" />
+                            <geom type="mesh" material="carbon_jaco" mesh="arm_half_2" name="ah2_collision"/>
+                            <body name="j2s7s300_link_4" pos="0 0 -0.205" quat="0 0 0.707107 0.707107">
+                                <inertial pos="0 0.081 -0.0086" quat="0.707107 0.707107 0 0" mass="0.6763" diaginertia="0.00142022 0.00142022 0.000304335" />
+                                <joint name="j2s7s300_joint_4" pos="0 0 0" axis="0 0 1" limited="true" range="0.523599 5.75959" damping="0.1" frictionloss="0.01"/>
+                                <geom type="mesh" contype="0" conaffinity="0" group="1" material="carbon_jaco" name="f_visual" mesh="forearm" />
+                                <geom type="mesh" contype="0" conaffinity="0" group="1" material="grey_plastic_jaco" name="f_ring_visual" mesh="ring_small" />
+                                <geom type="mesh" material="carbon_jaco" mesh="forearm" name="f_collision"/>
+                                <body name="j2s7s300_link_5" pos="0 0.2073 -0.0114" quat="0 0 -0.707107 0.707107">
+                                    <inertial pos="0 0.00288489 -0.0541933" mass="0.463" diaginertia="0.000432132 0.000432132 9.26e-05" />
+                                    <joint name="j2s7s300_joint_5" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="0.1" frictionloss="0.01"/>
+                                    <geom type="mesh" contype="0" conaffinity="0" group="1" material="carbon_jaco" name="ws1_visual" mesh="wrist_spherical_1" />
+                                    <geom type="mesh" contype="0" conaffinity="0" group="1" material="grey_plastic_jaco" name="ws1_ring_visual" mesh="ring_small" />
+                                    <geom type="mesh" material="carbon_jaco" mesh="wrist_spherical_1" name="ws1_collision"/>
+                                    <body name="j2s7s300_link_6" pos="0 0 -0.10375" quat="0 0 0.707107 0.707107">
+                                        <inertial pos="0 0.0497209 -0.00285628" quat="0.707107 0.707107 0 0" mass="0.463" diaginertia="0.000432132 0.000432132 9.26e-05" />
+                                        <joint name="j2s7s300_joint_6" pos="0 0 0" axis="0 0 1" limited="true" range="1.13446 5.14872" damping="0.01" frictionloss="0.01"/>
+                                        <geom type="mesh" contype="0" conaffinity="0" group="1" material="carbon_jaco" name="ws2_visual" mesh="wrist_spherical_2" />
+                                        <geom type="mesh" contype="0" conaffinity="0" group="1" material="grey_plastic_jaco" name="ws2_ring_visual" mesh="ring_small" />
+                                        <geom type="mesh" material="carbon_jaco" mesh="wrist_spherical_2" name="ws2_collision"/>
+                                        <body name="j2s7s300_link_7" pos="0 0.10375 0" quat="0 0 -0.707107 0.707107">
+                                            <inertial pos="0 0 -0.06" quat="0.5 0.5 -0.5 0.5" mass="1e-6" diaginertia="1e-6 1e-6 1e-6" /> <!--mass="0.99" diaginertia="0.0005816 0.000345324 0.000345324" /> -->
+                                            <joint name="j2s7s300_joint_7" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="0.01" frictionloss="0.01"/>
+                                            <body name="right_hand" pos="0 0 0" quat="0 1 0 0">
+                                                <!-- This camera points out from the eef. -->
+                                                <camera mode="fixed" name="eye_in_hand" pos="0.08 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                                                <!-- To add gripper -->
+                                            </body>
+                                        </body>
+                                    </body>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/base_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/base_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..833c8d76591d348864282da750f553e3e1f6feb7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/base_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98838761774aba57f7d313c1d4c0d5625e6a89a178adc9799449713bbe7b0c6d
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/base_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/base_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e1f1647d12f3b54b48ce834df5d35954abcc1e49
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/base_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c825a124049f93948281b649c7835efe29759ec5bfaedf215f4ecec48adec2bf
+size 354327
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/base_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/base_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..5f6aff0d1c58b052b4464d41029bfdcdaa91dae0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/base_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e38517e653d58f4160cfdcee34c9dea983d8b3ecb8f2c147444f58235335260
+size 183884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_no_vision_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_no_vision_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..3ed77d1a7fe59181d2f777149a0192cfc3deaca4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_no_vision_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4696fb326cdec70ff50b429dcefbfc26ba26783adf45cd0d09575ec8e8530e9a
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_no_vision_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_no_vision_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ca7bae252e93e44209b5711654e8335b7b7f77c2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_no_vision_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38cc80420d197714119a515cfc95173fedab5c16186a444b564ac3efe94994c7
+size 2903680
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_no_vision_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_no_vision_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..705d64f18e58ff342559845393b277df90ae3c00
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_no_vision_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b2ec7dafbf6a3c9a2cac717d18132a8a1efd9fcb8f2c40d9c12c8ece7323ec8
+size 1334384
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_with_vision_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_with_vision_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..0950fd51318b92e80add78a82e8964da275f4da7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_with_vision_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8eb1d69d678cddeac34e151a4f67f8e2a2f9db77668d31567d65beefd0df6c3
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_with_vision_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_with_vision_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..2954c910a89904ba02e0f2aea04297b918aeff1c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_with_vision_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df309dcea390ecf8e354cfb50b8b2f90efb481979375f243397fc88027e4e101
+size 2561538
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_with_vision_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_with_vision_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..72f81c50b040566e324f098cf263c7f9b84def47
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/bracelet_with_vision_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bb05e420a88532b4eee3948a0047fb8e7f341c3e5cc971d54e97e2aaffcf878
+size 1164084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/end_effector_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/end_effector_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..cae2df9738c15c518c28b53c66fe721b3a76b62a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/end_effector_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bef05ffe22bf1918c479eecec9be20ed7f98455e6ebcebdcc14e5fc88ce8546
+size 47
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/end_effector_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/end_effector_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3ee3febe0bee3d37a24f033d69fc9ea2149a0ca6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/end_effector_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:252e3687fe858a3a574ceb798f9e49727ed497820faf208e72e834faf7ef900f
+size 78
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/end_effector_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/end_effector_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..3e2754219dbba13475bf5179dc55c8aeb1ec4bf4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/end_effector_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b6fb58e61fa475939767d68a446f97f1bff02c0e5935a3ea8bb51e6515783d8
+size 80
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/forearm_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/forearm_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..9cec2eee6960ebfb966ac193a6287b10e763511f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/forearm_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:beb7e4b5cd33e7c65396133c8a2d2d17a42a028f07a3e904565906e7ef056451
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/forearm_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/forearm_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9c3eecc1639d613e04a1ba535715788d4bd8f754
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/forearm_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a34e75d356d679b1c033d8b18192960faf1e4fd804126b17b30c36e29e7994aa
+size 948560
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/forearm_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/forearm_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..0c616b5f7115ce35cf45991c428bf149a9a7593a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/forearm_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a00d96d2c7ec37e9fe4c5fb6d3fa2b4386dfd48fea5ac25632f4cb83b3505105
+size 456284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_1_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_1_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..1db586566e18400bfaeceeb18b056f016403b9ab
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_1_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19d1ee125ca4ba44781bed352ab6aa2ba9687ac4d57d37d13973e574d42f2463
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_1_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_1_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..183e0d74cf881f37edc3767260883270d7a321d7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_1_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c186dd0f5afee25e06230cb00c30f70fcfbf869f43a21be3e1c4bb0e47c4a21
+size 1051661
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_1_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_1_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..00b636d56211e93fced7042e4373bd32436e4353
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_1_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8df461a71780d8e2d09104a507584424279d85173adf14867e359f7e58b3bba7
+size 514184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_2_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_2_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..f9c8fbae18a8189d1eb9623e790272906bafbeba
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_2_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc5dfc5f663bfbd0dc05352d612f06a8a2c6d0e4cd65d0b460bfce3b2ef83346
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_2_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_2_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..29de7606f5cf69cc72d2caaba96c02b74c692a51
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_2_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d67b050144df7a50a8ff5cdc3c26a557e2abfcce6845daf1e64334dcd316f95
+size 1005957
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_2_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_2_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..375f68608fd5cedb74e2a4ad0d1a0c3fb6ba1dbf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/half_arm_2_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a9f38de21648804b3ab7076ec73726031f2ade7c95eb2541d070553e025407b
+size 489184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/pedestal.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/pedestal.dae
new file mode 100644
index 0000000000000000000000000000000000000000..3d1b182cfdd00212645903b3980e1597d0cbd76e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/pedestal.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9a74be4ae114c9acafccf68e1e49d8fd815ec030012ce1b60bdcf9b30db49f5
+size 2734652
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/pedestal.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/pedestal.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..43b29f033881c222955aaf42bc6820c23bdf92a6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/pedestal.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:530777aefa4b4db928631ca9c2db96e5876b0685fb821b1bc8130dfd786dd0d6
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/pedestal.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/pedestal.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e4974e2e9d26fb45580a525207c1c9fec2082e5d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/pedestal.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7b0bbe6fb0d171dc28df60700f82896cc83a01644a766ceeec0e4ad74e6f544
+size 4140469
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/shoulder_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/shoulder_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..5e1ed20951d24ab46a84983cd64a8324b7411c68
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/shoulder_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f8d77769f51177f3982efffa9949adf932524d128733cc2c41605ca28ba171b
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/shoulder_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/shoulder_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9cf1dee15a4d86166155580f1cf4c8ba19c6d14f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/shoulder_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72c3edfcc1ad141dbf299509e5d8245de760d213de8f2440281da7e232a502cf
+size 1017935
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/shoulder_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/shoulder_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..71dbb2133e60e6fdc5aa136217d5ab9b163b44d5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/shoulder_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a98e232e5197f09e9cd520e7a450c21bcec74bc49e2a19ce7cf86c5dc25f2e18
+size 476984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_1_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_1_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..f74f823fa29fd1ae18c44cc314b56f1328e0b925
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_1_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cd709ac78fc6db5917af4fc6a4ab7c6041b755bc07d89bb5e752294a09c78fb
+size 232
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_1_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_1_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..21ba0ca9a01455e9326216e2088466a6d622f2bc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_1_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2ba21dae81886c0c9915534da2887d58079cdf8b6b6b669ef080ffe52bce7c6
+size 1155068
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_1_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_1_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..cc0413db97a49f0a4554ce9ba194a5251a65f4be
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_1_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b97a5e3b2de535344f8e8d4176d77f0cdb3455b235f35430fa0413208856e66
+size 546484
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_2_link.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_2_link.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..9cec2eee6960ebfb966ac193a6287b10e763511f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_2_link.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:beb7e4b5cd33e7c65396133c8a2d2d17a42a028f07a3e904565906e7ef056451
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_2_link.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_2_link.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6aa155dedb8ca4ddd2567b045fe4d085ccd5778b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_2_link.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7302bdaa889e4e485d3f1b6b2b46431dcc86daec256c31dbe58eb75770451fec
+size 1084389
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_2_link.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_2_link.stl
new file mode 100644
index 0000000000000000000000000000000000000000..aa941bfe51911b1d342252be944c3b57e88e2274
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/meshes/spherical_wrist_2_link.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12271fb7d0ee78fb5642424cf5078be20f0476061e74203db964b674031ce879
+size 516984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..9b1a239369a61d45de01701c643822f3a447410f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/kinova3/robot.xml
@@ -0,0 +1,80 @@
+<mujoco model="kinova3">
+    <actuator>
+        <!-- Physical limits of the actuator. Pulled directly from datasheet. See pg 24 of:
+         https://www.kinovarobotics.com/sites/default/files/UG-014_KINOVA_Gen3_Ultra_lightweight_robot_User_guide_EN_R06_0.pdf -->
+        <motor ctrllimited="true" ctrlrange="-32.0 32.0" joint="Actuator1" name="torq_j1"/>
+        <motor ctrllimited="true" ctrlrange="-32.0 32.0" joint="Actuator2" name="torq_j2"/>
+        <motor ctrllimited="true" ctrlrange="-32.0 32.0" joint="Actuator3" name="torq_j3"/>
+        <motor ctrllimited="true" ctrlrange="-32.0 32.0" joint="Actuator4" name="torq_j4"/>
+        <motor ctrllimited="true" ctrlrange="-32.0 32.0" joint="Actuator5" name="torq_j5"/>
+        <motor ctrllimited="true" ctrlrange="-13.0 13.0" joint="Actuator6" name="torq_j6"/>
+        <motor ctrllimited="true" ctrlrange="-13.0 13.0" joint="Actuator7" name="torq_j7"/>
+    </actuator>
+    <asset>
+        <mesh name="base_link" file="meshes/base_link.stl" />
+        <mesh name="shoulder_link" file="meshes/shoulder_link.stl" />
+        <mesh name="half_arm_1_link" file="meshes/half_arm_1_link.stl" />
+        <mesh name="half_arm_2_link" file="meshes/half_arm_2_link.stl" />
+        <mesh name="forearm_link" file="meshes/forearm_link.stl" />
+        <mesh name="spherical_wrist_1_link" file="meshes/spherical_wrist_1_link.stl" />
+        <mesh name="spherical_wrist_2_link" file="meshes/spherical_wrist_2_link.stl" />
+        <mesh name="bracelet_with_vision_link" file="meshes/bracelet_with_vision_link.stl" />
+    </asset>
+    <worldbody>
+        <body name="base" pos="0 0 0">
+            <!-- robot view -->
+            <camera mode="fixed" name="robotview" pos="1.0 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+            <inertial diaginertia="0 0 0" mass="0" pos="0 0 0"/>
+            <!-- mount attached here -->
+            <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" mesh="base_link" />
+            <geom type="mesh" conaffinity="0" rgba="0.75294 0.75294 0.75294 1" mesh="base_link" name="base_collision" />
+            <body name="shoulder_link" pos="0 0 0.15643" quat="-3.67321e-06 1 0 0">
+                <inertial pos="-2.3e-05 -0.010364 -0.07336" quat="0.707051 0.0451246 -0.0453544 0.704263" mass="1.3773" diaginertia="0.00488868 0.00457 0.00135132" />
+                <joint name="Actuator1" pos="0 0 0" axis="0 0 1" damping="0.1" />
+                <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" name="s_visual" mesh="shoulder_link" />
+                <geom type="mesh" conaffinity="0" rgba="0.75294 0.75294 0.75294 1" mesh="shoulder_link" name="s_collision"/>
+                <body name="HalfArm1_Link" pos="0 0.005375 -0.12838" quat="0.707105 0.707108 0 0">
+                    <inertial pos="-4.4e-05 -0.09958 -0.013278" quat="0.482348 0.516286 -0.516862 0.483366" mass="1.1636" diaginertia="0.0113017 0.011088 0.00102532" />
+                    <joint name="Actuator2" pos="0 0 0" axis="0 0 1" limited="true" range="-2.41 2.41" damping="0.1" />
+                    <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" name="ha1_visual" mesh="half_arm_1_link" />
+                    <geom type="mesh" conaffinity="0" rgba="0.75294 0.75294 0.75294 1" mesh="half_arm_1_link" name="ha1_collision"/>
+                    <body name="HalfArm2_Link" pos="0 -0.21038 -0.006375" quat="0.707105 -0.707108 0 0">
+                        <inertial pos="-4.4e-05 -0.006641 -0.117892" quat="0.706144 0.0213722 -0.0209128 0.707437" mass="1.1636" diaginertia="0.0111633 0.010932 0.00100671" />
+                        <joint name="Actuator3" pos="0 0 0" axis="0 0 1" damping="0.1" />
+                        <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" name="ha2_visual" mesh="half_arm_2_link" />
+                        <geom type="mesh" conaffinity="0" rgba="0.75294 0.75294 0.75294 1" mesh="half_arm_2_link" name="ha2_collision"/>
+                        <body name="forearm_link" pos="0 0.006375 -0.21038" quat="0.707105 0.707108 0 0">
+                            <inertial pos="-1.8e-05 -0.075478 -0.015006" quat="0.483678 0.515961 -0.515859 0.483455" mass="0.9302" diaginertia="0.00834839 0.008147 0.000598606" />
+                            <joint name="Actuator4" pos="0 0 0" axis="0 0 1" limited="true" range="-2.66 2.66" damping="0.1" />
+                            <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" name="f_visual" mesh="forearm_link" />
+                            <geom type="mesh" conaffinity="0" rgba="0.75294 0.75294 0.75294 1" mesh="forearm_link" name="f_collision"/>
+                            <body name="SphericalWrist1_Link" pos="0 -0.20843 -0.006375" quat="0.707105 -0.707108 0 0">
+                                <inertial pos="1e-06 -0.009432 -0.063883" quat="0.703558 0.0707492 -0.0707492 0.703558" mass="0.6781" diaginertia="0.00165901 0.001596 0.000346988" />
+                                <joint name="Actuator5" pos="0 0 0" axis="0 0 1" damping="0.1" />
+                                <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" name="w1_visual" mesh="spherical_wrist_1_link" />
+                                <geom type="mesh" conaffinity="0" rgba="0.75294 0.75294 0.75294 1" mesh="spherical_wrist_1_link" name="w1_collision"/>
+                                <body name="SphericalWrist2_Link" pos="0 0.00017505 -0.10593" quat="0.707105 0.707108 0 0">
+                                    <inertial pos="1e-06 -0.045483 -0.00965" quat="0.44426 0.550121 -0.550121 0.44426" mass="0.6781" diaginertia="0.00170087 0.001641 0.00035013" />
+                                    <joint name="Actuator6" pos="0 0 0" axis="0 0 1" limited="true" range="-2.23 2.23" damping="0.01" />
+                                    <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" name="w2_visual" mesh="spherical_wrist_2_link" />
+                                    <geom type="mesh" conaffinity="0" rgba="0.75294 0.75294 0.75294 1" mesh="spherical_wrist_2_link" name="w2_collision"/>
+                                    <body name="Bracelet_Link" pos="0 -0.10593 -0.00017505" quat="0.707105 -0.707108 0 0">
+                                        <inertial pos="-0.000281 -0.011402 -0.029798" quat="0.601961 0.40663 -0.381169 0.571843" mass="0.5006" diaginertia="0.000657515 0.000586793 0.000320693" />
+                                        <joint name="Actuator7" pos="0 0 0" axis="0 0 1" damping="0.01" />
+                                        <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="1 1 1 1" name="b_visual" mesh="bracelet_with_vision_link" />
+                                        <geom type="mesh" conaffinity="0" rgba="0.75294 0.75294 0.75294 1" mesh="bracelet_with_vision_link" name="b_collision"/>
+                                        <body name="right_hand" pos="0 0 -0.065" quat="0 0.707105 -0.707108 0">
+                                            <!-- This camera points out from the eef. -->
+                                            <camera mode="fixed" name="eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                                            <!-- To add gripper -->
+                                        </body>
+                                    </body>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/finger.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/finger.stl
new file mode 100644
index 0000000000000000000000000000000000000000..ef5e672efbb990561b36fcee2c15b2f61cf42065
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/finger.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d07a740392f3b9b0816f65d64fff9927d3d57c897870fc4b6ff9c56fff3a0c8
+size 1684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/finger_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/finger_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..6c81071ad1b6f27bd97ad72839d9833f23ef439c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/finger_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f5556f3ed25c5a9292e342d98c09885ddbe39c7096fd6cadf59b0cd93079fd7
+size 31284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/hand.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/hand.stl
new file mode 100644
index 0000000000000000000000000000000000000000..bb315217a60e27343b84a9d4e3a4686762c4fc8d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/hand.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94493e94f30fe940f2c8ca2f155c3bbe67bbff406d3edf5e261670d2f0f6e2ed
+size 10084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/hand_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/hand_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..aeee94b7236eb7b6702fd1c65beeb463824ee9d5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/hand_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87a148e5a35a67f3a3e04104d1d63b7056f91b416e8aa0f37d5ecfe61e923fee
+size 353984
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0.dae
new file mode 100644
index 0000000000000000000000000000000000000000..2ea2fbee592e033a1cdf431400ad8c2ac4091248
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9ceaa66bb3a734e3a32f2f737ae57a29e922f4a962ed77b9bb8d8e25cd33159
+size 1590896
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..bbe58384ff30b933eb8758429c4f5cbd970c1b50
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc6d94330de8ddb005b311bfdba9f3b8e1aa7c256b71592ee7ff32cb9a9a5aa
+size 10084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..2ea2fbee592e033a1cdf431400ad8c2ac4091248
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9ceaa66bb3a734e3a32f2f737ae57a29e922f4a962ed77b9bb8d8e25cd33159
+size 1590896
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..d89dc9e808c4b05d05a13de7cce032beeae8ccd7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12b22d2d2823d4c2f36095a505c3bd365200d4ae83d2b7d9021715439c175b68
+size 2316
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..306edb5b025453a5dd6a6df94a71e9d3632cccc3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:accee28b0c08b14177854785a17ec6c8b7d282b2319b80f6fb2e5a135c032c69
+size 2293426
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..b090d6f954c9127ee749f67f8781c899c595a13e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link0_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7047d9cc40f21d4e23ecba81d731521434e0cb30b278c318d5f12aba48105081
+size 1024234
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..2030d60bcfb6e3f00c6287fbe8cad91be27118f0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8b7b7c1217d620a811fc0ee52d1d1b0e1470de955e7453872aac3f15cf7c5e
+size 978415
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..16bc4cdd84f3dc82098e6e15f6e3c7dbcab73786
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d24e332dffccf260b91d05dde17c5998bb9559d37da8608a8ee5213d9661f603
+size 625884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..2030d60bcfb6e3f00c6287fbe8cad91be27118f0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8b7b7c1217d620a811fc0ee52d1d1b0e1470de955e7453872aac3f15cf7c5e
+size 978415
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..9e84da922e332d3cf4a86768085b08c715a17b37
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b3792c21291581dc04e309ce2494b5bcf8fffc8b13bb86dbbcc7e19cd3ad9e3
+size 238
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a63f971a288c3e75e7b9b27bf2e7bddcd871cc94
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecffb03db29cbe2f7ffadd5947409c7409f4f948e02f919fd318d925021b028e
+size 1374032
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..16bc4cdd84f3dc82098e6e15f6e3c7dbcab73786
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link1_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d24e332dffccf260b91d05dde17c5998bb9559d37da8608a8ee5213d9661f603
+size 625884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2.dae
new file mode 100644
index 0000000000000000000000000000000000000000..64981bd82b803b79f46005dd56885b4ae01e9d87
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c44d0364f0030007e427106a4e842d835ca43902716cc46ee4f3342dab189e12
+size 998486
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..6ba548f4137d4ba09e7b0d9299fa631b27af1ea1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:370f7605a0fae3529db169ded50f52f171024aa792d4d773bc84197301f6a039
+size 15084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..64981bd82b803b79f46005dd56885b4ae01e9d87
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c44d0364f0030007e427106a4e842d835ca43902716cc46ee4f3342dab189e12
+size 998486
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..f788da796a3ca791baa1bc1e32972c094d43f9cc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8970ae2bf72033dbcd5f9bec4f104fc5447188fa2b929623f4a281aef3aa8b38
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..2001ac745c81b3788c996a3350b11d7490c04ba3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ada4575212260e77b96c9e5f3b42da0d8e5e6353eee57f8dcb7c6ee4ff559ab
+size 1386917
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..b15afeb957390112c44330a980bbbd414016259c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link2_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ca22e40bf92b26d88ae63a180867b2b6d226dac9204d8b749f1bd8337fdc852
+size 635884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3.dae
new file mode 100644
index 0000000000000000000000000000000000000000..23d6124df5e3d5696241441fbdf7dcfe53dbe150
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dab39a126153fb82f3650cca6de63a8e978f851aa5020a8e91b3d9d548dbba3d
+size 1099651
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3.stl
new file mode 100644
index 0000000000000000000000000000000000000000..7115ba0e92d33fd3a2e6e2087df980ae8b9a6730
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a8d638b9349c6c0eefc4e888636ac4838c4b27170f18a51699321118af709c1
+size 15084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..23d6124df5e3d5696241441fbdf7dcfe53dbe150
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dab39a126153fb82f3650cca6de63a8e978f851aa5020a8e91b3d9d548dbba3d
+size 1099651
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..d8bdf6226805a6f7f9c61a5eb0ba744fdf768efc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f019f3a20fddb775d6a6ea969d28f1a8c270bf0adcf448b9f7b6f4f9328881b
+size 856
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..93b8f9429192295ea55c8893653ae9e7cb8ccc29
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11f1504eba4f472585a8a3200d9aee420efc4196b31cb4284793a823ec007cd6
+size 1563997
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..cd555180b5df809da45f8a03dcd5b18aa248ab2a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link3_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9765165f24d0551b885bae0ce702448a711da22399c033c5e40bc56ad845b5f
+size 711734
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4.dae
new file mode 100644
index 0000000000000000000000000000000000000000..0ce1680db10d42992cb781fa23e3b5db43dea3ff
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e03d680e3a4a4555d673bcb8cb466e479f5cb069a5fc8a0b0f99c089c50fd63
+size 1145491
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4.stl
new file mode 100644
index 0000000000000000000000000000000000000000..88c6db70bf3c3b68bce08b9bcb5142050b1f9079
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0180ebb5772ec9840cb049750cffb29a9ddc90311752a16ea34757782ef9e48d
+size 15084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..0ce1680db10d42992cb781fa23e3b5db43dea3ff
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e03d680e3a4a4555d673bcb8cb466e479f5cb069a5fc8a0b0f99c089c50fd63
+size 1145491
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..178626f13382ab50c1a1d557a9cf5cb006987aac
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f08734c55cc3fcbca529277e5b9ea8f41e544fef6bafa253a3f181b6a7d3589
+size 852
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..db79df246fa4d6b36b9ea5f80d49dc8120ea46fe
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32989e80f160b70838d65a51dd3fd12fab36a4e0317e469ff547f03ecc81d0bf
+size 1615731
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..9a1c7ab2f1d9be60429570d322413590b2e539e0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link4_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75f852b23a9cd24735c59feacb12b4cb9ac018c2a1505a84b655144faefbdb30
+size 731134
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5.dae
new file mode 100644
index 0000000000000000000000000000000000000000..b6911ff709357cc25b27355fc36873d1d30f9cc1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0be76681192578a14d6ace89527e4ee418f7395e825e285125e05fd998d24e3e
+size 1438169
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5.stl
new file mode 100644
index 0000000000000000000000000000000000000000..5eaf5c8ec2155135ab9297d51e3dae6e5e280675
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd17e688c7870e722283525879643d53a74c0024d328b0e14b034b54c8b6c31a
+size 15084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..b6911ff709357cc25b27355fc36873d1d30f9cc1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0be76681192578a14d6ace89527e4ee418f7395e825e285125e05fd998d24e3e
+size 1438169
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..692dfb4a552dd3b5d3ca5a2c1db99645075c1733
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58562759a863c7affa15368642a3bfa144e33ff2159ddd3a35f326b01bf8c105
+size 631
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a34bb50c9ebe5326196abbdccbc8b2cd5d22fe13
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9dc118edcfa779488d03c91f17e273360c96faf97ff82d145717d1da2d26f436
+size 2028820
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..0cc2f41f713f5a966d8af5030ceb3eec8d593f3c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link5_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:179de04c09da7ec80232f17265ed6b49d28aa4491a13aab679da107a74fff50e
+size 916434
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6.dae
new file mode 100644
index 0000000000000000000000000000000000000000..adac012b16351aecef432a28bd593edc0872a9ae
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed9c57432b079d55b9954775f2ddfe34e8b904f683949b8eb6314238f8afa46e
+size 1727767
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6.stl
new file mode 100644
index 0000000000000000000000000000000000000000..828ad3bd384b22ef734d8add0e50d6ae449dce9c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20b768e99a0e0440b5754dcca108016434e57937cc356acd9c352ccd3cb27f77
+size 10084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..adac012b16351aecef432a28bd593edc0872a9ae
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed9c57432b079d55b9954775f2ddfe34e8b904f683949b8eb6314238f8afa46e
+size 1727767
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..5db8e4f693373969e375f7f5a54acb012c177a37
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d2f8656461177a741053ca584339195496d86c6e779a87ae8e68031fe0d968a
+size 3404
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9dd2a18e24b174200b79ee5e5c8111ee10818ab8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:907ec674c41933e28b21075e65ded297143d2691867eac15fd43721de670f18f
+size 2448144
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..83924e46ef92dcdcd777e0bafdb2f70cad66b224
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link6_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a25b11b650d4a1c96b2299a020e38e3caa592f6e3f0f483bee64823495b1688
+size 1081084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7.dae
new file mode 100644
index 0000000000000000000000000000000000000000..b6d289bc5b7d51793fd2bf805695356eed8ae3ac
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71be614f734bd27b2d7dec3e8bb022251cbbfce38b0a12dbfc1b88bc0513822a
+size 935952
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7.stl
new file mode 100644
index 0000000000000000000000000000000000000000..2047756ec662f051af90fe61266998bf16e655fe
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92ac6afcf7574c034d3170d8a68e95ac9048ab9d0dd5bbd8311b86e551b9ab1c
+size 10084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.dae
new file mode 100644
index 0000000000000000000000000000000000000000..b6d289bc5b7d51793fd2bf805695356eed8ae3ac
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71be614f734bd27b2d7dec3e8bb022251cbbfce38b0a12dbfc1b88bc0513822a
+size 935952
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..0d634f2afed484fa68d6ba05c06d087af0b6cc4d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce742e1ce34ba609c457614439bd79c33fae7cd87fa9e4efd32aca646ce0d3e1
+size 1644
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..7f2ec2019fa98d7e880537172e4467d0a620532c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b80b32eec4c6272fb58114b65052f60c89e8d0e56307697f0b055cd9e3826d4c
+size 1272083
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..185bb136d06185b914b8f16765f216b86402b3e6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/meshes/link7_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d7ef70426828402935ff2bfac9f8be2f770a0c37dcb43e1e3d5e8c0c0afc5ac
+size 604184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/no_texture_robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/no_texture_robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..9b21fad5b33204620d74c4b82e1bc95dfe5301e8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/no_texture_robot.xml
@@ -0,0 +1,92 @@
+<mujoco model="panda">
+    <actuator>
+        <!-- Physical limits of the actuator. -->
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="joint1" name="torq_j1"/>
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="joint2" name="torq_j2"/>
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="joint3" name="torq_j3"/>
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="joint4" name="torq_j4"/>
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="joint5" name="torq_j5"/>
+        <motor ctrllimited="true" ctrlrange="-12.0 12.0" joint="joint6" name="torq_j6"/>
+        <motor ctrllimited="true" ctrlrange="-12.0 12.0" joint="joint7" name="torq_j7"/>
+    </actuator>
+    <asset>
+        <mesh name="link0" file="meshes/link0.stl" />
+        <mesh name="link1" file="meshes/link1.stl" />
+        <mesh name="link2" file="meshes/link2.stl" />
+        <mesh name="link3" file="meshes/link3.stl" />
+        <mesh name="link4" file="meshes/link4.stl" />
+        <mesh name="link5" file="meshes/link5.stl" />
+        <mesh name="link6" file="meshes/link6.stl" />
+        <mesh name="link7" file="meshes/link7.stl" />
+        <mesh name="link0_vis" file="meshes/link0_vis.stl" />
+        <mesh name="link1_vis" file="meshes/link1_vis.stl" />
+        <mesh name="link2_vis" file="meshes/link2_vis.stl" />
+        <mesh name="link3_vis" file="meshes/link3_vis.stl" />
+        <mesh name="link4_vis" file="meshes/link4_vis.stl" />
+        <mesh name="link5_vis" file="meshes/link5_vis.stl" />
+        <mesh name="link6_vis" file="meshes/link6_vis.stl" />
+        <mesh name="link7_vis" file="meshes/link7_vis.stl" />
+    </asset>
+    <worldbody>
+        <body name="base" pos="0 0 0">
+            <!-- robot view -->
+            <camera mode="fixed" name="robotview" pos="1.0 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+            <inertial diaginertia="0 0 0" mass="0" pos="0 0 0"/>
+            <!-- mount attached here -->
+            <body name="link0" pos="0 0 0">
+                <inertial pos="0 0 0.05" mass="4" diaginertia="0.4 0.4 0.4" />
+                <geom type="mesh" contype="0" conaffinity="0" group="1" mesh="link0_vis" name="link0_visual" rgba="1 1 1 1" />
+                <geom type="mesh" group="0" mesh="link0" name="link0_collision"/>
+                <body name="link1" pos="0 0 0.333">
+                    <inertial pos="0 0 -0.07" mass="3" diaginertia="0.3 0.3 0.3" />
+                    <joint name="joint1" pos="0 0 0" axis="0 0 1" limited="true" range="-2.8973 2.8973" damping="0.1"/>
+                    <geom type="mesh" contype="0" conaffinity="0" group="1" mesh="link1_vis" name="link1_visual" rgba="1 1 1 1" />
+                    <geom type="mesh" group="0" mesh="link1" name="link1_collision"/>
+                    <body name="link2" pos="0 0 0" quat="0.707107 -0.707107 0 0">
+                        <inertial pos="0 -0.1 0" mass="3" diaginertia="0.3 0.3 0.3" />
+                        <joint name="joint2" pos="0 0 0" axis="0 0 1" limited="true" range="-1.7628 1.7628" damping="0.1"/>
+                        <geom type="mesh" contype="0" conaffinity="0" group="1" mesh="link2_vis" name="link2_visual" rgba="1 1 1 1" />
+                        <geom type="mesh" group="0" mesh="link2" name="link2_collision"/>
+                        <body name="link3" pos="0 -0.316 0" quat="0.707107 0.707107 0 0">
+                            <inertial pos="0.04 0 -0.05" mass="2" diaginertia="0.2 0.2 0.2" />
+                            <joint name="joint3" pos="0 0 0" axis="0 0 1" limited="true" range="-2.8973 2.8973" damping="0.1"/>
+                            <geom type="mesh" contype="0" conaffinity="0" group="1" mesh="link3_vis" name="link3_visual" rgba="1 1 1 1" />
+                            <geom type="mesh" group="0" mesh="link3" name="link3_collision"/>
+                            <body name="link4" pos="0.0825 0 0" quat="0.707107 0.707107 0 0">
+                                <inertial pos="-0.04 0.05 0" mass="2" diaginertia="0.2 0.2 0.2" />
+                                <joint name="joint4" pos="0 0 0" axis="0 0 1" limited="true" range="-3.0718 -0.0698" damping="0.1"/>
+                                <geom type="mesh" contype="0" conaffinity="0" group="1" mesh="link4_vis" name="link4_visual" rgba="1 1 1 1" />
+                                <geom type="mesh" group="0" mesh="link4" name="link4_collision"/>
+                                <body name="link5" pos="-0.0825 0.384 0" quat="0.707107 -0.707107 0 0">
+                                    <inertial pos="0 0 -0.15" mass="2" diaginertia="0.2 0.2 0.2" />
+                                    <joint name="joint5" pos="0 0 0" axis="0 0 1" limited="true" range="-2.8973 2.8973" damping="0.1"/>
+                                    <geom type="mesh" contype="0" conaffinity="0" group="1" mesh="link5_vis" name="link5_visual" rgba="1 1 1 1" />
+                                    <geom type="mesh" group="0" mesh="link5" name="link5_collision"/>
+                                    <body name="link6" pos="0 0 0" quat="0.707107 0.707107 0 0">
+                                        <inertial pos="0.06 0 0" mass="1.5" diaginertia="0.1 0.1 0.1" />
+                                        <joint name="joint6" pos="0 0 0" axis="0 0 1" limited="true" range="-0.0175 3.7525" damping="0.01"/>
+                                        <geom type="mesh" contype="0" conaffinity="0" group="1" mesh="link6_vis" name="link6_visual" rgba="1 1 1 1" />
+                                        <geom type="mesh" group="0" mesh="link6" name="link6_collision"/>
+                                        <body name="link7" pos="0.088 0 0" quat="0.707107 0.707107 0 0">
+                                            <inertial pos="0 0 0.08" mass="0.5" diaginertia="0.05 0.05 0.05" />
+                                            <joint name="joint7" pos="0 0 0" axis="0 0 1" limited="true" range="-2.8973 2.8973" damping="0.01"/>
+                                            <geom type="mesh" contype="0" conaffinity="0" group="1" mesh="link7_vis" name="link7_visual" rgba="1. 1. 1. 1." />
+                                            <geom type="mesh" group="0" mesh="link7" name="link7_collision"/>
+                                            <!-- rotate 135deg to align physically to the tool-->
+                                            <body name="right_hand" pos="0 0 0.1065" quat="0.924 0 0 -0.383">
+                                                <inertial pos="0 0 0" mass="0.5" diaginertia="0.05 0.05 0.05" />
+                                                <!-- This camera points out from the eef. -->
+                                                <camera mode="fixed" name="eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                                                <!-- to add gripper -->
+                                            </body>
+                                        </body>
+                                    </body>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/finger.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/finger.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..685184642fd6431fa28c373ad174b027da2f0825
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/finger.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:525d16f9a1338ebc2f3c062b30e462a279356aa4fe242617d39e05fa6926c27b
+size 481
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/finger.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/finger.obj
new file mode 100644
index 0000000000000000000000000000000000000000..755b123571ff7acc83822ee38fe559ffd9de7bff
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/finger.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6959401c281e6d5b4486f08072140b6fdb3461678face301ac448cec06322d3
+size 65359
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/hand.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/hand.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..5ddf352addf4066bbae00eb184c7bd52c3d372d2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/hand.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da2fb08fb2efec231cfd38b710e657b3ef04d3d3cf18082d3d043080dd6c79df
+size 1132
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/hand.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/hand.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9c8e9c103f64c4fff4c6286309a537a98d88dbe3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/hand.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c95c6f99024842a8b9e73629dacc30ac5ee459feb0dbcc16df765fd27620584
+size 713204
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..12793d769dda4f4368d84ecf5383c31206fa0aed
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff032aa0aa3514c0820f8c07a676892fe5a7a85977ed3e4818d8ee8080cd8a2c
+size 358725
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..fa7b471c7e99d2e635c0c01c9b89084c1d30df27
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d033464e59f40a3096d2d6cf7853afb5bdb258bf19accefa24fa040ea73763e2
+size 122052
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_10.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_10.obj
new file mode 100644
index 0000000000000000000000000000000000000000..7c9f65b1293683e1909740b979c2d9d03b6c3a1e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_10.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d879fddef79581242c8d28bc4dea19759712f5d83820b9a7c8bf519035d52d3
+size 407944
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_11.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_11.obj
new file mode 100644
index 0000000000000000000000000000000000000000..f169e1f08a4c8989959ef2291bef904934a4a734
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_11.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f47131b0f4a638b64065fa9d7fa0f6d196c7aa56ba717ce9d127d36a67338a5
+size 24799
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..95c352c141b99a86897c14078f9eb0158f072d63
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f8782b8a222da3981d0c0108930fb8dadf44639a16bd85bc89cd7401ee5a766
+size 731627
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c464bf0f548080012620c8246dddec7a8898abf1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fc91c0cd10c4aef9a493699fa83457c225d0854b28d55ccb0b47fe38f412f28
+size 58037
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a141747c20557310d6dda40c009e27a7d9a08b1d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f23819a788f7ce26965e540cef7a9bf5970ed99da3c1b875a91b0e544511e39d
+size 256094
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..057b9c1e6d235eb8a129e8e967d2d4e5fcf066ed
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8e5ff98258aaca852979fd62a39d929c1d726d9e6fa19f1acc030553cdf2ed3
+size 21416
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_6.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_6.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3f8d4e6fbb74f023c80dc4ee5673544b996ac45e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_6.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41be78cbdc6b0d401efe4fa630fe6924ee3312e294403aa35a4e18a70f99139e
+size 4495
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_7.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_7.obj
new file mode 100644
index 0000000000000000000000000000000000000000..8c15cd15e078f66486991ad420f87571ffa91a28
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_7.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75e3a2661f6366bb6c665282cd77f18b0da09ec1eaa8df29617fccf279970f6f
+size 34000
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_8.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_8.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b7d6621be9facc240620be13576685a5a43945aa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_8.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d751e2ad68ca735b0c3b0f8f11f7364026fe64166ebf4aab69446d4465c6129d
+size 3621712
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_9.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_9.obj
new file mode 100644
index 0000000000000000000000000000000000000000..2cfb2ffd4095aecf65c47c09ccf38045504e9be7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link0_vis/link0_vis_9.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19817601101caeea79826ffe057ad0481b97f60e80870ee768fe32b6855ab9d2
+size 125294
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..23112caa06b319e789a26399eedb048a78b260f8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c6e1b7fe3a382b872c986cfb3601e79b701afb7b4bfc72dcd6abd4b31f9d0f5
+size 262
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..7e840697010a583ad63477e55fe911f23762ddfb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0ec1bec9432a46f797e912cd8165798e62ebfd85c0807dce142bd6b9a768cd7
+size 1070650
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link1_vis/link1_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link1_vis/link1_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..207c80dd76bf5676fc10da91ac22c9471e241b5a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link1_vis/link1_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e6cffc2656cac8c8faa9b43e41cba312b35519c9cc4c5bddd7159f14df50f2d
+size 3676668
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link2.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link2.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..2f3dc43dd42d4fe485f1863772b4292ad67a14be
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link2.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7821edf4876534ddeb9cdc789453e6473e2f1b9d33747f036005788b83199da
+size 261
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..72fc313fa59501f85146cbeeff5d94ff56d8356b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1a41ada6d6cb8c51a8d0d19db9cadea101493842a52db75529e7ba0d4592904
+size 1084458
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link2_vis/link2_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link2_vis/link2_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5de65d643d34c77892f6c452d5bd9e755ef28c62
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link2_vis/link2_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42ada2805819bd9364e567e23b4c1b71b91d9ac02525598d823c14bf8d25dabd
+size 3713545
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..001972f5535b391822f2ba5ed4070cd04414e3cc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2648eae04bf57f48100234ada9400156b1867d0fc2dbda2606bb786d6cfec51
+size 931
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..d0b342f4ef35a302f76953b3c2874dbaa37d90b5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82f81c15c236a41918dade0241ce40e59c4a0ee7fb08985f3a934380a8f7f594
+size 1237175
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3d6091013a27ebaf994505530d2269f756de0544
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a1f18e21663f2acea23752dcb79b2a8e8ae96ab286dd9d90de1d61c0dd87c0b
+size 3499294
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b7a35d54a5bff2036a6eb54e74c032895ab2c16a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73ad9e69c96283cc865424a64e38b53b7cd4385a0ff9834cae91a258ab92cb97
+size 81340
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..aaed2765c1c528454b22eedce1dc8a773b70f81c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cd5a95e3599005bfc76e3700f1e3b3051efe07665b0ae6efbce75860a89e7f9
+size 98021
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..14cc7bfac589ce77334e02d80cf6e8c3ef045ce1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link3_vis/link3_vis_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01f69ca50a96cceb1bc1a0fcb4de9a7fe2b5b11d1d2e2a5c445723acef7eb483
+size 571618
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..9f1a2e75d4e6635d47ec16a79bbc096e8bbc0355
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa5e228aca4026e34e69dc9eb80bed96cc7d81dae5595984c16b464de5ba5963
+size 925
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4539ff1edbaff4bf1fcf058aae80566b960307f6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91f36ecba40ecce6c169ede6d152a6c8853ee886e2b145ef6c5865cc9d5516b8
+size 1272305
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..0f0ca00ede27baebc4b78d15224ebd1e7ec284d7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76f42fd26edf4a6b78b055d3b48c4df59cebb93ffc36fc6e6e6c8d441dbec3c9
+size 99644
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..16b3a4591995cb42ad2ab2dc43608a590ec0256d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85907c1c058ea1133cd6cf5a23535a1b77257b4852c753e08cbf87f79c4003c2
+size 3608854
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e54ceeb606068d3f57ddcdfed1986eb16b8c87e4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a019f7f862b782af509f649050ef45354ff73642742dea7525d8e1369a07fd94
+size 566926
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..d4ab20950c045a29aa22a7174aaa1bc35ea2fdcf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link4_vis/link4_vis_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0fcc4417a0680860ee11af60df227ecc328b4b1c66908b9872dedfa8620d992
+size 80397
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..a68c251016268d4e78d167ca54448069a328e0c7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d44e731edacef0b66d7813468107877d05a12893f363ce3a0985c8305f404f3
+size 676
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ca493ec395907a490a279498629cff35f119ef63
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10d084e6f728af7d291403f0359373c121e87785b177e5c6df8a7e1e88077f11
+size 1582192
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5_vis/link5_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5_vis/link5_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e19d2b16fd86bbf114e1901aa5d8f9593b19ea88
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5_vis/link5_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e30aa36ca950b2b57ba695423ee30da5da662b1a70b72d1533c5cd435b88355d
+size 907061
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5_vis/link5_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5_vis/link5_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..90a2e018b97d9bcd316cebd701c7308ea0fff767
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5_vis/link5_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d4089d9f2d26ef677ff42b27a2cb4ba9691a8fcd19ac665ac5203b23d22b876
+size 45788
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5_vis/link5_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5_vis/link5_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..d37b92cd0634bc31da3bc66b1992cd12b08621a2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link5_vis/link5_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c095257424ff95526d2ebab4f396833a2d77227e0d2b3bb899a0019d19c38c49
+size 4535777
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..f8f25910225c6c5c60ffc027f30f68c19e975104
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78299752f3c28e87c19f5456e851423de647fbb065f8d88332c1f72ba5729b4c
+size 3552
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6.obj
new file mode 100644
index 0000000000000000000000000000000000000000..37b6df71209edc724f2623bb8bc22a01288e69fb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d6c29421e8066761c993a519474eb58f851885387cc2ad5a606f96eb428f059
+size 2236857
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3641b11521f80eb46eb76c1e00448d9378b36658
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca3ca6f238a0ed1138bfd5d11982f610d96dfc98d2d663646f07fc4c2a3ea7b0
+size 181786
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..32e8999904fc52125ee50cfa72c591c3aec7cf38
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a96dc760e3b8b0f6ab4e098248e4019e67ec9fcf404c6ab944e216dfc752e590
+size 23299
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_10.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_10.obj
new file mode 100644
index 0000000000000000000000000000000000000000..87713887237e632206c961faffea0a829aa7b043
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_10.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:915c2eedef0d04869814126a96da1ff70d1fc89fc65ac80021084b00cad5d757
+size 462054
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_11.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_11.obj
new file mode 100644
index 0000000000000000000000000000000000000000..92397d1dcd2b8cdb7ea27573bfae4c54ff1a7f32
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_11.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6edb35baa656d0b647541ab9de4706c4d471ec831e732955f9f4d347f4bc02fb
+size 30999
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_12.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_12.obj
new file mode 100644
index 0000000000000000000000000000000000000000..69d264496143fa2800a40ca0c4204d961dcc07b3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_12.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5af3b6bdecc4de424eb0217627bf0f6185f126dbb0bc1a7fe39253f0a26f2907
+size 4214
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_13.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_13.obj
new file mode 100644
index 0000000000000000000000000000000000000000..00deb8bc958807e822e4583e01b894d4dd850685
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_13.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21dfaf6cb5f27a8593be82cee6ca3023e3829d5f2be5e30af54b5955ea87a554
+size 4327
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_14.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_14.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a4bc5fbd96fe9091274dec5676bfa4ecf5af6121
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_14.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8aa89ae8b85fb54400771754d25d6813dfb41299821bc984f37498ac0d9cba14
+size 533553
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_15.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_15.obj
new file mode 100644
index 0000000000000000000000000000000000000000..82d05761e240e015e1cb32b38f0ba123377391a5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_15.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9c3bcfb59c80b0e9f67c62775ea7564f1f11f55faf94b971c7fa2ee436e91bc
+size 819438
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_16.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_16.obj
new file mode 100644
index 0000000000000000000000000000000000000000..24e704998f534c62e3d848c1c08f8221424bc8c8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_16.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e76221e7c530dde22117afc57d95d48ddf30a5e586e3bd99ba6777a30b50e58
+size 4284801
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..90b0095a7ccdf63d4af8bd4b64936008a7b64605
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bef85a944a58e7032a765cbb10445ac44450fe047f3fefe501e3a3ac90dd6caf
+size 10246
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..28c3e63fbb4bd495dbd7744e15b5b379783370c0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8f1f77b3978a16fcc8b4dd00af50431d1bca26576ef92f73d90d984dd188211
+size 15976
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6942f140fca63d79181948ba71d913bd8174ab45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cf60228d09854089185c5bf9f23004603b3aef874b61b2b8dad7bdcc29178e1
+size 18377
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6fcb136c07d912e5e2095dbe7f85e5a9308b9c70
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfdcbfb610c052e2a4830fbc26a53ec99c0658ec3664a179c5471ff42d6a60de
+size 15690
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_6.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_6.obj
new file mode 100644
index 0000000000000000000000000000000000000000..71a321952d17098dca29d010868964159c9f5256
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_6.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b13bafd7d8cee84a07e5a0667da8d1e1d9467112415332f23ef79c582421f8ce
+size 16884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_7.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_7.obj
new file mode 100644
index 0000000000000000000000000000000000000000..99919ddd372f505a4965610a0fb7066c3a1e252a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_7.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52d8f9cac4326716f46fad9aa1314491d377b1b36a374258a97aae196b006e51
+size 2464
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_8.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_8.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6aa356f859758bb5dc518c2a8305edf45ad7273d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_8.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e586f0aaa641cdae0d372bd2fb689600cf13fc629a52ba4ad6076bfa7b0834f
+size 6323
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_9.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_9.obj
new file mode 100644
index 0000000000000000000000000000000000000000..7826b2dc9275567373bb710538e37ac15ff2f0cf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link6_vis/link6_vis_9.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a3caf4c3596ff2661d559ee291a9ac7f66a6d943ebfcc417f229de72962d0dc
+size 16367
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..44489a11f5f6f21d19be60998117bbef3457a5ca
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a43571b6feb5dbb0b6f5b3d0a06f51f356371d27a10fc580c985107db16f2ac
+size 1831721
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..fcb08857f1b2765c1f5e97f62f98b0676e786f99
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fefd18ce20e90dac5ce7d181e676e6cf7504cca1f050c4d05c098169b417116
+size 162840
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..88bd05f1d05340a5b5f113019c68b33c54e0010f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44ae15f4a83bd36f4de3cce78e3f592c8a070ca1fd4539be3b1c58e21d8d8671
+size 280928
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..eaf9300e7d600d382b71d4e5e3e01d9b5286eedd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08c6308530928ae7234edd1297039aff02b0d84d8bbfec8d01c5ab104fc4209b
+size 164977
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..50e878ca36e0e8ade0dd1c4c866ca6f270c66c7b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53e0bffb56a0fe9ebab0f43dc997512287ab14d8f9b2d77f1baf7f0c96367f59
+size 114146
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..8f60a8f9422247b4f987f1eaf3ac7dbbfb360d86
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fd161fb00e729732950770f20fb6e75c5a5ca1620bceac9d550c54f322d469d
+size 304295
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_6.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_6.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4c404437c9157bfe08cefdd9c7c41401a6cabb95
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_6.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd4f2b39adfd4857f922f3104fdd4cef01390265a5f2582c73d97f39dbc1755b
+size 133885
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_7.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_7.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c26dcc772f9ba9e4bf7c2f9b3313dcd36967a20e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/obj_meshes/link7_vis/link7_vis_7.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4f9e3c8532df4d1588e1c2ff82fae10facfb30a52329b7f595ee37d7bad6229
+size 1058189
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..dd30bb8d028642eef62018c05c70feb1b5db2350
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/panda/robot.xml
@@ -0,0 +1,252 @@
+<mujoco model="panda">
+    <actuator>
+        <!-- Physical limits of the actuator. -->
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="joint1" name="torq_j1"/>
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="joint2" name="torq_j2"/>
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="joint3" name="torq_j3"/>
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="joint4" name="torq_j4"/>
+        <motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="joint5" name="torq_j5"/>
+        <motor ctrllimited="true" ctrlrange="-12.0 12.0" joint="joint6" name="torq_j6"/>
+        <motor ctrllimited="true" ctrlrange="-12.0 12.0" joint="joint7" name="torq_j7"/>
+    </actuator>
+    <asset>
+        <mesh name="link0" file="meshes/link0.stl" />
+        <mesh name="link1" file="meshes/link1.stl" />
+        <mesh name="link2" file="meshes/link2.stl" />
+        <mesh name="link3" file="meshes/link3.stl" />
+        <mesh name="link4" file="meshes/link4.stl" />
+        <mesh name="link5" file="meshes/link5.stl" />
+        <mesh name="link6" file="meshes/link6.stl" />
+        <mesh name="link7" file="meshes/link7.stl" />
+
+        <!-- link 0 material and mesh -->
+	<material name="Face636_001" specular="0.5" shininess="0.45" rgba="0.901961 0.921569 0.929412 1.000000"/>
+	<material name="Part__Feature017_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Part__Feature018_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Part__Feature019_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Part__Feature022_001" specular="0.5" shininess="0.45" rgba="0.901961 0.921569 0.929412 1.000000"/>
+	<material name="Part__Feature023_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Shell001_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Shell002_001" specular="0.5" shininess="0.45" rgba="0.901961 0.921569 0.929412 1.000000"/>
+	<material name="Shell003_001" specular="0.5" shininess="0.45" rgba="0.901961 0.921569 0.929412 1.000000"/>
+	<material name="Shell009_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Shell010_001" specular="0.5" shininess="0.45" rgba="0.901961 0.921569 0.929412 1.000000"/>
+	<material name="Shell_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<mesh name="link0_vis_0" file="obj_meshes/link0_vis/link0_vis_0.obj"/>
+	<mesh name="link0_vis_1" file="obj_meshes/link0_vis/link0_vis_1.obj"/>
+	<mesh name="link0_vis_2" file="obj_meshes/link0_vis/link0_vis_2.obj"/>
+	<mesh name="link0_vis_3" file="obj_meshes/link0_vis/link0_vis_3.obj"/>
+	<mesh name="link0_vis_4" file="obj_meshes/link0_vis/link0_vis_4.obj"/>
+	<mesh name="link0_vis_5" file="obj_meshes/link0_vis/link0_vis_5.obj"/>
+	<mesh name="link0_vis_6" file="obj_meshes/link0_vis/link0_vis_6.obj"/>
+	<mesh name="link0_vis_7" file="obj_meshes/link0_vis/link0_vis_7.obj"/>
+	<mesh name="link0_vis_8" file="obj_meshes/link0_vis/link0_vis_8.obj"/>
+	<mesh name="link0_vis_9" file="obj_meshes/link0_vis/link0_vis_9.obj"/>
+	<mesh name="link0_vis_10" file="obj_meshes/link0_vis/link0_vis_10.obj"/>
+	<mesh name="link0_vis_11" file="obj_meshes/link0_vis/link0_vis_11.obj"/>
+
+        <!-- link 1 material and mesh -->
+	<material name="Part__Feature_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<mesh name="link1_vis" file="obj_meshes/link1_vis/link1_vis.obj"/>
+
+
+        <!-- link 2 material and mesh -->
+	<material name="Part__Feature024" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<mesh name="link2_vis" file="obj_meshes/link2_vis/link2_vis.obj"/>
+
+        <!-- link 3 material and mesh -->
+	<material name="Part__Feature001_010_001_002" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Part__Feature002_007_001_002" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Part__Feature003_004_001_002" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Part__Feature_001_001_001_002" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<mesh name="link3_vis_0" file="obj_meshes/link3_vis/link3_vis_0.obj"/>
+	<mesh name="link3_vis_1" file="obj_meshes/link3_vis/link3_vis_1.obj"/>
+	<mesh name="link3_vis_2" file="obj_meshes/link3_vis/link3_vis_2.obj"/>
+	<mesh name="link3_vis_3" file="obj_meshes/link3_vis/link3_vis_3.obj"/>
+
+
+        <!-- link 4 material and mesh -->
+	<material name="Part__Feature001_001_003_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Part__Feature002_001_003_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Part__Feature003_001_003_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Part__Feature_002_003_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<mesh name="link4_vis_0" file="obj_meshes/link4_vis/link4_vis_0.obj"/>
+	<mesh name="link4_vis_1" file="obj_meshes/link4_vis/link4_vis_1.obj"/>
+	<mesh name="link4_vis_2" file="obj_meshes/link4_vis/link4_vis_2.obj"/>
+	<mesh name="link4_vis_3" file="obj_meshes/link4_vis/link4_vis_3.obj"/>
+
+        <!-- link 5 material and mesh -->
+	<material name="Part__Feature_002_004_003" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Shell001_001_001_003" specular="0.5" shininess="0.45" rgba="0.250000 0.250000 0.250000 1.000000"/>
+	<material name="Shell_001_001_003" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<mesh name="link5_vis_0" file="obj_meshes/link5_vis/link5_vis_0.obj"/>
+	<mesh name="link5_vis_1" file="obj_meshes/link5_vis/link5_vis_1.obj"/>
+	<mesh name="link5_vis_2" file="obj_meshes/link5_vis/link5_vis_2.obj"/>
+
+        <!-- link 6 material and mesh -->
+	<material name="Face064_002_001_002_001" specular="0.5" shininess="0.45" rgba="1.000000 0.000000 0.000000 1.000000"/>
+	<material name="Face065_002_001_002_001" specular="0.5" shininess="0.45" rgba="0.000000 1.000000 0.000000 1.000000"/>
+	<material name="Face374_002_001_002_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Face539_002_001_002_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Part__Feature001_009_001_002_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Part__Feature002_006_001_002_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Shell002_002_001_002_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Shell003_002_001_002_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Shell004_001_001_002_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Shell005_001_001_002_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Shell006_003_002_001" specular="0.5" shininess="0.45" rgba="0.901961 0.921569 0.929412 1.000000"/>
+	<material name="Shell007_002_002_001" specular="0.5" shininess="0.45" rgba="0.250000 0.250000 0.250000 1.000000"/>
+	<material name="Shell011_002_002_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Shell012_002_002_001" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Shell_003_001_002_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Union001_001_001_002_001" specular="0.5" shininess="0.45" rgba="0.039216 0.541176 0.780392 1.000000"/>
+	<material name="Union_001_001_002_001" specular="0.5" shininess="0.45" rgba="0.039216 0.541176 0.780392 1.000000"/>	
+	<mesh name="link6_vis_0" file="obj_meshes/link6_vis/link6_vis_0.obj"/>
+	<mesh name="link6_vis_1" file="obj_meshes/link6_vis/link6_vis_1.obj"/>
+	<mesh name="link6_vis_2" file="obj_meshes/link6_vis/link6_vis_2.obj"/>
+	<mesh name="link6_vis_3" file="obj_meshes/link6_vis/link6_vis_3.obj"/>
+	<mesh name="link6_vis_4" file="obj_meshes/link6_vis/link6_vis_4.obj"/>
+	<mesh name="link6_vis_5" file="obj_meshes/link6_vis/link6_vis_5.obj"/>
+	<mesh name="link6_vis_6" file="obj_meshes/link6_vis/link6_vis_6.obj"/>
+	<mesh name="link6_vis_7" file="obj_meshes/link6_vis/link6_vis_7.obj"/>
+	<mesh name="link6_vis_8" file="obj_meshes/link6_vis/link6_vis_8.obj"/>
+	<mesh name="link6_vis_9" file="obj_meshes/link6_vis/link6_vis_9.obj"/>
+	<mesh name="link6_vis_10" file="obj_meshes/link6_vis/link6_vis_10.obj"/>
+	<mesh name="link6_vis_11" file="obj_meshes/link6_vis/link6_vis_11.obj"/>
+	<mesh name="link6_vis_12" file="obj_meshes/link6_vis/link6_vis_12.obj"/>
+	<mesh name="link6_vis_13" file="obj_meshes/link6_vis/link6_vis_13.obj"/>
+	<mesh name="link6_vis_14" file="obj_meshes/link6_vis/link6_vis_14.obj"/>
+	<mesh name="link6_vis_15" file="obj_meshes/link6_vis/link6_vis_15.obj"/>
+	<mesh name="link6_vis_16" file="obj_meshes/link6_vis/link6_vis_16.obj"/>
+
+        <!-- link 7 material and mesh -->
+	<material name="Part__Mirroring001_004_002" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Part__Mirroring002_004_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Part__Mirroring003_004_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Part__Mirroring004_004_002" specular="0.5" shininess="0.45" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Part__Mirroring005_004_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Part__Mirroring006_004_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Part__Mirroring007_004_001" specular="0.5" shininess="0.45" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Part__Mirroring_004_001" specular="0.5" shininess="0.45" rgba="0.898039 0.917647 0.929412 1.000000"/>	
+
+	<mesh name="link7_vis_0" file="obj_meshes/link7_vis/link7_vis_0.obj"/>
+	<mesh name="link7_vis_1" file="obj_meshes/link7_vis/link7_vis_1.obj"/>
+	<mesh name="link7_vis_2" file="obj_meshes/link7_vis/link7_vis_2.obj"/>
+	<mesh name="link7_vis_3" file="obj_meshes/link7_vis/link7_vis_3.obj"/>
+	<mesh name="link7_vis_4" file="obj_meshes/link7_vis/link7_vis_4.obj"/>
+	<mesh name="link7_vis_5" file="obj_meshes/link7_vis/link7_vis_5.obj"/>
+	<mesh name="link7_vis_6" file="obj_meshes/link7_vis/link7_vis_6.obj"/>
+	<mesh name="link7_vis_7" file="obj_meshes/link7_vis/link7_vis_7.obj"/>
+
+    </asset>
+    <worldbody>
+        <body name="base" pos="0 0 0">
+            <!-- robot view -->
+            <camera mode="fixed" name="robotview" pos="1.0 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+            <inertial diaginertia="0 0 0" mass="0" pos="0 0 0"/>
+            <!-- mount attached here -->
+            <body name="link0" pos="0 0 0">
+                <inertial pos="0 0 0.05" mass="4" diaginertia="0.4 0.4 0.4" />
+		<geom mesh="link0_vis_0" material="Shell010_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_1" material="Shell009_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_2" material="Shell003_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_3" material="Shell_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_4" material="Shell002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_5" material="Shell001_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_6" material="Face636_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_7" material="Part__Feature018_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_8" material="Part__Feature019_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_9" material="Part__Feature023_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_10" material="Part__Feature022_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="link0_vis_11" material="Part__Feature017_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+                <geom type="mesh" group="0" mesh="link0" name="link0_collision"/>
+                <body name="link1" pos="0 0 0.333">
+                    <inertial pos="0 0 -0.07" mass="3" diaginertia="0.3 0.3 0.3" />
+                    <joint name="joint1" pos="0 0 0" axis="0 0 1" limited="true" range="-2.8973 2.8973" damping="0.1"/>
+		    <geom material="Part__Feature_001" mesh="link1_vis" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+                    <geom type="mesh" group="0" mesh="link1" name="link1_collision"/>
+                    <body name="link2" pos="0 0 0" quat="0.707107 -0.707107 0 0">
+                        <inertial pos="0 -0.1 0" mass="3" diaginertia="0.3 0.3 0.3" />
+                        <joint name="joint2" pos="0 0 0" axis="0 0 1" limited="true" range="-1.7628 1.7628" damping="0.1"/>
+			<geom material="Part__Feature024" mesh="link2_vis" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+                        <geom type="mesh" group="0" mesh="link2" name="link2_collision"/>
+                        <body name="link3" pos="0 -0.316 0" quat="0.707107 0.707107 0 0">
+                            <inertial pos="0.04 0 -0.05" mass="2" diaginertia="0.2 0.2 0.2" />
+                            <joint name="joint3" pos="0 0 0" axis="0 0 1" limited="true" range="-2.8973 2.8973" damping="0.1"/>
+			    <geom mesh="link3_vis_0" material="Part__Feature003_004_001_002" type="mesh" contype="0" conaffinity="0" group="1"/>
+			    <geom mesh="link3_vis_1" material="Part__Feature002_007_001_002" type="mesh" contype="0" conaffinity="0" group="1"/>
+			    <geom mesh="link3_vis_2" material="Part__Feature001_010_001_002" type="mesh" contype="0" conaffinity="0" group="1"/>
+			    <geom mesh="link3_vis_3" material="Part__Feature_001_001_001_002" type="mesh" contype="0" conaffinity="0" group="1"/>
+                            <geom type="mesh" group="0" mesh="link3" name="link3_collision"/>
+                            <body name="link4" pos="0.0825 0 0" quat="0.707107 0.707107 0 0">
+                                <inertial pos="-0.04 0.05 0" mass="2" diaginertia="0.2 0.2 0.2" />
+                                <joint name="joint4" pos="0 0 0" axis="0 0 1" limited="true" range="-3.0718 -0.0698" damping="0.1"/>
+				<geom mesh="link4_vis_0" material="Part__Feature001_001_003_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="link4_vis_1" material="Part__Feature003_001_003_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="link4_vis_2" material="Part__Feature002_001_003_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="link4_vis_3" material="Part__Feature_002_003_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+                                <geom type="mesh" group="0" mesh="link4" name="link4_collision"/>
+                                <body name="link5" pos="-0.0825 0.384 0" quat="0.707107 -0.707107 0 0">
+                                    <inertial pos="0 0 -0.15" mass="2" diaginertia="0.2 0.2 0.2" />
+                                    <joint name="joint5" pos="0 0 0" axis="0 0 1" limited="true" range="-2.8973 2.8973" damping="0.1"/>
+				    <geom mesh="link5_vis_0" material="Shell001_001_001_003" type="mesh" contype="0" conaffinity="0" group="1"/>
+				    <geom mesh="link5_vis_1" material="Shell_001_001_003" type="mesh" contype="0" conaffinity="0" group="1"/>
+				    <geom mesh="link5_vis_2" material="Part__Feature_002_004_003" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+                                    <geom type="mesh" group="0" mesh="link5" name="link5_collision"/>
+                                    <body name="link6" pos="0 0 0" quat="0.707107 0.707107 0 0">
+                                        <inertial pos="0.06 0 0" mass="1.5" diaginertia="0.1 0.1 0.1" />
+                                        <joint name="joint6" pos="0 0 0" axis="0 0 1" limited="true" range="-0.0175 3.7525" damping="0.01"/>
+					<geom mesh="link6_vis_0" material="Shell006_003_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_1" material="Shell011_002_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_2" material="Shell007_002_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_3" material="Shell005_001_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_4" material="Shell004_001_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_5" material="Shell003_002_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_6" material="Shell002_002_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_7" material="Union001_001_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_8" material="Union_001_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_9" material="Face539_002_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_10" material="Shell_003_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_11" material="Face374_002_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_12" material="Face065_002_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_13" material="Face064_002_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_14" material="Part__Feature002_006_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_15" material="Part__Feature001_009_001_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="link6_vis_16" material="Shell012_002_002_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+                                        <geom type="mesh" group="0" mesh="link6" name="link6_collision"/>
+                                        <body name="link7" pos="0.088 0 0" quat="0.707107 0.707107 0 0">
+                                            <inertial pos="0 0 0.08" mass="0.5" diaginertia="0.05 0.05 0.05" />
+                                            <joint name="joint7" pos="0 0 0" axis="0 0 1" limited="true" range="-2.8973 2.8973" damping="0.01"/>
+					    <geom mesh="link7_vis_0" material="Part__Mirroring004_004_002" type="mesh" contype="0" conaffinity="0" group="1"/>
+					    <geom mesh="link7_vis_1" material="Part__Mirroring001_004_002" type="mesh" contype="0" conaffinity="0" group="1" />
+					    <geom mesh="link7_vis_2" material="Part__Mirroring007_004_001" type="mesh" contype="0" conaffinity="0" group="1" />
+					    <geom mesh="link7_vis_3" material="Part__Mirroring006_004_001" type="mesh" contype="0" conaffinity="0" group="1" />
+					    <geom mesh="link7_vis_4" material="Part__Mirroring005_004_001" type="mesh" contype="0" conaffinity="0" group="1" />
+					    <geom mesh="link7_vis_5" material="Part__Mirroring003_004_001" type="mesh" contype="0" conaffinity="0" group="1" />
+					    <geom mesh="link7_vis_6" material="Part__Mirroring002_004_001" type="mesh" contype="0" conaffinity="0" group="1" />
+					    <geom mesh="link7_vis_7" material="Part__Mirroring_004_001" type="mesh" contype="0" conaffinity="0" group="1" />
+                                            <!-- rotate 135deg to align physically to the tool-->
+                                            <geom type="mesh" group="0" mesh="link7" name="link7_collision"/>
+                                            <body name="right_hand" pos="0 0 0.1065" quat="0.924 0 0 -0.383">
+                                                <inertial pos="0 0 0" mass="0.5" diaginertia="0.05 0.05 0.05" />
+                                                <!-- This camera points out from the eef. -->
+                                                <camera mode="fixed" name="eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                                                <!-- to add gripper -->
+                                            </body>
+                                        </body>
+                                    </body>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.dae
new file mode 100644
index 0000000000000000000000000000000000000000..79993c4d499be9416a86e6f5abdcb35845fb7ef4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f879eb086ce265c73fa16b6c39b9344be2545e293fabbc4cd37ae405f991101
+size 1000898
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..b1bcc9bf5d30a204abd9be1ee0ae6126bd8b0a73
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e96917a254c2ff4ce42dd5ee0c82bb33d632013f9aeb0493b7205fec5d68c2d5
+size 411
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.obj
new file mode 100644
index 0000000000000000000000000000000000000000..256cb2dab250dd39cad70c40308832a1a3c24097
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aaeb52c143f716686b9ff094c41a2af2e761f8c6b0e633d3bb9500202f1a72c7
+size 1423343
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.stl
new file mode 100644
index 0000000000000000000000000000000000000000..83382c3684a5416298a7889732f86b57a5635efd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/base.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97b3997a278b7d7be42142f49f435a4a0d7856736b943bfb3590dc43210055f6
+size 264934
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.dae
new file mode 100644
index 0000000000000000000000000000000000000000..93bf692852e762e132bf998061910aee05f2cede
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d367f8c3b05f0a9261f1dca2e9040f4ae87908f343448c7ab8ddd51f74884ec
+size 1389340
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..3886714b14a4116ac77b23f3b91ff18403fa2ea1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49c9ecac411073f585ff5fea5ca6f29e90bbdfe4254e63b7bfdb7afb176d5593
+size 1828
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.obj
new file mode 100644
index 0000000000000000000000000000000000000000..95de473d78781dadf9bd9acf14d8bce2691ed872
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e98e325ac05841d7b3e2fca1cab5d7520b4ff8de51532cf3f615de4f4d03bd6
+size 1966591
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.stl
new file mode 100644
index 0000000000000000000000000000000000000000..3fbad3d5b002cbdf5046ec679ea9a9f32aa780db
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/head.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2d5dd7e7988417897c1b3c2b366ad9dc53e0c096e0a4db810f5b6f08e1385a9
+size 276234
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.dae
new file mode 100644
index 0000000000000000000000000000000000000000..8f39b917258adab2e893e8ea9fef2abb8e0f1bb6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d5ef27f6f53e5ab78ee0791d28f47099753ab5966132f67339e3514346eefd8
+size 3760739
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..599be372537b75781e5f7decfa7906937744c03b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:264852472aeb2f441259a9e09dee42214c5b2b2b92fef9e6d6fd8b7d2dd709e5
+size 1321
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e3a08865ffdb3a3d0e8e43206a205d6e77e79e25
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e0871515535b0b4b60f31ca114470fc767522ec3e0dc4f987e7db9d724d1dd9
+size 5578526
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.stl
new file mode 100644
index 0000000000000000000000000000000000000000..263bfce5dbb757036183d9c849615e4508891ba9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l0.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f4a7f56bcf4cbfbb72414acd61956ccb8db88ca5ec4074ff626a44cf41a18c1
+size 675584
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..253e63fd21bb6796d2357984078aca396175869b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f500967d2a91c6321a453dc3a5949013729354d04c8de4a8c20116e61918408e
+size 473148
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..100f24e77c21881f7d8226f8d147fd4e013190ce
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d10328be170934db4f575c103d6d65dbf954445ca649690d503797927306767d
+size 593
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..8d1e02249686849b4c781be8dbdf465710b1c266
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e551d2b6633fa879fe4d4a69d584922205a44ad1ad9a3899736fcc8ae5d8edc
+size 696392
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..964ea6cdf273aa59cb968bec8b71889df8bc5727
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6d8ba089c3da8a4a40e176a13928e3a39cfffa2fdc83311ac2f6b59035ab6d0
+size 511884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.dae
new file mode 100644
index 0000000000000000000000000000000000000000..b83494d7fb80655f2e0d5292b1eea479e1b74ca2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a56dc6354f42d850b35d246266de67c0fb22eb17576e19712340f5909116aed
+size 654737
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..e5c3fec6e71833822eea82eeb9ac9b6a6a2d0233
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfdece18474cb2479a0ae487126743f4a2cae72f1f22e3de8d1a8129089ad71a
+size 957
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..97946cdb06cac7b6bf8e86dc9b06939d7a69a93f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:974a66e72ae987c1cd7f47fefd4756646e298739a7c879244b2c3ff0533fb19c
+size 953303
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..3ef2de15c32d518d145e0d76a78894fd17d563dd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a0d2ccf5668f737409d0e7fc2578b9e24660298d09787852494a7adae8c58b1
+size 133734
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.dae
new file mode 100644
index 0000000000000000000000000000000000000000..2a6630f83b9739810d4bc5d6506ceefaa28a5587
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84f67d46fcc4acafd8fd7bd620ef6354b884d3f96eb4d09e792b40ce9ead0669
+size 618017
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..144315ce398fb6dc49e14b7e1e6a88972196d5b7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c17f048659d1e2e4929c4e958b60a54353e7f8c70f51441d30c19c919c4f9cec
+size 775
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..2e5a528e05e888bca43d8e9760df7ecee4751c1e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52c5c2a65a874a400085a32033397963f68794cbb906780c9bbb6fe52e3f5937
+size 896221
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.stl
new file mode 100644
index 0000000000000000000000000000000000000000..f0d4d108e88ba1629d943451db1d68916e810bf6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l3.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9faa1f689135fcda53c50b18f9a5c55bc718bb30790395a20821e96b87a174d0
+size 160034
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.dae
new file mode 100644
index 0000000000000000000000000000000000000000..08e9bafdf91240f1c1213757797b5d33b63e0a91
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b1742337c74de3ec0841eeb651291a09dadd6cbe84e2b1f26a1c4a99272a1c0
+size 4922491
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..70f5f601a4c2aba96d2b1342836ef1662ed106d9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05f1cc50983886f4963f46ff05852a5553eba76f44d7e80bdc8ffcd9bfc634b1
+size 1503
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..0ea231d268aa577cefdc6b98566ed05715cd7f56
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ac9d58aa680a044b59980b81034acc0115c245e8ba4f5902455994e478c6583
+size 7258890
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.stl
new file mode 100644
index 0000000000000000000000000000000000000000..a7d307ae67fac5efc496faebd8050ebbf73dacaa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l4.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9f7032958ae8feef741a4a073f59f3cf9f8f491d93505d0bc9047269cb2e1f2
+size 208284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.dae
new file mode 100644
index 0000000000000000000000000000000000000000..6002ab4f2b52ba188f44ab1f786b71643d60d51d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8468804ab641adc86c10d3d21f22d6e025189bbc8f3a18ba8a7e73f88f49b0e
+size 2120107
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..bd633c15237cd4c54bdb5dcb2ac6bda0225d331c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35afdc0890972e202b4b329de72ff70cc9b878ed5e3379600edf4f71f27faae4
+size 957
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e7d0ca6e3d3c38efa2c925cad241fea20a5a4383
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e0d0f74383ee20bb4dbbab14dc21233a68458d842c7d972c7f68dc27ce55629
+size 2904791
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.stl
new file mode 100644
index 0000000000000000000000000000000000000000..1ca0d1a4164f61bde4838fef7f86312150c5bc45
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l5.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5078849cf3e1ae7790a6014e1ecb51eda84b44d2d91206783bb0b1fd1740e9ff
+size 176534
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.dae
new file mode 100644
index 0000000000000000000000000000000000000000..0a170130340091219653ef8fc94307b7798669ac
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096155b75cd14bdb9e921c990f358844c9212178f32034a3fa92ef299039b7dc
+size 1990228
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..7c1910e1520fadf6e38711fe4d16876b9d1f0055
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2450351ee19839756c275f770bcc2edf7b476b42eb74b334bf679b85962d7fa2
+size 1139
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ebf7bb4ae33f5338b0248d0674e1859dc942fa12
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e5207527499e30c9fedec5cd3983792c59b2fc932304c1a38b352fe7ee2c615
+size 2838795
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.stl
new file mode 100644
index 0000000000000000000000000000000000000000..84627fc23ba0e3bbcb09e24872cd481bfd4a244d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/meshes/l6.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e55c25b435b2e038f21542cb9315745656fe248cfc1d388364acd7e73e4a28e9
+size 183034
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/no_texture_robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/no_texture_robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..cdf2d5b72f0004f781a4e56da4cecad7a0b2ce3a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/no_texture_robot.xml
@@ -0,0 +1,121 @@
+<mujoco model="sawyer">
+    <actuator>
+    	<motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="right_j0" name="torq_right_j0"/>
+    	<motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="right_j1" name="torq_right_j1"/>
+    	<motor ctrllimited="true" ctrlrange="-40.0 40.0" joint="right_j2" name="torq_right_j2"/>
+    	<motor ctrllimited="true" ctrlrange="-40.0 40.0" joint="right_j3" name="torq_right_j3"/>
+    	<motor ctrllimited="true" ctrlrange="-9.0 9.0" joint="right_j4" name="torq_right_j4"/>
+    	<motor ctrllimited="true" ctrlrange="-9.0 9.0" joint="right_j5" name="torq_right_j5"/>
+    	<motor ctrllimited="true" ctrlrange="-9.0 9.0" joint="right_j6" name="torq_right_j6"/>
+    </actuator>
+    <asset>
+    	<mesh file="meshes/base.stl" name="base"/>
+    	<mesh file="meshes/l0.stl" name="l0"/>
+    	<mesh file="meshes/head.stl" name="head"/>
+    	<mesh file="meshes/l1.stl" name="l1"/>
+    	<mesh file="meshes/l2.stl" name="l2"/>
+    	<mesh file="meshes/l3.stl" name="l3"/>
+    	<mesh file="meshes/l4.stl" name="l4"/>
+    	<mesh file="meshes/l5.stl" name="l5"/>
+    	<mesh file="meshes/l6.stl" name="l6"/>
+    </asset>
+    <worldbody>
+    	<body name="base" pos="0 0 0">
+			<!-- robot view -->
+			<camera mode="fixed" name="robotview" pos="1.0 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+    		<inertial diaginertia="0 0 0" mass="0" pos="0 0 0"/>
+            <!-- mount attached here -->
+    		<body name="right_arm_base_link" pos="0 0 0">
+    			<inertial diaginertia="0.00740351 0.00681776 0.00672942" mass="2.0687" pos="-0.0006241 -2.8025e-05 0.065404" quat="-0.209285 0.674441 0.227335 0.670558"/>
+    			<geom conaffinity="0" contype="0" group="1" mesh="base" type="mesh" name="base_vis" rgba="0.5 0.1 0.1 1"/>
+    			<geom pos="0 0 0.12" rgba="0.5 0.1 0.1 1" size="0.08 0.12" type="cylinder" name="base_col"/>
+    			<body name="right_l0" pos="0 0 0.08">
+    				<inertial diaginertia="0.0651588 0.0510944 0.0186218" mass="5.3213" pos="0.024366 0.010969 0.14363" quat="0.894823 0.00899958 -0.170275 0.412573"/>
+    				<joint axis="0 0 1" limited="true" name="right_j0" pos="0 0 0" range="-3.0503 3.0503"/>
+    				<geom conaffinity="0" contype="0" group="1" mesh="l0" type="mesh" name="link0_visual" rgba="0.5 0.1 0.1 1"/>
+    				<geom pos="0.08 0 0.23" rgba="0.5 0.1 0.1 1" size="0.07" name="link0_collision"/>
+    				<body name="head" pos="0 0 0.2965">
+    					<inertial diaginertia="0.0118334 0.00827089 0.00496574" mass="1.5795" pos="0.0053207 -2.6549e-05 0.1021" quat="0.999993 7.08405e-05 -0.00359857 -0.000626247"/>
+    					<!--Don't want to control the head joint so remove it from the kinematic tree-->
+    					<!--<joint axis="0 0 1" limited="true" name="head_pan" pos="0 0 0" range="-5.0952 0.9064"/>-->
+    					<geom conaffinity="0" contype="0" group="1" mesh="head" type="mesh" name="head_visual" rgba="0.5 0.1 0.1 1"/>
+    					<geom pos="0 0 0.08" rgba="0.5 0.1 0.1 1" size="0.018" name="head_collision"/>
+    					<body name="screen" pos="0.03 0 0.105" quat="0.5 0.5 0.5 0.5">
+    						<inertial diaginertia="1e-08 1e-08 1e-08" mass="0.0001" pos="0 0 0"/>
+    						<geom conaffinity="0" contype="0" group="1" size="0.12 0.07 0.001" type="box" name="screen_visual" rgba="0.2 0.2 0.2 1"/>
+    						<geom rgba="0.2 0.2 0.2 1" size="0.001" name="screen_collision"/>
+    					</body>
+    					<body name="head_camera" pos="0.0228027 0 0.216572" quat="0.342813 -0.618449 0.618449 -0.342813">
+    						<inertial diaginertia="0 0 0" mass="0" pos="0.0228027 0 0.216572" quat="0.342813 -0.618449 0.618449 -0.342813"/>
+    					</body>
+    				</body>
+    				<body name="right_torso_itb" pos="-0.055 0 0.22" quat="0.707107 0 -0.707107 0">
+    					<inertial diaginertia="1e-08 1e-08 1e-08" mass="0.0001" pos="0 0 0"/>
+    				</body>
+    				<body name="right_l1" pos="0.081 0.05 0.237" quat="0.5 -0.5 0.5 0.5">
+    					<inertial diaginertia="0.0224339 0.0221624 0.0097097" mass="4.505" pos="-0.0030849 -0.026811 0.092521" quat="0.424888 0.891987 0.132364 -0.0794296"/>
+    					<joint axis="0 0 1" limited="true" name="right_j1" pos="0 0 0" range="-3.8095 2.2736"/>
+    					<geom conaffinity="0" contype="0" group="1" mesh="l1" type="mesh" name="link1_visual" rgba="0.5 0.1 0.1 1"/>
+    					<geom pos="0 0 0.1225" rgba="0.5 0.1 0.1 1" size="0.07" name="link1_collision"/>
+    					<body name="right_l2" pos="0 -0.14 0.1425" quat="0.707107 0.707107 0 0">
+    						<inertial diaginertia="0.0257928 0.025506 0.00292515" mass="1.745" pos="-0.00016044 -0.014967 0.13582" quat="0.707831 -0.0524761 0.0516007 0.702537"/>
+    						<joint axis="0 0 1" limited="true" name="right_j2" pos="0 0 0" range="-3.0426 3.0426"/>
+    						<geom conaffinity="0" contype="0" group="1" mesh="l2" type="mesh" name="link2_visual" rgba="0.5 0.1 0.1 1"/>
+    						<geom pos="0 0 0.08" rgba="0.5 0.1 0.1 1" size="0.06 0.17" type="cylinder" name="link2_collision"/>
+    						<body name="right_l3" pos="0 -0.042 0.26" quat="0.707107 -0.707107 0 0">
+    							<inertial diaginertia="0.0102404 0.0096997 0.00369622" mass="2.5097" pos="-0.0048135 -0.0281 -0.084154" quat="0.902999 0.385391 -0.0880901 0.168247"/>
+    							<joint axis="0 0 1" limited="true" name="right_j3" pos="0 0 0" range="-3.0439 3.0439"/>
+    							<geom conaffinity="0" contype="0" group="1" mesh="l3" type="mesh" name="link3_visual" rgba="0.5 0.1 0.1 1"/>
+    							<geom pos="0 -0.01 -0.12" rgba="0.5 0.1 0.1 1" size="0.06" name="link3_collision"/>
+    							<body name="right_l4" pos="0 -0.125 -0.1265" quat="0.707107 0.707107 0 0">
+    								<inertial diaginertia="0.0136549 0.0135493 0.00127353" mass="1.1136" pos="-0.0018844 0.0069001 0.1341" quat="0.803612 0.031257 -0.0298334 0.593582"/>
+    								<joint axis="0 0 1" limited="true" name="right_j4" pos="0 0 0" range="-2.9761 2.9761"/>
+    								<geom conaffinity="0" contype="0" group="1" mesh="l4" type="mesh" name="link4_visual" rgba="0.5 0.1 0.1 1"/>
+    								<geom pos="0 0 0.11" rgba="0.5 0.1 0.1 1" size="0.045 0.15" type="cylinder" name="link4_collision"/>
+    								<body name="right_arm_itb" pos="-0.055 0 0.075" quat="0.707107 0 -0.707107 0">
+    									<inertial diaginertia="1e-08 1e-08 1e-08" mass="0.0001" pos="0 0 0"/>
+    								</body>
+    								<body name="right_l5" pos="0 0.031 0.275" quat="0.707107 -0.707107 0 0">
+    									<inertial diaginertia="0.00474131 0.00422857 0.00190672" mass="1.5625" pos="0.0061133 -0.023697 0.076416" quat="0.404076 0.9135 0.0473125 0.00158335"/>
+    									<joint axis="0 0 1" limited="true" name="right_j5" pos="0 0 0" range="-2.9761 2.9761" damping="0.2"/>
+    									<geom conaffinity="0" contype="0" group="1" mesh="l5" type="mesh" name="link5_visual" rgba="0.5 0.1 0.1 1"/>
+    									<geom pos="0 0 0.1" rgba="0.5 0.1 0.1 1" size="0.06" name="link5_collision"/>
+    									<body name="right_hand_camera" pos="0.039552 -0.033 0.0695" quat="0.707107 0 0.707107 0">
+    										<inertial diaginertia="0 0 0" mass="0" pos="0.039552 -0.033 0.0695" quat="0.707107 0 0.707107 0"/>
+    									</body>
+    									<body name="right_wrist" pos="0 0 0.10541" quat="0.707107 0.707107 0 0">
+    										<inertial diaginertia="0 0 0" mass="0" pos="0 0 0.10541" quat="0.707107 0.707107 0 0"/>
+    									</body>
+    									<body name="right_l6" pos="0 -0.11 0.1053" quat="0.0616248 0.06163 -0.704416 0.704416">
+    										<inertial diaginertia="0.000360258 0.000311068 0.000214974" mass="0.3292" pos="-8.0726e-06 0.0085838 -0.0049566" quat="0.479044 0.515636 -0.513069 0.491322"/>
+    										<joint axis="0 0 1" limited="true" name="right_j6" pos="0 0 0" range="-4.7124 4.7124" damping="0.1"/>
+    										<geom conaffinity="0" contype="0" group="1" mesh="l6" type="mesh" name="link6_visual" rgba="0.5 0.1 0.1 1"/>
+    										<geom pos="0 0.015 -0.01" rgba="0.5 0.1 0.1 1" size="0.055 0.025" type="cylinder" name="link6_collision"/>
+    										<body name="right_hand" pos="0 0 0.024" quat="0.707105 0 0 0.707108">
+												<!-- This camera points out from the eef. -->
+                								<camera mode="fixed" name="eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+    											<!-- To add gripper -->
+    										</body>
+    									</body>
+    								</body>
+    								<body name="right_l4_2" pos="0 0 0">
+    									<inertial diaginertia="1e-08 1e-08 1e-08" mass="1e-08" pos="1e-08 1e-08 1e-08" quat="0.820473 0.339851 -0.17592 0.424708"/>
+    									<geom pos="0 0.01 0.26" size="0.06" name="right_l4_2"/>
+    								</body>
+    							</body>
+    						</body>
+    						<body name="right_l2_2" pos="0 0 0">
+    							<inertial diaginertia="1e-08 1e-08 1e-08" mass="1e-08" pos="1e-08 1e-08 1e-08" quat="0.820473 0.339851 -0.17592 0.424708"/>
+    							<geom pos="0 0 0.26" size="0.06" name="right_l2_2"/>
+    						</body>
+    					</body>
+    					<body name="right_l1_2" pos="0 0 0">
+    						<inertial diaginertia="1e-08 1e-08 1e-08" mass="1e-08" pos="1e-08 1e-08 1e-08" quat="0.820473 0.339851 -0.17592 0.424708"/>
+    						<geom pos="0 0 0.035" size="0.07 0.07" type="cylinder" name="right_l1_2"/>
+    					</body>
+    				</body>
+    			</body>
+    		</body>
+    	</body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..b1bcc9bf5d30a204abd9be1ee0ae6126bd8b0a73
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e96917a254c2ff4ce42dd5ee0c82bb33d632013f9aeb0493b7205fec5d68c2d5
+size 411
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base.obj
new file mode 100644
index 0000000000000000000000000000000000000000..256cb2dab250dd39cad70c40308832a1a3c24097
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aaeb52c143f716686b9ff094c41a2af2e761f8c6b0e633d3bb9500202f1a72c7
+size 1423343
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base/base_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base/base_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9342e2de393daca72cf1ede392e7e7c93ceccb56
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base/base_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e547e145646f227341c5ece469598c3c028f6c344cd8a117b17c3989f7a2195
+size 24137
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base/base_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base/base_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3c7bc03d3ed2beb9b360e79249077feb008d5a51
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/base/base_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a29a1fc7532d40fe1ce6c50f78853f756f6f27648bde00b765918ffe047b78a1
+size 1902755
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..3886714b14a4116ac77b23f3b91ff18403fa2ea1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49c9ecac411073f585ff5fea5ca6f29e90bbdfe4254e63b7bfdb7afb176d5593
+size 1828
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head.obj
new file mode 100644
index 0000000000000000000000000000000000000000..95de473d78781dadf9bd9acf14d8bce2691ed872
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e98e325ac05841d7b3e2fca1cab5d7520b4ff8de51532cf3f615de4f4d03bd6
+size 1966591
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..19b51b856a58b417ba687e85de4a2a1b688a7dd8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f88d7b31887c4e60b0d63fc573ddbb7dc4b11f3553bf5850f0c86e995542cec
+size 17256
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c40664a2ae8ed0e374ca2c3352c702dc1aa6bd2c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:938fbcad656d9d92acf430754e4b86a3cf9da07cea1e8147fc3bcd96c2a8321b
+size 7308
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9a59ea2bba0a0e25d8af539193878a90c734268b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e73560e2ce46ac81db50f40772c11d7efa529914b370cc3f897fb1985abefae6
+size 26789
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..da6563bea56e6618bfa9f9a423aaa24d3cc81b5b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfa20d84f02301c05e5401182601789df5f352c1a96ce74f56430bb724a11282
+size 49795
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3d3097328f2b79caba6b42447f7ebae33f081fd3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c87abdbba03512c99de28290363b253e333f565135caac7c92365f0aec6365b4
+size 67186
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6b7239ec0c0b43a769b311434940f86be62e29dd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36053842c61f0977e5cc89c93621b5abf69807a0a081b453311ef1247483e063
+size 523234
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_6.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_6.obj
new file mode 100644
index 0000000000000000000000000000000000000000..797f4e954f3325048b7038395ac68de26b9e7180
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_6.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d690b8cf4d4cd0e58cf1f6374cd00e473a136ba10c6713ed132899be748f273b
+size 690999
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_7.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_7.obj
new file mode 100644
index 0000000000000000000000000000000000000000..d584b8bc44a41ba7664eeeb6f765701a886d00a6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_7.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3dc7d9fcbe86ebb31b84e18c73a42bb0049ae3c31fd660938aadfc4118eb7d29
+size 67662
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_8.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_8.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5ceca6d4496df69034aa9fd8fd5618a3152bc4a0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_8.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e087cddb34d9b9708c6733d013d46b7e103ce1ce552c689cf3ee8170d13d317
+size 1249668
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_9.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_9.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5b966e2bb7c5e8fbfe5493d65175ad7ca37c5005
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/head/head_9.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e55dbbcef11f2a551d23144d41a09fb61471d131f5605f2d9772f72d550f9a5e
+size 2966
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..599be372537b75781e5f7decfa7906937744c03b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:264852472aeb2f441259a9e09dee42214c5b2b2b92fef9e6d6fd8b7d2dd709e5
+size 1321
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e3a08865ffdb3a3d0e8e43206a205d6e77e79e25
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e0871515535b0b4b60f31ca114470fc767522ec3e0dc4f987e7db9d724d1dd9
+size 5578526
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..98086c432394d3c04c0a42342ea1c8a197069085
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5ccce2f6632ad8684991034e90d8ea06741e577802c522d41fb6f6f09735ab5
+size 983
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b1e30caeb9fc3747d337bcdebd02051c95bc1326
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c2f48a54ac9445b0540a5d811c3a33f6eaa678866e50dbc1dbe971344c7dd6c
+size 10332
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..617d81a94092ede96a00ede0978190b8301c197d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b423c02e61c90648623f1884fbb78e21562ff6975e682f29725850d44b23839b
+size 1510147
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ff81ffde5cffafb658a42c69ed1c012101c77278
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b605855763d12255ea647caabfa40a0c13985fc8ae16507572737072b5fb3ff7
+size 19377
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..75d398fa59b508c4631f44a15074f992f96226ad
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096c49f356dc63fe1b1b2a7a5157d9d32f33fe8735bf6844fed9ed706abcbc64
+size 2230
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6ad39a219b2b6db974fabfa8c25f80c5aeab59e0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa16edffa27981e6c8b9b39178b6e8ab694bf349aa9f4a2366d296db5b002aca
+size 2630321
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_6.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_6.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a5c58d942a68ff57db57f4bfed24bb08f02f498e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l0/l0_6.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1af4d3407a79dcaebee0fec864cc140154f6b19d784a79a6d60b514277c2851d
+size 3085040
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..100f24e77c21881f7d8226f8d147fd4e013190ce
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d10328be170934db4f575c103d6d65dbf954445ca649690d503797927306767d
+size 593
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..8d1e02249686849b4c781be8dbdf465710b1c266
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e551d2b6633fa879fe4d4a69d584922205a44ad1ad9a3899736fcc8ae5d8edc
+size 696392
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1/l1_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1/l1_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..664929843f0c3a1b3a13ce3b32ed3017285af627
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1/l1_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffab362a1d710a8152644a848c877f8f63bdeb3754b6e6c6a4104ef1c1384db7
+size 243190
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1/l1_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1/l1_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..21bb8380543343ae4d986d940008d7072a8bee52
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1/l1_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c531ddfbd7271e198ff7ed5719cb60d123f713b7b930c04a3d9179b01bd25c46
+size 485196
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1/l1_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1/l1_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..bcc9435f15310afae41e263ee4502a4452b80faf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l1/l1_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e87f5d62e04421430e6698473ba734c07db99d5c41cdb27054d5b5c80c45126
+size 174477
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..e5c3fec6e71833822eea82eeb9ac9b6a6a2d0233
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfdece18474cb2479a0ae487126743f4a2cae72f1f22e3de8d1a8129089ad71a
+size 957
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..97946cdb06cac7b6bf8e86dc9b06939d7a69a93f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:974a66e72ae987c1cd7f47fefd4756646e298739a7c879244b2c3ff0533fb19c
+size 953303
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..61c1ef1fd630555bc32f1a8a642287ee677caf4e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b5ac9fb3b46214d19d4f57c8ba3e7c483ddeff5f1a66326d32fd26f60014a5b
+size 101609
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5f15ba505defb98e7fb48de5ead684d4325ed081
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a895e0ba190688fd55d875bb6c8f27a8d8011e5c98da13e06742a1c350bf6c2
+size 122053
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..0863e7307ea569cd31c74fdd9d166dcf8c9fcae7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef1e27a6245d74e83a28d1a36f102dc2aa68e4cad93a82caa93687cb70ba8dbb
+size 22747
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..1253584681058fbd0afe531214f250e17147207f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7ad30b0f4b01ab8eddef1b78046d4d91baeae2b59cc596baf5bcf40d091a63f
+size 379478
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..0fada19a737cc03e382aa418e0afbeec448f36a9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l2/l2_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa7f82a997f91817d7e7988c173d35b3f24564cf3bc9c93fe67372afd1dd662
+size 624028
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..144315ce398fb6dc49e14b7e1e6a88972196d5b7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c17f048659d1e2e4929c4e958b60a54353e7f8c70f51441d30c19c919c4f9cec
+size 775
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..2e5a528e05e888bca43d8e9760df7ecee4751c1e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52c5c2a65a874a400085a32033397963f68794cbb906780c9bbb6fe52e3f5937
+size 896221
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..f726be80bb56e14aa9410793da5096f9a9178864
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a64c720306984eca6e57e89ee6368a4be02d80791aa89885344ce634c4b73eec
+size 20937
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..9a82cd3ef56b21e8779b8189260ff963fa2a4fae
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49d71ef4b973cd88b794666cf6179fd662f26c8da5cb0adf54fe26707ffb96fd
+size 389695
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3132ece57f7e1b4f59fa1cda77cb0073da8eb22e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f4b1336a4d3e96e93f3d7fb2678a4f5f30d261890ba6e9b1bc61231171dbc70
+size 99440
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..91f350d30e81788d8eb346b2eccb2c17c77de6a0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l3/l3_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61e29bac1d16bea9ab7ffbf9b493b1a597c2b14394b92c982bf7490ca64330b6
+size 671873
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..70f5f601a4c2aba96d2b1342836ef1662ed106d9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05f1cc50983886f4963f46ff05852a5553eba76f44d7e80bdc8ffcd9bfc634b1
+size 1503
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..0ea231d268aa577cefdc6b98566ed05715cd7f56
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ac9d58aa680a044b59980b81034acc0115c245e8ba4f5902455994e478c6583
+size 7258890
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..eb255bb57ab1a4b84ee4914a89ca8b424ffd3f08
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:934bb948f450ec9237e15af67331e6c15daaa45892dcad88d15d104517823fdc
+size 11477
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..8f339cd5773b510563a77e8856420869e418b22a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c672c27bfe7690396af9e54d57ceda3fc89d6f08a69324f7ed38223cac15ee4
+size 107524
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6e48c759323aed20ae02db9d1480f909f854ac87
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccf2480ddb77911343d9e59f5cad9d4838f32631de126122bc5c22399432e57e
+size 633606
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..1f29511a5354830e12af68a2afb1def0a1147051
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0fe6f9f7dac9085445820563a5baa4a836cd9ae99ccbb02e683cf7806915ca1
+size 995593
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..426ba76f25cb4ee31dbd2049853c47c5f437006b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3977e1b8b7309bb1eb11b375424d45c367d3130ce1418675523b6c57fd7ae24a
+size 644926
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..68f3b265c20660616c8ddbb1876604de5e2a5e78
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a398b7fb72ff3cd40c5fe0ccf780ed086f1e4d9697036d9fe8ef5dba2d12a3c1
+size 865576
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_6.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_6.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5fda2d361d28ecfc38d051b7197aad3e5abc0da0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_6.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baee574065439546b0d5d5405d8920564589791a095040cd300136f026f38373
+size 1515936
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_7.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_7.obj
new file mode 100644
index 0000000000000000000000000000000000000000..598a52cf5631eea09860f77c0f9911315c418a92
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l4/l4_7.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd86085b7e95f9321f4571fafdb1b3c40e2a4efb5b056d6d97e98b293a4733f3
+size 4653975
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..bd633c15237cd4c54bdb5dcb2ac6bda0225d331c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35afdc0890972e202b4b329de72ff70cc9b878ed5e3379600edf4f71f27faae4
+size 957
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e7d0ca6e3d3c38efa2c925cad241fea20a5a4383
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e0d0f74383ee20bb4dbbab14dc21233a68458d842c7d972c7f68dc27ce55629
+size 2904791
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..7d9819671ac1f189127d7a90746e7c7a0e40fa6c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:752b0b4844e7c9f7abd82030d87fe14703cd175d13236cb5878c27ff91f90ad7
+size 28915
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..bfcb392bcb8c2b7cdf244d2e28c68327bb510ed0
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f27709c10f5330e989c235fb943ad88314eb3a39d58f4edbb71391fe67476d26
+size 13227
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..1de25c0a42b3ed9ef0f49f367c749644b138fd12
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b0bfab2eb17fee7e743c4c9ce819abab2681a11c3f34b9a1960261d374a8530
+size 227206
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5314a137e24676c14cedfafd58e063cb2adaae5c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c7f821e5262fce470a2789b1fa16b97217b94a5fb8800237065d2a2c529105d
+size 2903969
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6b5d58dc8987c559c0509f2dc6fb9a6baba55d40
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l5/l5_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ea77ab10772e5530c52205f850785490145eefe95fff213265a814add4538a0
+size 716571
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..7c1910e1520fadf6e38711fe4d16876b9d1f0055
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2450351ee19839756c275f770bcc2edf7b476b42eb74b334bf679b85962d7fa2
+size 1139
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ebf7bb4ae33f5338b0248d0674e1859dc942fa12
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e5207527499e30c9fedec5cd3983792c59b2fc932304c1a38b352fe7ee2c615
+size 2838795
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..234f0c11741af840626ce696dfc4fbbe08d6f06c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:033d560b31cf1ebfe4cdbc6de3c7f2c23ec8c547eb15533237dc8582ae5d7ee4
+size 9443
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e361b280d20aa5ffb8c6d05ba275a524183d9bf7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea00b3efde74ab54e978e9940bf0f9036d69f7b792e71004238da8a0f5542584
+size 24130
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6f8259eaeb779183701c3a64858bf6949bce5dca
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9eea7ea5457978d3eeba790d923878dceaa118fec00c318292f6421ef3e01380
+size 4713
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4dbe13c95c9571c942399f94359d6e413f704d41
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a335fb9fe5b133e89912c5b0dad19f84642543fed8f7af3635412f6307684f6c
+size 80416
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_4.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_4.obj
new file mode 100644
index 0000000000000000000000000000000000000000..e8d6f9f96194032504770309d3b0c39c37f0f80e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_4.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb7c8f765540a84023f2e4d3b319623c1a2c1bcbac0309a921226f419f505cb5
+size 127507
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_5.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_5.obj
new file mode 100644
index 0000000000000000000000000000000000000000..01263012338d344789f4343c14c63c5014df6b70
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/obj_meshes/l6/l6_5.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07488040bedbdbfd6aff80384fc287be45229e40614512501781860ce21efdc3
+size 3783211
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ad115e2c5fbc12235da3914491cd86c348a72c35
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/sawyer/robot.xml
@@ -0,0 +1,281 @@
+<mujoco model="sawyer">
+    <actuator>
+    	<motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="right_j0" name="torq_right_j0"/>
+    	<motor ctrllimited="true" ctrlrange="-80.0 80.0" joint="right_j1" name="torq_right_j1"/>
+    	<motor ctrllimited="true" ctrlrange="-40.0 40.0" joint="right_j2" name="torq_right_j2"/>
+    	<motor ctrllimited="true" ctrlrange="-40.0 40.0" joint="right_j3" name="torq_right_j3"/>
+    	<motor ctrllimited="true" ctrlrange="-9.0 9.0" joint="right_j4" name="torq_right_j4"/>
+    	<motor ctrllimited="true" ctrlrange="-9.0 9.0" joint="right_j5" name="torq_right_j5"/>
+    	<motor ctrllimited="true" ctrlrange="-9.0 9.0" joint="right_j6" name="torq_right_j6"/>
+    </actuator>
+    <asset>
+
+        <!-- base material and mesh -->
+	<material name="Mtl1.017" specular="0.5" shininess="0.225" rgba="1.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl2.017" specular="0.5" shininess="0.225" rgba="0.286275 0.286275 0.286275 1.000000"/>
+	<mesh name="base_0" file="obj_meshes/base/base_0.obj"/>
+	<mesh name="base_1" file="obj_meshes/base/base_1.obj"/>
+      
+        <!-- head material and mesh -->
+	<material name="Mtl1" specular="0.5" shininess="0.225" rgba="1.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl2" specular="0.5" shininess="0.225" rgba="0.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl4" specular="0.5" shininess="0.225" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Mtl5" specular="0.5" shininess="0.225" rgba="0.870588 0.309804 0.086275 1.000000"/>
+	<material name="Mtl6" specular="0.5" shininess="0.225" rgba="0.286275 0.286275 0.286275 1.000000"/>
+	<material name="Mtl7" specular="0.5" shininess="0.225" rgba="0.698039 0.698039 0.698039 1.000000"/>
+	<material name="Mtl8" specular="0.5" shininess="0.225" rgba="0.792157 0.819608 0.929412 1.000000"/>
+	<material name="Mtl9" specular="0.5" shininess="0.225" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<material name="Mtla" specular="0.5" shininess="0.225" rgba="0.000000 0.000000 1.000000 1.000000"/>
+	<material name="Mtlb" specular="0.5" shininess="0.225" rgba="0.290196 0.290196 0.290196 1.000000"/>
+	<mesh name="head_0" file="obj_meshes/head/head_0.obj"/>
+	<mesh name="head_1" file="obj_meshes/head/head_1.obj"/>
+	<mesh name="head_2" file="obj_meshes/head/head_2.obj"/>
+	<mesh name="head_3" file="obj_meshes/head/head_3.obj"/>
+	<mesh name="head_4" file="obj_meshes/head/head_4.obj"/>
+	<mesh name="head_5" file="obj_meshes/head/head_5.obj"/>
+	<mesh name="head_6" file="obj_meshes/head/head_6.obj"/>
+	<mesh name="head_7" file="obj_meshes/head/head_7.obj"/>
+	<mesh name="head_8" file="obj_meshes/head/head_8.obj"/>
+	<mesh name="head_9" file="obj_meshes/head/head_9.obj"/>
+
+        <!-- l0 material and mesh -->
+	<material name="Mtl1.014" specular="0.5" shininess="0.225" rgba="0.286275 0.286275 0.286275 1.000000"/>
+	<material name="Mtl2.014" specular="0.5" shininess="0.225" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Mtl3.013" specular="0.5" shininess="0.225" rgba="0.168627 0.152941 0.141176 1.000000"/>
+	<material name="Mtl4.012" specular="0.5" shininess="0.225" rgba="1.000000 0.403922 0.121569 1.000000"/>
+	<material name="Mtl5.010" specular="0.5" shininess="0.225" rgba="1.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl6.008" specular="0.5" shininess="0.225" rgba="0.792157 0.819608 0.929412 1.000000"/>
+	<material name="Mtl7.007" specular="0.5" shininess="0.225" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<mesh name="l0_0" file="obj_meshes/l0/l0_0.obj"/>
+	<mesh name="l0_1" file="obj_meshes/l0/l0_1.obj"/>
+	<mesh name="l0_2" file="obj_meshes/l0/l0_2.obj"/>
+	<mesh name="l0_3" file="obj_meshes/l0/l0_3.obj"/>
+	<mesh name="l0_4" file="obj_meshes/l0/l0_4.obj"/>
+	<mesh name="l0_5" file="obj_meshes/l0/l0_5.obj"/>
+	<mesh name="l0_6" file="obj_meshes/l0/l0_6.obj"/>
+
+        <!-- l1 material and mesh -->
+	<material name="Mtl1.004" specular="0.5" shininess="0.225" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Mtl2.004" specular="0.5" shininess="0.225" rgba="1.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl3.003" specular="0.5" shininess="0.225" rgba="0.286275 0.286275 0.286275 1.000000"/>
+	<mesh name="l1_0" file="obj_meshes/l1/l1_0.obj"/>
+	<mesh name="l1_1" file="obj_meshes/l1/l1_1.obj"/>
+	<mesh name="l1_2" file="obj_meshes/l1/l1_2.obj"/>
+
+        <!-- l2 material and mesh -->
+	<material name="Mtl1.005" specular="0.5" shininess="0.225" rgba="1.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl2.005" specular="0.5" shininess="0.225" rgba="0.286275 0.286275 0.286275 1.000000"/>
+	<material name="Mtl3.004" specular="0.5" shininess="0.225" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Mtl4.003" specular="0.5" shininess="0.225" rgba="0.792157 0.819608 0.929412 1.000000"/>
+	<material name="Mtl5.003" specular="0.5" shininess="0.225" rgba="0.105882 0.105882 0.105882 1.000000"/>
+	<mesh name="l2_0" file="obj_meshes/l2/l2_0.obj"/>
+	<mesh name="l2_1" file="obj_meshes/l2/l2_1.obj"/>
+	<mesh name="l2_2" file="obj_meshes/l2/l2_2.obj"/>
+	<mesh name="l2_3" file="obj_meshes/l2/l2_3.obj"/>
+	<mesh name="l2_4" file="obj_meshes/l2/l2_4.obj"/>
+
+        <!-- l3 material and mesh -->
+	<material name="Mtl1.007" specular="0.5" shininess="0.225" rgba="0.290196 0.290196 0.290196 1.000000"/>
+	<material name="Mtl2.007" specular="0.5" shininess="0.225" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Mtl3.006" specular="0.5" shininess="0.225" rgba="1.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl4.005" specular="0.5" shininess="0.225" rgba="0.792157 0.819608 0.929412 1.000000"/>
+	<mesh name="l3_0" file="obj_meshes/l3/l3_0.obj"/>
+	<mesh name="l3_1" file="obj_meshes/l3/l3_1.obj"/>
+	<mesh name="l3_2" file="obj_meshes/l3/l3_2.obj"/>
+	<mesh name="l3_3" file="obj_meshes/l3/l3_3.obj"/>
+
+        <!-- l4 material and mesh -->
+	<material name="Mtl1.010" specular="0.5" shininess="0.225" rgba="0.286275 0.286275 0.286275 1.000000"/>
+	<material name="Mtl2.010" specular="0.5" shininess="0.225" rgba="1.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl3.009" specular="0.5" shininess="0.225" rgba="0.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl4.008" specular="0.5" shininess="0.225" rgba="0.792157 0.819608 0.929412 1.000000"/>
+	<material name="Mtl5.006" specular="0.5" shininess="0.225" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Mtl6.004" specular="0.5" shininess="0.225" rgba="0.749020 0.749020 0.749020 1.000000"/>
+	<material name="Mtl7.004" specular="0.5" shininess="0.225" rgba="0.290196 0.290196 0.290196 1.000000"/>
+	<material name="Mtl8.002" specular="0.5" shininess="0.225" rgba="0.250980 0.250980 0.250980 1.000000"/>
+	<mesh name="l4_0" file="obj_meshes/l4/l4_0.obj"/>
+	<mesh name="l4_1" file="obj_meshes/l4/l4_1.obj"/>
+	<mesh name="l4_2" file="obj_meshes/l4/l4_2.obj"/>
+	<mesh name="l4_3" file="obj_meshes/l4/l4_3.obj"/>
+	<mesh name="l4_4" file="obj_meshes/l4/l4_4.obj"/>
+	<mesh name="l4_5" file="obj_meshes/l4/l4_5.obj"/>
+	<mesh name="l4_6" file="obj_meshes/l4/l4_6.obj"/>
+	<mesh name="l4_7" file="obj_meshes/l4/l4_7.obj"/>
+
+        <!-- l5 material and mesh -->
+	<material name="Mtl1_001" specular="0.5" shininess="0.225" rgba="1.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl2_001" specular="0.5" shininess="0.225" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Mtl3_001" specular="0.5" shininess="0.225" rgba="0.286275 0.286275 0.286275 1.000000"/>
+	<material name="Mtl4_001" specular="0.5" shininess="0.225" rgba="0.866667 0.905882 1.000000 1.000000"/>
+	<material name="Mtl5_001" specular="0.5" shininess="0.225" rgba="0.501961 0.501961 0.501961 1.000000"/>
+	<mesh name="l5_0" file="obj_meshes/l5/l5_0.obj"/>
+	<mesh name="l5_1" file="obj_meshes/l5/l5_1.obj"/>
+	<mesh name="l5_2" file="obj_meshes/l5/l5_2.obj"/>
+	<mesh name="l5_3" file="obj_meshes/l5/l5_3.obj"/>
+	<mesh name="l5_4" file="obj_meshes/l5/l5_4.obj"/>
+
+        <!-- l6 material and mesh -->
+	<material name="Mtl1.012" specular="0.5" shininess="0.225" rgba="0.298039 0.298039 0.298039 1.000000"/>
+	<material name="Mtl2.012" specular="0.5" shininess="0.225" rgba="0.282353 0.282353 0.282353 1.000000"/>
+	<material name="Mtl3.011" specular="0.5" shininess="0.225" rgba="0.286275 0.286275 0.286275 1.000000"/>
+	<material name="Mtl4.010" specular="0.5" shininess="0.225" rgba="1.000000 0.000000 0.000000 1.000000"/>
+	<material name="Mtl5.008" specular="0.5" shininess="0.225" rgba="1.000000 1.000000 1.000000 1.000000"/>
+	<material name="Mtl6.006" specular="0.5" shininess="0.225" rgba="1.000000 0.403922 0.121569 1.000000"/>
+	<mesh name="l6_0" file="obj_meshes/l6/l6_0.obj"/>
+	<mesh name="l6_1" file="obj_meshes/l6/l6_1.obj"/>
+	<mesh name="l6_2" file="obj_meshes/l6/l6_2.obj"/>
+	<mesh name="l6_3" file="obj_meshes/l6/l6_3.obj"/>
+	<mesh name="l6_4" file="obj_meshes/l6/l6_4.obj"/>
+	<mesh name="l6_5" file="obj_meshes/l6/l6_5.obj"/>
+
+    </asset>
+    <worldbody>
+    	<body name="base" pos="0 0 0">
+			<!-- robot view -->
+			<camera mode="fixed" name="robotview" pos="1.0 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+    		<inertial diaginertia="0 0 0" mass="0" pos="0 0 0"/>
+            <!-- mount attached here -->
+    		<body name="right_arm_base_link" pos="0 0 0">
+    			<inertial diaginertia="0.00740351 0.00681776 0.00672942" mass="2.0687" pos="-0.0006241 -2.8025e-05 0.065404" quat="-0.209285 0.674441 0.227335 0.670558"/>
+			<geom mesh="base_0" material="Mtl2.017" type="mesh" contype="0" conaffinity="0" group="1"/>
+			<geom mesh="base_1" material="Mtl1.017" type="mesh" contype="0" conaffinity="0" group="1"/>
+  			
+    			<geom pos="0 0 0.12" rgba="0.5 0.1 0.1 1" size="0.08 0.12" type="cylinder" name="base_col"/>
+    			<body name="right_l0" pos="0 0 0.08">
+    				<inertial diaginertia="0.0651588 0.0510944 0.0186218" mass="5.3213" pos="0.024366 0.010969 0.14363" quat="0.894823 0.00899958 -0.170275 0.412573"/>
+    				<joint axis="0 0 1" limited="true" name="right_j0" pos="0 0 0" range="-3.0503 3.0503"/>
+				<geom mesh="l0_0" material="Mtl7.007" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="l0_1" material="Mtl6.008" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="l0_2" material="Mtl5.010" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="l0_3" material="Mtl4.012" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="l0_4" material="Mtl3.013" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="l0_5" material="Mtl2.014" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="l0_6" material="Mtl1.014" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+    				<geom pos="0.08 0 0.23" rgba="0.5 0.1 0.1 1" size="0.07" name="link0_collision"/>
+    				<body name="head" pos="0 0 0.2965">
+    					<inertial diaginertia="0.0118334 0.00827089 0.00496574" mass="1.5795" pos="0.0053207 -2.6549e-05 0.1021" quat="0.999993 7.08405e-05 -0.00359857 -0.000626247"/>
+    					<!--Don't want to control the head joint so remove it from the kinematic tree-->
+    					<!--<joint axis="0 0 1" limited="true" name="head_pan" pos="0 0 0" range="-5.0952 0.9064"/>-->
+					<geom mesh="head_0" material="Mtlb" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="head_1" material="Mtla" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="head_2" material="Mtl8" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="head_3" material="Mtl9" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="head_4" material="Mtl7" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="head_5" material="Mtl6" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="head_6" material="Mtl4" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="head_7" material="Mtl2" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="head_8" material="Mtl1" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="head_9" material="Mtl5" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+    					<geom pos="0 0 0.08" rgba="0.5 0.1 0.1 1" size="0.018" name="head_collision"/>
+    					<body name="screen" pos="0.03 0 0.105" quat="0.5 0.5 0.5 0.5">
+    						<inertial diaginertia="1e-08 1e-08 1e-08" mass="0.0001" pos="0 0 0"/>
+    						<geom rgba="0.2 0.2 0.2 1" size="0.001" name="screen_collision"/>
+    					</body>
+    					<body name="head_camera" pos="0.0228027 0 0.216572" quat="0.342813 -0.618449 0.618449 -0.342813">
+    						<inertial diaginertia="0 0 0" mass="0" pos="0.0228027 0 0.216572" quat="0.342813 -0.618449 0.618449 -0.342813"/>
+    					</body>
+    				</body>
+    				<body name="right_torso_itb" pos="-0.055 0 0.22" quat="0.707107 0 -0.707107 0">
+    					<inertial diaginertia="1e-08 1e-08 1e-08" mass="0.0001" pos="0 0 0"/>
+    				</body>
+    				<body name="right_l1" pos="0.081 0.05 0.237" quat="0.5 -0.5 0.5 0.5">
+    					<inertial diaginertia="0.0224339 0.0221624 0.0097097" mass="4.505" pos="-0.0030849 -0.026811 0.092521" quat="0.424888 0.891987 0.132364 -0.0794296"/>
+    					<joint axis="0 0 1" limited="true" name="right_j1" pos="0 0 0" range="-3.8095 2.2736"/>
+					<geom mesh="l1_0" material="Mtl3.003" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="l1_1" material="Mtl2.004" type="mesh" contype="0" conaffinity="0" group="1"/>
+					<geom mesh="l1_2" material="Mtl1.004" type="mesh" contype="0" conaffinity="0" group="1"/>
+					
+
+    					<geom pos="0 0 0.1225" rgba="0.5 0.1 0.1 1" size="0.07" name="link1_collision"/>
+    					<body name="right_l2" pos="0 -0.14 0.1425" quat="0.707107 0.707107 0 0">
+    						<inertial diaginertia="0.0257928 0.025506 0.00292515" mass="1.745" pos="-0.00016044 -0.014967 0.13582" quat="0.707831 -0.0524761 0.0516007 0.702537"/>
+    						<joint axis="0 0 1" limited="true" name="right_j2" pos="0 0 0" range="-3.0426 3.0426"/>
+						<geom mesh="l2_0" material="Mtl4.003" type="mesh" contype="0" conaffinity="0" group="1"/>
+						<geom mesh="l2_1" material="Mtl5.003" type="mesh" contype="0" conaffinity="0" group="1"/>
+						<geom mesh="l2_2" material="Mtl3.004" type="mesh" contype="0" conaffinity="0" group="1"/>
+						<geom mesh="l2_3" material="Mtl2.005" type="mesh" contype="0" conaffinity="0" group="1"/>
+						<geom mesh="l2_4" material="Mtl1.005" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+    						<geom pos="0 0 0.08" rgba="0.5 0.1 0.1 1" size="0.06 0.17" type="cylinder" name="link2_collision"/>
+    						<body name="right_l3" pos="0 -0.042 0.26" quat="0.707107 -0.707107 0 0">
+    							<inertial diaginertia="0.0102404 0.0096997 0.00369622" mass="2.5097" pos="-0.0048135 -0.0281 -0.084154" quat="0.902999 0.385391 -0.0880901 0.168247"/>
+    							<joint axis="0 0 1" limited="true" name="right_j3" pos="0 0 0" range="-3.0439 3.0439"/>
+							<geom mesh="l3_0" material="Mtl4.005" type="mesh" contype="0" conaffinity="0" group="1"/>
+							<geom mesh="l3_1" material="Mtl3.006" type="mesh" contype="0" conaffinity="0" group="1"/>
+							<geom mesh="l3_2" material="Mtl2.007" type="mesh" contype="0" conaffinity="0" group="1"/>
+							<geom mesh="l3_3" material="Mtl1.007" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+    							<geom pos="0 -0.01 -0.12" rgba="0.5 0.1 0.1 1" size="0.06" name="link3_collision"/>
+    							<body name="right_l4" pos="0 -0.125 -0.1265" quat="0.707107 0.707107 0 0">
+    								<inertial diaginertia="0.0136549 0.0135493 0.00127353" mass="1.1136" pos="-0.0018844 0.0069001 0.1341" quat="0.803612 0.031257 -0.0298334 0.593582"/>
+    								<joint axis="0 0 1" limited="true" name="right_j4" pos="0 0 0" range="-2.9761 2.9761"/>
+								<geom mesh="l4_0" material="Mtl8.002" type="mesh" contype="0" conaffinity="0" group="1"/>
+								<geom mesh="l4_1" material="Mtl7.004" type="mesh" contype="0" conaffinity="0" group="1"/>
+								<geom mesh="l4_2" material="Mtl6.004" type="mesh" contype="0" conaffinity="0" group="1"/>
+								<geom mesh="l4_3" material="Mtl5.006" type="mesh" contype="0" conaffinity="0" group="1"/>
+								<geom mesh="l4_4" material="Mtl4.008" type="mesh" contype="0" conaffinity="0" group="1"/>
+								<geom mesh="l4_5" material="Mtl3.009" type="mesh" contype="0" conaffinity="0" group="1"/>
+								<geom mesh="l4_6" material="Mtl2.010" type="mesh" contype="0" conaffinity="0" group="1"/>
+								<geom mesh="l4_7" material="Mtl1.010" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+    								<geom pos="0 0 0.11" rgba="0.5 0.1 0.1 1" size="0.045 0.15" type="cylinder" name="link4_collision"/>
+    								<body name="right_arm_itb" pos="-0.055 0 0.075" quat="0.707107 0 -0.707107 0">
+    									<inertial diaginertia="1e-08 1e-08 1e-08" mass="0.0001" pos="0 0 0"/>
+    								</body>
+    								<body name="right_l5" pos="0 0.031 0.275" quat="0.707107 -0.707107 0 0">
+    									<inertial diaginertia="0.00474131 0.00422857 0.00190672" mass="1.5625" pos="0.0061133 -0.023697 0.076416" quat="0.404076 0.9135 0.0473125 0.00158335"/>
+    									<joint axis="0 0 1" limited="true" name="right_j5" pos="0 0 0" range="-2.9761 2.9761" damping="0.2"/>
+									<geom mesh="l5_0" material="Mtl5_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+									<geom mesh="l5_1" material="Mtl4_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+									<geom mesh="l5_2" material="Mtl2_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+									<geom mesh="l5_3" material="Mtl1_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+									<geom mesh="l5_4" material="Mtl3_001" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+    									<geom pos="0 0 0.1" rgba="0.5 0.1 0.1 1" size="0.06" name="link5_collision"/>
+    									<body name="right_hand_camera" pos="0.039552 -0.033 0.0695" quat="0.707107 0 0.707107 0">
+    										<inertial diaginertia="0 0 0" mass="0" pos="0.039552 -0.033 0.0695" quat="0.707107 0 0.707107 0"/>
+    									</body>
+    									<body name="right_wrist" pos="0 0 0.10541" quat="0.707107 0.707107 0 0">
+    										<inertial diaginertia="0 0 0" mass="0" pos="0 0 0.10541" quat="0.707107 0.707107 0 0"/>
+    									</body>
+    									<body name="right_l6" pos="0 -0.11 0.1053" quat="0.0616248 0.06163 -0.704416 0.704416">
+    										<inertial diaginertia="0.000360258 0.000311068 0.000214974" mass="0.3292" pos="-8.0726e-06 0.0085838 -0.0049566" quat="0.479044 0.515636 -0.513069 0.491322"/>
+    										<joint axis="0 0 1" limited="true" name="right_j6" pos="0 0 0" range="-4.7124 4.7124" damping="0.1"/>
+										<geom mesh="l6_0" material="Mtl6.006" type="mesh" contype="0" conaffinity="0" group="1"/>
+										<geom mesh="l6_1" material="Mtl5.008" type="mesh" contype="0" conaffinity="0" group="1"/>
+										<geom mesh="l6_2" material="Mtl4.010" type="mesh" contype="0" conaffinity="0" group="1"/>
+										<geom mesh="l6_3" material="Mtl3.011" type="mesh" contype="0" conaffinity="0" group="1"/>
+										<geom mesh="l6_4" material="Mtl2.012" type="mesh" contype="0" conaffinity="0" group="1"/>
+										<geom mesh="l6_5" material="Mtl1.012" type="mesh" contype="0" conaffinity="0" group="1"/>
+										
+    										<geom pos="0 0.015 -0.01" rgba="0.5 0.1 0.1 1" size="0.055 0.025" type="cylinder" name="link6_collision"/>
+    										<body name="right_hand" pos="0 0 0.024" quat="0.707105 0 0 0.707108">
+												<!-- This camera points out from the eef. -->
+                								<camera mode="fixed" name="eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+    											<!-- To add gripper -->
+    										</body>
+    									</body>
+    								</body>
+    								<body name="right_l4_2" pos="0 0 0">
+    									<inertial diaginertia="1e-08 1e-08 1e-08" mass="1e-08" pos="1e-08 1e-08 1e-08" quat="0.820473 0.339851 -0.17592 0.424708"/>
+    									<geom pos="0 0.01 0.26" size="0.06" name="right_l4_2"/>
+    								</body>
+    							</body>
+    						</body>
+    						<body name="right_l2_2" pos="0 0 0">
+    							<inertial diaginertia="1e-08 1e-08 1e-08" mass="1e-08" pos="1e-08 1e-08 1e-08" quat="0.820473 0.339851 -0.17592 0.424708"/>
+    							<geom pos="0 0 0.26" size="0.06" name="right_l2_2"/>
+    						</body>
+    					</body>
+    					<body name="right_l1_2" pos="0 0 0">
+    						<inertial diaginertia="1e-08 1e-08 1e-08" mass="1e-08" pos="1e-08 1e-08 1e-08" quat="0.820473 0.339851 -0.17592 0.424708"/>
+    						<geom pos="0 0 0.035" size="0.07 0.07" type="cylinder" name="right_l1_2"/>
+    					</body>
+    				</body>
+    			</body>
+    		</body>
+    	</body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base.dae
new file mode 100644
index 0000000000000000000000000000000000000000..858cb12e429b814657e2cef0309956e930c18dd7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7d37bb19bd062ca80ea00dfcd758b1145455c5cfad87d41756b101ac5b2a8a4
+size 358055
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base.stl
new file mode 100644
index 0000000000000000000000000000000000000000..17532f8b2337d284b57814b0b04df1cad492e7f1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a145e0d46f2130afdf2a2e8825a00a929870c4c3d6d8e4d1adc5f04db3aac1b
+size 21084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..5ac838f4f39b08b5b8a5bca10f4809b89b5cd246
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:663310d22dab14745f7a6067ba2b02f7c390474250b9a2d1330cbb55728e2f71
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..fa9db15c0bc4eeeddca2f786cd6c6c5de31a6dbd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:403e84ad717168f90758115872ac35f74a5a5330e4a76202ec461d82b9cbab79
+size 540706
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..831cd04e107d9527ea3385521d0b581dd65a8a6b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/base_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3007a4de228bc656964eea51203a2d04ae27a8be57f397b76e83a537f0df866a
+size 240784
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm.dae
new file mode 100644
index 0000000000000000000000000000000000000000..2483da94c077d3cb5a078906a71730bda836e628
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cb2f094ffba59f124f70881cabea985abfd399f2fbbe76fba7c18d2ece943b9
+size 1140936
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm.stl
new file mode 100644
index 0000000000000000000000000000000000000000..f45e03826c43012b7d691055029ff858dd11173d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e7423cab807c34160ec4f770daee5e747d70e777eb01b7beeace2b8c5751816
+size 53284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..f952494dc4dc7877b9a59a4a451a068ab06ed939
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34ce0be2e1632e4fe37bf494e3893c75f2fc090be4290b65927658a123af7e0a
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..1f2d17cad0bbe2c5d6c04fb61b63d5447f734726
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7496b538aeffefcdeb3885f339e3c636516a123d7ab85ef8c75530f203bb00e9
+size 1466045
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..28afda347a5f3a3750bd9307e772dc49ce6b190f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/forearm_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edc0313de11350874137be754939293c2b4ae810cad6aace786692b1aed8a640
+size 648934
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/pedestal.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/pedestal.dae
new file mode 100644
index 0000000000000000000000000000000000000000..3d1b182cfdd00212645903b3980e1597d0cbd76e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/pedestal.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9a74be4ae114c9acafccf68e1e49d8fd815ec030012ce1b60bdcf9b30db49f5
+size 2734652
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/pedestal.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/pedestal.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..679543fc2320f420d0679415d38b000ac9b756b7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/pedestal.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88a713a17094f06d1c8a540dc80b92ab9ee1d504564ac879e4d181f34af46ae8
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/pedestal.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/pedestal.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4ee6a54adcfd8339d2a4e7ba79891734fe60b3ca
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/pedestal.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2ddd6d21418e64e98195e454b8e684b20ca15baa302dec8ce16f19fb0a48498
+size 2817264
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder.dae
new file mode 100644
index 0000000000000000000000000000000000000000..d942729f28425811f372408f256c60f5c3279f35
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2b58abbef50ce03d4704465d3b619a7da5b2ecb2efc236eadfd221116cbbef0
+size 1797082
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder.stl
new file mode 100644
index 0000000000000000000000000000000000000000..2afa86568142f0b614e766e149fd19b5ca23ff43
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ceb92532177daa77682f5fbd628e01c2137d168f949a7a706ce1dabe9f002387
+size 70084
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..93b0626b2e49a4d9b4d774ebdc5bbacf3a693caa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ea38dba4d482ea7d337c2aa84765da479f196af8428033f89d4b4172e396e55
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..bcb6e0147c2ff39737753215242c119cb292e487
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bbce6bd41475f84a0c5fecb83791474558070d06caaa82196a558538c32f11c
+size 2416493
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..35dc031611f777189e0d801a5a8c504ee744a75e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/shoulder_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb64e003df6d9fd0c3716bcc662c4363bad8b66de2bd564de66156aad0c67403
+size 1056884
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm.dae
new file mode 100644
index 0000000000000000000000000000000000000000..712cceaaaffa5c2e96e060dbd4304faeee0ef8b6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6486526456e11585a41080958b97a0c8821da856e93b2059c74b07f8102bf6cd
+size 3082485
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm.stl
new file mode 100644
index 0000000000000000000000000000000000000000..d0535c386a41cb0f1086573e24660180d3635c2d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee893044caf00075cb55b4cf666d1f1311c7979786212a501009f33bee945209
+size 99684
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..b84076060ec8dc8d69265a37606bdc57c9092fd2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f6867485b58212fbbbc719e169c447cfbe96a11d695dfb20868f9c8fafd3461
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4ee9a68d1cf5453558a95d6d23cf82b2f0bdc16e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6f7168e435e18220cdf0d61cf67e5ec140d74651b7f9eea6a1138afccd5a696
+size 4049840
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..2c0c6cd88a2cb65056d64b0f64f77581393c8700
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/upperarm_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3805393009270b717f7eb4564d0fd9fb75b1e2199158a82dbd7216ea79f29f2
+size 1706034
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1.dae
new file mode 100644
index 0000000000000000000000000000000000000000..4eaa7dba1291817b44551da862aadb5c7d4823ba
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c36649cf4deec6da427d72d45163a059dcb668eb61aac760d5f2f979948fa13a
+size 1334662
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1.stl
new file mode 100644
index 0000000000000000000000000000000000000000..a4c0a83322115f8d31fbcf8a8a83968fd01497fa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8c9f9337b6fd98c75f052e96de10e14a107ddb6874ba6b904e546f8a4e4f43a
+size 59584
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..f11c62e9b0a5379da28889c2c8fd3f20e97f68b9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95f2b261b30a808efdecda2abd67b18fb695ed9a0691586f299274de35617327
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..fde49bf1532ce3d03160f487fbcb7e8e456855aa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6eaeff6ea4f4641e142f0f8cd191c01d3d1547df095e5720671e1c172872bd8
+size 1800469
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..83d5d014053a35dcf10f05768f722b0275de98a5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist1_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c9d1740c05ace364f4a24447a0d819da48755b31235242c69c8bc15ea5c61d6
+size 806284
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2.dae
new file mode 100644
index 0000000000000000000000000000000000000000..351f8942d4d382b536eb75fe4570e97b48be4596
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c99f5538357a7b42a9b207a12e52e713dd3eb7e587645144bde7ab31c1cfc76b
+size 1554838
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2.stl
new file mode 100644
index 0000000000000000000000000000000000000000..44ce896e6b5194de0a569779e5dd2e8766552685
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2964a63f60ce3e3cf3ad55bcf190d7876d50e373cb64b70a57cea5885eaf3c86
+size 67584
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..c6ad2d69c16350e3d807cce64fad324f526c97fa
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d711567f9decdeb7f44b848e75c44cd119aab9422b1ae48cb2f5ccd6747aa13
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..d4e506eaf1b7ac02bc9c4d2b38536a850aeb31f6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d92696777d24520869088c69615f0a2a4bcbbcb8b6675d2e97d3e8ee3a2e85a
+size 2099344
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..928005c93f54aa784dd9d66003100469d61b4028
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist2_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c16241088d427c1fd263be78220908b530ed296682206ffe9430b36c84b4ab74
+size 946784
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3.dae b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3.dae
new file mode 100644
index 0000000000000000000000000000000000000000..4a91d5b7bd2c2709dc0ddbc0a8391327570fbcc7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3.dae
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f5d4f544ae72698a41f75ed4897a76386a91b814dff22e30e93a5bd105a717c
+size 66076
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3.stl
new file mode 100644
index 0000000000000000000000000000000000000000..2db25af90ecf3a43388f22482dd8344c34380df5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83b3666b4ae2badd54af0d2c25a921682ecbc29e849eec646c3ed55fb74c78a3
+size 7184
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..9d87daa5e45f71ea19edb0c740285eb916e0cd65
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57585ece09efa8e1722f23649baa44b67758dcbb08662dd248da0e0b8e5e0858
+size 236
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..26707f2f604136335a06a01531b7f50e89773438
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:caa699b340ddbb85003d2356ac41e9c020585c77d42d9e268414c2665ef92b97
+size 92076
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3_vis.stl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3_vis.stl
new file mode 100644
index 0000000000000000000000000000000000000000..7a0079ec3cdc20d8224d4dd81f7ec645540399f4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/meshes/wrist3_vis.stl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:275a0adeb5737c0ea951036071826a348d98c464bf0d5000a896935241e65712
+size 45634
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/no_texture_robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/no_texture_robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..d5fd5f77e9c1053110b6cd381d5fb25bdb8630db
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/no_texture_robot.xml
@@ -0,0 +1,80 @@
+<mujoco model="ur5e">
+    <actuator>
+        <!-- Physical limits of the actuator. -->
+        <!-- Values taken from https://www.universal-robots.com/articles/ur-articles/max-joint-torques/ -->
+        <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="shoulder_pan_joint" name="torq_j1"/>
+        <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="shoulder_lift_joint" name="torq_j2"/>
+        <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="elbow_joint" name="torq_j3"/>
+        <motor ctrllimited="true" ctrlrange="-28.0 28.0" joint="wrist_1_joint" name="torq_j4"/>
+        <motor ctrllimited="true" ctrlrange="-28.0 28.0" joint="wrist_2_joint" name="torq_j5"/>
+        <motor ctrllimited="true" ctrlrange="-28.0 28.0" joint="wrist_3_joint" name="torq_j6"/>
+    </actuator>
+    <asset>
+        <mesh name="base" file="meshes/base.stl" />
+        <mesh name="shoulder" file="meshes/shoulder.stl" />
+        <mesh name="upperarm" file="meshes/upperarm.stl" />
+        <mesh name="forearm" file="meshes/forearm.stl" />
+        <mesh name="wrist1" file="meshes/wrist1.stl" />
+        <mesh name="wrist2" file="meshes/wrist2.stl" />
+        <mesh name="wrist3" file="meshes/wrist3.stl" />
+        <mesh name="base_vis" file="meshes/base_vis.stl" />
+        <mesh name="shoulder_vis" file="meshes/shoulder_vis.stl" />
+        <mesh name="upperarm_vis" file="meshes/upperarm_vis.stl" />
+        <mesh name="forearm_vis" file="meshes/forearm_vis.stl" />
+        <mesh name="wrist1_vis" file="meshes/wrist1_vis.stl" />
+        <mesh name="wrist2_vis" file="meshes/wrist2_vis.stl" />
+        <mesh name="wrist3_vis" file="meshes/wrist3_vis.stl" />
+    </asset>
+    <worldbody>
+        <body name="base" pos="0 0 0">
+            <!-- robot view -->
+            <camera mode="fixed" name="robotview" pos="1.0 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+            <inertial diaginertia="0 0 0" mass="0" pos="0 0 0"/>
+            <!-- mount attached here -->
+            <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.7 0.7 0.7 1" quat="0.707 0.707 0 0" mesh="base_vis" />
+            <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="base" name="base_col"/>
+            <body name="shoulder_link" pos="0 0 0.163">
+                <inertial pos="0 0 0" mass="3.7" diaginertia="0.0102675 0.0102675 0.00666" />
+                <joint name="shoulder_pan_joint" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="0.001" frictionloss="0.01" />
+                <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.7 0.7 0.7 1" quat="0.707 0.707 0 0" mesh="shoulder_vis" name="shoulder_visual" />
+                <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="shoulder" name="shoulder_col" />
+                <body name="upper_arm_link" pos="0 0.138 0" quat="0.707107 0 0.707107 0">
+                    <inertial pos="0 0 0.2125" mass="8.393" diaginertia="0.133886 0.133886 0.0151074" />
+                    <joint name="shoulder_lift_joint" pos="0 0 0" axis="0 1 0" limited="true" range="-6.28319 6.28319" damping="0.001" frictionloss="0.01" />
+                    <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.7 0.7 0.7 1" quat="0.707 0.707 0 0" mesh="upperarm_vis" name="upperarm_visual" />
+                    <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="upperarm" name="upperarm_col"/>
+                    <body name="forearm_link" pos="0 -0.131 0.425">
+                        <inertial pos="0 0 0.196" mass="2.275" diaginertia="0.0311796 0.0311796 0.004095" />
+                        <joint name="elbow_joint" pos="0 0 0" axis="0 1 0" limited="true" range="-3.14159 3.14159" damping="0.001" frictionloss="0.01" />
+                        <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.7 0.7 0.7 1" quat="0.707 0.707 0 0" mesh="forearm_vis" name="forearm_visual" />
+                        <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="forearm" name="forearm_col"/>
+                        <body name="wrist_1_link" pos="0 0 0.392" quat="0.707107 0 0.707107 0">
+                            <inertial pos="0 0.127 0" mass="1.219" diaginertia="0.0025599 0.0025599 0.0021942" />
+                            <joint name="wrist_1_joint" pos="0 0 0" axis="0 1 0" limited="true" range="-6.28319 6.28319" damping="0.001" frictionloss="0.01" />
+                            <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.7 0.7 0.7 1" quat="0.707 0.707 0 0" mesh="wrist1_vis" name="wrist1_visual" />
+                            <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="wrist1" name="wrist1_col" />
+                            <body name="wrist_2_link" pos="0 0.127 0">
+                                <inertial pos="0 0 0.1" mass="1.219" diaginertia="0.0025599 0.0025599 0.0021942" />
+                                <joint name="wrist_2_joint" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="0.001" frictionloss="0.01" />
+                                <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.7 0.7 0.7 1" quat="0.707 0.707 0 0" mesh="wrist2_vis" name="wrist2_visual" />
+                                <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="wrist2" name="wrist2_col" />
+                                <body name="wrist_3_link" pos="0 0 0.1">
+                                    <inertial pos="0 0.0771683 0" quat="0.707107 0 0 0.707107" mass="0.1889" diaginertia="0.000132134 9.90863e-05 9.90863e-05" />
+                                    <joint name="wrist_3_joint" pos="0 0 0" axis="0 1 0" limited="true" range="-6.28319 6.28319" damping="0.001" frictionloss="0.01" />
+                                    <geom type="mesh" contype="0" conaffinity="0" group="1" rgba="0.7 0.7 0.7 1" quat="0.707 0.707 0 0" mesh="wrist3_vis" name="wrist3_visual" />
+                                    <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="wrist3" name="wrist3_col" />
+                                    <geom size="0.005 0.005 0.005" pos="0 0.09 0" quat="0.707107 0 0 0.707107" type="box" name="wrist3_col2"/>
+                                    <body name="right_hand" pos="0 0.098 0" quat="0.707 -0.707 0 0" >
+                                        <!-- This camera points out from the eef. -->
+                                        <camera mode="fixed" name="eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                                        <!-- To add gripper -->
+                                    </body>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..777a0cdf4c41894fe746f81e6f94085767caaed4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27e8ad5a2fbbd8dd54a7308654a1224451b7c8ecd05b39cc461bf9c5974f841b
+size 421
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..62412afd1e36c4545790155678a2d7216f6f71d8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0297e6e882404a2e1f34d4d18e0b4a1c4d9d4d08d5360b20981c8bf15bb75666
+size 344266
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis/base_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis/base_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..2c22fd6e2346ec4cb4262cc77d0850f068281f66
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis/base_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:192729de151e0982652acca1a97b64337a9ef22bd3ddf2420fa2b8e782afd3f9
+size 555556
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis/base_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis/base_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..17b44e9f3b19e43d3569314d87945f53206ec711
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/base_vis/base_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:436805384ad1f8f2ccaf028fed33596df4e156656fe4eeb73bfb1d32f51ccebd
+size 638730
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..330f355c55c28e067ecea287c0fb3b8c26dc664b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66c34739f2772474b80b3f86381ca2810b4bc7e036dd6f4311d59661cf85bee7
+size 775
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..fb30cdc5b32c36883200b315e93b6ba437425b2f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e5a706619c54ebc0e74d0cdc838f63066a47df43156cb431813243f86e14946
+size 1134710
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5f16f8d858ed1d92ce0c531b10b9d0f290bc2793
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db3cc5aabd8b4ff3e2392f7aa989a1ac3829a8c692cc84a44fc6d161e0c9e25f
+size 1692114
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..86331383f406767a7551016407bfe5ddcdc0344c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:593487745160c53175b1329aa7a56275fd4ffad24c86e56966fbcaafe00cb74e
+size 76433
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..1aeb038fb9310f7928a1baf6a019f0d50a9ffd8f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f68fd607a6166a0e2fcb162f472db24aa7008a369d878b700d5ca3f499e3f850
+size 933173
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..64b386b9c40b280be19cc8bf738d8f67bc8406ee
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/forearm_vis/forearm_vis_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0fe9c0c9eb1b2612906effab179846c71b82c910ea73dfac3198ad1854189fd
+size 1096820
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..7b6c830eb2405b236f1024d32554617033e6ec3a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d47f710de6c043902d31ef3133711e042ebc2dff1b4a6aae871a557cea494488
+size 605
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c00b27d0d135a109ad67ec7671d135d555181d93
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e910c05bfc38f0d8b31706074ddbc82e041d2e6feb07467539e5a537033c819
+size 1796581
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis/shoulder_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis/shoulder_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..0f708fabf54a029e23c7f7af6a8cddbfa8da7cea
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis/shoulder_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6f46fbcdc3eb91856d29a54acbbb39242f422dcc62b6d5590685b057cf8be1a
+size 2946979
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis/shoulder_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis/shoulder_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..6b7753b61129c27c6e5b963ab89533b079024e6b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis/shoulder_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:922264d6abbaaa4d7cce453b3371dc7b9dfc8dd46961a64c835b431331e6643d
+size 569961
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis/shoulder_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis/shoulder_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5b4fa73f4d0cd47cf2d6be8476bc01b72a9506b4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/shoulder_vis/shoulder_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8b5432ca017f953da2ce264bfb8de2ac2b61eb5c85d472d06d4b9da3c3c9ab8
+size 2441153
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..a4aa777df8ceaa3f387f491b3ee69e800af61cbe
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fd967fc19fbe1cf8e88fe75ec15d8d61e6c80358f1fbf23bcc534d2684fdbd0
+size 791
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..fb292eafbc8057b387bf59e4924c7caeabd401c6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07a32b121699a5e282caee0478f9ce1848715549c6afe6654bd4c9b2eb711c12
+size 3158403
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3e013b959e6bb18baaa914b67bb76ec0f5e18811
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e53e6302c7937d8c0853fdd600bcd4adf3767f586ed9a12170b4159f8f4a5b1
+size 156260
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..d7764e464e8d940498e6a7684db933efe751afea
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ce040b7f759bbeb6874005457a84239d02690c7eb22fc85d4876eb258ef4c70
+size 1090424
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..4db6b9afff037cccbe6f54ef1a76c0101caa66bc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebc641d2d76b981acd673fb6644895ce56a8c10e0109cbc4a8cc8c59e540e862
+size 3684774
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_3.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_3.obj
new file mode 100644
index 0000000000000000000000000000000000000000..bcf9440ab9ce851f3cae0ac12f3d78e8e4ffabfe
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/upperarm_vis/upperarm_vis_3.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a965ee78d60112a72bcc33d173d6669f09a5854ab157c2c6df1b3f5857bbc3f
+size 5345375
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..c124e07a96b651de2f2b799748c947de84c62bdf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b29367ba1b140f1540c05ae8eb0837ccd5f7fb906bd0a323b6e7d8e8d1617af5
+size 605
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..746d1a99369f672a0d615fdb803667e96eebee3f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:022c689e3363f7fbd147ca4c0495489bd5c6e51f06f02c5c36ab0110b2aa7aa3
+size 1319390
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis/wrist1_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis/wrist1_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..b45db7eca2622ab5ca0e9cf6f9cf22ad3d022340
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis/wrist1_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44d267ff1e725fc4a59ed6ab2f2eeb58356871d98d868d8ba08555bbd8b0af5d
+size 278772
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis/wrist1_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis/wrist1_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..c19ea037389a5411951b71b8fe79f4be61c95844
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis/wrist1_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f04a25bc0827e903c9d001a16173e33197641280317b1d9e78e473f476052764
+size 2470993
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis/wrist1_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis/wrist1_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ffbdabf5c24241ba6e6d1f758ecf698bb9891f8e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist1_vis/wrist1_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23e82bbd4d6d21084d1a72d50f4d027d413ac0cb23edcb0b1c76f6c162da6a11
+size 1658162
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..f090d8e29b3f4c781b7de56f5c85c767bcdfa938
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d15722d0ecaedb182f661dbabe67e0fa89a8fb05348458cf7d70f65044d1da2
+size 605
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..830afb040ee47f04d8728ba8019bf60159a83c5f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:715a3abe4cd81466113069a39663ca07249e795be712e613b21127b294142bdc
+size 1548411
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis/wrist2_vis_0.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis/wrist2_vis_0.obj
new file mode 100644
index 0000000000000000000000000000000000000000..8d9ed22f08b55ec5f52d60e1aca9ae6ae1a623af
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis/wrist2_vis_0.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6482ce9227c3a4e0bc5af577c33008fda74723b7e8b72b5ffa6a2b5bf42651c3
+size 846654
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis/wrist2_vis_1.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis/wrist2_vis_1.obj
new file mode 100644
index 0000000000000000000000000000000000000000..68a3731f7d1d5e94eea2a3cb45f66794f4531b7d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis/wrist2_vis_1.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34c774dab175b810eb364b0955feaaa9648ad33bc4cd445f2d87c7a5f0d8faad
+size 2156472
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis/wrist2_vis_2.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis/wrist2_vis_2.obj
new file mode 100644
index 0000000000000000000000000000000000000000..13f899867fbb314fa3367dce49b776a81a5687f2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist2_vis/wrist2_vis_2.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a382e17343660e5757f03fd1f8d5dbd07cbcb5828b3c838815bdb1bf0c9d3004
+size 2141246
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist3_vis.mtl b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist3_vis.mtl
new file mode 100644
index 0000000000000000000000000000000000000000..c79271118c99ead228f3313844b968e893938cc6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist3_vis.mtl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d3ebe91449c0ba69e2d91be33a24185d8ded3f2f975a05b3c504752db1f1692
+size 237
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist3_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist3_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..24af72a1aa0258d1cab7ededadb8658def10a961
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist3_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a36deaacff34e810bbf86381ff2bce5cd3ecbced6486e3336c51649b2067881f
+size 58971
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist3_vis/wrist3_vis.obj b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist3_vis/wrist3_vis.obj
new file mode 100644
index 0000000000000000000000000000000000000000..5955b8a7a6536575d850a8edb619264553c65886
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/obj_meshes/wrist3_vis/wrist3_vis.obj
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:953c3ab8ff91983ba79cf139eec47853ded3e53943c366aaed84f60e62172e10
+size 211663
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/robot.xml b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/robot.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1d0f101cb287fb286dda3be091a3432d137e889e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/robots/ur5e/robot.xml
@@ -0,0 +1,132 @@
+<mujoco model="ur5e">
+    <actuator>
+        <!-- Physical limits of the actuator. -->
+        <!-- Values taken from https://www.universal-robots.com/articles/ur-articles/max-joint-torques/ -->
+        <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="shoulder_pan_joint" name="torq_j1"/>
+        <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="shoulder_lift_joint" name="torq_j2"/>
+        <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="elbow_joint" name="torq_j3"/>
+        <motor ctrllimited="true" ctrlrange="-28.0 28.0" joint="wrist_1_joint" name="torq_j4"/>
+        <motor ctrllimited="true" ctrlrange="-28.0 28.0" joint="wrist_2_joint" name="torq_j5"/>
+        <motor ctrllimited="true" ctrlrange="-28.0 28.0" joint="wrist_3_joint" name="torq_j6"/>
+    </actuator>
+    <asset>
+
+        <material name="Black" specular="0.5" shininess="0.25" rgba="0.033102 0.033102 0.033102 1.000000"/>
+	<material name="JointGrey" specular="0.5" shininess="0.25" rgba="0.278431 0.278431 0.278431 1.000000"/>
+	<material name="LinkGrey" specular="0.5" shininess="0.25" rgba="0.820392 0.820392 0.820392 1.000000"/>
+	<material name="URBlue" specular="0.5" shininess="0.25" rgba="0.490196 0.678431 0.800000 1.000000"/>
+      
+        <!-- base material and mesh -->
+	<mesh name="base_vis_0" file="obj_meshes/base_vis/base_vis_0.obj"/>
+	<mesh name="base_vis_1" file="obj_meshes/base_vis/base_vis_1.obj"/>
+
+        <!-- forearm mesh -->	
+	<mesh name="forearm_vis_0" file="obj_meshes/forearm_vis/forearm_vis_0.obj"/>
+	<mesh name="forearm_vis_1" file="obj_meshes/forearm_vis/forearm_vis_1.obj"/>
+	<mesh name="forearm_vis_2" file="obj_meshes/forearm_vis/forearm_vis_2.obj"/>
+	<mesh name="forearm_vis_3" file="obj_meshes/forearm_vis/forearm_vis_3.obj"/>
+
+        <!-- shoulder material and mesh -->	
+	<mesh name="shoulder_vis_0" file="obj_meshes/shoulder_vis/shoulder_vis_0.obj"/>
+	<mesh name="shoulder_vis_1" file="obj_meshes/shoulder_vis/shoulder_vis_1.obj"/>
+	<mesh name="shoulder_vis_2" file="obj_meshes/shoulder_vis/shoulder_vis_2.obj"/>
+
+        <!-- upperarm material and mesh -->
+	<mesh name="upperarm_vis_0" file="obj_meshes/upperarm_vis/upperarm_vis_0.obj"/>
+	<mesh name="upperarm_vis_1" file="obj_meshes/upperarm_vis/upperarm_vis_1.obj"/>
+	<mesh name="upperarm_vis_2" file="obj_meshes/upperarm_vis/upperarm_vis_2.obj"/>
+	<mesh name="upperarm_vis_3" file="obj_meshes/upperarm_vis/upperarm_vis_3.obj"/>
+
+        <!-- wrist1 material and mesh -->	
+	<mesh name="wrist1_vis_0" file="obj_meshes/wrist1_vis/wrist1_vis_0.obj"/>
+	<mesh name="wrist1_vis_1" file="obj_meshes/wrist1_vis/wrist1_vis_1.obj"/>
+	<mesh name="wrist1_vis_2" file="obj_meshes/wrist1_vis/wrist1_vis_2.obj"/>
+
+        <!-- wrist2 material and mesh -->	
+	<mesh name="wrist2_vis_0" file="obj_meshes/wrist2_vis/wrist2_vis_0.obj"/>
+	<mesh name="wrist2_vis_1" file="obj_meshes/wrist2_vis/wrist2_vis_1.obj"/>
+	<mesh name="wrist2_vis_2" file="obj_meshes/wrist2_vis/wrist2_vis_2.obj"/>
+
+        <!-- wrist3 material and mesh -->	
+	<mesh name="wrist3_vis" file="obj_meshes/wrist3_vis/wrist3_vis.obj"/>
+	
+        <mesh name="base" file="meshes/base.stl" />
+        <mesh name="shoulder" file="meshes/shoulder.stl" />
+        <mesh name="upperarm" file="meshes/upperarm.stl" />
+        <mesh name="forearm" file="meshes/forearm.stl" />
+        <mesh name="wrist1" file="meshes/wrist1.stl" />
+        <mesh name="wrist2" file="meshes/wrist2.stl" />
+        <mesh name="wrist3" file="meshes/wrist3.stl" />
+    </asset>
+    <worldbody>
+        <body name="base" pos="0 0 0">
+            <!-- robot view -->
+            <camera mode="fixed" name="robotview" pos="1.0 0 0.4" quat="0.653 0.271 0.271 0.653"/>
+            <inertial diaginertia="0 0 0" mass="0" pos="0 0 0"/>
+            <!-- mount attached here -->
+	    <geom mesh="base_vis_0" material="Black" type="mesh" contype="0" conaffinity="0" group="1"/>
+	    <geom mesh="base_vis_1" material="JointGrey" type="mesh" contype="0" conaffinity="0" group="1"/>
+ 
+            <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="base" name="base_col"/>
+            <body name="shoulder_link" pos="0 0 0.163">
+                <inertial pos="0 0 0" mass="3.7" diaginertia="0.0102675 0.0102675 0.00666" />
+                <joint name="shoulder_pan_joint" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="0.001" frictionloss="0.01" />
+		<geom mesh="shoulder_vis_0" material="URBlue" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="shoulder_vis_1" material="Black" type="mesh" contype="0" conaffinity="0" group="1"/>
+		<geom mesh="shoulder_vis_2" material="JointGrey" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+                <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="shoulder" name="shoulder_col" />
+                <body name="upper_arm_link" pos="0 0.138 0" quat="0.707107 0 0.707107 0">
+                    <inertial pos="0 0 0.2125" mass="8.393" diaginertia="0.133886 0.133886 0.0151074" />
+                    <joint name="shoulder_lift_joint" pos="0 0 0" axis="0 1 0" limited="true" range="-6.28319 6.28319" damping="0.001" frictionloss="0.01" />
+		    <geom mesh="upperarm_vis_0" material="LinkGrey" type="mesh" contype="0" conaffinity="0" group="1"/>
+		    <geom mesh="upperarm_vis_1" material="Black" type="mesh" contype="0" conaffinity="0" group="1"/>
+		    <geom mesh="upperarm_vis_2" material="JointGrey" type="mesh" contype="0" conaffinity="0" group="1"/>
+		    <geom mesh="upperarm_vis_3" material="URBlue" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+                    <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="upperarm" name="upperarm_col"/>
+                    <body name="forearm_link" pos="0 -0.131 0.425">
+                        <inertial pos="0 0 0.196" mass="2.275" diaginertia="0.0311796 0.0311796 0.004095" />
+                        <joint name="elbow_joint" pos="0 0 0" axis="0 1 0" limited="true" range="-3.14159 3.14159" damping="0.001" frictionloss="0.01" />
+
+			<geom mesh="forearm_vis_0" material="URBlue" type="mesh" contype="0" conaffinity="0" group="1"/>
+			<geom mesh="forearm_vis_1" material="LinkGrey" type="mesh" contype="0" conaffinity="0" group="1"/>
+			<geom mesh="forearm_vis_2" material="Black" type="mesh" contype="0" conaffinity="0" group="1"/>
+			<geom mesh="forearm_vis_3" material="JointGrey" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+                        <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="forearm" name="forearm_col"/>
+                        <body name="wrist_1_link" pos="0 0 0.392" quat="0.707107 0 0.707107 0">
+                            <inertial pos="0 0.127 0" mass="1.219" diaginertia="0.0025599 0.0025599 0.0021942" />
+                            <joint name="wrist_1_joint" pos="0 0 0" axis="0 1 0" limited="true" range="-6.28319 6.28319" damping="0.001" frictionloss="0.01" />
+			    <geom mesh="wrist1_vis_0" material="Black" type="mesh" contype="0" conaffinity="0" group="1"/>
+			    <geom mesh="wrist1_vis_1" material="URBlue" type="mesh" contype="0" conaffinity="0" group="1"/>
+			    <geom mesh="wrist1_vis_2" material="JointGrey" type="mesh" contype="0" conaffinity="0" group="1"/>
+
+                            <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="wrist1" name="wrist1_col" />
+                            <body name="wrist_2_link" pos="0 0.127 0">
+                                <inertial pos="0 0 0.1" mass="1.219" diaginertia="0.0025599 0.0025599 0.0021942" />
+                                <joint name="wrist_2_joint" pos="0 0 0" axis="0 0 1" limited="true" range="-6.28319 6.28319" damping="0.001" frictionloss="0.01" />
+				<geom mesh="wrist2_vis_0" material="Black" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="wrist2_vis_1" material="URBlue" type="mesh" contype="0" conaffinity="0" group="1"/>
+				<geom mesh="wrist2_vis_2" material="JointGrey" type="mesh" contype="0" conaffinity="0" group="1"/>
+                                <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="wrist2" name="wrist2_col" />
+                                <body name="wrist_3_link" pos="0 0 0.1">
+                                    <inertial pos="0 0.0771683 0" quat="0.707107 0 0 0.707107" mass="0.1889" diaginertia="0.000132134 9.90863e-05 9.90863e-05" />
+                                    <joint name="wrist_3_joint" pos="0 0 0" axis="0 1 0" limited="true" range="-6.28319 6.28319" damping="0.001" frictionloss="0.01" />
+				    <geom material="LinkGrey" mesh="wrist3_vis" type="mesh" contype="0" conaffinity="0" group="1"/>
+                                    <geom type="mesh" rgba="0.7 0.7 0.7 1" mesh="wrist3" name="wrist3_col" />
+                                    <geom size="0.005 0.005 0.005" pos="0 0.09 0" quat="0.707107 0 0 0.707107" type="box" name="wrist3_col2"/>
+                                    <body name="right_hand" pos="0 0.098 0" quat="0.707 -0.707 0 0" >
+                                        <!-- This camera points out from the eef. -->
+                                        <camera mode="fixed" name="eye_in_hand" pos="0.05 0 0" quat="0 0.707108 0.707108 0" fovy="75"/>
+                                        <!-- To add gripper -->
+                                    </body>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+</mujoco>
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/blue-wood.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/blue-wood.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca06668e5038d9f5bea19a8045026e0e50cbc94e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/blue-wood.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b1eab21395a05d7463751f9075b16bc4fba8fd4be42938bda25ea634a345884
+size 201294
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/brass-ambra.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/brass-ambra.png
new file mode 100644
index 0000000000000000000000000000000000000000..1221dbf0dd2471cc3d7318a49b66fe03b06f3649
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/brass-ambra.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:246007818045b92276631294bb61d797806f2381d7044e98b6e3c4465f8e085c
+size 1720843
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/bread.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/bread.png
new file mode 100644
index 0000000000000000000000000000000000000000..f23b705dc737519490d20f7f8fce2b1676d8a15d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/bread.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22814a06ae3c7dc097801887e6aec97f830b7368b611cab56d1c4ad5ae57bddc
+size 518677
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/can.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/can.png
new file mode 100644
index 0000000000000000000000000000000000000000..1cbc5117af16dce9ef31b567f240f993f8aece0a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/can.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d1b29ff348a4425f84bb0de6b46139446b7e950058348d52fced0a707558e02
+size 586402
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/ceramic.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/ceramic.png
new file mode 100644
index 0000000000000000000000000000000000000000..891c50a69b45e57cc9befa6f8fcc579d35552fc4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/ceramic.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:beef464a2aa9143ba6a25a44adcbfd1f1aff16c5969edbf08019b61025388402
+size 1442030
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/cereal.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/cereal.png
new file mode 100644
index 0000000000000000000000000000000000000000..5486bb143fe9407be3e5f9d34f30e95d61b543cb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/cereal.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa0be8e701b41b3a7284bcd1e6e18adb541d007d00b2dbfe69772620bea790c8
+size 542717
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/clay.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/clay.png
new file mode 100644
index 0000000000000000000000000000000000000000..d264b69c1a3d4be139085c9c6bacab5ef34c7049
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/clay.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ea2ced7f0f07996292314e3f3f5df68c4d8c5b2cbf2180c899ed269906fdcd2
+size 634465
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/cream-plaster.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/cream-plaster.png
new file mode 100644
index 0000000000000000000000000000000000000000..d77333eb1bd7d44057939b3c12c8d9a620e4a8e1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/cream-plaster.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f989033f089158b2c8f0f1f7881f6922b018a5b154558b2edcc9f41a21ab983
+size 696397
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/dark-wood.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/dark-wood.png
new file mode 100644
index 0000000000000000000000000000000000000000..09784cca1e9f2de13f0306189a9ad21aaa47241b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/dark-wood.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5da8dbf7af06ecd251840aceec25fedcfa98267041cea289b2829c982e008dc3
+size 239227
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/dirt.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/dirt.png
new file mode 100644
index 0000000000000000000000000000000000000000..eaeda47e2951d5cf35cc9881571235ebeb3fcb4c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/dirt.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b410966a8e0c1d52e2847bdb61aa51c6291efcd7e92839923637d0dcdb91e973
+size 533543
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/glass.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/glass.png
new file mode 100644
index 0000000000000000000000000000000000000000..c48c108132eb21a8f4a17a06d778b04d5c22dcb9
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/glass.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e12c4e1ed663ba3b690a701626e80189be02228f6d29c985273f15da4de423a
+size 89051
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/gray-felt.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/gray-felt.png
new file mode 100644
index 0000000000000000000000000000000000000000..66ee5d48f22f4f66a9fe70865865063b14e2f6b2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/gray-felt.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04c1b483f04be0bd3d26a42ca530c05244cf4b52c0ccb027dbb8fc2bee41ca05
+size 1037473
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/gray-plaster.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/gray-plaster.png
new file mode 100644
index 0000000000000000000000000000000000000000..fcf71d0d1dcff614a27b5d521f0986c197cc29d2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/gray-plaster.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ee7520ebf6118051ced809cb9754983e7537a40f8d5271d5cb2fabdd71c33bd
+size 474599
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/gray-woodgrain.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/gray-woodgrain.png
new file mode 100644
index 0000000000000000000000000000000000000000..e72ed266cce2951efb4c2f125b4defa31719d833
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/gray-woodgrain.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9e65c147f2683dbcfe6279b359db649aef0cb40ef8b1e3ed5ba6f29fab6b3d0
+size 267843
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/green-wood.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/green-wood.png
new file mode 100644
index 0000000000000000000000000000000000000000..2a6cc1080f72267c3640eadd87fcb5418eeeb4e7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/green-wood.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4fd42ec6dea58f46aa96fc1d78d57c52450a6497dcad887ae8e2ff8c4af3797
+size 83480
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/lemon.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/lemon.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e9743ccc21070f6a053e27517d8461b6fbf1217
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/lemon.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93a21938e5fe8d62a042c5591b13b9bec75d94791f7d08f375beb60fce523610
+size 1919770
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/light-gray-floor-tile.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/light-gray-floor-tile.png
new file mode 100644
index 0000000000000000000000000000000000000000..70a290f75e6545ae5bc745acadf63566fadd6c2c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/light-gray-floor-tile.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2aae4ba3b38c3ab9851eca9ad6e76cef9357ef75b3e9a4b280f694ca30acbc5
+size 73373
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/light-gray-plaster.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/light-gray-plaster.png
new file mode 100644
index 0000000000000000000000000000000000000000..8553f5e438ccf68cfe9ce9300ce8c94712ba1e8f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/light-gray-plaster.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bdaf13b44ab6ef3122451f6b3853e40bdf20bb9be7662649683ba525dfe191
+size 516639
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/light-wood.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/light-wood.png
new file mode 100644
index 0000000000000000000000000000000000000000..92f9d6aa33937bb75a49fb9543e0c6a74a88aff6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/light-wood.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:967be64bdb170d36283bc43c2ae5d525ec88f78bc837dc7d0efc9a28d68caad1
+size 801173
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/metal.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/metal.png
new file mode 100644
index 0000000000000000000000000000000000000000..f5260ff57151e704fb133e4e0ea88af4a455ee24
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/metal.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93784216a66b52be9c8ccc3732e50f511815291e32f34f97bd139ed7229061fa
+size 196878
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/pink-plaster.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/pink-plaster.png
new file mode 100644
index 0000000000000000000000000000000000000000..08abc2bc4cacd7d533d1275e3275626c15195eaf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/pink-plaster.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6f8435c8aec555fa89ee72d472d652f31ce1afc0e07372e9857b0aa50f6e029
+size 553251
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/red-wood.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/red-wood.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d1e2e39d020002dc548ff27c7c1b6184d025583
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/red-wood.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d42411e594d6be26c4876ada623b163af6621402462b244d86eaf6e72aca6494
+size 1797019
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/soda.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/soda.png
new file mode 100644
index 0000000000000000000000000000000000000000..5d48f08b91de2820aeb9687371947d4034f11809
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/soda.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2774d29c6c16e1a3b3156e6679897a1695127ec59463d2de1a7ffd7f11b41b4d
+size 627792
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/steel-brushed.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/steel-brushed.png
new file mode 100644
index 0000000000000000000000000000000000000000..a072534801ccb26c8512195f490edc9f1b8295ab
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/steel-brushed.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43f30e7a2d0e8085aa15f58623b1753a003f93e5fe336eeaf65c146f955a81d5
+size 275192
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/steel-scratched.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/steel-scratched.png
new file mode 100644
index 0000000000000000000000000000000000000000..96c24dd05d9dd8ac7be045baf71c668582b1cd7d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/steel-scratched.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e22a111d516cc84addde9b2b18353d8a6ec521b9b79ec5e9c169e2d9000d3fe
+size 391216
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/white-bricks.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/white-bricks.png
new file mode 100644
index 0000000000000000000000000000000000000000..04ce966e64c68766b372a84bfc57323e5bbf7b92
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/white-bricks.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:864e2bfb462939212c4141a060eb4c3f34c891b6909873797b08f8c4185433d1
+size 1458770
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/white-plaster.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/white-plaster.png
new file mode 100644
index 0000000000000000000000000000000000000000..7972a9b8fc1bf5a9ee03919c0cf6ee6f04513ba6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/white-plaster.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a35c03edcef74c940400dd39e2384d05169d1d15ac247166b3b7a930acd70f03
+size 624094
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/wood-tiles.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/wood-tiles.png
new file mode 100644
index 0000000000000000000000000000000000000000..e9988e92a4aa98c43e46a274bc9aaa49096a0cf5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/wood-tiles.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18d87069529422d832559ce969858576dd46f83efc3dfd9e5ffbf26d740be123
+size 1460776
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/wood-varnished-panels.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/wood-varnished-panels.png
new file mode 100644
index 0000000000000000000000000000000000000000..28c7819137619bf1eaa2ad1fd77612c4b14c255f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/wood-varnished-panels.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67b48dc99b89004d98cb0811f170d39993e7e205cc8d5bd3d01a4be137e33a85
+size 466710
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/yellow-plaster.png b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/yellow-plaster.png
new file mode 100644
index 0000000000000000000000000000000000000000..0610295bf881b2e6d78f5fc72811e4cd2f1b0702
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/yellow-plaster.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f98027e5dd4ad845a1a20f302b183c62b8e8fab54799634c5143c1dde0d0362
+size 477879
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/base.py b/phantom/submodules/phantom-robosuite/robosuite/models/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e6f98d47904d4c05c8e8c34fb297d8cf570398
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/base.py
@@ -0,0 +1,696 @@
+import io
+import os
+import xml.dom.minidom
+import xml.etree.ElementTree as ET
+
+import robosuite.macros as macros
+from robosuite.utils import XMLError
+from robosuite.utils.mjcf_utils import (
+    _element_filter,
+    add_material,
+    add_prefix,
+    find_elements,
+    recolor_collision_geoms,
+    sort_elements,
+    string_to_array,
+)
+
+
+class MujocoXML(object):
+    """
+    Base class of Mujoco xml file
+    Wraps around ElementTree and provides additional functionality for merging different models.
+    Specially, we keep track of <worldbody/>, <actuator/> and <asset/>
+
+    When initialized, loads a mujoco xml from file.
+
+    Args:
+        fname (str): path to the MJCF xml file.
+    """
+
+    def __init__(self, fname):
+        self.file = fname
+        self.folder = os.path.dirname(fname)
+        self.tree = ET.parse(fname)
+        self.root = self.tree.getroot()
+        self.worldbody = self.create_default_element("worldbody")
+        self.actuator = self.create_default_element("actuator")
+        self.sensor = self.create_default_element("sensor")
+        self.asset = self.create_default_element("asset")
+        self.tendon = self.create_default_element("tendon")
+        self.equality = self.create_default_element("equality")
+        self.contact = self.create_default_element("contact")
+
+        # Parse any default classes and replace them inline
+        default = self.create_default_element("default")
+        default_classes = self._get_default_classes(default)
+        self._replace_defaults_inline(default_dic=default_classes)
+
+        # Remove original default classes
+        self.root.remove(default)
+
+        self.resolve_asset_dependency()
+
+    def resolve_asset_dependency(self):
+        """
+        Converts every file dependency into absolute path so when we merge we don't break things.
+        """
+
+        for node in self.asset.findall("./*[@file]"):
+            file = node.get("file")
+            abs_path = os.path.abspath(self.folder)
+            abs_path = os.path.join(abs_path, file)
+            node.set("file", abs_path)
+
+    def create_default_element(self, name):
+        """
+        Creates a <@name/> tag under root if there is none.
+
+        Args:
+            name (str): Name to generate default element
+
+        Returns:
+            ET.Element: Node that was created
+        """
+
+        found = self.root.find(name)
+        if found is not None:
+            return found
+        ele = ET.Element(name)
+        self.root.append(ele)
+        return ele
+
+    def merge(self, others, merge_body="default"):
+        """
+        Default merge method.
+
+        Args:
+            others (MujocoXML or list of MujocoXML): other xmls to merge into this one
+                raises XML error if @others is not a MujocoXML instance.
+                merges <worldbody/>, <actuator/> and <asset/> of @others into @self
+            merge_body (None or str): If set, will merge child bodies of @others. Default is "default", which
+                corresponds to the root worldbody for this XML. Otherwise, should be an existing body name
+                that exists in this XML. None results in no merging of @other's bodies in its worldbody.
+
+        Raises:
+            XMLError: [Invalid XML instance]
+        """
+        if type(others) is not list:
+            others = [others]
+        for idx, other in enumerate(others):
+            if not isinstance(other, MujocoXML):
+                raise XMLError("{} is not a MujocoXML instance.".format(type(other)))
+            if merge_body is not None:
+                root = (
+                    self.worldbody
+                    if merge_body == "default"
+                    else find_elements(
+                        root=self.worldbody, tags="body", attribs={"name": merge_body}, return_first=True
+                    )
+                )
+                for body in other.worldbody:
+                    root.append(body)
+            self.merge_assets(other)
+            for one_actuator in other.actuator:
+                self.actuator.append(one_actuator)
+            for one_sensor in other.sensor:
+                self.sensor.append(one_sensor)
+            for one_tendon in other.tendon:
+                self.tendon.append(one_tendon)
+            for one_equality in other.equality:
+                self.equality.append(one_equality)
+            for one_contact in other.contact:
+                self.contact.append(one_contact)
+
+    def get_model(self, mode="mujoco"):
+        """
+        Generates a MjModel instance from the current xml tree.
+
+        Args:
+            mode (str): Mode with which to interpret xml tree
+
+        Returns:
+            MjModel: generated model from xml
+
+        Raises:
+            ValueError: [Invalid mode]
+        """
+
+        available_modes = ["mujoco"]
+        with io.StringIO() as string:
+            string.write(ET.tostring(self.root, encoding="unicode"))
+            if mode == "mujoco":
+                import mujoco
+
+                model = mujoco.MjModel.from_xml_string(string.getvalue())
+                return model
+            raise ValueError("Unkown model mode: {}. Available options are: {}".format(mode, ",".join(available_modes)))
+
+    def get_xml(self):
+        """
+        Reads a string of the MJCF XML file.
+
+        Returns:
+            str: XML tree read in from file
+        """
+        with io.StringIO() as string:
+            string.write(ET.tostring(self.root, encoding="unicode"))
+            return string.getvalue()
+
+    def save_model(self, fname, pretty=False):
+        """
+        Saves the xml to file.
+
+        Args:
+            fname (str): output file location
+            pretty (bool): If True, (attempts!! to) pretty print the output
+        """
+        with open(fname, "w") as f:
+            xml_str = ET.tostring(self.root, encoding="unicode")
+            if pretty:
+                parsed_xml = xml.dom.minidom.parseString(xml_str)
+                xml_str = parsed_xml.toprettyxml(newl="")
+            f.write(xml_str)
+
+    def merge_assets(self, other):
+        """
+        Merges @other's assets in a custom logic.
+
+        Args:
+            other (MujocoXML or MujocoObject): other xml file whose assets will be merged into this one
+        """
+        for asset in other.asset:
+            if (
+                find_elements(root=self.asset, tags=asset.tag, attribs={"name": asset.get("name")}, return_first=True)
+                is None
+            ):
+                self.asset.append(asset)
+
+    def get_element_names(self, root, element_type):
+        """
+        Searches recursively through the @root and returns a list of names of the specified @element_type
+
+        Args:
+            root (ET.Element): Root of the xml element tree to start recursively searching through
+                (e.g.: `self.worldbody`)
+            element_type (str): Name of element to return names of. (e.g.: "site", "geom", etc.)
+
+        Returns:
+            list: names that correspond to the specified @element_type
+        """
+        names = []
+        for child in root:
+            if child.tag == element_type:
+                names.append(child.get("name"))
+            names += self.get_element_names(child, element_type)
+        return names
+
+    @staticmethod
+    def _get_default_classes(default):
+        """
+        Utility method to convert all default tags into a nested dictionary of values -- this will be used to replace
+        all elements' class tags inline with the appropriate defaults if not specified.
+
+        Args:
+            default (ET.Element): Nested default tag XML root.
+
+        Returns:
+            dict: Nested dictionary, where each default class name is mapped to its own dict mapping element tag names
+                (e.g.: geom, site, etc.) to the set of default attributes for that tag type
+        """
+        # Create nested dict to return
+        default_dic = {}
+        # Parse the default tag accordingly
+        for cls in default:
+            default_dic[cls.get("class")] = {child.tag: child for child in cls}
+        return default_dic
+
+    def _replace_defaults_inline(self, default_dic, root=None):
+        """
+        Utility method to replace all default class attributes recursively in the XML tree starting from @root
+        with the corresponding defaults in @default_dic if they are not explicitly specified for ta given element.
+
+        Args:
+            root (ET.Element): Root of the xml element tree to start recursively replacing defaults. Only is used by
+                recursive calls
+            default_dic (dict): Nested dictionary, where each default class name is mapped to its own dict mapping
+                element tag names (e.g.: geom, site, etc.) to the set of default attributes for that tag type
+        """
+        # If root is None, this is the top level call -- replace root with self.root
+        if root is None:
+            root = self.root
+        # Check this current element if it contains any class elements
+        cls_name = root.attrib.pop("class", None)
+        if cls_name is not None:
+            # If the tag for this element is contained in our default dic, we add any defaults that are not
+            # explicitly specified in this
+            tag_attrs = default_dic[cls_name].get(root.tag, None)
+            if tag_attrs is not None:
+                for k, v in tag_attrs.items():
+                    if root.get(k, None) is None:
+                        root.set(k, v)
+        # Loop through all child elements
+        for child in root:
+            self._replace_defaults_inline(default_dic=default_dic, root=child)
+
+    @property
+    def name(self):
+        """
+        Returns name of this MujocoXML
+
+        Returns:
+            str: Name of this MujocoXML
+        """
+        return self.root.get("model")
+
+
+class MujocoModel(object):
+    """
+    Base class for all simulation models used in mujoco.
+
+    Standardizes core API for accessing models' relevant geoms, names, etc.
+    """
+
+    def correct_naming(self, names):
+        """
+        Corrects all strings in @names by adding the naming prefix to it and returns the name-corrected values
+
+        Args:
+            names (str, list, or dict): Name(s) to be corrected
+
+        Raises:
+            TypeError: [Invalid input type]
+        """
+        if type(names) is str:
+            return self.naming_prefix + names if not self.exclude_from_prefixing(names) else names
+        elif type(names) is list:
+            return [self.naming_prefix + name if not self.exclude_from_prefixing(name) else name for name in names]
+        elif type(names) is dict:
+            names = names.copy()
+            for key, val in names.items():
+                names[key] = self.correct_naming(val)
+            return names
+        else:
+            # Assumed to be type error
+            raise TypeError("Error: type of 'names' must be str, list, or dict!")
+
+    def set_sites_visibility(self, sim, visible):
+        """
+        Set all site visual states for this model.
+
+        Args:
+            sim (MjSim): Current active mujoco simulation instance
+            visible (bool): If True, will visualize model sites. Else, will hide the sites.
+        """
+        # Loop through all visualization geoms and set their alpha values appropriately
+        for vis_g in self.sites:
+            vis_g_id = sim.model.site_name2id(vis_g)
+            if (visible and sim.model.site_rgba[vis_g_id][3] < 0) or (
+                not visible and sim.model.site_rgba[vis_g_id][3] > 0
+            ):
+                # We toggle the alpha value
+                sim.model.site_rgba[vis_g_id][3] = -sim.model.site_rgba[vis_g_id][3]
+
+    def exclude_from_prefixing(self, inp):
+        """
+        A function that should take in an arbitrary input and return either True or False, determining whether the
+        corresponding name to @inp should have naming_prefix added to it. Must be defined by subclass.
+
+        Args:
+            inp (any): Arbitrary input, depending on subclass. Can be str, ET.Element, etc.
+
+        Returns:
+            bool: True if we should exclude the associated name(s) with @inp from being prefixed with naming_prefix
+        """
+        raise NotImplementedError
+
+    @property
+    def name(self):
+        """
+        Name for this model. Should be unique.
+
+        Returns:
+            str: Unique name for this model.
+        """
+        raise NotImplementedError
+
+    @property
+    def naming_prefix(self):
+        """
+        Generates a standardized prefix to prevent naming collisions
+
+        Returns:
+            str: Prefix unique to this model.
+        """
+        raise NotImplementedError
+
+    @property
+    def root_body(self):
+        """
+        Root body name for this model. This should correspond to the top-level body element in the equivalent mujoco xml
+        tree for this object.
+        """
+        raise NotImplementedError
+
+    @property
+    def bodies(self):
+        """
+        Returns:
+            list: Body names for this model
+        """
+        raise NotImplementedError
+
+    @property
+    def joints(self):
+        """
+        Returns:
+            list: Joint names for this model
+        """
+        raise NotImplementedError
+
+    @property
+    def actuators(self):
+        """
+        Returns:
+            list: Actuator names for this model
+        """
+        raise NotImplementedError
+
+    @property
+    def sites(self):
+        """
+        Returns:
+             list: Site names for this model
+        """
+        raise NotImplementedError
+
+    @property
+    def sensors(self):
+        """
+        Returns:
+             list: Sensor names for this model
+        """
+        raise NotImplementedError
+
+    @property
+    def contact_geoms(self):
+        """
+        List of names corresponding to the geoms used to determine contact with this model.
+
+        Returns:
+            list: relevant contact geoms for this model
+        """
+        raise NotImplementedError
+
+    @property
+    def visual_geoms(self):
+        """
+        List of names corresponding to the geoms used for visual rendering of this model.
+
+        Returns:
+            list: relevant visual geoms for this model
+        """
+        raise NotImplementedError
+
+    @property
+    def important_geoms(self):
+        """
+        Geoms corresponding to important components of this model. String keywords should be mapped to lists of geoms.
+
+        Returns:
+            dict of list: Important set of geoms, where each set of geoms are grouped as a list and are
+            organized by keyword string entries into a dict
+        """
+        raise NotImplementedError
+
+    @property
+    def important_sites(self):
+        """
+        Dict of sites corresponding to the important site geoms (e.g.: used to aid visualization during sim).
+
+        Returns:
+            dict: Important site geoms, where each specific geom name is mapped from keyword string entries
+                in the dict
+        """
+        raise NotImplementedError
+
+    @property
+    def important_sensors(self):
+        """
+        Dict of important sensors enabled for this model.
+
+        Returns:
+            dict: Important sensors for this model, where each specific sensor name is mapped from keyword string
+                entries in the dict
+        """
+        raise NotImplementedError
+
+    @property
+    def bottom_offset(self):
+        """
+        Returns vector from model root body to model bottom.
+        Useful for, e.g. placing models on a surface.
+        Must be defined by subclass.
+
+        Returns:
+            np.array: (dx, dy, dz) offset vector
+        """
+        raise NotImplementedError
+
+    @property
+    def top_offset(self):
+        """
+        Returns vector from model root body to model top.
+        Useful for, e.g. placing models on a surface.
+        Must be defined by subclass.
+
+        Returns:
+            np.array: (dx, dy, dz) offset vector
+        """
+        raise NotImplementedError
+
+    @property
+    def horizontal_radius(self):
+        """
+        Returns maximum distance from model root body to any radial point of the model.
+
+        Helps us put models programmatically without them flying away due to a huge initial contact force.
+        Must be defined by subclass.
+
+        Returns:
+            float: radius
+        """
+        raise NotImplementedError
+
+
+class MujocoXMLModel(MujocoXML, MujocoModel):
+    """
+    Base class for all MujocoModels that are based on a raw XML file.
+
+    Args:
+        fname (str): Path to relevant xml file from which to create this robot instance
+        idn (int or str): Number or some other unique identification string for this model instance
+    """
+
+    def __init__(self, fname, idn=0):
+        super().__init__(fname)
+
+        # Set id and add prefixes to all body names to prevent naming clashes
+        self.idn = idn
+
+        # Define other variables that get filled later
+        self.mount = None
+
+        # Define filter method to automatically add a default name to visual / collision geoms if encountered
+        group_mapping = {
+            None: "col",
+            "0": "col",
+            "1": "vis",
+        }
+        ctr_mapping = {
+            "col": 0,
+            "vis": 0,
+        }
+
+        def _add_default_name_filter(element, parent):
+            # Run default filter
+            filter_key = _element_filter(element=element, parent=parent)
+            # Also additionally modify element if it is (a) a geom and (b) has no name
+            if element.tag == "geom" and element.get("name") is None:
+                group = group_mapping[element.get("group")]
+                element.set("name", f"g{ctr_mapping[group]}_{group}")
+                ctr_mapping[group] += 1
+            # Return default filter key
+            return filter_key
+
+        # Parse element tree to get all relevant bodies, joints, actuators, and geom groups
+        self._elements = sort_elements(root=self.root, element_filter=_add_default_name_filter)
+        assert (
+            len(self._elements["root_body"]) == 1
+        ), "Invalid number of root bodies found for robot model. Expected 1," "got {}".format(
+            len(self._elements["root_body"])
+        )
+        self._elements["root_body"] = self._elements["root_body"][0]
+        self._elements["bodies"] = (
+            [self._elements["root_body"]] + self._elements["bodies"]
+            if "bodies" in self._elements
+            else [self._elements["root_body"]]
+        )
+        self._root_body = self._elements["root_body"].get("name")
+        self._bodies = [e.get("name") for e in self._elements.get("bodies", [])]
+        self._joints = [e.get("name") for e in self._elements.get("joints", [])]
+        self._actuators = [e.get("name") for e in self._elements.get("actuators", [])]
+        self._sites = [e.get("name") for e in self._elements.get("sites", [])]
+        self._sensors = [e.get("name") for e in self._elements.get("sensors", [])]
+        self._contact_geoms = [e.get("name") for e in self._elements.get("contact_geoms", [])]
+        self._visual_geoms = [e.get("name") for e in self._elements.get("visual_geoms", [])]
+        self._base_offset = string_to_array(self._elements["root_body"].get("pos", "0 0 0"))
+
+        # Update all xml element prefixes
+        add_prefix(root=self.root, prefix=self.naming_prefix, exclude=self.exclude_from_prefixing)
+
+        # Recolor all collision geoms appropriately
+        recolor_collision_geoms(root=self.worldbody, rgba=self.contact_geom_rgba)
+
+        # Add default materials
+        if macros.USING_INSTANCE_RANDOMIZATION:
+            tex_element, mat_element, _, used = add_material(root=self.worldbody, naming_prefix=self.naming_prefix)
+            # Only add if material / texture was actually used
+            if used:
+                self.asset.append(tex_element)
+                self.asset.append(mat_element)
+
+    def exclude_from_prefixing(self, inp):
+        """
+        By default, don't exclude any from being prefixed
+        """
+        return False
+
+    @property
+    def base_offset(self):
+        """
+        Provides position offset of root body.
+
+        Returns:
+            3-array: (x,y,z) pos value of root_body body element. If no pos in element, returns all zeros.
+        """
+        return self._base_offset
+
+    @property
+    def name(self):
+        return "{}{}".format(type(self).__name__, self.idn)
+
+    @property
+    def naming_prefix(self):
+        return "{}_".format(self.idn)
+
+    @property
+    def root_body(self):
+        return self.correct_naming(self._root_body)
+
+    @property
+    def bodies(self):
+        return self.correct_naming(self._bodies)
+
+    @property
+    def joints(self):
+        return self.correct_naming(self._joints)
+
+    @property
+    def actuators(self):
+        return self.correct_naming(self._actuators)
+
+    @property
+    def sites(self):
+        return self.correct_naming(self._sites)
+
+    @property
+    def sensors(self):
+        return self.correct_naming(self._sensors)
+
+    @property
+    def contact_geoms(self):
+        return self.correct_naming(self._contact_geoms)
+
+    @property
+    def visual_geoms(self):
+        return self.correct_naming(self._visual_geoms)
+
+    @property
+    def important_sites(self):
+        return self.correct_naming(self._important_sites)
+
+    @property
+    def important_geoms(self):
+        return self.correct_naming(self._important_geoms)
+
+    @property
+    def important_sensors(self):
+        return self.correct_naming(self._important_sensors)
+
+    @property
+    def _important_sites(self):
+        """
+        Dict of sites corresponding to the important site geoms (e.g.: used to aid visualization during sim).
+
+        Returns:
+            dict: Important site geoms, where each specific geom name is mapped from keyword string entries
+                in the dict. Note that the mapped sites should be the RAW site names found directly in the XML file --
+                the naming prefix will be automatically added in the public method call
+        """
+        raise NotImplementedError
+
+    @property
+    def _important_geoms(self):
+        """
+        Geoms corresponding to important components of this model. String keywords should be mapped to lists of geoms.
+
+        Returns:
+            dict of list: Important set of geoms, where each set of geoms are grouped as a list and are
+                organized by keyword string entries into a dict. Note that the mapped geoms should be the RAW geom
+                names found directly in the XML file -- the naming prefix will be automatically added in the
+                public method call
+        """
+        raise NotImplementedError
+
+    @property
+    def _important_sensors(self):
+        """
+        Dict of important sensors enabled for this model.
+
+        Returns:
+            dict: Important sensors for this model, where each specific sensor name is mapped from keyword string
+                entries in the dict. Note that the mapped geoms should be the RAW sensor names found directly in the
+                XML file -- the naming prefix will be automatically added in the public method call
+        """
+        raise NotImplementedError
+
+    @property
+    def contact_geom_rgba(self):
+        """
+        RGBA color to assign to all contact geoms for this model
+
+        Returns:
+            4-array: (r,g,b,a) values from 0 to 1 for this model's set of contact geoms
+        """
+        raise NotImplementedError
+
+    @property
+    def bottom_offset(self):
+        """
+        Returns vector from model root body to model bottom.
+        Useful for, e.g. placing models on a surface.
+        By default, this corresponds to the root_body's base offset.
+
+        Returns:
+            np.array: (dx, dy, dz) offset vector
+        """
+        return self.base_offset
+
+    @property
+    def top_offset(self):
+        raise NotImplementedError
+
+    @property
+    def horizontal_radius(self):
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2814e50ff77bc148113bd9e3824e8f63b56bfd1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/__init__.py
@@ -0,0 +1,31 @@
+from .gripper_model import GripperModel
+from .gripper_factory import gripper_factory
+from .gripper_tester import GripperTester
+
+from .panda_gripper import PandaGripper
+from .rethink_gripper import RethinkGripper
+from .robotiq_85_gripper import Robotiq85Gripper
+from .robotiq_gripper_85_real_kinova import Robotiq85GripperRealKinova
+from .robotiq_three_finger_gripper import RobotiqThreeFingerGripper, RobotiqThreeFingerDexterousGripper
+from .panda_gripper import PandaGripper
+from .jaco_three_finger_gripper import JacoThreeFingerGripper, JacoThreeFingerDexterousGripper
+from .robotiq_140_gripper import Robotiq140Gripper
+from .wiping_gripper import WipingGripper
+from .null_gripper import NullGripper
+
+
+GRIPPER_MAPPING = {
+    "RethinkGripper": RethinkGripper,
+    "PandaGripper": PandaGripper,
+    "JacoThreeFingerGripper": JacoThreeFingerGripper,
+    "JacoThreeFingerDexterousGripper": JacoThreeFingerDexterousGripper,
+    "WipingGripper": WipingGripper,
+    "Robotiq85Gripper": Robotiq85Gripper,
+    "Robotiq140Gripper": Robotiq140Gripper,
+    "RobotiqThreeFingerGripper": RobotiqThreeFingerGripper,
+    "RobotiqThreeFingerDexterousGripper": RobotiqThreeFingerDexterousGripper,
+    "Robotiq85GripperRealKinova": Robotiq85GripperRealKinova,
+    None: NullGripper,
+}
+
+ALL_GRIPPERS = GRIPPER_MAPPING.keys()
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/gripper_factory.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/gripper_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..133dce39b24ec446089e79267fe9fd8247259e22
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/gripper_factory.py
@@ -0,0 +1,29 @@
+"""
+Defines a string based method of initializing grippers
+"""
+
+
+def gripper_factory(name, idn=0):
+    """
+    Generator for grippers
+
+    Creates a GripperModel instance with the provided name.
+
+    Args:
+        name (None or str): the name of the gripper class
+        idn (int or str): Number or some other unique identification string for this gripper instance
+
+    Returns:
+        GripperModel: requested gripper instance
+
+    Raises:
+        XMLError: [invalid XML]
+    """
+    # Import GRIPPER_MAPPING at runtime so we avoid circular imports
+    from robosuite.models.grippers import ALL_GRIPPERS, GRIPPER_MAPPING
+
+    # Make sure gripper is valid
+    assert name in GRIPPER_MAPPING, f"Unknown gripper name: {name}. Valid options are: {ALL_GRIPPERS}"
+
+    # Generate gripper
+    return GRIPPER_MAPPING[name](idn=idn)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/gripper_model.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/gripper_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ebe18b3ae96aa2a0d04134071388dde1c75de10
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/gripper_model.py
@@ -0,0 +1,161 @@
+"""
+Defines the base class of all grippers
+"""
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.models.base import MujocoXMLModel
+from robosuite.utils.mjcf_utils import GRIPPER_COLLISION_COLOR, find_elements, string_to_array
+
+
+class GripperModel(MujocoXMLModel):
+    """
+    Base class for grippers
+
+    Args:
+        fname (str): Path to relevant xml file to create this gripper instance
+        idn (int or str): Number or some other unique identification string for this gripper instance
+    """
+
+    def __init__(self, fname, idn=0):
+        super().__init__(fname, idn=idn)
+
+        # Set variable to hold current action being outputted
+        self.current_action = np.zeros(self.dof)
+
+        # Grab gripper offset (string -> np.array -> elements [1, 2, 3, 0] (x, y, z, w))
+        # This is the comopunded rotation with the base body and the eef body as well!
+        base_quat = np.fromstring(self.worldbody[0].attrib.get("quat", "1 0 0 0"), dtype=np.float64, sep=" ")[
+            [1, 2, 3, 0]
+        ]
+        eef_element = find_elements(
+            root=self.root, tags="body", attribs={"name": self.correct_naming("eef")}, return_first=True
+        )
+        eef_relative_quat = string_to_array(eef_element.get("quat", "1 0 0 0"))[[1, 2, 3, 0]]
+        self.rotation_offset = T.quat_multiply(eef_relative_quat, base_quat)
+
+    def format_action(self, action):
+        """
+        Given (-1,1) abstract control as np-array
+        returns the (-1,1) control signals
+        for underlying actuators as 1-d np array
+        """
+        raise NotImplementedError
+
+    # -------------------------------------------------------------------------------------- #
+    # Properties: In general, these are the name-adjusted versions from the private          #
+    #             subclass implementations pulled from their respective raw xml files        #
+    # -------------------------------------------------------------------------------------- #
+
+    @property
+    def naming_prefix(self):
+        return "gripper{}_".format(self.idn)
+
+    @property
+    def speed(self):
+        """
+        How quickly the gripper opens / closes
+
+        Returns:
+            float: Speed of the gripper
+        """
+        return 0.0
+
+    @property
+    def dof(self):
+        """
+        Defines the number of DOF of the gripper
+
+        Returns:
+            int: gripper DOF
+        """
+        return len(self._actuators)
+
+    @property
+    def bottom_offset(self):
+        return np.zeros(3)
+
+    @property
+    def top_offset(self):
+        return np.zeros(3)
+
+    @property
+    def horizontal_radius(self):
+        return 0
+
+    @property
+    def contact_geom_rgba(self):
+        return GRIPPER_COLLISION_COLOR
+
+    # -------------------------------------------------------------------------------------- #
+    # All subclasses must implement the following properties                                 #
+    # -------------------------------------------------------------------------------------- #
+
+    @property
+    def init_qpos(self):
+        """
+        Defines the default rest (open) qpos of the gripper
+
+        Returns:
+            np.array: Default init qpos of this gripper
+        """
+        raise NotImplementedError
+
+    @property
+    def _important_sites(self):
+        """
+        Sites used to aid visualization by human. (usually "grip_site" and "grip_cylinder")
+        (and should be hidden from robots)
+
+        Returns:
+            dict:
+
+                :`'grip_site'`: Name of grip actuation intersection location site
+                :`'grip_cylinder'`: Name of grip actuation z-axis location site
+                :`'ee'`: Name of end effector site
+                :`'ee_x'`: Name of end effector site (x-axis)
+                :`'ee_y'`: Name of end effector site (y-axis)
+                :`'ee_z'`: Name of end effector site (z-axis)
+        """
+        return {
+            "grip_site": "grip_site",
+            "grip_cylinder": "grip_site_cylinder",
+            "ee": "ee",
+            "ee_x": "ee_x",
+            "ee_y": "ee_y",
+            "ee_z": "ee_z",
+        }
+
+    @property
+    def _important_geoms(self):
+        """
+        Geoms corresponding to important components of the gripper (by default, left_finger, right_finger,
+        left_fingerpad, right_fingerpad).
+        Note that these are the raw string names directly pulled from a gripper's corresponding XML file,
+        NOT the adjusted name with an auto-generated naming prefix
+
+        Note that this should be a dict of lists.
+
+        Returns:
+            dict of list: Raw XML important geoms, where each set of geoms are grouped as a list and are
+            organized by keyword string entries into a dict
+        """
+        return {
+            "left_finger": [],
+            "right_finger": [],
+            "left_fingerpad": [],
+            "right_fingerpad": [],
+        }
+
+    @property
+    def _important_sensors(self):
+        """
+        Sensor names for each gripper (usually "force_ee" and "torque_ee")
+
+        Returns:
+            dict:
+
+                :`'force_ee'`: Name of force eef sensor for this gripper
+                :`'torque_ee'`: Name of torque eef sensor for this gripper
+        """
+        return {sensor: sensor for sensor in ["force_ee", "torque_ee"]}
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/gripper_tester.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/gripper_tester.py
new file mode 100644
index 0000000000000000000000000000000000000000..e297f4f25c42322530857fb0f59c3a32b28406ce
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/gripper_tester.py
@@ -0,0 +1,235 @@
+"""
+Defines GripperTester that is used to test the physical properties of various grippers
+"""
+import xml.etree.ElementTree as ET
+
+import numpy as np
+
+import robosuite.macros as macros
+from robosuite.models.arenas.table_arena import TableArena
+from robosuite.models.objects import BoxObject
+from robosuite.models.world import MujocoWorldBase
+from robosuite.utils import OpenCVRenderer
+from robosuite.utils.binding_utils import MjSim
+from robosuite.utils.mjcf_utils import array_to_string, new_actuator, new_joint
+
+
+class GripperTester:
+    """
+    A class that is used to test gripper
+
+    Args:
+        gripper (GripperModel): A gripper instance to be tested
+        pos (str): (x y z) position to place the gripper in string form, e.g. '0 0 0.3'
+        quat (str): rotation to apply to gripper in string form, e.g. '0 0 1 0' to flip z axis
+        gripper_low_pos (float): controls the gipper y position, larger -> higher
+        gripper_high_pos (float): controls the gipper y high position larger -> higher,
+            must be larger than gripper_low_pos
+        box_size (None or 3-tuple of int): the size of the box to grasp, None defaults to [0.02, 0.02, 0.02]
+        box_density (int): the density of the box to grasp
+        step_time (int): the interval between two gripper actions
+        render (bool): if True, show rendering
+    """
+
+    def __init__(
+        self,
+        gripper,
+        pos,
+        quat,
+        gripper_low_pos,
+        gripper_high_pos,
+        box_size=None,
+        box_density=10000,
+        step_time=400,
+        render=True,
+    ):
+        # define viewer
+        self.viewer = None
+
+        world = MujocoWorldBase()
+        # Add a table
+        arena = TableArena(table_full_size=(0.4, 0.4, 0.1), table_offset=(0, 0, 0.1), has_legs=False)
+        world.merge(arena)
+
+        # Add a gripper
+        self.gripper = gripper
+        # Create another body with a slider joint to which we'll add this gripper
+        gripper_body = ET.Element("body")
+        gripper_body.set("pos", pos)
+        gripper_body.set("quat", quat)  # flip z
+        gripper_body.append(new_joint(name="gripper_z_joint", type="slide", axis="0 0 -1", damping="50"))
+        # Add all gripper bodies to this higher level body
+        for body in gripper.worldbody:
+            gripper_body.append(body)
+        # Merge the all of the gripper tags except its bodies
+        world.merge(gripper, merge_body=None)
+        # Manually add the higher level body we created
+        world.worldbody.append(gripper_body)
+        # Create a new actuator to control our slider joint
+        world.actuator.append(new_actuator(joint="gripper_z_joint", act_type="position", name="gripper_z", kp="500"))
+
+        # Add an object for grasping
+        # density is in units kg / m3
+        TABLE_TOP = [0, 0, 0.09]
+        if box_size is None:
+            box_size = [0.02, 0.02, 0.02]
+        box_size = np.array(box_size)
+        self.cube = BoxObject(
+            name="object", size=box_size, rgba=[1, 0, 0, 1], friction=[1, 0.005, 0.0001], density=box_density
+        )
+        object_pos = np.array(TABLE_TOP + box_size * [0, 0, 1])
+        mujoco_object = self.cube.get_obj()
+        # Set the position of this object
+        mujoco_object.set("pos", array_to_string(object_pos))
+        # Add our object to the world body
+        world.worldbody.append(mujoco_object)
+
+        # add reference objects for x and y axes
+        x_ref = BoxObject(
+            name="x_ref", size=[0.01, 0.01, 0.01], rgba=[0, 1, 0, 1], obj_type="visual", joints=None
+        ).get_obj()
+        x_ref.set("pos", "0.2 0 0.105")
+        world.worldbody.append(x_ref)
+        y_ref = BoxObject(
+            name="y_ref", size=[0.01, 0.01, 0.01], rgba=[0, 0, 1, 1], obj_type="visual", joints=None
+        ).get_obj()
+        y_ref.set("pos", "0 0.2 0.105")
+        world.worldbody.append(y_ref)
+
+        self.world = world
+        self.render = render
+        self.simulation_ready = False
+        self.step_time = step_time
+        self.cur_step = 0
+        if gripper_low_pos > gripper_high_pos:
+            raise ValueError(
+                "gripper_low_pos {} is larger " "than gripper_high_pos {}".format(gripper_low_pos, gripper_high_pos)
+            )
+        self.gripper_low_pos = gripper_low_pos
+        self.gripper_high_pos = gripper_high_pos
+
+    def start_simulation(self):
+        """
+        Starts simulation of the test world
+        """
+        model_xml = self.world.get_xml()
+        self.sim = MjSim.from_xml_string(model_xml)
+
+        if self.render:
+            self.viewer = OpenCVRenderer(self.sim)
+            # We also need to add the offscreen context
+            if self.sim._render_context_offscreen is None:
+                render_context = MjRenderContextOffscreen(self.sim, device_id=-1)
+                self.sim.add_render_context(render_context)
+        self.sim_state = self.sim.get_state()
+
+        # For gravity correction
+        gravity_corrected = ["gripper_z_joint"]
+        self._gravity_corrected_qvels = [self.sim.model.get_joint_qvel_addr(x) for x in gravity_corrected]
+
+        self.gripper_z_id = self.sim.model.actuator_name2id("gripper_z")
+        self.gripper_z_is_low = False
+
+        self.gripper_actuator_ids = [self.sim.model.actuator_name2id(x) for x in self.gripper.actuators]
+
+        self.gripper_is_closed = True
+
+        self.object_id = self.sim.model.body_name2id(self.cube.root_body)
+        object_default_pos = self.sim.data.body_xpos[self.object_id]
+        self.object_default_pos = np.array(object_default_pos, copy=True)
+
+        self.reset()
+        self.simulation_ready = True
+
+    def reset(self):
+        """
+        Resets the simulation to the initial state
+        """
+        self.sim.set_state(self.sim_state)
+        self.cur_step = 0
+
+    def close(self):
+        """
+        Close the viewer if it exists
+        """
+        if self.viewer is not None:
+            self.viewer.close()
+
+    def step(self):
+        """
+        Forward the simulation by one timestep
+
+        Raises:
+            RuntimeError: if start_simulation is not yet called.
+        """
+        if not self.simulation_ready:
+            raise RuntimeError("Call start_simulation before calling step")
+        if self.gripper_z_is_low:
+            self.sim.data.ctrl[self.gripper_z_id] = self.gripper_low_pos
+        else:
+            self.sim.data.ctrl[self.gripper_z_id] = self.gripper_high_pos
+        if self.gripper_is_closed:
+            self._apply_gripper_action(1)
+        else:
+            self._apply_gripper_action(-1)
+        self._apply_gravity_compensation()
+        self.sim.step()
+        if self.render:
+            self.viewer.render()
+        self.cur_step += 1
+
+    def _apply_gripper_action(self, action):
+        """
+        Applies binary gripper action
+
+        Args:
+            action (int): Action to apply. Should be -1 (open) or 1 (closed)
+        """
+        gripper_action_actual = self.gripper.format_action(np.array([action]))
+        # rescale normalized gripper action to control ranges
+        ctrl_range = self.sim.model.actuator_ctrlrange[self.gripper_actuator_ids]
+        bias = 0.5 * (ctrl_range[:, 1] + ctrl_range[:, 0])
+        weight = 0.5 * (ctrl_range[:, 1] - ctrl_range[:, 0])
+        applied_gripper_action = bias + weight * gripper_action_actual
+        self.sim.data.ctrl[self.gripper_actuator_ids] = applied_gripper_action
+
+    def _apply_gravity_compensation(self):
+        """
+        Applies gravity compensation to the simulation
+        """
+        self.sim.data.qfrc_applied[self._gravity_corrected_qvels] = self.sim.data.qfrc_bias[
+            self._gravity_corrected_qvels
+        ]
+
+    def loop(self, total_iters=1, test_y=False, y_baseline=0.01):
+        """
+        Performs lower, grip, raise and release actions of a gripper,
+                each separated with T timesteps
+
+        Args:
+            total_iters (int): Iterations to perform before exiting
+            test_y (bool): test if object is lifted
+            y_baseline (float): threshold for determining that object is lifted
+        """
+        seq = [(False, False), (True, False), (True, True), (False, True)]
+        for cur_iter in range(total_iters):
+            for cur_plan in seq:
+                self.gripper_z_is_low, self.gripper_is_closed = cur_plan
+                for step in range(self.step_time):
+                    self.step()
+            if test_y:
+                if not self.object_height > y_baseline:
+                    raise ValueError(
+                        "object is lifed by {}, ".format(self.object_height)
+                        + "not reaching the requirement {}".format(y_baseline)
+                    )
+
+    @property
+    def object_height(self):
+        """
+        Queries the height (z) of the object compared to on the ground
+
+        Returns:
+            float: Object height relative to default (ground) object position
+        """
+        return self.sim.data.body_xpos[self.object_id][2] - self.object_default_pos[2]
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/jaco_three_finger_gripper.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/jaco_three_finger_gripper.py
new file mode 100644
index 0000000000000000000000000000000000000000..188412d71880d3cec60606fdedc3dd30707c9162
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/jaco_three_finger_gripper.py
@@ -0,0 +1,107 @@
+"""
+Gripper for Kinova's Jaco robot arm (has three fingers).
+"""
+import numpy as np
+
+from robosuite.models.grippers.gripper_model import GripperModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class JacoThreeFingerGripperBase(GripperModel):
+    """
+    Gripper for Kinova's Jaco robot arm (has three fingers).
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this gripper instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("grippers/jaco_three_finger_gripper.xml"), idn=idn)
+
+    def format_action(self, action):
+        return action
+
+    @property
+    def init_qpos(self):
+        return np.array([0.5, 0, 0.5, 0, 0.5, 0])
+
+    @property
+    def _important_geoms(self):
+        return {
+            "left_finger": [
+                "index_proximal_collision",
+                "index_distal_collision",
+                "index_tip_collision",
+                "pinky_proximal_collision",
+                "pinky_distal_collision",
+                "pinky_tip_collision",
+                "index_tip_collision",
+                "pinky_pad_collision",
+            ],
+            "right_finger": [
+                "thumb_proximal_collision",
+                "thumb_distal_collision",
+                "thumb_tip_collision",
+                "thumb_pad_collision",
+            ],
+            "left_fingerpad": ["index_pad_collision", "pinky_pad_collision"],
+            "right_fingerpad": ["thumb_pad_collision"],
+        }
+
+
+class JacoThreeFingerGripper(JacoThreeFingerGripperBase):
+    """
+    Modifies JacoThreeFingerGripperBase to only take one action.
+    """
+
+    def format_action(self, action):
+        """
+        Maps continuous action into binary output
+        -1 => open, 1 => closed
+
+        Args:
+            action (np.array): gripper-specific action
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        assert len(action) == self.dof
+        self.current_action = np.clip(self.current_action - self.speed * np.sign(action), -1.0, 1.0)
+        return self.current_action
+
+    @property
+    def speed(self):
+        return 0.005
+
+    @property
+    def dof(self):
+        return 1
+
+
+class JacoThreeFingerDexterousGripper(JacoThreeFingerGripperBase):
+    """
+    Dexterous variation of the Jaco gripper in which all finger are actuated independently
+    """
+
+    def format_action(self, action):
+        """
+        Maps continuous action into binary output
+        all -1 => open, all 1 => closed
+
+        Args:
+            action (np.array): gripper-specific action
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        assert len(action) == self.dof
+        self.current_action = np.clip(self.current_action - self.speed * np.sign(action), -1.0, 1.0)
+        return self.current_action
+
+    @property
+    def speed(self):
+        return 0.005
+
+    @property
+    def dof(self):
+        return 3
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/null_gripper.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/null_gripper.py
new file mode 100644
index 0000000000000000000000000000000000000000..48f6a804744099f5cfca1dd23397d2bd7f5cd5ca
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/null_gripper.py
@@ -0,0 +1,24 @@
+"""
+Null Gripper (if we don't want to attach gripper to robot eef).
+"""
+from robosuite.models.grippers.gripper_model import GripperModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class NullGripper(GripperModel):
+    """
+    Dummy Gripper class to represent no gripper
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this gripper instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("grippers/null_gripper.xml"), idn=idn)
+
+    def format_action(self, action):
+        return action
+
+    @property
+    def init_qpos(self):
+        return None
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/panda_gripper.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/panda_gripper.py
new file mode 100644
index 0000000000000000000000000000000000000000..64650e33d40f6df02d48032c91df3c71213d1751
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/panda_gripper.py
@@ -0,0 +1,66 @@
+"""
+Gripper for Franka's Panda (has two fingers).
+"""
+import numpy as np
+
+from robosuite.models.grippers.gripper_model import GripperModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class PandaGripperBase(GripperModel):
+    """
+    Gripper for Franka's Panda (has two fingers).
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this gripper instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("grippers/panda_gripper.xml"), idn=idn)
+
+    def format_action(self, action):
+        return action
+
+    @property
+    def init_qpos(self):
+        return np.array([0.020833, -0.020833])
+
+    @property
+    def _important_geoms(self):
+        return {
+            "left_finger": ["finger1_collision", "finger1_pad_collision"],
+            "right_finger": ["finger2_collision", "finger2_pad_collision"],
+            "left_fingerpad": ["finger1_pad_collision"],
+            "right_fingerpad": ["finger2_pad_collision"],
+        }
+
+
+class PandaGripper(PandaGripperBase):
+    """
+    Modifies PandaGripperBase to only take one action.
+    """
+
+    def format_action(self, action):
+        """
+        Maps continuous action into binary output
+        -1 => open, 1 => closed
+
+        Args:
+            action (np.array): gripper-specific action
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        assert len(action) == self.dof
+        self.current_action = np.clip(
+            self.current_action + np.array([-1.0, 1.0]) * self.speed * np.sign(action), -1.0, 1.0
+        )
+        return self.current_action
+
+    @property
+    def speed(self):
+        return 0.01
+
+    @property
+    def dof(self):
+        return 1
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/rethink_gripper.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/rethink_gripper.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa3852fcf1122e29682587b25643f6fdbc22cbf1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/rethink_gripper.py
@@ -0,0 +1,66 @@
+"""
+Gripper with two fingers for Rethink Robots.
+"""
+import numpy as np
+
+from robosuite.models.grippers.gripper_model import GripperModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class RethinkGripperBase(GripperModel):
+    """
+    Gripper with long two-fingered parallel jaw.
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this gripper instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("grippers/rethink_gripper.xml"), idn=idn)
+
+    def format_action(self, action):
+        return action
+
+    @property
+    def init_qpos(self):
+        return np.array([0.020833, -0.020833])
+
+    @property
+    def _important_geoms(self):
+        return {
+            "left_finger": ["l_finger_g0", "l_finger_g1", "l_fingertip_g0", "l_fingerpad_g0"],
+            "right_finger": ["r_finger_g0", "r_finger_g1", "r_fingertip_g0", "r_fingerpad_g0"],
+            "left_fingerpad": ["l_fingerpad_g0"],
+            "right_fingerpad": ["r_fingerpad_g0"],
+        }
+
+
+class RethinkGripper(RethinkGripperBase):
+    """
+    Modifies two finger base to only take one action.
+    """
+
+    def format_action(self, action):
+        """
+        Maps continuous action into binary output
+        -1 => open, 1 => closed
+
+        Args:
+            action (np.array): gripper-specific action
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        assert len(action) == 1
+        self.current_action = np.clip(
+            self.current_action + np.array([1.0, -1.0]) * self.speed * np.sign(action), -1.0, 1.0
+        )
+        return self.current_action
+
+    @property
+    def speed(self):
+        return 0.01
+
+    @property
+    def dof(self):
+        return 1
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_140_gripper.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_140_gripper.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a2877e1f6f6366b3ae42f66a4c083dedb4ad09
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_140_gripper.py
@@ -0,0 +1,77 @@
+"""
+Gripper with 140mm Jaw width from Robotiq (has two fingers).
+"""
+import numpy as np
+
+from robosuite.models.grippers.gripper_model import GripperModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class Robotiq140GripperBase(GripperModel):
+    """
+    Gripper with 140mm Jaw width from Robotiq (has two fingers).
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this gripper instance
+
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("grippers/robotiq_gripper_140.xml"), idn=idn)
+
+    def format_action(self, action):
+        return action
+
+    @property
+    def init_qpos(self):
+        return np.array([0.012, 0.065, 0.065, -0.012, 0.065, 0.065])
+
+    @property
+    def _important_geoms(self):
+        return {
+            "left_finger": [
+                "left_outer_finger_collision",
+                "left_inner_finger_collision",
+                "left_fingertip_collision",
+                "left_fingerpad_collision",
+            ],
+            "right_finger": [
+                "right_outer_finger_collision",
+                "right_inner_finger_collision",
+                "right_fingertip_collision",
+                "right_fingerpad_collision",
+            ],
+            "left_fingerpad": ["left_fingerpad_collision"],
+            "right_fingerpad": ["right_fingerpad_collision"],
+        }
+
+
+class Robotiq140Gripper(Robotiq140GripperBase):
+    """
+    Modifies Robotiq140GripperBase to only take one action.
+    """
+
+    def format_action(self, action):
+        """
+        Maps continuous action into binary output
+        -1 => open, 1 => closed
+
+        Args:
+            action (np.array): gripper-specific action
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        assert len(action) == 1
+        self.current_action = np.clip(
+            self.current_action + np.array([1.0, -1.0]) * self.speed * np.sign(action), -1.0, 1.0
+        )
+        return self.current_action
+
+    @property
+    def speed(self):
+        return 0.01
+
+    @property
+    def dof(self):
+        return 1
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_85_gripper.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_85_gripper.py
new file mode 100644
index 0000000000000000000000000000000000000000..690017c2c3247cf987277583fa1e7d5dd63904ee
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_85_gripper.py
@@ -0,0 +1,74 @@
+"""
+6-DoF gripper with its open/close variant
+"""
+import numpy as np
+
+from robosuite.models.grippers.gripper_model import GripperModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class Robotiq85GripperBase(GripperModel):
+    """
+    6-DoF Robotiq gripper.
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this gripper instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("grippers/robotiq_gripper_85_v4.xml"), idn=idn)
+
+    def format_action(self, action):
+        return action
+
+    @property
+    def init_qpos(self):
+        return np.array([-0.026, -0.267, -0.200, -0.026, -0.267, -0.200])
+
+    @property
+    def _important_geoms(self):
+        return {
+            "left_finger": [
+                "left_outer_finger_collision",
+                "left_inner_finger_collision",
+                "left_fingertip_collision",
+                "left_fingerpad_collision",
+            ],
+            "right_finger": [
+                "right_outer_finger_collision",
+                "right_inner_finger_collision",
+                "right_fingertip_collision",
+                "right_fingerpad_collision",
+            ],
+            "left_fingerpad": ["left_fingerpad_collision"],
+            "right_fingerpad": ["right_fingerpad_collision"],
+        }
+
+
+class Robotiq85Gripper(Robotiq85GripperBase):
+    """
+    1-DoF variant of RobotiqGripperBase.
+    """
+
+    def format_action(self, action):
+        """
+        Maps continuous action into binary output
+        -1 => open, 1 => closed
+
+        Args:
+            action (np.array): gripper-specific action
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        assert len(action) == 1
+        self.current_action = np.clip(self.current_action + self.speed * np.sign(action), -1.0, 1.0)
+        return self.current_action
+
+    @property
+    def speed(self):
+        return 0.01
+
+    @property
+    def dof(self):
+        return 1
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_gripper_85_real_kinova.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_gripper_85_real_kinova.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ddfb4e988b60b177ebd9223b176d97bc1ecac17
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_gripper_85_real_kinova.py
@@ -0,0 +1,78 @@
+"""
+6-DoF gripper with its open/close variant
+"""
+import numpy as np
+
+from robosuite.models.grippers.gripper_model import GripperModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class Robotiq85GripperRealKinovaBase(GripperModel):
+    """
+    6-DoF Robotiq gripper.
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this gripper instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("grippers/robotiq_gripper_85_real_kinova.xml"), idn=idn)
+
+    def format_action(self, action):
+        return action
+
+    @property
+    def init_qpos(self):
+        return np.array([-0.026, -0.267, -0.200, -0.026, -0.267, -0.200])
+        # return np.array([0.00227, 0.000136, 0.00247, -0.00267, 0.00227, 0.000136, 0.00247, -0.00267])
+        # return np.array([0.00258958, 0.00264364, 0.0027039, 0.00258958, 0.00264361, 0.00270381])
+
+
+    @property
+    def _important_geoms(self):
+        return {
+            "left_finger": [
+                "left_outer_finger_collision",
+                "left_inner_finger_collision",
+                "left_fingertip_collision",
+                "left_fingerpad_collision",
+            ],
+            "right_finger": [
+                "right_outer_finger_collision",
+                "right_inner_finger_collision",
+                "right_fingertip_collision",
+                "right_fingerpad_collision",
+            ],
+            "left_fingerpad": ["left_fingerpad_collision"],
+            "right_fingerpad": ["right_fingerpad_collision"],
+        }
+
+
+class Robotiq85GripperRealKinova(Robotiq85GripperRealKinovaBase):
+    """
+    1-DoF variant of RobotiqGripperBase.
+    """
+
+    def format_action(self, action):
+        """
+        Maps continuous action into binary output
+        -1 => open, 1 => closed
+
+        Args:
+            action (np.array): gripper-specific action
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        assert len(action) == 1
+        self.current_action = np.clip(self.current_action + self.speed * np.sign(action), -1.0, 1.0)
+        print("Modified gripper action: ", self.current_action)
+        return self.current_action
+
+    @property
+    def speed(self):
+        return 0.01
+
+    @property
+    def dof(self):
+        return 1
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_three_finger_gripper.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_three_finger_gripper.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d36d56e84dbe7636b5f4e9899dec3123615c2a2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/robotiq_three_finger_gripper.py
@@ -0,0 +1,115 @@
+"""
+Gripper with 11-DoF controlling three fingers and its open/close variant.
+"""
+import numpy as np
+
+from robosuite.models.grippers.gripper_model import GripperModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class RobotiqThreeFingerGripperBase(GripperModel):
+    """
+    Gripper with 11 dof controlling three fingers.
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this gripper instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("grippers/robotiq_gripper_s.xml"), idn=idn)
+
+    def format_action(self, action):
+        return action
+
+    @property
+    def init_qpos(self):
+        return np.zeros(11)
+
+    @property
+    def _important_geoms(self):
+        return {
+            "left_finger": [
+                "f1_l0",
+                "f1_l1",
+                "f1_l2",
+                "f1_l3",
+                "f2_l0",
+                "f2_l1",
+                "f2_l2",
+                "f2_l3",
+                "f1_tip_collision",
+                "f2_tip_collision",
+                "f1_pad_collision",
+                "f2_pad_collision",
+            ],
+            "right_finger": [
+                "f3_l0",
+                "f3_l1",
+                "f3_l2",
+                "f3_l3",
+                "finger_middle_tip_collision",
+                "finger_middle_pad_collision",
+            ],
+            "left_fingerpad": ["f1_pad_collision", "f2_pad_collision"],
+            "right_fingerpad": ["finger_middle_pad_collision"],
+        }
+
+
+class RobotiqThreeFingerGripper(RobotiqThreeFingerGripperBase):
+    """
+    1-DoF variant of RobotiqThreeFingerGripperBase.
+    """
+
+    def format_action(self, action):
+        """
+        Maps continuous action into binary output
+        -1 => open, 1 => closed
+
+        Args:
+            action (np.array): gripper-specific action
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        assert len(action) == self.dof
+        self.current_action = np.clip(self.current_action + self.speed * np.array(action), -1.0, 1.0)
+        # Automatically set the scissor joint to "closed" position by default
+        return np.concatenate([self.current_action * np.ones(3), [-1]])
+
+    @property
+    def speed(self):
+        return 0.01
+
+    @property
+    def dof(self):
+        return 1
+
+
+class RobotiqThreeFingerDexterousGripper(RobotiqThreeFingerGripperBase):
+    """
+    Dexterous variation of the 3-finger Robotiq gripper in which all finger are actuated independently as well
+    as the scissor joint between fingers 1 and 2
+    """
+
+    def format_action(self, action):
+        """
+        Maps continuous action into binary output
+        all -1 => open, all 1 => closed
+
+        Args:
+            action (np.array): gripper-specific action
+
+        Raises:
+            AssertionError: [Invalid action dimension size]
+        """
+        assert len(action) == self.dof
+        self.current_action = np.clip(self.current_action + self.speed * np.sign(action), -1.0, 1.0)
+        return self.current_action
+
+    @property
+    def speed(self):
+        return 0.01
+
+    @property
+    def dof(self):
+        return 4
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/grippers/wiping_gripper.py b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/wiping_gripper.py
new file mode 100644
index 0000000000000000000000000000000000000000..692475efaba30837edd8ef987801124f6dfb672b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/grippers/wiping_gripper.py
@@ -0,0 +1,34 @@
+"""
+Gripper without fingers to wipe a surface
+"""
+from robosuite.models.grippers.gripper_model import GripperModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class WipingGripper(GripperModel):
+    """
+    A Wiping Gripper with no actuation and enabled with sensors to detect contact forces
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this gripper instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("grippers/wiping_gripper.xml"), idn=idn)
+
+    def format_action(self, action):
+        return action
+
+    @property
+    def init_qpos(self):
+        return None
+
+    @property
+    def _important_geoms(self):
+        return {
+            "left_finger": [],
+            "right_finger": [],
+            "left_fingerpad": [],
+            "right_fingerpad": [],
+            "corners": ["wiping_corner1", "wiping_corner2", "wiping_corner3", "wiping_corner4"],
+        }
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/mounts/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f92d43a51d49865a410a7899b9c41aa99085bd09
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/__init__.py
@@ -0,0 +1,15 @@
+from .mount_model import MountModel
+from .mount_factory import mount_factory
+
+from .rethink_mount import RethinkMount
+from .phantom_mount import PhantomMount
+from .null_mount import NullMount
+
+
+MOUNT_MAPPING = {
+    "RethinkMount": RethinkMount,
+    "PhantomMount": PhantomMount,
+    None: NullMount,
+}
+
+ALL_MOUNTS = MOUNT_MAPPING.keys()
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/mounts/mount_factory.py b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/mount_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..09b30335494171d2916068fa03551be36582caec
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/mount_factory.py
@@ -0,0 +1,25 @@
+"""
+Defines a string based method of initializing mounts
+"""
+
+
+def mount_factory(name, idn=0):
+    """
+    Generator for grippers
+
+    Creates a MountModel instance with the provided name.
+
+    Args:
+        name (None or str): the name of the mount class
+        idn (int or str): Number or some other unique identification string for this mount instance
+
+    Returns:
+        MountModel: requested mount instance
+
+    Raises:
+        XMLError: [invalid XML]
+    """
+    # Import MOUNT_MAPPING at runtime so we avoid circular imports
+    from robosuite.models.mounts import MOUNT_MAPPING
+
+    return MOUNT_MAPPING.get(name, "Unknown mount name: {}".format(name))(idn=idn)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/mounts/mount_model.py b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/mount_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeb9be9a35122079e4d836f3cb875dc959790e4b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/mount_model.py
@@ -0,0 +1,92 @@
+"""
+Defines the base class of all mounts
+"""
+import numpy as np
+
+from robosuite.models.base import MujocoXMLModel
+from robosuite.utils.mjcf_utils import MOUNT_COLLISION_COLOR
+
+
+class MountModel(MujocoXMLModel):
+    """
+    Base class for mounts that will be attached to robots. Note that this model's root body will be directly
+    appended to the robot's root body, so all offsets should be taken relative to that.
+
+    Args:
+        fname (str): Path to relevant xml file to create this mount instance
+        idn (int or str): Number or some other unique identification string for this gripper instance
+    """
+
+    def __init__(self, fname, idn=0):
+        super().__init__(fname, idn=idn)
+
+        # Grab mount offset (string -> np.array -> elements [1, 2, 3, 0] (x, y, z, w))
+        self.rotation_offset = np.fromstring(
+            self.worldbody[0].attrib.get("quat", "1 0 0 0"), dtype=np.float64, sep=" "
+        )[[1, 2, 3, 0]]
+
+    # -------------------------------------------------------------------------------------- #
+    # Properties: In general, these are the name-adjusted versions from the private          #
+    #             subclass implementations pulled from their respective raw xml files        #
+    # -------------------------------------------------------------------------------------- #
+
+    @property
+    def naming_prefix(self):
+        return "mount{}_".format(self.idn)
+
+    @property
+    def _important_sites(self):
+        """
+        Returns:
+            dict: (Default is no important sites; i.e.: empty dict)
+        """
+        return {}
+
+    @property
+    def _important_geoms(self):
+        """
+        Returns:
+             dict: (Default is no important geoms; i.e.: empty dict)
+        """
+        return {}
+
+    @property
+    def _important_sensors(self):
+        """
+        Returns:
+            dict: (Default is no sensors; i.e.: empty dict)
+        """
+        return {}
+
+    @property
+    def contact_geom_rgba(self):
+        return MOUNT_COLLISION_COLOR
+
+    # -------------------------------------------------------------------------------------- #
+    # All subclasses must implement the following properties                                 #
+    # -------------------------------------------------------------------------------------- #
+
+    @property
+    def top_offset(self):
+        """
+        Returns vector from model root body to model top.
+        This should correspond to the distance from the root body to the actual mounting surface
+        location of this mount.
+
+        Returns:
+            np.array: (dx, dy, dz) offset vector
+        """
+        raise NotImplementedError
+
+    @property
+    def horizontal_radius(self):
+        """
+        Returns maximum distance from model root body to any radial point of the model.
+
+        Helps us put models programmatically without them flying away due to a huge initial contact force.
+        Must be defined by subclass.
+
+        Returns:
+            float: radius
+        """
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/mounts/null_mount.py b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/null_mount.py
new file mode 100644
index 0000000000000000000000000000000000000000..3848e4ca2d37377780220253117ea74bea1ca769
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/null_mount.py
@@ -0,0 +1,27 @@
+"""
+Rethink's Generic Mount (Officially used on Sawyer).
+"""
+import numpy as np
+
+from robosuite.models.mounts.mount_model import MountModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class NullMount(MountModel):
+    """
+    Dummy Mount to signify no mount.
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this mount instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("mounts/null_mount.xml"), idn=idn)
+
+    @property
+    def top_offset(self):
+        return np.array((0, 0, 0))
+
+    @property
+    def horizontal_radius(self):
+        return 0
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/mounts/phantom_mount.py b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/phantom_mount.py
new file mode 100644
index 0000000000000000000000000000000000000000..0be462ea19b7e8b6d644628775c651d4d03b38cb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/phantom_mount.py
@@ -0,0 +1,28 @@
+"""
+Phantom mount.
+"""
+import numpy as np
+
+from robosuite.models.mounts.mount_model import MountModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class PhantomMount(MountModel):
+    """
+    Mount officially used for Rethink's Baxter Robot. Includes only a wheeled pedestal.
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this mount instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("mounts/phantom_mount.xml"), idn=idn)
+
+    @property
+    def top_offset(self):
+        return np.array((0, 0, -0.062))
+
+    @property
+    def horizontal_radius(self):
+        # TODO: This may be inaccurate; just a placeholder for now
+        return 0.25
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/mounts/rethink_mount.py b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/rethink_mount.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed2903c91521a86268b6f8f109b45794a4dea30b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/mounts/rethink_mount.py
@@ -0,0 +1,28 @@
+"""
+Rethink's Generic Mount (Officially used on Sawyer).
+"""
+import numpy as np
+
+from robosuite.models.mounts.mount_model import MountModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class RethinkMount(MountModel):
+    """
+    Mount officially used for Rethink's Sawyer Robot. Includes a controller box and wheeled pedestal.
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this mount instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("mounts/rethink_mount.xml"), idn=idn)
+
+    @property
+    def top_offset(self):
+        return np.array((0, 0, -0.01))
+
+    @property
+    def horizontal_radius(self):
+        # TODO: This may be inaccurate; just a placeholder for now
+        return 0.25
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e11ddca0521eb6d0a4bdba1e9d1582cef2fafa7c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/__init__.py
@@ -0,0 +1,24 @@
+from .objects import MujocoObject, MujocoXMLObject, MujocoGeneratedObject
+from .generated_objects import CompositeBodyObject, CompositeObject, PrimitiveObject
+from .object_groups import ObjectGroup
+
+from .xml_objects import (
+    BottleObject,
+    CanObject,
+    LemonObject,
+    MilkObject,
+    BreadObject,
+    CerealObject,
+    SquareNutObject,
+    RoundNutObject,
+    MilkVisualObject,
+    BreadVisualObject,
+    CerealVisualObject,
+    CanVisualObject,
+    PlateWithHoleObject,
+    DoorObject,
+)
+from .primitive import *
+from .composite import *
+from .composite_body import *
+from .group import *
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..67b6a445552775cb953873d0f3dddcea00dcaf2d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/__init__.py
@@ -0,0 +1,8 @@
+from .bin import Bin
+from .hammer import HammerObject
+from .lid import Lid
+from .pot_with_handles import PotWithHandlesObject
+from .hollow_cylinder import HollowCylinderObject
+from .cone import ConeObject
+from .hook_frame import HookFrame
+from .stand_with_mount import StandWithMount
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/bin.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/bin.py
new file mode 100644
index 0000000000000000000000000000000000000000..a69afc06c4892894a3fec2b01625f8cf5137a870
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/bin.py
@@ -0,0 +1,146 @@
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.models.objects import CompositeObject
+from robosuite.utils.mjcf_utils import CustomMaterial, add_to_dict
+
+
+class Bin(CompositeObject):
+    """
+    Generates a four-walled bin container with an open top.
+    Args:
+        name (str): Name of this Bin object
+        bin_size (3-array): (x,y,z) full size of bin
+        wall_thickness (float): How thick to make walls of bin
+        transparent_walls (bool): If True, walls will be semi-translucent
+        friction (3-array or None): If specified, sets friction values for this bin. None results in default values
+        density (float): Density value to use for all geoms. Defaults to 1000
+        use_texture (bool): If true, geoms will be defined by realistic textures and rgba values will be ignored
+        rgba (4-array or None): If specified, sets rgba values for all geoms. None results in default values
+    """
+
+    def __init__(
+        self,
+        name,
+        bin_size=(0.3, 0.3, 0.15),
+        wall_thickness=0.01,
+        transparent_walls=True,
+        friction=None,
+        density=1000.0,
+        use_texture=True,
+        rgba=(0.2, 0.1, 0.0, 1.0),
+    ):
+        # Set name
+        self._name = name
+
+        # Set object attributes
+        self.bin_size = np.array(bin_size)
+        self.wall_thickness = wall_thickness
+        self.transparent_walls = transparent_walls
+        self.friction = friction if friction is None else np.array(friction)
+        self.density = density
+        self.use_texture = use_texture
+        self.rgba = rgba
+        self.bin_mat_name = "dark_wood_mat"
+
+        # Element references
+        self._base_geom = "base"
+
+        # Other private attributes
+        self._important_sites = {}
+
+        # Create dictionary of values to create geoms for composite object and run super init
+        super().__init__(**self._get_geom_attrs())
+
+        # Define materials we want to use for this object
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "3 3",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        bin_mat = CustomMaterial(
+            texture="WoodDark",
+            tex_name="dark_wood",
+            mat_name=self.bin_mat_name,
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        self.append_material(bin_mat)
+
+    def _get_geom_attrs(self):
+        """
+        Creates geom elements that will be passed to superclass CompositeObject constructor
+        Returns:
+            dict: args to be used by CompositeObject to generate geoms
+        """
+        # Initialize dict of obj args that we'll pass to the CompositeObject constructor
+        base_args = {
+            "total_size": self.bin_size / 2.0,
+            "name": self.name,
+            "locations_relative_to_center": True,
+            "obj_types": "all",
+            "density": self.density,
+        }
+        obj_args = {}
+
+        # Base
+        add_to_dict(
+            dic=obj_args,
+            geom_types="box",
+            geom_locations=(0, 0, -(self.bin_size[2] - self.wall_thickness) / 2),
+            geom_quats=(1, 0, 0, 0),
+            geom_sizes=(
+                np.array((self.bin_size[0], self.bin_size[1], self.wall_thickness))
+                - np.array((self.wall_thickness, self.wall_thickness, 0))
+            )
+            / 2,
+            geom_names=self._base_geom,
+            geom_rgbas=None if self.use_texture else self.rgba,
+            geom_materials=self.bin_mat_name if self.use_texture else None,
+            geom_frictions=self.friction,
+        )
+
+        # Walls
+        x_vals = np.array(
+            [0, -(self.bin_size[0] - self.wall_thickness) / 2, 0, (self.bin_size[0] - self.wall_thickness) / 2]
+        )
+        y_vals = np.array(
+            [-(self.bin_size[1] - self.wall_thickness) / 2, 0, (self.bin_size[1] - self.wall_thickness) / 2, 0]
+        )
+        w_vals = np.array([self.bin_size[0], self.bin_size[1], self.bin_size[0], self.bin_size[1]])
+        r_vals = np.array([np.pi / 2, 0, -np.pi / 2, np.pi])
+        if self.transparent_walls:
+            wall_rgba = (1.0, 1.0, 1.0, 0.3)
+            wall_mat = None
+        else:
+            wall_rgba = None if self.use_texture else self.rgba
+            wall_mat = self.bin_mat_name if self.use_texture else None
+        for i, (x, y, w, r) in enumerate(zip(x_vals, y_vals, w_vals, r_vals)):
+            add_to_dict(
+                dic=obj_args,
+                geom_types="box",
+                geom_locations=(x, y, 0),
+                geom_quats=T.convert_quat(T.axisangle2quat(np.array([0, 0, r])), to="wxyz"),
+                geom_sizes=(self.wall_thickness / 2, w / 2, self.bin_size[2] / 2),
+                geom_names=f"wall{i}",
+                geom_rgbas=wall_rgba,
+                geom_materials=wall_mat,
+                geom_frictions=self.friction,
+            )
+
+        # Add back in base args and site args
+        obj_args.update(base_args)
+
+        # Return this dict
+        return obj_args
+
+    @property
+    def base_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to bin base
+        """
+        return [self.correct_naming(self._base_geom)]
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/cone.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/cone.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b35e3fe36205e4162b27addf5d9da2d50629354
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/cone.py
@@ -0,0 +1,156 @@
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.models.objects import CompositeObject
+from robosuite.utils.mjcf_utils import RED, CustomMaterial, add_to_dict
+
+
+class ConeObject(CompositeObject):
+    """
+    Generates an approximate cone object by using cylinder or box geoms.
+    Args:
+        name (str): Name of this Cone object
+        outer_radius (float): Radius of cone base
+        inner_radius (float): Radius of cone tip (since everything is a cylinder or box)
+        height (float): Height of cone
+        ngeoms (int): Number of cylinder or box geoms used to approximate the cone. Use
+            more geoms to make the approximation better.
+        use_box (bool): If true, use box geoms instead of cylinders, corresponding to a
+            square pyramid shape instead of a conical shape.
+    """
+
+    def __init__(
+        self,
+        name,
+        outer_radius=0.0425,
+        inner_radius=0.03,
+        height=0.05,
+        ngeoms=8,
+        use_box=False,
+        rgba=None,
+        material=None,
+        density=1000.0,
+        solref=(0.02, 1.0),
+        solimp=(0.9, 0.95, 0.001),
+        friction=None,
+    ):
+
+        # Set object attributes
+        self._name = name
+        self.rgba = rgba
+        self.density = density
+        self.friction = friction if friction is None else np.array(friction)
+        self.solref = solref
+        self.solimp = solimp
+
+        self.has_material = material is not None
+        if self.has_material:
+            assert isinstance(material, CustomMaterial)
+            self.material = material
+
+        # Other private attributes
+        self._important_sites = {}
+
+        # radius of the tip and the base
+        self.r1 = inner_radius
+        self.r2 = outer_radius
+
+        # number of geoms used to approximate the cone
+        if ngeoms % 2 == 0:
+            # use an odd number of geoms for easier computation
+            ngeoms += 1
+        self.n = ngeoms
+
+        # cone height
+        self.height = height
+
+        # unit half-height for geoms
+        self.unit_height = (height / ngeoms) / 2.0
+
+        # unit radius for geom radius grid
+        self.unit_r = (self.r2 - self.r1) / (self.n - 1)
+
+        self.use_box = use_box
+
+        # Create dictionary of values to create geoms for composite object and run super init
+        super().__init__(**self._get_geom_attrs())
+
+        # Optionally add material
+        if self.has_material:
+            self.append_material(self.material)
+
+    def _get_geom_attrs(self):
+        """
+        Creates geom elements that will be passed to superclass CompositeObject constructor
+        Returns:
+            dict: args to be used by CompositeObject to generate geoms
+        """
+        # Initialize dict of obj args that we'll pass to the CompositeObject constructor
+        base_args = {
+            "total_size": [self.r2, self.r2, self.height / 2.0],
+            "name": self.name,
+            "locations_relative_to_center": True,
+            "obj_types": "all",
+            "density": self.density,
+            "solref": self.solref,
+            "solimp": self.solimp,
+        }
+        obj_args = {}
+
+        # stack the boxes / cylinders in the z-direction
+        ngeoms_each_side = (self.n - 1) // 2
+        geom_locations = [
+            (0.0, 0.0, i * self.unit_height * 2.0) for i in range(-ngeoms_each_side, ngeoms_each_side + 1)
+        ]
+
+        if self.use_box:
+            geom_sizes = [
+                (
+                    self.r1 + i * self.unit_r,
+                    self.r1 + i * self.unit_r,
+                    self.unit_height,
+                )
+                for i in range(self.n)
+            ][::-1]
+        else:
+            geom_sizes = [
+                (
+                    self.r1 + i * self.unit_r,
+                    self.unit_height,
+                )
+                for i in range(self.n)
+            ][::-1]
+
+        for i in range(self.n):
+            # note: set geom condim to 4 for consistency with round-nut.xml
+            # geom_quat = np.array([np.cos(geom_angle / 2.), 0., 0., np.sin(geom_angle / 2.)])
+            add_to_dict(
+                dic=obj_args,
+                geom_types="box" if self.use_box else "cylinder",
+                geom_locations=geom_locations[i],
+                geom_quats=None,
+                geom_sizes=geom_sizes[i],
+                geom_names="c_{}".format(i),
+                # geom_rgbas=None if self.has_material else self.rgba,
+                geom_rgbas=self.rgba,
+                geom_materials=self.material.mat_attrib["name"] if self.has_material else None,
+                geom_frictions=self.friction,
+                geom_condims=4,
+            )
+
+        # Sites
+        obj_args["sites"] = [
+            {
+                "name": "center",
+                "pos": (0, 0, 0),
+                "size": "0.002",
+                "rgba": RED,
+                "type": "sphere",
+            }
+        ]
+
+        # Add back in base args and site args
+        obj_args.update(base_args)
+
+        # Return this dict
+        return obj_args
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/hammer.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/hammer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcfa96c9d038f2197201dba5b3094526e4e69ff1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/hammer.py
@@ -0,0 +1,282 @@
+from collections.abc import Iterable
+
+import numpy as np
+
+from robosuite.models.objects import CompositeObject
+from robosuite.utils.mjcf_utils import BLUE, CYAN, GREEN, RED, CustomMaterial, add_to_dict
+
+
+class HammerObject(CompositeObject):
+    """
+    Generates a Hammer object with a cylindrical or box-shaped handle, cubic head, cylindrical face and triangular claw
+    (used in Handover task)
+
+    Args:
+        name (str): Name of this Hammer object
+
+        handle_shape (str): Either "box", for a box-shaped handle, or "cylinder", for a cylindrically-shaped handle
+
+        handle_radius (float or 2-array of float): Either specific or range of values to draw randomly from
+            uniformly for the handle radius
+
+        handle_length (float or 2-array of float): Either specific or range of values to draw randomly from
+            uniformly for the handle length
+
+        handle_density (float or 2-array of float): Either specific or range of values to draw randomly from
+            uniformly for the handle density (in SI units). Note that this value is scaled x4 for the hammer head
+
+        handle_friction (float or 2-array of float): Either specific or range of values to draw randomly from
+            uniformly for the handle friction. Note that Mujoco default values are used for the head
+
+        head_density_ratio (float): Ratio of density of handle to head (including face and claw)
+
+        use_texture (bool): If true, geoms will be defined by realistic textures and rgba values will be ignored
+
+        rgba_handle (4-array or None): If specified, sets handle rgba values
+
+        rgba_head (4-array or None): If specified, sets handle rgba values
+
+        rgba_face (4-array or None): If specified, sets handle rgba values
+
+        rgba_claw (4-array or None): If specified, sets handle rgba values
+
+    Raises:
+        ValueError: [Invalid handle shape]
+    """
+
+    def __init__(
+        self,
+        name,
+        handle_shape="box",
+        handle_radius=(0.015, 0.02),
+        handle_length=(0.1, 0.25),
+        handle_density=(100, 250),
+        handle_friction=(3.0, 5.0),
+        head_density_ratio=2.0,
+        use_texture=True,
+        rgba_handle=None,
+        rgba_head=None,
+        rgba_face=None,
+        rgba_claw=None,
+    ):
+        # Set name
+        self._name = name
+
+        # Set handle type and density ratio
+        self.handle_shape = handle_shape
+        self.head_density_ratio = head_density_ratio
+
+        # Set radius and length ranges
+        self.handle_radius_range = handle_radius if isinstance(handle_radius, Iterable) else [handle_radius] * 2
+        self.handle_length_range = handle_length if isinstance(handle_length, Iterable) else [handle_length] * 2
+        self.handle_density_range = handle_density if isinstance(handle_density, Iterable) else [handle_density] * 2
+        self.handle_friction_range = handle_friction if isinstance(handle_friction, Iterable) else [handle_friction] * 2
+
+        # Sample actual radius and length, as well as head half-size
+        self.handle_radius = np.random.uniform(self.handle_radius_range[0], self.handle_radius_range[1])
+        self.handle_length = np.random.uniform(self.handle_length_range[0], self.handle_length_range[1])
+        self.handle_density = np.random.uniform(self.handle_density_range[0], self.handle_density_range[1])
+        self.handle_friction = np.random.uniform(self.handle_friction_range[0], self.handle_friction_range[1])
+        self.head_halfsize = np.random.uniform(self.handle_radius, self.handle_radius * 1.2)
+
+        # Initialize RGBA values and texture flag
+        self.use_texture = use_texture
+        self.rgba_handle = rgba_handle if rgba_handle is not None else RED
+        self.rgba_head = rgba_head if rgba_head is not None else CYAN
+        self.rgba_face = rgba_face if rgba_face is not None else BLUE
+        self.rgba_claw = rgba_claw if rgba_claw is not None else GREEN
+
+        # Create dictionary of values to create geoms for composite object and run super init
+        super().__init__(**self._get_geom_attrs())
+
+        # Define materials we want to use for this object
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "3 3",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        metal = CustomMaterial(
+            texture="SteelScratched",
+            tex_name="metal",
+            mat_name="metal_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        wood = CustomMaterial(
+            texture="WoodLight",
+            tex_name="wood",
+            mat_name="wood_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+
+        # Append materials to object
+        self.append_material(metal)
+        self.append_material(wood)
+
+    def _get_geom_attrs(self):
+        """
+        Creates geom elements that will be passed to superclass CompositeObject constructor
+
+        Returns:
+            dict: args to be used by CompositeObject to generate geoms
+        """
+        full_size = np.array(
+            (3.2 * self.head_halfsize, self.head_halfsize, self.handle_length + 2 * self.head_halfsize)
+        )
+        # Initialize dict of obj args that we'll pass to the CompositeObject constructor
+        base_args = {
+            "total_size": full_size / 2.0,
+            "name": self.name,
+            "locations_relative_to_center": True,
+            "obj_types": "all",
+        }
+        obj_args = {}
+
+        # Add handle component
+        assert self.handle_shape in {
+            "cylinder",
+            "box",
+        }, "Error loading hammer: Handle type must either be 'box' or 'cylinder', got {}.".format(self.handle_shape)
+        add_to_dict(
+            dic=obj_args,
+            geom_types="cylinder" if self.handle_shape == "cylinder" else "box",
+            geom_locations=(0, 0, 0),
+            geom_quats=(1, 0, 0, 0),
+            geom_sizes=np.array([self.handle_radius, self.handle_length / 2.0])
+            if self.handle_shape == "cylinder"
+            else np.array([self.handle_radius, self.handle_radius, self.handle_length / 2.0]),
+            geom_names="handle",
+            geom_rgbas=None if self.use_texture else self.rgba_handle,
+            geom_materials="wood_mat" if self.use_texture else None,
+            geom_frictions=(self.handle_friction, 0.005, 0.0001),
+            density=self.handle_density,
+        )
+
+        # Add head component
+        add_to_dict(
+            dic=obj_args,
+            geom_types="box",
+            geom_locations=(0, 0, self.handle_length / 2.0 + self.head_halfsize),
+            geom_quats=(1, 0, 0, 0),
+            geom_sizes=np.array([self.head_halfsize * 2, self.head_halfsize, self.head_halfsize]),
+            geom_names="head",
+            geom_rgbas=None if self.use_texture else self.rgba_head,
+            geom_materials="metal_mat" if self.use_texture else None,
+            geom_frictions=None,
+            density=self.handle_density * self.head_density_ratio,
+        )
+
+        # Add neck component
+        add_to_dict(
+            dic=obj_args,
+            geom_types="cylinder",
+            geom_locations=(self.head_halfsize * 2.2, 0, self.handle_length / 2.0 + self.head_halfsize),
+            geom_quats=(0.707106, 0, 0.707106, 0),
+            geom_sizes=np.array([self.head_halfsize * 0.8, self.head_halfsize * 0.2]),
+            geom_names="neck",
+            geom_rgbas=None if self.use_texture else self.rgba_face,
+            geom_materials="metal_mat" if self.use_texture else None,
+            geom_frictions=None,
+            density=self.handle_density * self.head_density_ratio,
+        )
+
+        # Add face component
+        add_to_dict(
+            dic=obj_args,
+            geom_types="cylinder",
+            geom_locations=(self.head_halfsize * 2.8, 0, self.handle_length / 2.0 + self.head_halfsize),
+            geom_quats=(0.707106, 0, 0.707106, 0),
+            geom_sizes=np.array([self.head_halfsize, self.head_halfsize * 0.4]),
+            geom_names="face",
+            geom_rgbas=None if self.use_texture else self.rgba_face,
+            geom_materials="metal_mat" if self.use_texture else None,
+            geom_frictions=None,
+            density=self.handle_density * self.head_density_ratio,
+        )
+
+        # Add claw component
+        add_to_dict(
+            dic=obj_args,
+            geom_types="box",
+            geom_locations=(-self.head_halfsize * 2, 0, self.handle_length / 2.0 + self.head_halfsize),
+            geom_quats=(0.9238795, 0, 0.3826834, 0),
+            geom_sizes=np.array([self.head_halfsize * 0.7072, self.head_halfsize * 0.95, self.head_halfsize * 0.7072]),
+            geom_names="claw",
+            geom_rgbas=None if self.use_texture else self.rgba_claw,
+            geom_materials="metal_mat" if self.use_texture else None,
+            geom_frictions=None,
+            density=self.handle_density * self.head_density_ratio,
+        )
+
+        # Add back in base args
+        obj_args.update(base_args)
+
+        # Return this dict
+        return obj_args
+
+    @property
+    def init_quat(self):
+        """
+        Generates a new random orientation for the hammer
+
+        Returns:
+            np.array: (x, y, z, w) quaternion orientation for the hammer
+        """
+        # Randomly sample between +/- flip (such that the hammer head faces one way or the other)
+        return np.array([0.5, -0.5, 0.5, -0.5]) if np.random.rand() >= 0.5 else np.array([-0.5, -0.5, -0.5, -0.5])
+
+    @property
+    def handle_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to hammer handle
+        """
+        return self.correct_naming(["handle"])
+
+    @property
+    def head_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to hammer head
+        """
+        return self.correct_naming(["head"])
+
+    @property
+    def face_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to hammer face
+        """
+        return self.correct_naming(["neck", "face"])
+
+    @property
+    def claw_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to hammer claw
+        """
+        return self.correct_naming(["claw"])
+
+    @property
+    def all_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to all hammer components
+        """
+        return self.handle_geoms + self.head_geoms + self.face_geoms + self.claw_geoms
+
+    @property
+    def bottom_offset(self):
+        return np.array([0, 0, -self.handle_radius])
+
+    @property
+    def top_offset(self):
+        return np.array([0, 0, self.handle_radius])
+
+    @property
+    def horizontal_radius(self):
+        return self.head_halfsize + 0.5 * self.handle_length
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/hollow_cylinder.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/hollow_cylinder.py
new file mode 100644
index 0000000000000000000000000000000000000000..329dba0b045bfd99390fa269d34625785f845934
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/hollow_cylinder.py
@@ -0,0 +1,146 @@
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.models.objects import CompositeObject
+from robosuite.utils.mjcf_utils import RED, CustomMaterial, add_to_dict
+
+
+class HollowCylinderObject(CompositeObject):
+    """
+    Generates an approximate hollow cylinder object by using box geoms.
+    Args:
+        name (str): Name of this HollowCylinder object
+        outer_radius (float): Outer radius of hollow cylinder
+        inner_radius (float): Inner radius of hollow cylinder
+        height (float): Height of hollow cylinder
+        ngeoms (int): Number of box geoms used to approximate the cylindrical shell. Use
+            more geoms to make the approximation better.
+        make_half (bool): If true, only make half of the shell.
+    """
+
+    def __init__(
+        self,
+        name,
+        outer_radius=0.0425,
+        inner_radius=0.03,
+        height=0.05,
+        ngeoms=8,
+        rgba=None,
+        material=None,
+        density=1000.0,
+        solref=(0.02, 1.0),
+        solimp=(0.9, 0.95, 0.001),
+        friction=None,
+        make_half=False,
+    ):
+
+        # Set object attributes
+        self._name = name
+        self.rgba = rgba
+        self.density = density
+        self.friction = friction if friction is None else np.array(friction)
+        self.solref = solref
+        self.solimp = solimp
+        self.make_half = make_half  # if True, will only make half the hollow cylinder
+
+        self.has_material = material is not None
+        if self.has_material:
+            assert isinstance(material, CustomMaterial)
+            self.material = material
+
+        # Other private attributes
+        self._important_sites = {}
+
+        # radius of the inner cup hole and entire cup
+        self.r1 = inner_radius
+        self.r2 = outer_radius
+
+        # number of geoms used to approximate the cylindrical shell
+        self.n = ngeoms
+
+        # cylinder half-height
+        self.height = height
+
+        # half-width of each box inferred from triangle of radius + box half-length
+        # since the angle will be (360 / n) / 2
+        self.unit_box_width = self.r2 * np.sin(np.pi / self.n)
+
+        # half-height of each box inferred from the same triangle with inner radius
+        self.unit_box_height = (self.r2 - self.r1) * np.cos(np.pi / self.n) / 2.0
+
+        # each box geom depth will end up defining the height of the cup
+        self.unit_box_depth = self.height
+
+        # radius of intermediate circle that connects all box centers
+        self.int_r = (self.r1 * np.cos(np.pi / self.n)) + self.unit_box_height
+
+        # Create dictionary of values to create geoms for composite object and run super init
+        super().__init__(**self._get_geom_attrs())
+
+        # Optionally add material
+        if self.has_material:
+            self.append_material(self.material)
+
+    def _get_geom_attrs(self):
+        """
+        Creates geom elements that will be passed to superclass CompositeObject constructor
+        Returns:
+            dict: args to be used by CompositeObject to generate geoms
+        """
+        # Initialize dict of obj args that we'll pass to the CompositeObject constructor
+        base_args = {
+            "total_size": [self.r2, self.r2, self.height],
+            "name": self.name,
+            "locations_relative_to_center": True,
+            "obj_types": "all",
+            "density": self.density,
+            "solref": self.solref,
+            "solimp": self.solimp,
+        }
+        obj_args = {}
+
+        n_make = self.n
+        if self.make_half:
+            # only make half the shell
+            n_make = (self.n // 2) + 1
+
+        # infer locations of all geoms with trigonometry
+        angle_step = 2.0 * np.pi / self.n
+        for i in range(n_make):
+            # we start with the top-most box object and proceed clockwise (thus an offset of np.pi)
+            geom_angle = np.pi - i * angle_step
+            geom_center = np.array([self.int_r * np.cos(geom_angle), self.int_r * np.sin(geom_angle), 0.0])
+            geom_quat = np.array([np.cos(geom_angle / 2.0), 0.0, 0.0, np.sin(geom_angle / 2.0)])
+            geom_size = np.array([self.unit_box_height, self.unit_box_width, self.unit_box_depth])
+
+            # note: set geom condim to 4 for consistency with round-nut.xml
+            add_to_dict(
+                dic=obj_args,
+                geom_types="box",
+                geom_locations=tuple(geom_center),
+                geom_quats=tuple(geom_quat),
+                geom_sizes=tuple(geom_size),
+                geom_names="hc_{}".format(i),
+                # geom_rgbas=None if self.has_material else self.rgba,
+                geom_rgbas=self.rgba,
+                geom_materials=self.material.mat_attrib["name"] if self.has_material else None,
+                geom_frictions=self.friction,
+                geom_condims=4,
+            )
+
+        # Sites
+        obj_args["sites"] = [
+            {
+                "name": "center",
+                "pos": (0, 0, 0),
+                "size": "0.002",
+                "rgba": RED,
+                "type": "sphere",
+            }
+        ]
+
+        # Add back in base args and site args
+        obj_args.update(base_args)
+
+        # Return this dict
+        return obj_args
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/hook_frame.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/hook_frame.py
new file mode 100644
index 0000000000000000000000000000000000000000..3741c7b74dac78a1963da1c2c01d7023fc9c85d5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/hook_frame.py
@@ -0,0 +1,332 @@
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.models.objects import CompositeObject
+from robosuite.utils.mjcf_utils import BLUE, GREEN, RED, CustomMaterial, add_to_dict
+
+
+class HookFrame(CompositeObject):
+    """
+    Generates an upside down L-shaped frame (a "hook" shape), intended to be used with StandWithMount object.
+    Args:
+        name (str): Name of this object
+        frame_length (float): How long the frame is
+        frame_height (float): How tall the frame is
+        frame_thickness (float): How thick the frame is
+        hook_height (float): if not None, add a box geom at the edge of the hook with this height (not half-height)
+        grip_location (float): if not None, adds a grip to passed location, relative to center of the rod corresponding to @frame_height.
+        grip_size ([float]): (R, H) radius and half-height for the cylindrical grip. Set to None
+            to not add a grip.
+        tip_size ([float]): if not None, adds a cone tip to the end of the hook for easier insertion, with the
+            provided (CH, LR, UR, H) where CH is the base cylinder height, LR and UR are the lower and upper radius
+            of the cone tip, and H is the half-height of the cone tip
+        friction (3-array or None): If specified, sets friction values for this object. None results in default values
+        density (float): Density value to use for all geoms. Defaults to 1000
+        use_texture (bool): If true, geoms will be defined by realistic textures and rgba values will be ignored
+        rgba (4-array or None): If specified, sets rgba values for all geoms. None results in default values
+    """
+
+    def __init__(
+        self,
+        name,
+        frame_length=0.3,
+        frame_height=0.2,
+        frame_thickness=0.025,
+        hook_height=None,
+        grip_location=None,
+        grip_size=None,
+        tip_size=None,
+        friction=None,
+        density=1000.0,
+        solref=(0.02, 1.0),
+        solimp=(0.9, 0.95, 0.001),
+        use_texture=True,
+        rgba=(0.2, 0.1, 0.0, 1.0),
+    ):
+        # Set name
+        self._name = name
+
+        # Set object attributes
+        self.size = None  # Filled in automatically
+        self.frame_length = frame_length
+        self.frame_height = frame_height
+        self.frame_thickness = frame_thickness
+        self.hook_height = hook_height
+        self.grip_location = grip_location
+        self.grip_size = tuple(grip_size) if grip_size is not None else None
+        self.tip_size = tuple(tip_size) if tip_size is not None else None
+        self.friction = friction if friction is None else np.array(friction)
+        self.solref = solref
+        self.solimp = solimp
+        self.density = density
+        self.use_texture = use_texture
+        self.rgba = rgba
+        self.mat_name = "brass_mat"
+        self.grip_mat_name = "ceramic_mat"
+        self.tip_mat_name = "steel_mat"
+
+        # Other private attributes
+        self._important_sites = {}
+
+        # Create dictionary of values to create geoms for composite object and run super init
+        super().__init__(**self._get_geom_attrs())
+
+        # Define materials we want to use for this object
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "3 3",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        bin_mat = CustomMaterial(
+            texture="Brass",
+            tex_name="brass",
+            mat_name=self.mat_name,
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        self.append_material(bin_mat)
+        # optionally add material for grip
+        if (self.grip_location is not None) and (self.grip_size is not None):
+            grip_mat = CustomMaterial(
+                texture="Ceramic",
+                tex_name="ceramic",
+                mat_name=self.grip_mat_name,
+                tex_attrib=tex_attrib,
+                mat_attrib=mat_attrib,
+            )
+            self.append_material(grip_mat)
+        # optionally add material for tip
+        if self.tip_size is not None:
+            tip_mat = CustomMaterial(
+                texture="SteelScratched",
+                tex_name="steel",
+                mat_name=self.tip_mat_name,
+                tex_attrib=tex_attrib,
+                mat_attrib=mat_attrib,
+            )
+            self.append_material(tip_mat)
+
+    def _get_geom_attrs(self):
+        """
+        Creates geom elements that will be passed to superclass CompositeObject constructor
+        Returns:
+            dict: args to be used by CompositeObject to generate geoms
+        """
+        # Initialize dict of obj args that we'll pass to the CompositeObject constructor
+        self.size = np.array((self.frame_length, self.frame_thickness, self.frame_height))
+        if self.tip_size is not None:
+            self.size[2] += 2.0 * (self.tip_size[0] + (2.0 * self.tip_size[3]))
+        base_args = {
+            "total_size": self.size / 2,
+            "name": self.name,
+            "locations_relative_to_center": True,
+            "obj_types": "all",
+            "density": self.density,
+            "solref": self.solref,
+            "solimp": self.solimp,
+        }
+        obj_args = {}
+
+        # Vertical Frame
+        add_to_dict(
+            dic=obj_args,
+            geom_types="box",
+            geom_locations=((self.frame_length - self.frame_thickness) / 2, 0, -self.frame_thickness / 2),
+            geom_quats=(1, 0, 0, 0),
+            geom_sizes=np.array((self.frame_thickness, self.frame_thickness, self.frame_height - self.frame_thickness))
+            / 2,
+            geom_names="vertical_frame",
+            geom_rgbas=None if self.use_texture else self.rgba,
+            geom_materials=self.mat_name if self.use_texture else None,
+            geom_frictions=self.friction,
+        )
+
+        # Horizontal Frame
+        add_to_dict(
+            dic=obj_args,
+            geom_types="box",
+            geom_locations=(0, 0, (self.frame_height - self.frame_thickness) / 2),
+            geom_quats=(1, 0, 0, 0),
+            geom_sizes=np.array((self.frame_length, self.frame_thickness, self.frame_thickness)) / 2,
+            geom_names="horizontal_frame",
+            geom_rgbas=None if self.use_texture else self.rgba,
+            geom_materials=self.mat_name if self.use_texture else None,
+            geom_frictions=self.friction,
+        )
+
+        # optionally add hook at the end of the horizontal frame
+        if self.hook_height is not None:
+            add_to_dict(
+                dic=obj_args,
+                geom_types="box",
+                geom_locations=(
+                    (-self.frame_length + self.frame_thickness) / 2,
+                    0,
+                    (self.frame_height + self.hook_height) / 2,
+                ),
+                geom_quats=(1, 0, 0, 0),
+                geom_sizes=np.array((self.frame_thickness, self.frame_thickness, self.hook_height)) / 2,
+                geom_names="hook_frame",
+                geom_rgbas=None if self.use_texture else self.rgba,
+                geom_materials=self.mat_name if self.use_texture else None,
+                geom_frictions=self.friction,
+            )
+
+        # optionally add a grip
+        if (self.grip_location is not None) and (self.grip_size is not None):
+            # note: use box grip instead of cylindrical grip for stability
+            add_to_dict(
+                dic=obj_args,
+                geom_types="box",
+                geom_locations=(
+                    (self.frame_length - self.frame_thickness) / 2,
+                    0,
+                    (-self.frame_thickness / 2) + self.grip_location,
+                ),
+                geom_quats=(1, 0, 0, 0),
+                geom_sizes=(self.grip_size[0], self.grip_size[0], self.grip_size[1]),
+                geom_names="grip_frame",
+                # geom_rgbas=None if self.use_texture else self.rgba,
+                geom_rgbas=(0.13, 0.13, 0.13, 1.0),
+                geom_materials=self.grip_mat_name if self.use_texture else None,
+                # geom_frictions=self.friction,
+                geom_frictions=(1.0, 0.005, 0.0001),  # use default friction
+            )
+
+        # optionally add cone tip
+        if self.tip_size is not None:
+            from robosuite.models.objects import ConeObject
+
+            cone = ConeObject(
+                name="cone",
+                outer_radius=self.tip_size[2],
+                inner_radius=self.tip_size[1],
+                height=self.tip_size[3],
+                # ngeoms=8,
+                ngeoms=50,
+                use_box=True,
+                # use_box=False,
+                rgba=None,
+                material=None,
+                density=self.density,
+                solref=self.solref,
+                solimp=self.solimp,
+                friction=self.friction,
+            )
+            cone_args = cone._get_geom_attrs()
+
+            # DIRTY HACK: add them in reverse (in hindsight, should just turn this into a composite body...)
+            cone_geom_types = cone_args["geom_types"]
+            cone_geom_locations = cone_args["geom_locations"]
+            cone_geom_sizes = cone_args["geom_sizes"][::-1]
+
+            # location of mount site is the translation we need
+            cylinder_offset = (
+                (self.frame_length - self.frame_thickness) / 2,
+                0,
+                -self.frame_height / 2 - self.tip_size[0],  # account for half-height of cylinder
+            )
+            cone_offset = (
+                cylinder_offset[0],
+                cylinder_offset[1],
+                cylinder_offset[2]
+                - self.tip_size[0]
+                - self.tip_size[3] / 2.0,  # need to move below cylinder, and account for half-height
+            )
+
+            # first add cylinder
+            add_to_dict(
+                dic=obj_args,
+                geom_types="cylinder",
+                geom_locations=cylinder_offset,
+                geom_quats=(1, 0, 0, 0),
+                geom_sizes=(self.tip_size[2], self.tip_size[0]),
+                geom_names="tip_cylinder",
+                geom_rgbas=None if self.use_texture else self.rgba,
+                geom_materials=self.tip_mat_name if self.use_texture else None,
+                geom_frictions=self.friction,
+            )
+
+            # then add cone tip geoms
+            for i in range(len(cone_geom_types)):
+                add_to_dict(
+                    dic=obj_args,
+                    geom_types=cone_geom_types[i],
+                    geom_locations=(
+                        cone_geom_locations[i][0] + cone_offset[0],
+                        cone_geom_locations[i][1] + cone_offset[1],
+                        cone_geom_locations[i][2] + cone_offset[2],
+                    ),
+                    geom_quats=(1, 0, 0, 0),
+                    geom_sizes=cone_geom_sizes[i],
+                    geom_names="tip_cone_{}".format(i),
+                    geom_rgbas=None if self.use_texture else self.rgba,
+                    geom_materials=self.tip_mat_name if self.use_texture else None,
+                    geom_frictions=self.friction,
+                )
+
+        # Sites
+        obj_args["sites"] = [
+            {
+                "name": f"hang_site",
+                "pos": (-self.frame_length / 2, 0, (self.frame_height - self.frame_thickness) / 2),
+                "size": "0.002",
+                "rgba": RED,
+                "type": "sphere",
+            },
+            {
+                "name": f"mount_site",
+                "pos": ((self.frame_length - self.frame_thickness) / 2, 0, -self.frame_height / 2),
+                "size": "0.002",
+                "rgba": GREEN,
+                "type": "sphere",
+            },
+            {
+                "name": f"intersection_site",
+                "pos": (
+                    (self.frame_length - self.frame_thickness) / 2,
+                    0,
+                    (self.frame_height - self.frame_thickness) / 2,
+                ),
+                "size": "0.002",
+                "rgba": BLUE,
+                "type": "sphere",
+            },
+        ]
+
+        if self.tip_size is not None:
+            obj_args["sites"].append(
+                {
+                    "name": f"tip_site",
+                    "pos": (
+                        ((self.frame_length - self.frame_thickness) / 2),
+                        0,
+                        (-self.frame_height / 2) - 2.0 * self.tip_size[0] - self.tip_size[3],
+                    ),
+                    "size": "0.002",
+                    "rgba": RED,
+                    "type": "sphere",
+                },
+            )
+
+        # Add back in base args and site args
+        obj_args.update(base_args)
+
+        # Return this dict
+        return obj_args
+
+    @property
+    def init_quat(self):
+        """
+        Rotate the frame on its side so it is flat
+        Returns:
+            np.array: (x, y, z, w) quaternion orientation for this object
+        """
+        # Rotate 90 degrees about two consecutive axes to make the hook lie on the table instead of being upright.
+        return T.quat_multiply(
+            np.array([0, 0.0, np.sqrt(2) / 2.0, np.sqrt(2) / 2.0]),
+            np.array([-np.sqrt(2) / 2.0, 0.0, 0.0, np.sqrt(2) / 2.0]),
+        )
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/lid.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/lid.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb602bd9308bbe387e4648939d61c00c32f5520
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/lid.py
@@ -0,0 +1,136 @@
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.models.objects import CompositeObject
+from robosuite.utils.mjcf_utils import CustomMaterial, add_to_dict
+
+
+class Lid(CompositeObject):
+    """
+    Generates a square lid with a simple handle.
+    Args:
+        name (str): Name of this Lid object
+        lid_size (3-array): (length, width, thickness) of lid
+        handle_size (3-array): (thickness, length, height) of handle
+        transparent (bool): If True, lid will be semi-translucent
+        friction (3-array or None): If specified, sets friction values for this lid. None results in default values
+        density (float): Density value to use for all geoms. Defaults to 1000
+        use_texture (bool): If true, geoms will be defined by realistic textures and rgba values will be ignored
+        rgba (4-array or None): If specified, sets rgba values for all geoms. None results in default values
+    """
+
+    def __init__(
+        self,
+        name,
+        lid_size=(0.3, 0.3, 0.01),
+        handle_size=(0.02, 0.08, 0.03),
+        transparent=True,
+        friction=None,
+        density=250.0,
+        use_texture=True,
+        rgba=(0.2, 0.1, 0.0, 1.0),
+    ):
+        # Set name
+        self._name = name
+
+        # Set object attributes
+        self.lid_size = np.array(lid_size)
+        self.handle_size = np.array(handle_size)
+        self.transparent = transparent
+        self.friction = friction if friction is None else np.array(friction)
+        self.density = density
+        self.use_texture = use_texture
+        self.rgba = rgba
+        self.lid_mat_name = "dark_wood_mat"
+
+        # Element references
+        self._handle_geom = "handle"
+
+        # Other private attributes
+        self._important_sites = {}
+
+        # Create dictionary of values to create geoms for composite object and run super init
+        super().__init__(**self._get_geom_attrs())
+
+        # Define materials we want to use for this object
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "3 3",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        lid_mat = CustomMaterial(
+            texture="WoodDark",
+            tex_name="dark_wood",
+            mat_name=self.lid_mat_name,
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        self.append_material(lid_mat)
+
+    def _get_geom_attrs(self):
+        """
+        Creates geom elements that will be passed to superclass CompositeObject constructor
+        Returns:
+            dict: args to be used by CompositeObject to generate geoms
+        """
+        full_height = self.lid_size[2] + self.handle_size[2]
+        full_size = np.array([self.lid_size[0], self.lid_size[1], full_height])
+        # Initialize dict of obj args that we'll pass to the CompositeObject constructor
+        base_args = {
+            "total_size": full_size / 2.0,
+            "name": self.name,
+            "locations_relative_to_center": True,
+            "obj_types": "all",
+        }
+        obj_args = {}
+
+        # Top
+        if self.transparent:
+            top_rgba = (1.0, 1.0, 1.0, 0.3)
+            top_mat = None
+        else:
+            top_rgba = None if self.use_texture else self.rgba
+            top_mat = self.lid_mat_name if self.use_texture else None
+        add_to_dict(
+            dic=obj_args,
+            geom_types="box",
+            geom_locations=(0, 0, (-full_size[2] + self.lid_size[2]) / 2),
+            geom_quats=(1, 0, 0, 0),
+            geom_sizes=np.array((full_size[0], full_size[1], self.lid_size[2])) / 2,
+            geom_names="top",
+            geom_rgbas=top_rgba,
+            geom_materials=top_mat,
+            geom_frictions=self.friction,
+            density=self.density,
+        )
+
+        # Handle
+        add_to_dict(
+            dic=obj_args,
+            geom_types="box",
+            geom_locations=(0, 0, (full_size[2] - self.handle_size[2]) / 2),
+            geom_quats=(1, 0, 0, 0),
+            geom_sizes=self.handle_size / 2,
+            geom_names=self._handle_geom,
+            geom_rgbas=None if self.use_texture else self.rgba,
+            geom_materials=self.lid_mat_name if self.use_texture else None,
+            geom_frictions=self.friction,
+            density=self.density * 2,
+        )
+
+        # Add back in base args and site args
+        obj_args.update(base_args)
+
+        # Return this dict
+        return obj_args
+
+    @property
+    def handle_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to lid handle
+        """
+        return [self.correct_naming(self._handle_geom)]
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/pot_with_handles.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/pot_with_handles.py
new file mode 100644
index 0000000000000000000000000000000000000000..6783e5ebe9fc74a7c08c2f44536e6ecdeb73a9d7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/pot_with_handles.py
@@ -0,0 +1,350 @@
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.models.objects import CompositeObject
+from robosuite.utils.mjcf_utils import BLUE, GREEN, RED, CustomMaterial, add_to_dict, array_to_string
+
+
+class PotWithHandlesObject(CompositeObject):
+    """
+    Generates the Pot object with side handles (used in TwoArmLift)
+
+    Args:
+        name (str): Name of this Pot object
+
+        body_half_size (3-array of float): If specified, defines the (x,y,z) half-dimensions of the main pot
+            body. Otherwise, defaults to [0.07, 0.07, 0.07]
+
+        handle_radius (float): Determines the pot handle radius
+
+        handle_length (float): Determines the pot handle length
+
+        handle_width (float): Determines the pot handle width
+
+        handle_friction (float): Friction value to use for pot handles. Defauls to 1.0
+
+        density (float): Density value to use for all geoms. Defaults to 1000
+
+        use_texture (bool): If true, geoms will be defined by realistic textures and rgba values will be ignored
+
+        rgba_body (4-array or None): If specified, sets pot body rgba values
+
+        rgba_handle_0 (4-array or None): If specified, sets handle 0 rgba values
+
+        rgba_handle_1 (4-array or None): If specified, sets handle 1 rgba values
+
+        solid_handle (bool): If true, uses a single geom to represent the handle
+
+        thickness (float): How thick to make the pot body walls
+    """
+
+    def __init__(
+        self,
+        name,
+        body_half_size=(0.07, 0.07, 0.07),
+        handle_radius=0.01,
+        handle_length=0.09,
+        handle_width=0.09,
+        handle_friction=1.0,
+        density=1000,
+        use_texture=True,
+        rgba_body=None,
+        rgba_handle_0=None,
+        rgba_handle_1=None,
+        solid_handle=False,
+        thickness=0.01,  # For body
+    ):
+        # Set name
+        self._name = name
+
+        # Set object attributes
+        self.body_half_size = np.array(body_half_size)
+        self.thickness = thickness
+        self.handle_radius = handle_radius
+        self.handle_length = handle_length
+        self.handle_width = handle_width
+        self.handle_friction = handle_friction
+        self.density = density
+        self.use_texture = use_texture
+        self.rgba_body = np.array(rgba_body) if rgba_body else RED
+        self.rgba_handle_0 = np.array(rgba_handle_0) if rgba_handle_0 else GREEN
+        self.rgba_handle_1 = np.array(rgba_handle_1) if rgba_handle_1 else BLUE
+        self.solid_handle = solid_handle
+
+        # Element references to be filled when generated
+        self._handle0_geoms = None
+        self._handle1_geoms = None
+        self.pot_base = None
+
+        # Other private attributes
+        self._important_sites = {}
+
+        # Create dictionary of values to create geoms for composite object and run super init
+        super().__init__(**self._get_geom_attrs())
+
+        # Define materials we want to use for this object
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "1 1",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        redwood = CustomMaterial(
+            texture="WoodRed",
+            tex_name="redwood",
+            mat_name="pot_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        greenwood = CustomMaterial(
+            texture="WoodGreen",
+            tex_name="greenwood",
+            mat_name="handle0_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        bluewood = CustomMaterial(
+            texture="WoodBlue",
+            tex_name="bluewood",
+            mat_name="handle1_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        self.append_material(redwood)
+        self.append_material(greenwood)
+        self.append_material(bluewood)
+
+    def _get_geom_attrs(self):
+        """
+        Creates geom elements that will be passed to superclass CompositeObject constructor
+
+        Returns:
+            dict: args to be used by CompositeObject to generate geoms
+        """
+        full_size = np.array(
+            (
+                self.body_half_size,
+                self.body_half_size + self.handle_length * 2,
+                self.body_half_size,
+            )
+        )
+        # Initialize dict of obj args that we'll pass to the CompositeObject constructor
+        base_args = {
+            "total_size": full_size / 2.0,
+            "name": self.name,
+            "locations_relative_to_center": True,
+            "obj_types": "all",
+        }
+        site_attrs = []
+        obj_args = {}
+
+        # Initialize geom lists
+        self._handle0_geoms = []
+        self._handle1_geoms = []
+
+        # Add main pot body
+        # Base geom
+        name = f"base"
+        self.pot_base = [name]
+        add_to_dict(
+            dic=obj_args,
+            geom_types="box",
+            geom_locations=(0, 0, -self.body_half_size[2] + self.thickness / 2),
+            geom_quats=(1, 0, 0, 0),
+            geom_sizes=np.array([self.body_half_size[0], self.body_half_size[1], self.thickness / 2]),
+            geom_names=name,
+            geom_rgbas=None if self.use_texture else self.rgba_body,
+            geom_materials="pot_mat" if self.use_texture else None,
+            geom_frictions=None,
+            density=self.density,
+        )
+
+        # Walls
+        x_off = np.array(
+            [0, -(self.body_half_size[0] - self.thickness / 2), 0, self.body_half_size[0] - self.thickness / 2]
+        )
+        y_off = np.array(
+            [-(self.body_half_size[1] - self.thickness / 2), 0, self.body_half_size[1] - self.thickness / 2, 0]
+        )
+        w_vals = np.array(
+            [self.body_half_size[1], self.body_half_size[0], self.body_half_size[1], self.body_half_size[0]]
+        )
+        r_vals = np.array([np.pi / 2, 0, -np.pi / 2, np.pi])
+        for i, (x, y, w, r) in enumerate(zip(x_off, y_off, w_vals, r_vals)):
+            add_to_dict(
+                dic=obj_args,
+                geom_types="box",
+                geom_locations=(x, y, 0),
+                geom_quats=T.convert_quat(T.axisangle2quat(np.array([0, 0, r])), to="wxyz"),
+                geom_sizes=np.array([self.thickness / 2, w, self.body_half_size[2]]),
+                geom_names=f"body{i}",
+                geom_rgbas=None if self.use_texture else self.rgba_body,
+                geom_materials="pot_mat" if self.use_texture else None,
+                geom_frictions=None,
+                density=self.density,
+            )
+
+        # Add handles
+        main_bar_size = np.array(
+            [
+                self.handle_width / 2 + self.handle_radius,
+                self.handle_radius,
+                self.handle_radius,
+            ]
+        )
+        side_bar_size = np.array([self.handle_radius, self.handle_length / 2, self.handle_radius])
+        handle_z = self.body_half_size[2] - self.handle_radius
+        for i, (g_list, handle_side, rgba) in enumerate(
+            zip([self._handle0_geoms, self._handle1_geoms], [1.0, -1.0], [self.rgba_handle_0, self.rgba_handle_1])
+        ):
+            handle_center = np.array((0, handle_side * (self.body_half_size[1] + self.handle_length), handle_z))
+            # Solid handle case
+            if self.solid_handle:
+                name = f"handle{i}"
+                g_list.append(name)
+                add_to_dict(
+                    dic=obj_args,
+                    geom_types="box",
+                    geom_locations=handle_center,
+                    geom_quats=(1, 0, 0, 0),
+                    geom_sizes=np.array([self.handle_width / 2, self.handle_length / 2, self.handle_radius]),
+                    geom_names=name,
+                    geom_rgbas=None if self.use_texture else rgba,
+                    geom_materials=f"handle{i}_mat" if self.use_texture else None,
+                    geom_frictions=(self.handle_friction, 0.005, 0.0001),
+                    density=self.density,
+                )
+            # Hollow handle case
+            else:
+                # Center bar
+                name = f"handle{i}_c"
+                g_list.append(name)
+                add_to_dict(
+                    dic=obj_args,
+                    geom_types="box",
+                    geom_locations=handle_center,
+                    geom_quats=(1, 0, 0, 0),
+                    geom_sizes=main_bar_size,
+                    geom_names=name,
+                    geom_rgbas=None if self.use_texture else rgba,
+                    geom_materials=f"handle{i}_mat" if self.use_texture else None,
+                    geom_frictions=(self.handle_friction, 0.005, 0.0001),
+                    density=self.density,
+                )
+                # Side bars
+                for bar_side, suffix in zip([-1.0, 1.0], ["-", "+"]):
+                    name = f"handle{i}_{suffix}"
+                    g_list.append(name)
+                    add_to_dict(
+                        dic=obj_args,
+                        geom_types="box",
+                        geom_locations=(
+                            bar_side * self.handle_width / 2,
+                            handle_side * (self.body_half_size[1] + self.handle_length / 2),
+                            handle_z,
+                        ),
+                        geom_quats=(1, 0, 0, 0),
+                        geom_sizes=side_bar_size,
+                        geom_names=name,
+                        geom_rgbas=None if self.use_texture else rgba,
+                        geom_materials=f"handle{i}_mat" if self.use_texture else None,
+                        geom_frictions=(self.handle_friction, 0.005, 0.0001),
+                        density=self.density,
+                    )
+            # Add relevant site
+            handle_site = self.get_site_attrib_template()
+            handle_name = f"handle{i}"
+            handle_site.update(
+                {
+                    "name": handle_name,
+                    "pos": array_to_string(handle_center - handle_side * np.array([0, 0.005, 0])),
+                    "size": "0.005",
+                    "rgba": rgba,
+                }
+            )
+            site_attrs.append(handle_site)
+            # Add to important sites
+            self._important_sites[f"handle{i}"] = self.naming_prefix + handle_name
+
+        # Add pot body site
+        pot_site = self.get_site_attrib_template()
+        center_name = "center"
+        pot_site.update(
+            {
+                "name": center_name,
+                "size": "0.005",
+            }
+        )
+        site_attrs.append(pot_site)
+        # Add to important sites
+        self._important_sites["center"] = self.naming_prefix + center_name
+
+        # Add back in base args and site args
+        obj_args.update(base_args)
+        obj_args["sites"] = site_attrs  # All sites are part of main (top) body
+
+        # Return this dict
+        return obj_args
+
+    @property
+    def handle_distance(self):
+
+        """
+        Calculates how far apart the handles are
+
+        Returns:
+            float: handle distance
+        """
+        return self.body_half_size[1] * 2 + self.handle_length * 2
+
+    @property
+    def handle0_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to handle0 (green handle)
+        """
+        return self.correct_naming(self._handle0_geoms)
+
+    @property
+    def handle1_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to handle1 (blue handle)
+        """
+        return self.correct_naming(self._handle1_geoms)
+
+    @property
+    def handle_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to both handles
+        """
+        return self.handle0_geoms + self.handle1_geoms
+
+    @property
+    def important_sites(self):
+        """
+        Returns:
+            dict: In addition to any default sites for this object, also provides the following entries
+
+                :`'handle0'`: Name of handle0 location site
+                :`'handle1'`: Name of handle1 location site
+        """
+        # Get dict from super call and add to it
+        dic = super().important_sites
+        dic.update(self._important_sites)
+        return dic
+
+    @property
+    def bottom_offset(self):
+        return np.array([0, 0, -1 * self.body_half_size[2]])
+
+    @property
+    def top_offset(self):
+        return np.array([0, 0, self.body_half_size[2]])
+
+    @property
+    def horizontal_radius(self):
+        return np.sqrt(2) * (max(self.body_half_size) + self.handle_length)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/stand_with_mount.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/stand_with_mount.py
new file mode 100644
index 0000000000000000000000000000000000000000..903c35a19a2443a58a51b9ce05f8af0b6ea8f8c8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite/stand_with_mount.py
@@ -0,0 +1,199 @@
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.models.objects import CompositeObject
+from robosuite.utils.mjcf_utils import RED, CustomMaterial, add_to_dict
+
+
+class StandWithMount(CompositeObject):
+    """
+    Generates a flat stand with a four-walled mount sticking out of the top.
+    Args:
+        name (str): Name of this object
+        size (3-array): (x,y,z) full size of object
+        mount_location (2-array): (x,y) location to place mount, relative to center of stand
+        mount_width (float): How wide mount is (measured from outside of walls!)
+        wall_thickness (float): How thick to make walls of mount
+        initialize_on_side (bool): If True, will initialize this stand on its side (tipped over)
+        add_hole_vis (bool): If True, adds a rim around the top of the walls, to help make the hole more visually distinctive
+        friction (3-array or None): If specified, sets friction values for this object. None results in default values
+        density (float): Density value to use for all geoms. Defaults to 1000
+        use_texture (bool): If true, geoms will be defined by realistic textures and rgba values will be ignored
+        rgba (4-array or None): If specified, sets rgba values for all geoms. None results in default values
+    """
+
+    def __init__(
+        self,
+        name,
+        size=(0.3, 0.3, 0.15),
+        mount_location=(0.0, 0.0),
+        mount_width=0.05,
+        wall_thickness=0.01,
+        base_thickness=0.01,
+        initialize_on_side=True,
+        add_hole_vis=False,
+        friction=None,
+        density=1000.0,
+        solref=(0.02, 1.0),
+        solimp=(0.9, 0.95, 0.001),
+        use_texture=True,
+        rgba=(0.2, 0.1, 0.0, 1.0),
+    ):
+        # Set name
+        self._name = name
+
+        # Set object attributes
+        self.size = np.array(size)
+        self.mount_location = np.array(mount_location)
+        self.mount_width = mount_width
+        self.wall_thickness = wall_thickness
+        self.base_thickness = base_thickness
+        self.initialize_on_side = initialize_on_side
+        self.add_hole_vis = add_hole_vis
+        self.friction = friction if friction is None else np.array(friction)
+        self.solref = solref
+        self.solimp = solimp
+        self.density = density
+        self.use_texture = use_texture
+        self.rgba = rgba
+        self.mat_name = "brass_mat"
+
+        # Element references
+        self._base_geom = "base"
+
+        # Other private attributes
+        self._important_sites = {}
+
+        # Create dictionary of values to create geoms for composite object and run super init
+        super().__init__(**self._get_geom_attrs())
+
+        # Define materials we want to use for this object
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "3 3",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        bin_mat = CustomMaterial(
+            texture="Brass",
+            tex_name="brass",
+            mat_name=self.mat_name,
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+        self.append_material(bin_mat)
+
+    def _get_geom_attrs(self):
+        """
+        Creates geom elements that will be passed to superclass CompositeObject constructor
+        Returns:
+            dict: args to be used by CompositeObject to generate geoms
+        """
+        # Initialize dict of obj args that we'll pass to the CompositeObject constructor
+        base_args = {
+            "total_size": self.size / 2.0,
+            "name": self.name,
+            "locations_relative_to_center": True,
+            "obj_types": "all",
+            "density": self.density,
+            "solref": self.solref,
+            "solimp": self.solimp,
+        }
+        obj_args = {}
+
+        # Base
+        add_to_dict(
+            dic=obj_args,
+            geom_types="box",
+            geom_locations=(0, 0, -(self.size[2] - self.base_thickness) / 2),
+            geom_quats=(1, 0, 0, 0),
+            geom_sizes=np.array((self.size[0], self.size[1], self.base_thickness)) / 2,
+            geom_names=self._base_geom,
+            geom_rgbas=None if self.use_texture else self.rgba,
+            geom_materials=self.mat_name if self.use_texture else None,
+            geom_frictions=self.friction,
+        )
+
+        # Walls
+        x_vals = (
+            np.array(
+                [0, -(self.mount_width - self.wall_thickness) / 2, 0, (self.mount_width - self.wall_thickness) / 2]
+            )
+            + self.mount_location[0]
+        )
+        y_vals = (
+            np.array(
+                [-(self.mount_width - self.wall_thickness) / 2, 0, (self.mount_width - self.wall_thickness) / 2, 0]
+            )
+            + self.mount_location[1]
+        )
+        r_vals = np.array([np.pi / 2, 0, -np.pi / 2, np.pi])
+        for i, (x, y, r) in enumerate(zip(x_vals, y_vals, r_vals)):
+            add_to_dict(
+                dic=obj_args,
+                geom_types="box",
+                geom_locations=(x, y, self.base_thickness / 2),
+                geom_quats=T.convert_quat(T.axisangle2quat(np.array([0, 0, r])), to="wxyz"),
+                geom_sizes=(self.wall_thickness / 2, self.mount_width / 2, (self.size[2] - self.base_thickness) / 2),
+                geom_names=f"wall{i}",
+                geom_rgbas=None if self.use_texture else self.rgba,
+                geom_materials=self.mat_name if self.use_texture else None,
+                geom_frictions=self.friction,
+            )
+
+        if self.add_hole_vis:
+            # add a purely visual rim
+            del base_args["obj_types"]
+            obj_args["obj_types"] = len(obj_args["geom_types"]) * ["all"]
+
+            vis_geom_side = 0.7 * ((self.mount_width - self.wall_thickness) / 2)
+            vis_geom_size = (vis_geom_side, vis_geom_side, self.wall_thickness / 2)
+            add_to_dict(
+                dic=obj_args,
+                geom_types="box",
+                geom_locations=(self.mount_location[0], self.mount_location[1], (self.size[2] / 2) - vis_geom_size[2]),
+                geom_quats=(1, 0, 0, 0),
+                geom_sizes=vis_geom_size,
+                geom_names="hole_vis",
+                geom_rgbas=(0.0, 1.0, 0.0, 0.5),
+                geom_materials=None,
+                geom_frictions=self.friction,
+                obj_types="visual",
+            )
+
+        # Sites
+        obj_args["sites"] = [
+            {
+                "name": f"mount_site",
+                "pos": (0, 0, self.size[2] / 2),
+                "size": "0.002",
+                "rgba": RED,
+                "type": "sphere",
+            }
+        ]
+
+        # Add back in base args and site args
+        obj_args.update(base_args)
+
+        # Return this dict
+        return obj_args
+
+    @property
+    def init_quat(self):
+        """
+        Optionally rotate the mount on its side so it is flat
+        Returns:
+            np.array: (x, y, z, w) quaternion orientation for this object
+        """
+        # Rotate 90 deg about Y axis if at all
+        return np.array([0, 0.707107, 0, 0.707107]) if self.initialize_on_side else np.array([0, 0, 0, 1])
+
+    @property
+    def base_geoms(self):
+        """
+        Returns:
+            list of str: geom names corresponding to base
+        """
+        return [self.correct_naming(self._base_geom)]
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite_body/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite_body/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79ccc9a8e74b24e77ac07a4045d2be68e8872167
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite_body/__init__.py
@@ -0,0 +1,2 @@
+from .hinged_box import HingedBoxObject
+from .ratcheting_wrench import RatchetingWrenchObject
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite_body/hinged_box.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite_body/hinged_box.py
new file mode 100644
index 0000000000000000000000000000000000000000..12aa3cd0c62ab94df33033ea0bbcd88693876866
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite_body/hinged_box.py
@@ -0,0 +1,141 @@
+import numpy as np
+
+from robosuite.models.objects import BoxObject, CompositeBodyObject, CylinderObject
+from robosuite.utils.mjcf_utils import BLUE, RED, CustomMaterial, array_to_string
+
+
+class HingedBoxObject(CompositeBodyObject):
+    """
+    An example object that demonstrates the CompositeBodyObject functionality. This object consists of two cube bodies
+    joined together by a hinge joint.
+
+    Args:
+        name (str): Name of this object
+
+        box1_size (3-array): (L, W, H) half-sizes for the first box
+
+        box2_size (3-array): (L, W, H) half-sizes for the second box
+
+        use_texture (bool): set True if using wood textures for the blocks
+    """
+
+    def __init__(
+        self,
+        name,
+        box1_size=(0.025, 0.025, 0.025),
+        box2_size=(0.025, 0.025, 0.0125),
+        use_texture=True,
+    ):
+        # Set box sizes
+        self.box1_size = np.array(box1_size)
+        self.box2_size = np.array(box2_size)
+
+        # Set texture attributes
+        self.use_texture = use_texture
+        self.box1_material = None
+        self.box2_material = None
+        self.box1_rgba = RED
+        self.box2_rgba = BLUE
+
+        # Define materials we want to use for this object
+        if self.use_texture:
+            # Remove RGBAs
+            self.box1_rgba = None
+            self.box2_rgba = None
+
+            # Set materials for each box
+            tex_attrib = {
+                "type": "cube",
+            }
+            mat_attrib = {
+                "texrepeat": "3 3",
+                "specular": "0.4",
+                "shininess": "0.1",
+            }
+            self.box1_material = CustomMaterial(
+                texture="WoodRed",
+                tex_name="box1_tex",
+                mat_name="box1_mat",
+                tex_attrib=tex_attrib,
+                mat_attrib=mat_attrib,
+            )
+            self.box2_material = CustomMaterial(
+                texture="WoodBlue",
+                tex_name="box2_tex",
+                mat_name="box2_mat",
+                tex_attrib=tex_attrib,
+                mat_attrib=mat_attrib,
+            )
+
+        # Create objects
+        objects = []
+        for i, (size, mat, rgba) in enumerate(
+            zip(
+                (self.box1_size, self.box2_size),
+                (self.box1_material, self.box2_material),
+                (self.box1_rgba, self.box2_rgba),
+            )
+        ):
+            objects.append(
+                BoxObject(
+                    name=f"box{i + 1}",
+                    size=size,
+                    rgba=rgba,
+                    material=mat,
+                )
+            )
+
+        # Also add hinge for visualization
+        objects.append(
+            CylinderObject(
+                name="hinge",
+                size=np.array(
+                    [min(self.box1_size[2], self.box2_size[2]) / 5.0, min(self.box1_size[0], self.box2_size[0])]
+                ),
+                rgba=[0.5, 0.5, 0, 1],
+                obj_type="visual",
+            )
+        )
+
+        # Define hinge joint
+        rel_hinge_pos = [self.box2_size[0], 0, -self.box2_size[2]]  # want offset in all except y-axis
+        hinge_joint = {
+            "name": "box_hinge",
+            "type": "hinge",
+            "axis": "0 1 0",  # y-axis hinge
+            "pos": array_to_string(rel_hinge_pos),
+            "stiffness": "0.0001",
+            "limited": "true",
+            "range": "0 1.57",
+        }
+
+        # Define positions -- second box should lie on top of first box with edge aligned at hinge joint
+        # Hinge visualizer should be aligned at hinge joint location
+        positions = [
+            np.zeros(3),  # First box is centered at top-level body anyways
+            np.array([-(self.box2_size[0] - self.box1_size[0]), 0, self.box1_size[2] + self.box2_size[2]]),
+            np.array(rel_hinge_pos),
+        ]
+
+        quats = [
+            None,  # Default quaternion for box 1
+            None,  # Default quaternion for box 2
+            [0.707, 0.707, 0, 0],  # Rotated 90 deg about x-axis
+        ]
+
+        # Define parents -- which body each is aligned to
+        parents = [
+            None,  # box 1 attached to top-level body
+            objects[0].root_body,  # box 2 attached to box 1
+            objects[1].root_body,  # hinge attached to box 2
+        ]
+
+        # Run super init
+        super().__init__(
+            name=name,
+            objects=objects,
+            object_locations=positions,
+            object_quats=quats,
+            object_parents=parents,
+            body_joints={objects[1].root_body: [hinge_joint]},
+        )
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite_body/ratcheting_wrench.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite_body/ratcheting_wrench.py
new file mode 100644
index 0000000000000000000000000000000000000000..6686541956d90f7fd06125332bbc342e6c7b5ac1
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/composite_body/ratcheting_wrench.py
@@ -0,0 +1,150 @@
+import numpy as np
+
+from robosuite.models.objects import BoxObject, CompositeBodyObject, CylinderObject, HollowCylinderObject
+from robosuite.utils.mjcf_utils import CustomMaterial
+
+
+class RatchetingWrenchObject(CompositeBodyObject):
+    """
+    A ratcheting wrench made out of mujoco primitives.
+    Args:
+        name (str): Name of this object
+        handle_size ([float]): (L, W, H) half-sizes for the handle (center part of wrench)
+        outer_radius_1 (float): Outer radius of first end of wrench
+        inner_radius_1 (float): Inner radius of first end of wrench
+        height_1 (float): Height of first end of wrench
+        outer_radius_2 (float): Outer radius of second end of wrench
+        inner_radius_2 (float): Inner radius of second end of wrench
+        height_2 (float): Height of second end of wrench
+        ngeoms (int): Number of box geoms used to approximate the ends of the wrench. Use
+            more geoms to make the approximation better.
+        grip_size ([float]): (R, H) radius and half-height for the box grip. Set to None
+            to not add a grip.
+    """
+
+    def __init__(
+        self,
+        name,
+        handle_size=(0.08, 0.01, 0.005),
+        outer_radius_1=0.0425,
+        inner_radius_1=0.03,
+        height_1=0.05,
+        outer_radius_2=0.0425,
+        inner_radius_2=0.03,
+        height_2=0.05,
+        ngeoms=8,
+        grip_size=None,
+        # rgba=None,
+        density=1000.0,
+        solref=(0.02, 1.0),
+        solimp=(0.9, 0.95, 0.001),
+        friction=None,
+    ):
+        # Object properties
+        self.handle_size = tuple(handle_size)
+        self.outer_radii = (outer_radius_1, outer_radius_2)
+        self.inner_radii = (inner_radius_1, inner_radius_2)
+        self.heights = (height_1, height_2)
+        self.ngeoms = ngeoms
+        self.grip_size = tuple(grip_size) if grip_size is not None else None
+
+        # Define materials we want to use for this object
+        tex_attrib = {
+            "type": "cube",
+        }
+        mat_attrib = {
+            "texrepeat": "3 3",
+            "specular": "0.4",
+            "shininess": "0.1",
+        }
+        wrench_mat = CustomMaterial(
+            texture="SteelScratched",
+            tex_name="steel",
+            mat_name="steel_mat",
+            tex_attrib=tex_attrib,
+            mat_attrib=mat_attrib,
+        )
+
+        if self.grip_size is not None:
+            grip_mat = CustomMaterial(
+                texture="Ceramic",
+                tex_name="ceramic",
+                mat_name="ceramic_mat",
+                tex_attrib=tex_attrib,
+                mat_attrib=mat_attrib,
+            )
+
+        # Create objects
+        objects = []
+
+        # each end of the wrench is modeled by a hollow cylinder
+        for i in range(2):
+            objects.append(
+                HollowCylinderObject(
+                    name=f"hole{i + 1}",
+                    outer_radius=self.outer_radii[i],
+                    inner_radius=self.inner_radii[i],
+                    height=self.heights[i],
+                    ngeoms=self.ngeoms,
+                    rgba=None,
+                    material=wrench_mat,
+                    density=density,
+                    solref=solref,
+                    solimp=solimp,
+                    friction=friction,
+                    make_half=False,
+                )
+            )
+
+        # also add center box geom for handle
+        objects.append(
+            BoxObject(
+                name="handle",
+                size=handle_size,
+                rgba=None,
+                material=wrench_mat,
+                density=density,
+                solref=solref,
+                solimp=solimp,
+                friction=friction,
+            )
+        )
+
+        # Define positions (top-level body is centered at handle)
+        hole_1_box_geom_height = 2.0 * objects[0].unit_box_height
+        hole_2_box_geom_height = 2.0 * objects[1].unit_box_height
+        positions = [
+            # this computation ensures no gaps between the center bar geom and the two wrench holes at the end
+            np.array([-handle_size[0] - self.outer_radii[0] + hole_1_box_geom_height, 0, 0]),
+            np.array([handle_size[0] + self.outer_radii[1] - hole_2_box_geom_height, 0, 0]),
+            np.zeros(3),
+        ]
+        quats = [None, None, None]
+        parents = [None, None, None]
+
+        # maybe add grip
+        if self.grip_size is not None:
+            objects.append(
+                BoxObject(
+                    name="grip",
+                    size=[self.grip_size[0], self.grip_size[0], self.grip_size[1]],
+                    rgba=(0.13, 0.13, 0.13, 1.0),
+                    density=density,
+                    solref=solref,
+                    solimp=solimp,
+                    friction=(1.0, 0.005, 0.0001),  # use default friction
+                )
+            )
+            positions.append(np.zeros(3))
+            quats.append((np.sqrt(2) / 2.0, 0.0, np.sqrt(2) / 2.0, 0.0))  # rotate 90 degrees about y-axis
+            parents.append(None)
+
+        # Run super init
+        super().__init__(
+            name=name,
+            objects=objects,
+            object_locations=positions,
+            object_quats=quats,
+            object_parents=parents,
+            joints=[dict(type="free", damping="0.0005")],  # be consistent with round-nut.xml
+        )
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/generated_objects.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/generated_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..770624b848099721f9d86ec8193e2cbd0c6e6ca2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/generated_objects.py
@@ -0,0 +1,785 @@
+from copy import deepcopy
+
+import numpy as np
+
+from robosuite.models.objects import MujocoGeneratedObject, MujocoObject
+from robosuite.utils.mjcf_utils import (
+    OBJECT_COLLISION_COLOR,
+    CustomMaterial,
+    add_prefix,
+    array_to_string,
+    find_elements,
+    new_body,
+    new_geom,
+    new_joint,
+    new_site,
+)
+
+
+class CompositeBodyObject(MujocoGeneratedObject):
+    """
+    An object constructed out of multiple bodies to make more complex shapes.
+
+    Args:
+        name (str): Name of overall object
+
+        objects (MujocoObject or list of MujocoObjects): object(s) to combine to form the composite body object.
+            Note that these objects will be added sequentially, so if an object is required to be nested relative to
+            another object, that nested object should be listed after the parent object. Note that all top-level joints
+            for any inputted objects are automatically stripped
+
+        object_locations (list): list of body locations in the composite. Each
+            location should be a list or tuple of 3 elements and all
+            locations are taken relative to that object's parent body. Giving None for a location results in (0,0,0)
+            for that object.
+
+        object_quats (None or list): list of (w, x, y, z) quaternions for each body. None results in (1,0,0,0) for
+            that object.
+
+        object_parents (None or list): Parent bodies to append each object to. Note that specifying "None" will
+            automatically append all objects to the root body ("root")
+
+        joints (None or list): Joints to use for the top-level composite body object. If None, no joints will be used
+            for this top-level object. If "default", a single free joint will be added to the top-level body of this
+            object. Otherwise, should be a list of dictionaries, where each dictionary should specify the specific
+            joint attributes necessary. See http://www.mujoco.org/book/XMLreference.html#joint for reference.
+
+        body_joints (None or dict): If specified, maps body names to joint specifications to append to that
+            body. If None, no extra joints will be used. If mapped value is "default", a single free joint will be
+            added to the specified body. Otherwise, should be a list of dictionaries, where each dictionary should
+            specify the specific joint attributes necessary. See http://www.mujoco.org/book/XMLreference.html#joint
+            for reference.
+
+        sites (None or list): list of sites to add to top-level composite body object. If None, only the default
+            top-level object site will be used. Otherwise, should be a list of dictionaries, where each dictionary
+            should specify the appropriate attributes for the given site.
+            See http://www.mujoco.org/book/XMLreference.html#site for reference.
+
+        total_size (None or np.array): if provided, use this to describe the bounding box for this composite body
+            object. Can also be used to specify @object_locations relative to the lower left corner of the bounding
+            box defined by @total_size, instead of the center of this body, with @locations_relative_to_corner.
+
+        locations_relative_to_corner (bool): if True, must supply @total_size. All object locations will be
+            relative to the lower left corner of the bounding box.
+    """
+
+    def __init__(
+        self,
+        name,
+        objects,
+        object_locations,
+        object_quats=None,
+        object_parents=None,
+        joints="default",
+        body_joints=None,
+        sites=None,
+        total_size=None,
+        locations_relative_to_corner=False,
+    ):
+        # Always call superclass first
+        super().__init__()
+
+        self._name = name
+
+        # Set internal variable geometric properties which will be modified later
+        self._object_absolute_positions = {"root": np.zeros(3)}  # maps body names to abs positions (rel to root)
+        self._top = 0
+        self._bottom = 0
+        self._horizontal = 0
+
+        # Standardize inputs
+        if isinstance(objects, MujocoObject):
+            self.objects = [objects]
+        elif type(objects) in {list, tuple}:
+            self.objects = list(objects)
+        else:
+            # Invalid objects received
+            raise ValueError("Invalid objects received, got type: {}".format(type(objects)))
+
+        n_objects = len(self.objects)
+        self.object_locations = np.array(object_locations)
+        self.object_quats = deepcopy(object_quats) if object_quats is not None else [None] * n_objects
+        self.object_parents = deepcopy(object_parents) if object_parents is not None else ["root"] * n_objects
+
+        # Set joints
+        if joints == "default":
+            self.joint_specs = [self.get_joint_attrib_template()]  # default free joint
+        elif joints is None:
+            self.joint_specs = []
+        else:
+            self.joint_specs = joints
+
+        # Set body joints
+        if body_joints is None:
+            body_joints = {}
+        self.body_joint_specs = body_joints
+
+        # Make sure all joints are named appropriately
+        j_num = 0
+        for joint_spec in self.joint_specs:
+            if "name" not in joint_spec:
+                joint_spec["name"] = "joint{}".format(j_num)
+                j_num += 1
+
+        # Set sites
+        self.site_specs = deepcopy(sites) if sites is not None else []
+        # Add default site
+        site_element_attr = self.get_site_attrib_template()
+        site_element_attr["rgba"] = "1 0 0 0"
+        site_element_attr["name"] = "default_site"
+        self.site_specs.append(site_element_attr)
+
+        # Make sure all sites are named appropriately
+        s_num = 0
+        for site_spec in self.site_specs:
+            if "name" not in site_spec:
+                site_spec["name"] = "site{}".format(s_num)
+                s_num += 1
+
+        self.total_size = np.array(total_size) if total_size is not None else None
+        self.locations_relative_to_corner = locations_relative_to_corner
+        if self.locations_relative_to_corner:
+            assert self.total_size is not None
+
+        # Always run sanity check
+        self.sanity_check()
+
+        # Lastly, parse XML tree appropriately
+        self._obj = self._get_object_subtree()
+
+        # Extract the appropriate private attributes for this
+        self._get_object_properties()
+
+    def _get_object_subtree(self):
+        # Initialize top-level body
+        obj = new_body(name="root")
+
+        # Add all joints and sites
+        for joint_spec in self.joint_specs:
+            obj.append(new_joint(**joint_spec))
+        for site_spec in self.site_specs:
+            obj.append(new_site(**site_spec))
+
+        # Loop through all objects and associated args and append them appropriately
+        for o, o_parent, o_pos, o_quat in zip(
+            self.objects, self.object_parents, self.object_locations, self.object_quats
+        ):
+            self._append_object(root=obj, obj=o, parent_name=o_parent, pos=o_pos, quat=o_quat)
+
+        # Loop through all joints and append them appropriately
+        for body_name, joint_specs in self.body_joint_specs.items():
+            self._append_joints(root=obj, body_name=body_name, joint_specs=joint_specs)
+
+        # Return final object
+        return obj
+
+    def _get_object_properties(self):
+        """
+        Extends the superclass method to add prefixes to all assets
+        """
+        super()._get_object_properties()
+        # Add prefix to all assets
+        add_prefix(root=self.asset, prefix=self.naming_prefix, exclude=self.exclude_from_prefixing)
+
+    def _append_object(self, root, obj, parent_name=None, pos=None, quat=None):
+        """
+        Helper function to add pre-generated object @obj to the body with name @parent_name
+
+        Args:
+            root (ET.Element): Top-level element to iteratively search through for @parent_name to add @obj to
+            obj (MujocoObject): Object to append to the body specified by @parent_name
+            parent_name (None or str): Body name to search for in @root to append @obj to.
+                None defaults to "root" (top-level body)
+            pos (None or 3-array): (x,y,z) relative offset from parent body when appending @obj.
+                None defaults to (0,0,0)
+            quat (None or 4-array) (w,x,y,z) relative quaternion rotation from parent body when appending @obj.
+                None defaults to (1,0,0,0)
+        """
+        # Set defaults if any are None
+        if parent_name is None:
+            parent_name = "root"
+        if pos is None:
+            pos = np.zeros(3)
+        if quat is None:
+            quat = np.array([1, 0, 0, 0])
+        # First, find parent body
+        parent = find_elements(root=root, tags="body", attribs={"name": parent_name}, return_first=True)
+        assert parent is not None, "Could not find parent body with name: {}".format(parent_name)
+        # Get the object xml element tree, remove its top-level joints, and modify its top-level pos / quat
+        child = obj.get_obj()
+        self._remove_joints(child)
+
+        if self.locations_relative_to_corner:
+            # use object location to convert to position coordinate (the origin is the
+            # center of the composite object)
+            cartesian_size = obj.get_bounding_box_half_size()
+            pos = [
+                (-self.total_size[0] + cartesian_size[0]) + pos[0],
+                (-self.total_size[1] + cartesian_size[1]) + pos[1],
+                (-self.total_size[2] + cartesian_size[2]) + pos[2],
+            ]
+
+        child.set("pos", array_to_string(pos))
+        child.set("quat", array_to_string(quat))
+        # Add this object and its assets to this composite object
+        self.merge_assets(other=obj)
+        parent.append(child)
+        # Update geometric properties for this composite object
+        obj_abs_pos = self._object_absolute_positions[parent_name] + np.array(pos)
+        self._object_absolute_positions[obj.root_body] = obj_abs_pos
+        self._top = max(self._top, obj_abs_pos[2] + obj.top_offset[2])
+        self._bottom = min(self._bottom, obj_abs_pos[2] + obj.bottom_offset[2])
+        self._horizontal = max(self._horizontal, max(obj_abs_pos[:2]) + obj.horizontal_radius)
+
+    def _append_joints(self, root, body_name=None, joint_specs="default"):
+        """
+        Appends all joints as specified by @joint_specs to @body.
+
+        Args:
+            root (ET.Element): Top-level element to iteratively search through for @body_name
+            body_name (None or str): Name of the body to append the joints to.
+                None defaults to "root" (top-level body)
+            joint_specs (str or list): List of joint specifications to add to the specified body, or
+                "default", which results in a single free joint
+        """
+        # Standardize inputs
+        if body_name is None:
+            body_name = "root"
+        if joint_specs == "default":
+            joint_specs = [self.get_joint_attrib_template()]
+        for i, joint_spec in enumerate(joint_specs):
+            if "name" not in joint_spec:
+                joint_spec["name"] = f"{body_name}_joint{i}"
+        # Search for body and make sure it exists
+        body = find_elements(root=root, tags="body", attribs={"name": body_name}, return_first=True)
+        assert body is not None, "Could not find body with name: {}".format(body_name)
+        # Add joint(s) to this body
+        for joint_spec in joint_specs:
+            body.append(new_joint(**joint_spec))
+
+    @staticmethod
+    def _remove_joints(body):
+        """
+        Helper function to strip all joints directly appended to the specified @body.
+
+        Args:
+            body (ET.Element): Body to strip joints from
+        """
+        children_to_remove = []
+        for child in body:
+            if child.tag == "joint":
+                children_to_remove.append(child)
+        for child in children_to_remove:
+            body.remove(child)
+
+    @property
+    def bottom_offset(self):
+        return np.array([0.0, 0.0, self._bottom])
+
+    @property
+    def top_offset(self):
+        return np.array([0.0, 0.0, self._top])
+
+    @property
+    def horizontal_radius(self):
+        return self._horizontal
+
+    def get_bounding_box_half_size(self):
+        if self.total_size is not None:
+            return np.array(self.total_size)
+        return super().get_bounding_box_half_size()
+
+
+class CompositeObject(MujocoGeneratedObject):
+    """
+    An object constructed out of basic geoms to make more intricate shapes.
+
+    Note that by default, specifying None for a specific geom element will usually set a value to the mujoco defaults.
+
+    Args:
+        name (str): Name of overall object
+
+        total_size (list): (x, y, z) half-size in each dimension for the bounding box for
+            this Composite object
+
+        geom_types (list): list of geom types in the composite. Must correspond
+            to MuJoCo geom primitives, such as "box" or "capsule".
+
+        geom_locations (list): list of geom locations in the composite. Each
+            location should be a list or tuple of 3 elements and all
+            locations are relative to the lower left corner of the total box
+            (e.g. (0, 0, 0) corresponds to this corner).
+
+        geom_sizes (list): list of geom sizes ordered the same as @geom_locations
+
+        geom_quats (None or list): list of (w, x, y, z) quaternions for each geom.
+
+        geom_names (None or list): list of geom names ordered the same as @geom_locations. The
+            names will get appended with an underscore to the passed name in @get_collision
+            and @get_visual
+
+        geom_rgbas (None or list): list of geom colors ordered the same as @geom_locations. If
+            passed as an argument, @rgba is ignored.
+
+        geom_materials (None or list of CustomTexture): list of custom textures to use for this object material
+
+        geom_frictions (None or list): list of geom frictions to use for each geom.
+
+        rgba (None or list): (r, g, b, a) default values to use if geom-specific @geom_rgbas isn't specified for a given element
+
+        density (float or list of float): either single value to use for all geom densities or geom-specific values
+
+        solref (list or list of list): parameters used for the mujoco contact solver. Can be single set of values or
+            element-specific values. See http://www.mujoco.org/book/modeling.html#CSolver for details.
+
+        solimp (list or list of list): parameters used for the mujoco contact solver. Can be single set of values or
+            element-specific values. See http://www.mujoco.org/book/modeling.html#CSolver for details.
+
+        locations_relative_to_center (bool): If true, @geom_locations will be considered relative to the center of the
+            overall object bounding box defined by @total_size. Else, the corner of this bounding box is considered the
+            origin.
+
+        joints (None or list): Joints to use for this composite object. If None, no joints will be used
+            for this top-level object. If "default", a single free joint will be added to this object.
+            Otherwise, should be a list of dictionaries, where each dictionary should specify the specific
+            joint attributes necessary. See http://www.mujoco.org/book/XMLreference.html#joint for reference.
+
+        sites (None or list): list of sites to add to this composite object. If None, only the default
+             object site will be used. Otherwise, should be a list of dictionaries, where each dictionary
+            should specify the appropriate attributes for the given site.
+            See http://www.mujoco.org/book/XMLreference.html#site for reference.
+
+        obj_types (str or list of str): either single obj_type for all geoms or geom-specific type. Choices are
+            {"collision", "visual", "all"}
+    """
+
+    def __init__(
+        self,
+        name,
+        total_size,
+        geom_types,
+        geom_sizes,
+        geom_locations,
+        geom_quats=None,
+        geom_names=None,
+        geom_rgbas=None,
+        geom_materials=None,
+        geom_frictions=None,
+        geom_condims=None,
+        rgba=None,
+        density=100.0,
+        solref=(0.02, 1.0),
+        solimp=(0.9, 0.95, 0.001),
+        locations_relative_to_center=False,
+        joints="default",
+        sites=None,
+        obj_types="all",
+        duplicate_collision_geoms=True,
+    ):
+        # Always call superclass first
+        super().__init__(duplicate_collision_geoms=duplicate_collision_geoms)
+
+        self._name = name
+
+        # Set joints
+        if joints == "default":
+            self.joint_specs = [self.get_joint_attrib_template()]  # default free joint
+        elif joints is None:
+            self.joint_specs = []
+        else:
+            self.joint_specs = joints
+
+        # Make sure all joints are named appropriately
+        j_num = 0
+        for joint_spec in self.joint_specs:
+            if "name" not in joint_spec:
+                joint_spec["name"] = "joint{}".format(j_num)
+                j_num += 1
+
+        # Set sites
+        self.site_specs = deepcopy(sites) if sites is not None else []
+        # Add default site
+        site_element_attr = self.get_site_attrib_template()
+        site_element_attr["rgba"] = "1 0 0 0"
+        site_element_attr["name"] = "default_site"
+        self.site_specs.append(site_element_attr)
+
+        # Make sure all sites are named appropriately
+        s_num = 0
+        for site_spec in self.site_specs:
+            if "name" not in site_spec:
+                site_spec["name"] = "site{}".format(s_num)
+                s_num += 1
+
+        n_geoms = len(geom_types)
+        self.total_size = np.array(total_size)
+        self.geom_types = np.array(geom_types)
+        self.geom_sizes = deepcopy(geom_sizes)
+        self.geom_locations = np.array(geom_locations)
+        self.geom_quats = deepcopy(geom_quats) if geom_quats is not None else [None] * n_geoms
+        self.geom_names = list(geom_names) if geom_names is not None else [None] * n_geoms
+        self.geom_rgbas = list(geom_rgbas) if geom_rgbas is not None else [None] * n_geoms
+        self.geom_materials = list(geom_materials) if geom_materials is not None else [None] * n_geoms
+        self.geom_frictions = list(geom_frictions) if geom_frictions is not None else [None] * n_geoms
+        self.geom_condims = list(geom_condims) if geom_condims is not None else [None] * n_geoms
+        self.density = [density] * n_geoms if density is None or type(density) in {float, int} else list(density)
+        self.solref = [solref] * n_geoms if solref is None or type(solref[0]) in {float, int} else list(solref)
+        self.solimp = [solimp] * n_geoms if obj_types is None or type(solimp[0]) in {float, int} else list(solimp)
+        self.rgba = rgba  # override superclass setting of this variable
+        self.locations_relative_to_center = locations_relative_to_center
+        self.obj_types = [obj_types] * n_geoms if obj_types is None or type(obj_types) is str else list(obj_types)
+
+        # Always run sanity check
+        self.sanity_check()
+
+        # Lastly, parse XML tree appropriately
+        self._obj = self._get_object_subtree()
+
+        # Extract the appropriate private attributes for this
+        self._get_object_properties()
+
+    def get_bounding_box_half_size(self):
+        return np.array(self.total_size)
+
+    def in_box(self, position, object_position):
+        """
+        Checks whether the object is contained within this CompositeObject.
+        Useful for when the CompositeObject has holes and the object should
+        be within one of the holes. Makes an approximation by treating the
+        object as a point, and the CompositeBoxObject as an axis-aligned grid.
+        Args:
+            position: 3D body position of CompositeObject
+            object_position: 3D position of object to test for insertion
+        """
+        ub = position + self.total_size
+        lb = position - self.total_size
+
+        # fudge factor for the z-check, since after insertion the object falls to table
+        lb[2] -= 0.01
+
+        return np.all(object_position > lb) and np.all(object_position < ub)
+
+    def _get_object_subtree(self):
+        # Initialize top-level body
+        obj = new_body(name="root")
+
+        # Add all joints and sites
+        for joint_spec in self.joint_specs:
+            obj.append(new_joint(**joint_spec))
+        for site_spec in self.site_specs:
+            obj.append(new_site(**site_spec))
+
+        # Loop through all geoms and generate the composite object
+        for i, (
+            obj_type,
+            g_type,
+            g_size,
+            g_loc,
+            g_name,
+            g_rgba,
+            g_friction,
+            g_condim,
+            g_quat,
+            g_material,
+            g_density,
+            g_solref,
+            g_solimp,
+        ) in enumerate(
+            zip(
+                self.obj_types,
+                self.geom_types,
+                self.geom_sizes,
+                self.geom_locations,
+                self.geom_names,
+                self.geom_rgbas,
+                self.geom_frictions,
+                self.geom_condims,
+                self.geom_quats,
+                self.geom_materials,
+                self.density,
+                self.solref,
+                self.solimp,
+            )
+        ):
+            # geom type
+            geom_type = g_type
+            # get cartesian size from size spec
+            size = g_size
+            cartesian_size = self._size_to_cartesian_half_lengths(geom_type, size)
+            if self.locations_relative_to_center:
+                # no need to convert
+                pos = g_loc
+            else:
+                # use geom location to convert to position coordinate (the origin is the
+                # center of the composite object)
+                pos = [
+                    (-self.total_size[0] + cartesian_size[0]) + g_loc[0],
+                    (-self.total_size[1] + cartesian_size[1]) + g_loc[1],
+                    (-self.total_size[2] + cartesian_size[2]) + g_loc[2],
+                ]
+
+            # geom name
+            geom_name = g_name if g_name is not None else f"g{i}"
+
+            # geom rgba
+            geom_rgba = g_rgba if g_rgba is not None else self.rgba
+
+            # geom friction
+            geom_friction = (
+                array_to_string(g_friction)
+                if g_friction is not None
+                else array_to_string(np.array([1.0, 0.005, 0.0001]))
+            )  # mujoco default
+
+            # Define base geom attributes
+            geom_attr = {
+                "size": size,
+                "pos": pos,
+                "name": geom_name,
+                "type": geom_type,
+            }
+
+            # Optionally define quat if specified
+            if g_quat is not None:
+                geom_attr["quat"] = array_to_string(g_quat)
+
+            # Add collision geom if necessary
+            if obj_type in {"collision", "all"}:
+                col_geom_attr = deepcopy(geom_attr)
+                col_geom_attr.update(self.get_collision_attrib_template())
+                if g_density is not None:
+                    col_geom_attr["density"] = str(g_density)
+                col_geom_attr["friction"] = geom_friction
+                col_geom_attr["solref"] = array_to_string(g_solref)
+                col_geom_attr["solimp"] = array_to_string(g_solimp)
+                col_geom_attr["rgba"] = OBJECT_COLLISION_COLOR
+                if g_condim is not None:
+                    col_geom_attr["condim"] = str(g_condim)
+                obj.append(new_geom(**col_geom_attr))
+
+            # Add visual geom if necessary
+            if obj_type in {"visual", "all"}:
+                vis_geom_attr = deepcopy(geom_attr)
+                vis_geom_attr.update(self.get_visual_attrib_template())
+                vis_geom_attr["name"] += "_vis"
+                if g_material is not None:
+                    vis_geom_attr["material"] = g_material
+                vis_geom_attr["rgba"] = geom_rgba
+                obj.append(new_geom(**vis_geom_attr))
+
+        return obj
+
+    @staticmethod
+    def _size_to_cartesian_half_lengths(geom_type, geom_size):
+        """
+        converts from geom size specification to x, y, and z half-length bounding box
+        """
+        if geom_type in ["box", "ellipsoid"]:
+            return geom_size
+        if geom_type == "sphere":
+            # size is radius
+            return [geom_size[0], geom_size[0], geom_size[0]]
+        if geom_type == "capsule":
+            # size is radius, half-length of cylinder part
+            return [geom_size[0], geom_size[0], geom_size[0] + geom_size[1]]
+        if geom_type == "cylinder":
+            # size is radius, half-length
+            return [geom_size[0], geom_size[0], geom_size[1]]
+        raise Exception("unsupported geom type!")
+
+    @property
+    def bottom_offset(self):
+        return np.array([0.0, 0.0, -self.total_size[2]])
+
+    @property
+    def top_offset(self):
+        return np.array([0.0, 0.0, self.total_size[2]])
+
+    @property
+    def horizontal_radius(self):
+        return np.linalg.norm(self.total_size[:2], 2)
+
+
+class PrimitiveObject(MujocoGeneratedObject):
+    """
+    Base class for all programmatically generated mujoco object
+    i.e., every MujocoObject that does not have an corresponding xml file
+
+    Args:
+        name (str): (unique) name to identify this generated object
+
+        size (n-tuple of float): relevant size parameters for the object, should be of size 1 - 3
+
+        rgba (4-tuple of float): Color
+
+        density (float): Density
+
+        friction (3-tuple of float): (sliding friction, torsional friction, and rolling friction).
+            A single float can also be specified, in order to set the sliding friction (the other values) will
+            be set to the MuJoCo default. See http://www.mujoco.org/book/modeling.html#geom for details.
+
+        solref (2-tuple of float): MuJoCo solver parameters that handle contact.
+            See http://www.mujoco.org/book/XMLreference.html for more details.
+
+        solimp (3-tuple of float): MuJoCo solver parameters that handle contact.
+            See http://www.mujoco.org/book/XMLreference.html for more details.
+
+        material (CustomMaterial or `'default'` or None): if "default", add a template material and texture for this
+            object that is used to color the geom(s).
+            Otherwise, input is expected to be a CustomMaterial object
+
+            See http://www.mujoco.org/book/XMLreference.html#asset for specific details on attributes expected for
+            Mujoco texture / material tags, respectively
+
+            Note that specifying a custom texture in this way automatically overrides any rgba values set
+
+        joints (None or str or list of dict): Joints for this object. If None, no joint will be created. If "default",
+            a single (free) joint will be crated. Else, should be a list of dict, where each dictionary corresponds to
+            a joint that will be created for this object. The dictionary should specify the joint attributes
+            (type, pos, etc.) according to the MuJoCo xml specification.
+
+        obj_type (str): Geom elements to generate / extract for this object. Must be one of:
+
+            :`'collision'`: Only collision geoms are returned (this corresponds to group 0 geoms)
+            :`'visual'`: Only visual geoms are returned (this corresponds to group 1 geoms)
+            :`'all'`: All geoms are returned
+
+        duplicate_collision_geoms (bool): If set, will guarantee that each collision geom has a
+            visual geom copy
+    """
+
+    def __init__(
+        self,
+        name,
+        size=None,
+        rgba=None,
+        density=None,
+        friction=None,
+        solref=None,
+        solimp=None,
+        material=None,
+        joints="default",
+        obj_type="all",
+        duplicate_collision_geoms=True,
+    ):
+        # Always call superclass first
+        super().__init__(obj_type=obj_type, duplicate_collision_geoms=duplicate_collision_geoms)
+
+        # Set name
+        self._name = name
+
+        if size is None:
+            size = [0.05, 0.05, 0.05]
+        self.size = list(size)
+
+        if rgba is None:
+            rgba = [1, 0, 0, 1]
+        assert len(rgba) == 4, "rgba must be a length 4 array"
+        self.rgba = list(rgba)
+
+        if density is None:
+            density = 1000  # water
+        self.density = density
+
+        if friction is None:
+            friction = [1, 0.005, 0.0001]  # MuJoCo default
+        elif isinstance(friction, float) or isinstance(friction, int):
+            friction = [friction, 0.005, 0.0001]
+        assert len(friction) == 3, "friction must be a length 3 array or a single number"
+        self.friction = list(friction)
+
+        if solref is None:
+            self.solref = [0.02, 1.0]  # MuJoCo default
+        else:
+            self.solref = solref
+
+        if solimp is None:
+            self.solimp = [0.9, 0.95, 0.001]  # MuJoCo default
+        else:
+            self.solimp = solimp
+
+        self.material = material
+        if material == "default":
+            # add in default texture and material for this object (for domain randomization)
+            default_tex = CustomMaterial(
+                texture=self.rgba,
+                tex_name="tex",
+                mat_name="mat",
+            )
+            self.append_material(default_tex)
+        elif material is not None:
+            # add in custom texture and material
+            self.append_material(material)
+
+        # joints for this object
+        if joints == "default":
+            self.joint_specs = [self.get_joint_attrib_template()]  # default free joint
+        elif joints is None:
+            self.joint_specs = []
+        else:
+            self.joint_specs = joints
+
+        # Make sure all joints have names!
+        for i, joint_spec in enumerate(self.joint_specs):
+            if "name" not in joint_spec:
+                joint_spec["name"] = "joint{}".format(i)
+
+        # Always run sanity check
+        self.sanity_check()
+
+        # Lastly, parse XML tree appropriately
+        self._obj = self._get_object_subtree()
+
+        # Extract the appropriate private attributes for this
+        self._get_object_properties()
+
+    def _get_object_subtree_(self, ob_type="box"):
+        # Create element tree
+        obj = new_body(name="main")
+
+        # Get base element attributes
+        element_attr = {"name": "g0", "type": ob_type, "size": array_to_string(self.size)}
+
+        # Add collision geom if necessary
+        if self.obj_type in {"collision", "all"}:
+            col_element_attr = deepcopy(element_attr)
+            col_element_attr.update(self.get_collision_attrib_template())
+            col_element_attr["density"] = str(self.density)
+            col_element_attr["friction"] = array_to_string(self.friction)
+            col_element_attr["solref"] = array_to_string(self.solref)
+            col_element_attr["solimp"] = array_to_string(self.solimp)
+            obj.append(new_geom(**col_element_attr))
+        # Add visual geom if necessary
+        if self.obj_type in {"visual", "all"}:
+            vis_element_attr = deepcopy(element_attr)
+            vis_element_attr.update(self.get_visual_attrib_template())
+            vis_element_attr["name"] += "_vis"
+            if self.material == "default":
+                vis_element_attr["rgba"] = "0.5 0.5 0.5 1"  # mujoco default
+                vis_element_attr["material"] = "mat"
+            elif self.material is not None:
+                vis_element_attr["material"] = self.material.mat_attrib["name"]
+            else:
+                vis_element_attr["rgba"] = array_to_string(self.rgba)
+            obj.append(new_geom(**vis_element_attr))
+        # add joint(s)
+        for joint_spec in self.joint_specs:
+            obj.append(new_joint(**joint_spec))
+        # add a site as well
+        site_element_attr = self.get_site_attrib_template()
+        site_element_attr["name"] = "default_site"
+        obj.append(new_site(**site_element_attr))
+        return obj
+
+    # Methods that still need to be defined by subclass
+    def _get_object_subtree(self):
+        raise NotImplementedError
+
+    def bottom_offset(self):
+        raise NotImplementedError
+
+    def top_offset(self):
+        raise NotImplementedError
+
+    def horizontal_radius(self):
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/group/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/group/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f000410cd83b63dd4c255d31c6722cfcd319de
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/group/__init__.py
@@ -0,0 +1 @@
+from .transport import TransportGroup
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/group/transport.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/group/transport.py
new file mode 100644
index 0000000000000000000000000000000000000000..58c3bd894dc6d67f41f5e0246f122d97614fa1d8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/group/transport.py
@@ -0,0 +1,174 @@
+import numpy as np
+
+import robosuite.utils.sim_utils as SU
+import robosuite.utils.transform_utils as T
+from robosuite.models.objects import Bin, Lid, ObjectGroup
+
+
+class TransportGroup(ObjectGroup):
+    """
+    Group of objects that capture transporting a payload placed in a start bin to a target bin, while
+    also requiring a piece of trash to be removed from the target bin
+    Args:
+        name (str): Name of that will the prepended to all geom bodies generated for this group
+        payload (MujocoObject): Object that represents payload
+        trash (MujocoObject): Object that represents trash
+        bin_size (3-tuple): (x,y,z) full size of bins to place on tables
+    """
+
+    def __init__(self, name, payload, trash, bin_size=(0.3, 0.3, 0.15)):
+        # Store and initialize internal variables
+        self.payload = payload
+        self.trash = trash
+        self.bin_size = bin_size
+
+        # Create bins and lid
+        self.start_bin = Bin(name=f"{name}_start_bin", bin_size=bin_size, density=10000.0)
+        self.target_bin = Bin(name=f"{name}_target_bin", bin_size=bin_size, density=10000.0)
+        self.trash_bin = Bin(name=f"{name}_trash_bin", bin_size=bin_size, density=10000.0)
+        self.lid = Lid(name=f"{name}_start_bin_lid", lid_size=(*bin_size[:2], 0.01))
+
+        # Relevant geom ids
+        self.payload_geom_ids = None
+        self.trash_geom_ids = None
+        self.target_bin_base_geom_ids = None
+        self.trash_bin_base_geom_ids = None
+        self.lid_handle_geom_ids = None
+        self.payload_body_id = None
+        self.trash_body_id = None
+
+        # Run super init
+        super().__init__(name=name)
+
+    def get_states(self):
+        """
+        Grabs all relevant information for this transport group. Returned dictionary maps keywords to corresponding
+        values pulled from the current sim state.
+        Returns:
+            dict:
+                "lid_handle_pose": list of (pos, quat) of lid handle
+                "payload_pose": list of (pos, quat) of hammer handle
+                "trash_pose": list of (pos, quat) of trash object
+                "target_bin_pos": position of target bin (base geom)
+                "trash_bin_pos": position of trash bin (base geom)
+                "trash_in_trash_bin": True if trash object is touching the base of the trash bin
+                "payload_in_target_bin": True if payload object is touching the base of the target bin
+        """
+        return {
+            "lid_handle_pose": (self.lid_handle_pos, self.lid_handle_quat),
+            "payload_pose": (self.payload_pos, self.payload_quat),
+            "trash_pose": (self.trash_pos, self.trash_quat),
+            "target_bin_pos": self.target_bin_pos,
+            "trash_bin_pos": self.trash_bin_pos,
+            "trash_in_trash_bin": self.trash_in_trash_bin,
+            "payload_in_target_bin": self.payload_in_target_bin,
+        }
+
+    def _generate_objects(self):
+        # Store all relevant objects in self._objects
+        self._objects = {
+            "payload": self.payload,
+            "trash": self.trash,
+            "start_bin": self.start_bin,
+            "target_bin": self.target_bin,
+            "trash_bin": self.trash_bin,
+            "lid": self.lid,
+        }
+
+    def update_sim(self, sim):
+        """
+        Updates internal reference to sim and all other references
+        Args:
+            sim (MjSim): Active mujoco sim reference
+        """
+        # Always run super first
+        super().update_sim(sim=sim)
+
+        # Update internal references to IDs
+        self.payload_geom_ids = [self.sim.model.geom_name2id(geom) for geom in self.payload.contact_geoms]
+        self.trash_geom_ids = [self.sim.model.geom_name2id(geom) for geom in self.trash.contact_geoms]
+        self.target_bin_base_geom_ids = [self.sim.model.geom_name2id(geom) for geom in self.target_bin.base_geoms]
+        self.trash_bin_base_geom_ids = [self.sim.model.geom_name2id(geom) for geom in self.trash_bin.base_geoms]
+        self.lid_handle_geom_ids = [self.sim.model.geom_name2id(geom) for geom in self.lid.handle_geoms]
+        self.payload_body_id = self.sim.model.body_name2id(self.payload.root_body)
+        self.trash_body_id = self.sim.model.body_name2id(self.trash.root_body)
+
+    @property
+    def lid_handle_pos(self):
+        """
+        Returns:
+            np.array: (x,y,z) absolute position of the lid handle
+        """
+        return np.array(self.sim.data.geom_xpos[self.lid_handle_geom_ids[0]])
+
+    @property
+    def lid_handle_quat(self):
+        """
+        Returns:
+            np.array: (x,y,z,w) quaternion of the lid handle
+        """
+        return np.array(T.mat2quat(self.sim.data.geom_xmat[self.lid_handle_geom_ids[0]].reshape(3, 3)))
+
+    @property
+    def payload_pos(self):
+        """
+        Returns:
+            np.array: (x,y,z) absolute position of the payload
+        """
+        return np.array(self.sim.data.body_xpos[self.payload_body_id])
+
+    @property
+    def payload_quat(self):
+        """
+        Returns:
+            np.array: (x,y,z,w) quaternion of the payload
+        """
+        return np.array(T.mat2quat(self.sim.data.body_xmat[self.payload_body_id].reshape(3, 3)))
+
+    @property
+    def trash_pos(self):
+        """
+        Returns:
+            np.array: (x,y,z) absolute position of the trash
+        """
+        return np.array(self.sim.data.body_xpos[self.trash_body_id])
+
+    @property
+    def trash_quat(self):
+        """
+        Returns:
+            np.array: (x,y,z,w) quaternion of the trash
+        """
+        return np.array(T.mat2quat(self.sim.data.body_xmat[self.trash_body_id].reshape(3, 3)))
+
+    @property
+    def target_bin_pos(self):
+        """
+        Returns:
+            np.array: (x,y,z) absolute position of the target bin
+        """
+        return np.array(self.sim.data.geom_xpos[self.target_bin_base_geom_ids[0]])
+
+    @property
+    def trash_bin_pos(self):
+        """
+        Returns:
+            np.array: (x,y,z) absolute position of the trash bin
+        """
+        return np.array(self.sim.data.geom_xpos[self.trash_bin_base_geom_ids[0]])
+
+    @property
+    def trash_in_trash_bin(self):
+        """
+        Returns:
+            bool: True if trash is in trash bin
+        """
+        return SU.check_contact(self.sim, self.trash_bin.base_geoms, self.trash.contact_geoms)
+
+    @property
+    def payload_in_target_bin(self):
+        """
+        Returns:
+            bool: True if payload is in target bin
+        """
+        return SU.check_contact(self.sim, self.target_bin.base_geoms, self.payload.contact_geoms)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/object_groups.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/object_groups.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb1100920fbb4a49db4c30bef139870f215dac30
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/object_groups.py
@@ -0,0 +1,46 @@
+class ObjectGroup:
+    """
+    An abstraction that encompasses a group of objects that interact together in a meaningful way
+    name (str): Name of this object group. This will be prepended to all objects generated by this group.
+    """
+
+    def __init__(self, name):
+        # Store internal variables
+        self.name = name
+        self.sim = None  # Reference to shared mjsim object
+        self._objects = {}  # maps object names to object class instances
+
+        # Generate objects
+        self._generate_objects()
+
+    def get_states(self):
+        """
+        Function to grab group-relevant states. This should be implemented by the subclass.
+        Returns:
+            dict: Keyword-mapped states for this group
+        """
+        raise NotImplementedError
+
+    def update_sim(self, sim):
+        """
+        Updates internal reference to sim and all other relevant references
+        Args:
+            sim (MjSim): Active mujoco sim reference
+        """
+        self.sim = sim
+
+    def _generate_objects(self):
+        """
+        Internal helper function that generates the objects for this group. Should populate self._objects mapping
+        names of objects to their actual object class instances.
+        """
+        raise NotImplementedError
+
+    @property
+    def objects(self):
+        """
+        Contains references to all objects owned by this group. Mapped from names to object instances
+        Returns:
+            dict: keyword-mapped object class instances
+        """
+        return self._objects
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/objects.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b56dfd9cf9088efad337722ac0ea64b78fb2fec
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/objects.py
@@ -0,0 +1,587 @@
+import copy
+import xml.etree.ElementTree as ET
+from copy import deepcopy
+
+import robosuite.macros as macros
+from robosuite.models.base import MujocoModel, MujocoXML
+from robosuite.utils.mjcf_utils import (
+    OBJECT_COLLISION_COLOR,
+    CustomMaterial,
+    add_material,
+    add_prefix,
+    array_to_string,
+    find_elements,
+    new_joint,
+    sort_elements,
+    string_to_array,
+)
+
+# Dict mapping geom type string keywords to group number
+GEOMTYPE2GROUP = {
+    "collision": {0},  # If we want to use a geom for physics, but NOT visualize
+    "visual": {1},  # If we want to use a geom for visualization, but NOT physics
+    "all": {0, 1},  # If we want to use a geom for BOTH physics + visualization
+}
+
+GEOM_GROUPS = GEOMTYPE2GROUP.keys()
+
+
+class MujocoObject(MujocoModel):
+    """
+    Base class for all objects.
+
+    We use Mujoco Objects to implement all objects that:
+
+        1) may appear for multiple times in a task
+        2) can be swapped between different tasks
+
+    Typical methods return copy so the caller can all joints/attributes as wanted
+
+    Args:
+        obj_type (str): Geom elements to generate / extract for this object. Must be one of:
+
+            :`'collision'`: Only collision geoms are returned (this corresponds to group 0 geoms)
+            :`'visual'`: Only visual geoms are returned (this corresponds to group 1 geoms)
+            :`'all'`: All geoms are returned
+
+        duplicate_collision_geoms (bool): If set, will guarantee that each collision geom has a
+            visual geom copy
+
+    """
+
+    def __init__(self, obj_type="all", duplicate_collision_geoms=True):
+        super().__init__()
+        self.asset = ET.Element("asset")
+        assert obj_type in GEOM_GROUPS, "object type must be one in {}, got: {} instead.".format(GEOM_GROUPS, obj_type)
+        self.obj_type = obj_type
+        self.duplicate_collision_geoms = duplicate_collision_geoms
+
+        # Attributes that should be filled in within the subclass
+        self._name = None
+        self._obj = None
+
+        # Attributes that are auto-filled by _get_object_properties call
+        self._root_body = None
+        self._bodies = None
+        self._joints = None
+        self._actuators = None
+        self._sites = None
+        self._contact_geoms = None
+        self._visual_geoms = None
+
+    def merge_assets(self, other):
+        """
+        Merges @other's assets in a custom logic.
+
+        Args:
+            other (MujocoXML or MujocoObject): other xml file whose assets will be merged into this one
+        """
+        for asset in other.asset:
+            if (
+                find_elements(root=self.asset, tags=asset.tag, attribs={"name": asset.get("name")}, return_first=True)
+                is None
+            ):
+                self.asset.append(asset)
+
+    def get_obj(self):
+        """
+        Returns the generated / extracted object, in XML ElementTree form.
+
+        Returns:
+            ET.Element: Object in XML form.
+        """
+        assert self._obj is not None, "Object XML tree has not been generated yet!"
+        return self._obj
+
+    def exclude_from_prefixing(self, inp):
+        """
+        A function that should take in either an ET.Element or its attribute (str) and return either True or False,
+        determining whether the corresponding name / str to @inp should have naming_prefix added to it.
+        Must be defined by subclass.
+
+        Args:
+            inp (ET.Element or str): Element or its attribute to check for prefixing.
+
+        Returns:
+            bool: True if we should exclude the associated name(s) with @inp from being prefixed with naming_prefix
+        """
+        raise NotImplementedError
+
+    def _get_object_subtree(self):
+
+        """
+        Returns a ET.Element
+        It is a <body/> subtree that defines all collision and / or visualization related fields
+        of this object.
+        Return should be a copy.
+        Must be defined by subclass.
+
+        Returns:
+            ET.Element: body
+        """
+        raise NotImplementedError
+
+    def _get_object_properties(self):
+        """
+        Helper function to extract relevant object properties (bodies, joints, contact/visual geoms, etc...) from this
+        object's XML tree. Assumes the self._obj attribute has already been filled.
+        """
+        # Parse element tree to get all relevant bodies, joints, actuators, and geom groups
+        _elements = sort_elements(root=self.get_obj())
+        assert (
+            len(_elements["root_body"]) == 1
+        ), "Invalid number of root bodies found for robot model. Expected 1," "got {}".format(
+            len(_elements["root_body"])
+        )
+        _elements["root_body"] = _elements["root_body"][0]
+        _elements["bodies"] = (
+            [_elements["root_body"]] + _elements["bodies"] if "bodies" in _elements else [_elements["root_body"]]
+        )
+        self._root_body = _elements["root_body"].get("name")
+        self._bodies = [e.get("name") for e in _elements.get("bodies", [])]
+        self._joints = [e.get("name") for e in _elements.get("joints", [])]
+        self._actuators = [e.get("name") for e in _elements.get("actuators", [])]
+        self._sites = [e.get("name") for e in _elements.get("sites", [])]
+        self._sensors = [e.get("name") for e in _elements.get("sensors", [])]
+        self._contact_geoms = [e.get("name") for e in _elements.get("contact_geoms", [])]
+        self._visual_geoms = [e.get("name") for e in _elements.get("visual_geoms", [])]
+
+        # Add default materials if we're using domain randomization
+        if macros.USING_INSTANCE_RANDOMIZATION:
+            tex_element, mat_element, _, used = add_material(root=self.get_obj(), naming_prefix=self.naming_prefix)
+            # Only add the material / texture if they were actually used
+            if used:
+                self.asset.append(tex_element)
+                self.asset.append(mat_element)
+
+        # Add prefix to all elements
+        add_prefix(root=self.get_obj(), prefix=self.naming_prefix, exclude=self.exclude_from_prefixing)
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def naming_prefix(self):
+        return "{}_".format(self.name)
+
+    @property
+    def root_body(self):
+        return self.correct_naming(self._root_body)
+
+    @property
+    def bodies(self):
+        return self.correct_naming(self._bodies)
+
+    @property
+    def joints(self):
+        return self.correct_naming(self._joints)
+
+    @property
+    def actuators(self):
+        return self.correct_naming(self._actuators)
+
+    @property
+    def sites(self):
+        return self.correct_naming(self._sites)
+
+    @property
+    def sensors(self):
+        return self.correct_naming(self._sensors)
+
+    @property
+    def contact_geoms(self):
+        return self.correct_naming(self._contact_geoms)
+
+    @property
+    def visual_geoms(self):
+        return self.correct_naming(self._visual_geoms)
+
+    @property
+    def important_geoms(self):
+        """
+        Returns:
+             dict: (Default is no important geoms; i.e.: empty dict)
+        """
+        return {}
+
+    @property
+    def important_sites(self):
+        """
+        Returns:
+            dict:
+
+                :`obj`: Object default site
+        """
+        return {"obj": self.naming_prefix + "default_site"}
+
+    @property
+    def important_sensors(self):
+        """
+        Returns:
+            dict: (Default is no sensors; i.e.: empty dict)
+        """
+        return {}
+
+    @property
+    def bottom_offset(self):
+        """
+        Returns vector from model root body to model bottom.
+        Useful for, e.g. placing models on a surface.
+        Must be defined by subclass.
+
+        Returns:
+            np.array: (dx, dy, dz) offset vector
+        """
+        raise NotImplementedError
+
+    @property
+    def top_offset(self):
+        """
+        Returns vector from model root body to model top.
+        Useful for, e.g. placing models on a surface.
+        Must be defined by subclass.
+
+        Returns:
+            np.array: (dx, dy, dz) offset vector
+        """
+        raise NotImplementedError
+
+    @property
+    def horizontal_radius(self):
+        """
+        Returns maximum distance from model root body to any radial point of the model.
+
+        Helps us put models programmatically without them flying away due to a huge initial contact force.
+        Must be defined by subclass.
+
+        Returns:
+            float: radius
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def get_site_attrib_template():
+        """
+        Returns attribs of spherical site used to mark body origin
+
+        Returns:
+            dict: Dictionary of default site attributes
+        """
+        return {
+            "pos": "0 0 0",
+            "size": "0.002 0.002 0.002",
+            "rgba": "1 0 0 1",
+            "type": "sphere",
+            "group": "0",
+        }
+
+    @staticmethod
+    def get_joint_attrib_template():
+        """
+        Returns attribs of free joint
+
+        Returns:
+            dict: Dictionary of default joint attributes
+        """
+        return {
+            "type": "free",
+        }
+
+    def get_bounding_box_half_size(self):
+        raise NotImplementedError
+
+    def get_bounding_box_size(self):
+        """
+        Returns numpy array with dimensions of a bounding box around this object.
+        """
+        return 2. * self.get_bounding_box_half_size()
+
+
+class MujocoXMLObject(MujocoObject, MujocoXML):
+    """
+    MujocoObjects that are loaded from xml files (by default, inherit all properties (e.g.: name)
+    from MujocoObject class first!)
+
+    Args:
+        fname (str): XML File path
+
+        name (str): Name of this MujocoXMLObject
+
+        joints (None or str or list of dict): each dictionary corresponds to a joint that will be created for this
+            object. The dictionary should specify the joint attributes (type, pos, etc.) according to the MuJoCo xml
+            specification. If "default", a single free-joint will be automatically generated. If None, no joints will
+            be created.
+
+        obj_type (str): Geom elements to generate / extract for this object. Must be one of:
+
+            :`'collision'`: Only collision geoms are returned (this corresponds to group 0 geoms)
+            :`'visual'`: Only visual geoms are returned (this corresponds to group 1 geoms)
+            :`'all'`: All geoms are returned
+
+        duplicate_collision_geoms (bool): If set, will guarantee that each collision geom has a
+            visual geom copy
+    """
+
+    def __init__(self, fname, name, joints="default", obj_type="all", duplicate_collision_geoms=True):
+        MujocoXML.__init__(self, fname)
+        # Set obj type and duplicate args
+        assert obj_type in GEOM_GROUPS, "object type must be one in {}, got: {} instead.".format(GEOM_GROUPS, obj_type)
+        self.obj_type = obj_type
+        self.duplicate_collision_geoms = duplicate_collision_geoms
+
+        # Set name
+        self._name = name
+
+        # joints for this object
+        if joints == "default":
+            self.joint_specs = [self.get_joint_attrib_template()]  # default free joint
+        elif joints is None:
+            self.joint_specs = []
+        else:
+            self.joint_specs = joints
+
+        # Make sure all joints have names!
+        for i, joint_spec in enumerate(self.joint_specs):
+            if "name" not in joint_spec:
+                joint_spec["name"] = "joint{}".format(i)
+
+        # Lastly, parse XML tree appropriately
+        self._obj = self._get_object_subtree()
+
+        # Extract the appropriate private attributes for this
+        self._get_object_properties()
+
+    def _get_object_subtree(self):
+        # Parse object
+        obj = copy.deepcopy(self.worldbody.find("./body/body[@name='object']"))
+        # Rename this top level object body (will have self.naming_prefix added later)
+        obj.attrib["name"] = "main"
+        # Get all geom_pairs in this tree
+        geom_pairs = self._get_geoms(obj)
+
+        # Define a temp function so we don't duplicate so much code
+        obj_type = self.obj_type
+
+        def _should_keep(el):
+            return int(el.get("group")) in GEOMTYPE2GROUP[obj_type]
+
+        # Loop through each of these pairs and modify them according to @elements arg
+        for i, (parent, element) in enumerate(geom_pairs):
+            # Delete non-relevant geoms and rename remaining ones
+            if not _should_keep(element):
+                parent.remove(element)
+            else:
+                g_name = element.get("name")
+                g_name = g_name if g_name is not None else f"g{i}"
+                element.set("name", g_name)
+                # Also optionally duplicate collision geoms if requested (and this is a collision geom)
+                if self.duplicate_collision_geoms and element.get("group") in {None, "0"}:
+                    parent.append(self._duplicate_visual_from_collision(element))
+                    # Also manually set the visual appearances to the original collision model
+                    element.set("rgba", array_to_string(OBJECT_COLLISION_COLOR))
+                    if element.get("material") is not None:
+                        del element.attrib["material"]
+        # add joint(s)
+        for joint_spec in self.joint_specs:
+            obj.append(new_joint(**joint_spec))
+        # Lastly, add a site for this object
+        template = self.get_site_attrib_template()
+        template["rgba"] = "1 0 0 0"
+        template["name"] = "default_site"
+        obj.append(ET.Element("site", attrib=template))
+
+        return obj
+
+    def exclude_from_prefixing(self, inp):
+        """
+        By default, don't exclude any from being prefixed
+        """
+        return False
+
+    def _get_object_properties(self):
+        """
+        Extends the base class method to also add prefixes to all bodies in this object
+        """
+        super()._get_object_properties()
+        add_prefix(root=self.root, prefix=self.naming_prefix, exclude=self.exclude_from_prefixing)
+
+    @staticmethod
+    def _duplicate_visual_from_collision(element):
+        """
+        Helper function to duplicate a geom element to be a visual element. Namely, this corresponds to the
+        following attribute requirements: group=1, conaffinity/contype=0, no mass, name appended with "_visual"
+
+        Args:
+            element (ET.Element): element to duplicate as a visual geom
+
+        Returns:
+            element (ET.Element): duplicated element
+        """
+        # Copy element
+        vis_element = deepcopy(element)
+        # Modify for visual-specific attributes (group=1, conaffinity/contype=0, no mass, update name)
+        vis_element.set("group", "1")
+        vis_element.set("conaffinity", "0")
+        vis_element.set("contype", "0")
+        vis_element.set("mass", "1e-8")
+        vis_element.set("name", vis_element.get("name") + "_visual")
+        return vis_element
+
+    def _get_geoms(self, root, _parent=None):
+        """
+        Helper function to recursively search through element tree starting at @root and returns
+        a list of (parent, child) tuples where the child is a geom element
+
+        Args:
+            root (ET.Element): Root of xml element tree to start recursively searching through
+            _parent (ET.Element): Parent of the root element tree. Should not be used externally; only set
+                during the recursive call
+
+        Returns:
+            list: array of (parent, child) tuples where the child element is a geom type
+        """
+        # Initialize return array
+        geom_pairs = []
+        # If the parent exists and this is a geom element, we add this current (parent, element) combo to the output
+        if _parent is not None and root.tag == "geom":
+            geom_pairs.append((_parent, root))
+        # Loop through all children elements recursively and add to pairs
+        for child in root:
+            geom_pairs += self._get_geoms(child, _parent=root)
+        # Return all found pairs
+        return geom_pairs
+
+    @property
+    def bottom_offset(self):
+        bottom_site = self.worldbody.find("./body/site[@name='{}bottom_site']".format(self.naming_prefix))
+        return string_to_array(bottom_site.get("pos"))
+
+    @property
+    def top_offset(self):
+        top_site = self.worldbody.find("./body/site[@name='{}top_site']".format(self.naming_prefix))
+        return string_to_array(top_site.get("pos"))
+
+    @property
+    def horizontal_radius(self):
+        horizontal_radius_site = self.worldbody.find(
+            "./body/site[@name='{}horizontal_radius_site']".format(self.naming_prefix)
+        )
+        return string_to_array(horizontal_radius_site.get("pos"))[0]
+
+    def get_bounding_box_half_size(self):
+        horizontal_radius_site = self.worldbody.find(
+            "./body/site[@name='{}horizontal_radius_site']".format(self.naming_prefix)
+        )
+        return string_to_array(horizontal_radius_site.get("pos")) - self.bottom_offset
+
+
+class MujocoGeneratedObject(MujocoObject):
+    """
+    Base class for all procedurally generated objects.
+
+    Args:
+        obj_type (str): Geom elements to generate / extract for this object. Must be one of:
+
+            :`'collision'`: Only collision geoms are returned (this corresponds to group 0 geoms)
+            :`'visual'`: Only visual geoms are returned (this corresponds to group 1 geoms)
+            :`'all'`: All geoms are returned
+
+        duplicate_collision_geoms (bool): If set, will guarantee that each collision geom has a
+            visual geom copy
+    """
+
+    def __init__(self, obj_type="all", duplicate_collision_geoms=True):
+        super().__init__(obj_type=obj_type, duplicate_collision_geoms=duplicate_collision_geoms)
+
+        # Store common material names so we don't add prefixes to them
+        self.shared_materials = set()
+        self.shared_textures = set()
+
+    def sanity_check(self):
+        """
+        Checks if data provided makes sense.
+        Called in __init__()
+        For subclasses to inherit from
+        """
+        pass
+
+    @staticmethod
+    def get_collision_attrib_template():
+        """
+        Generates template with collision attributes for a given geom
+
+        Returns:
+            dict: Initial template with `'pos'` and `'group'` already specified
+        """
+        return {"group": "0", "rgba": array_to_string(OBJECT_COLLISION_COLOR)}
+
+    @staticmethod
+    def get_visual_attrib_template():
+        """
+        Generates template with visual attributes for a given geom
+
+        Returns:
+            dict: Initial template with `'conaffinity'`, `'contype'`, and `'group'` already specified
+        """
+        return {"conaffinity": "0", "contype": "0", "mass": "1e-8", "group": "1"}
+
+    def append_material(self, material):
+        """
+        Adds a new texture / material combination to the assets subtree of this XML
+        Input is expected to be a CustomMaterial object
+
+        See http://www.mujoco.org/book/XMLreference.html#asset for specific details on attributes expected for
+        Mujoco texture / material tags, respectively
+
+        Note that the "file" attribute for the "texture" tag should be specified relative to the textures directory
+        located in robosuite/models/assets/textures/
+
+        Args:
+            material (CustomMaterial): Material to add to this object
+        """
+        # First check if asset attribute exists; if not, define the asset attribute
+        if not hasattr(self, "asset"):
+            self.asset = ET.Element("asset")
+        # If the material name is not in shared materials, add this to our assets
+        if material.name not in self.shared_materials:
+            self.asset.append(ET.Element("texture", attrib=material.tex_attrib))
+            self.asset.append(ET.Element("material", attrib=material.mat_attrib))
+        # Add this material name to shared materials if it should be shared
+        if material.shared:
+            self.shared_materials.add(material.name)
+            self.shared_textures.add(material.tex_attrib["name"])
+        # Update prefix for assets
+        add_prefix(root=self.asset, prefix=self.naming_prefix, exclude=self.exclude_from_prefixing)
+
+    def exclude_from_prefixing(self, inp):
+        """
+        Exclude all shared materials and their associated names from being prefixed.
+
+        Args:
+            inp (ET.Element or str): Element or its attribute to check for prefixing.
+
+        Returns:
+            bool: True if we should exclude the associated name(s) with @inp from being prefixed with naming_prefix
+        """
+        # Automatically return False if this is not of type "str"
+        if type(inp) is not str:
+            return False
+        # Only return True if the string matches the name of a common material
+        return True if inp in self.shared_materials or inp in self.shared_textures else False
+
+    # Methods that still need to be defined by subclass
+    def _get_object_subtree(self):
+        raise NotImplementedError
+
+    def bottom_offset(self):
+        raise NotImplementedError
+
+    def top_offset(self):
+        raise NotImplementedError
+
+    def horizontal_radius(self):
+        raise NotImplementedError
+
+    def get_bounding_box_half_size(self):
+        return np.array([self.horizontal_radius, self.horizontal_radius, 0.]) - self.bottom_offset
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b224ac36f77f0c71cc7b6e38fd9b697f75f2823
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/__init__.py
@@ -0,0 +1,4 @@
+from .ball import BallObject
+from .box import BoxObject
+from .capsule import CapsuleObject
+from .cylinder import CylinderObject
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/ball.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/ball.py
new file mode 100644
index 0000000000000000000000000000000000000000..95c6621dedadaf436ecb33dc3720fc8db7ddbacf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/ball.py
@@ -0,0 +1,71 @@
+import numpy as np
+
+from robosuite.models.objects import PrimitiveObject
+from robosuite.utils.mjcf_utils import get_size
+
+
+class BallObject(PrimitiveObject):
+    """
+    A ball (sphere) object.
+
+    Args:
+        size (1-tuple of float): (radius) size parameters for this ball object
+    """
+
+    def __init__(
+        self,
+        name,
+        size=None,
+        size_max=None,
+        size_min=None,
+        density=None,
+        friction=None,
+        rgba=None,
+        solref=None,
+        solimp=None,
+        material=None,
+        joints="default",
+        obj_type="all",
+        duplicate_collision_geoms=True,
+    ):
+        size = get_size(size, size_max, size_min, [0.07], [0.03])
+        super().__init__(
+            name=name,
+            size=size,
+            rgba=rgba,
+            density=density,
+            friction=friction,
+            solref=solref,
+            solimp=solimp,
+            material=material,
+            joints=joints,
+            obj_type=obj_type,
+            duplicate_collision_geoms=duplicate_collision_geoms,
+        )
+
+    def sanity_check(self):
+        """
+        Checks to make sure inputted size is of correct length
+
+        Raises:
+            AssertionError: [Invalid size length]
+        """
+        assert len(self.size) == 1, "ball size should have length 1"
+
+    def _get_object_subtree(self):
+        return self._get_object_subtree_(ob_type="sphere")
+
+    @property
+    def bottom_offset(self):
+        return np.array([0, 0, -1 * self.size[0]])
+
+    @property
+    def top_offset(self):
+        return np.array([0, 0, self.size[0]])
+
+    @property
+    def horizontal_radius(self):
+        return self.size[0]
+
+    def get_bounding_box_half_size(self):
+        return np.array([self.size[0], self.size[0], self.size[0]])
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/box.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/box.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fcb660be972e8862887be7508d16d7bbe6307c8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/box.py
@@ -0,0 +1,71 @@
+import numpy as np
+
+from robosuite.models.objects import PrimitiveObject
+from robosuite.utils.mjcf_utils import get_size
+
+
+class BoxObject(PrimitiveObject):
+    """
+    A box object.
+
+    Args:
+        size (3-tuple of float): (half-x, half-y, half-z) size parameters for this box object
+    """
+
+    def __init__(
+        self,
+        name,
+        size=None,
+        size_max=None,
+        size_min=None,
+        density=None,
+        friction=None,
+        rgba=None,
+        solref=None,
+        solimp=None,
+        material=None,
+        joints="default",
+        obj_type="all",
+        duplicate_collision_geoms=True,
+    ):
+        size = get_size(size, size_max, size_min, [0.07, 0.07, 0.07], [0.03, 0.03, 0.03])
+        super().__init__(
+            name=name,
+            size=size,
+            rgba=rgba,
+            density=density,
+            friction=friction,
+            solref=solref,
+            solimp=solimp,
+            material=material,
+            joints=joints,
+            obj_type=obj_type,
+            duplicate_collision_geoms=duplicate_collision_geoms,
+        )
+
+    def sanity_check(self):
+        """
+        Checks to make sure inputted size is of correct length
+
+        Raises:
+            AssertionError: [Invalid size length]
+        """
+        assert len(self.size) == 3, "box size should have length 3"
+
+    def _get_object_subtree(self):
+        return self._get_object_subtree_(ob_type="box")
+
+    @property
+    def bottom_offset(self):
+        return np.array([0, 0, -1 * self.size[2]])
+
+    @property
+    def top_offset(self):
+        return np.array([0, 0, self.size[2]])
+
+    @property
+    def horizontal_radius(self):
+        return np.linalg.norm(self.size[0:2], 2)
+
+    def get_bounding_box_half_size(self):
+        return np.array([self.size[0], self.size[1], self.size[2]])
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/capsule.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/capsule.py
new file mode 100644
index 0000000000000000000000000000000000000000..6139cb3473d1d658a169f5620775722e75581929
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/capsule.py
@@ -0,0 +1,71 @@
+import numpy as np
+
+from robosuite.models.objects import PrimitiveObject
+from robosuite.utils.mjcf_utils import get_size
+
+
+class CapsuleObject(PrimitiveObject):
+    """
+    A capsule object.
+
+    Args:
+        size (2-tuple of float): (radius, half-length) size parameters for this capsule object
+    """
+
+    def __init__(
+        self,
+        name,
+        size=None,
+        size_max=None,
+        size_min=None,
+        density=None,
+        friction=None,
+        rgba=None,
+        solref=None,
+        solimp=None,
+        material=None,
+        joints="default",
+        obj_type="all",
+        duplicate_collision_geoms=True,
+    ):
+        size = get_size(size, size_max, size_min, [0.07, 0.07], [0.03, 0.03])
+        super().__init__(
+            name=name,
+            size=size,
+            rgba=rgba,
+            density=density,
+            friction=friction,
+            solref=solref,
+            solimp=solimp,
+            material=material,
+            joints=joints,
+            obj_type=obj_type,
+            duplicate_collision_geoms=duplicate_collision_geoms,
+        )
+
+    def sanity_check(self):
+        """
+        Checks to make sure inputted size is of correct length
+
+        Raises:
+            AssertionError: [Invalid size length]
+        """
+        assert len(self.size) == 2, "capsule size should have length 2"
+
+    def _get_object_subtree(self):
+        return self._get_object_subtree_(ob_type="capsule")
+
+    @property
+    def bottom_offset(self):
+        return np.array([0, 0, -1 * (self.size[0] + self.size[1])])
+
+    @property
+    def top_offset(self):
+        return np.array([0, 0, (self.size[0] + self.size[1])])
+
+    @property
+    def horizontal_radius(self):
+        return self.size[0]
+
+    def get_bounding_box_half_size(self):
+        return np.array([self.size[0], self.size[0], self.size[0] + self.size[1]])
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/cylinder.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/cylinder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e2dc9af1d6ab3a4c10226b141f6a181437cd44a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/primitive/cylinder.py
@@ -0,0 +1,95 @@
+import numpy as np
+
+from robosuite.models.objects import MujocoGeneratedObject, PrimitiveObject
+from robosuite.utils.mjcf_utils import get_size
+
+
+class CylinderObject(PrimitiveObject):
+    """
+    A cylinder object.
+
+    Args:
+        size (2-tuple of float): (radius, half-length) size parameters for this cylinder object
+    """
+
+    def __init__(
+        self,
+        name,
+        size=None,
+        size_max=None,
+        size_min=None,
+        density=None,
+        friction=None,
+        rgba=None,
+        solref=None,
+        solimp=None,
+        material=None,
+        joints="default",
+        obj_type="all",
+        duplicate_collision_geoms=True,
+    ):
+        size = get_size(size, size_max, size_min, [0.07, 0.07], [0.03, 0.03])
+
+        # We override solref, solimp, and joint default values for better stability
+        if friction is None:
+            friction = [1, 0.01, 0.001]
+        if solref is None:
+            solref = [0.01, 0.5]
+        if joints == "default":
+            joints = [{"type": "free", "damping": "0.0001"}]
+
+        super().__init__(
+            name=name,
+            size=size,
+            rgba=rgba,
+            density=density,
+            friction=friction,
+            solref=solref,
+            solimp=solimp,
+            material=material,
+            joints=joints,
+            obj_type=obj_type,
+            duplicate_collision_geoms=duplicate_collision_geoms,
+        )
+
+    def sanity_check(self):
+        """
+        Checks to make sure inputted size is of correct length
+
+        Raises:
+            AssertionError: [Invalid size length]
+        """
+        assert len(self.size) == 2, "cylinder size should have length 2"
+
+    def _get_object_subtree(self):
+        return self._get_object_subtree_(ob_type="cylinder")
+
+    @staticmethod
+    def get_collision_attrib_template():
+        """
+        Generates template with collision attributes for a given geom
+
+        Extends super method for better stability for contacts
+
+        Returns:
+            dict: Initial template with `'pos'` and `'group'` already specified
+        """
+        template = MujocoGeneratedObject.get_collision_attrib_template()
+        # Add condim value
+        template["margin"] = "0.001"
+        return template
+
+    @property
+    def bottom_offset(self):
+        return np.array([0, 0, -1 * self.size[1]])
+
+    @property
+    def top_offset(self):
+        return np.array([0, 0, self.size[1]])
+
+    @property
+    def horizontal_radius(self):
+        return self.size[0]
+
+    def get_bounding_box_half_size(self):
+        return np.array([self.size[0], self.size[0], self.size[1]])
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/objects/xml_objects.py b/phantom/submodules/phantom-robosuite/robosuite/models/objects/xml_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..68e0369d469d33e8052e6e0157182bb65e0b194d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/objects/xml_objects.py
@@ -0,0 +1,299 @@
+import numpy as np
+
+from robosuite.models.objects import MujocoXMLObject
+from robosuite.utils.mjcf_utils import array_to_string, find_elements, xml_path_completion
+
+
+class BottleObject(MujocoXMLObject):
+    """
+    Bottle object
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/bottle.xml"),
+            name=name,
+            joints=[dict(type="free", damping="0.0005")],
+            obj_type="all",
+            duplicate_collision_geoms=True,
+        )
+
+
+class CanObject(MujocoXMLObject):
+    """
+    Coke can object (used in PickPlace)
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/can.xml"),
+            name=name,
+            joints=[dict(type="free", damping="0.0005")],
+            obj_type="all",
+            duplicate_collision_geoms=True,
+        )
+
+
+class LemonObject(MujocoXMLObject):
+    """
+    Lemon object
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/lemon.xml"), name=name, obj_type="all", duplicate_collision_geoms=True
+        )
+
+
+class MilkObject(MujocoXMLObject):
+    """
+    Milk carton object (used in PickPlace)
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/milk.xml"),
+            name=name,
+            joints=[dict(type="free", damping="0.0005")],
+            obj_type="all",
+            duplicate_collision_geoms=True,
+        )
+
+
+class BreadObject(MujocoXMLObject):
+    """
+    Bread loaf object (used in PickPlace)
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/bread.xml"),
+            name=name,
+            joints=[dict(type="free", damping="0.0005")],
+            obj_type="all",
+            duplicate_collision_geoms=True,
+        )
+
+
+class CerealObject(MujocoXMLObject):
+    """
+    Cereal box object (used in PickPlace)
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/cereal.xml"),
+            name=name,
+            joints=[dict(type="free", damping="0.0005")],
+            obj_type="all",
+            duplicate_collision_geoms=True,
+        )
+
+
+class SquareNutObject(MujocoXMLObject):
+    """
+    Square nut object (used in NutAssembly)
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/square-nut.xml"),
+            name=name,
+            joints=[dict(type="free", damping="0.0005")],
+            obj_type="all",
+            duplicate_collision_geoms=True,
+        )
+
+    @property
+    def important_sites(self):
+        """
+        Returns:
+            dict: In addition to any default sites for this object, also provides the following entries
+
+                :`'handle'`: Name of nut handle location site
+        """
+        # Get dict from super call and add to it
+        dic = super().important_sites
+        dic.update({"handle": self.naming_prefix + "handle_site"})
+        return dic
+
+
+class RoundNutObject(MujocoXMLObject):
+    """
+    Round nut (used in NutAssembly)
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/round-nut.xml"),
+            name=name,
+            joints=[dict(type="free", damping="0.0005")],
+            obj_type="all",
+            duplicate_collision_geoms=True,
+        )
+
+    @property
+    def important_sites(self):
+        """
+        Returns:
+            dict: In addition to any default sites for this object, also provides the following entries
+
+                :`'handle'`: Name of nut handle location site
+        """
+        # Get dict from super call and add to it
+        dic = super().important_sites
+        dic.update({"handle": self.naming_prefix + "handle_site"})
+        return dic
+
+
+class MilkVisualObject(MujocoXMLObject):
+    """
+    Visual fiducial of milk carton (used in PickPlace).
+
+    Fiducial objects are not involved in collision physics.
+    They provide a point of reference to indicate a position.
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/milk-visual.xml"),
+            name=name,
+            joints=None,
+            obj_type="visual",
+            duplicate_collision_geoms=True,
+        )
+
+
+class BreadVisualObject(MujocoXMLObject):
+    """
+    Visual fiducial of bread loaf (used in PickPlace)
+
+    Fiducial objects are not involved in collision physics.
+    They provide a point of reference to indicate a position.
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/bread-visual.xml"),
+            name=name,
+            joints=None,
+            obj_type="visual",
+            duplicate_collision_geoms=True,
+        )
+
+
+class CerealVisualObject(MujocoXMLObject):
+    """
+    Visual fiducial of cereal box (used in PickPlace)
+
+    Fiducial objects are not involved in collision physics.
+    They provide a point of reference to indicate a position.
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/cereal-visual.xml"),
+            name=name,
+            joints=None,
+            obj_type="visual",
+            duplicate_collision_geoms=True,
+        )
+
+
+class CanVisualObject(MujocoXMLObject):
+    """
+    Visual fiducial of coke can (used in PickPlace)
+
+    Fiducial objects are not involved in collision physics.
+    They provide a point of reference to indicate a position.
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/can-visual.xml"),
+            name=name,
+            joints=None,
+            obj_type="visual",
+            duplicate_collision_geoms=True,
+        )
+
+
+class PlateWithHoleObject(MujocoXMLObject):
+    """
+    Square plate with a hole in the center (used in PegInHole)
+    """
+
+    def __init__(self, name):
+        super().__init__(
+            xml_path_completion("objects/plate-with-hole.xml"),
+            name=name,
+            joints=None,
+            obj_type="all",
+            duplicate_collision_geoms=True,
+        )
+
+
+class DoorObject(MujocoXMLObject):
+    """
+    Door with handle (used in Door)
+
+    Args:
+        friction (3-tuple of float): friction parameters to override the ones specified in the XML
+        damping (float): damping parameter to override the ones specified in the XML
+        lock (bool): Whether to use the locked door variation object or not
+    """
+
+    def __init__(self, name, friction=None, damping=None, lock=False):
+        xml_path = "objects/door.xml"
+        if lock:
+            xml_path = "objects/door_lock.xml"
+        super().__init__(
+            xml_path_completion(xml_path), name=name, joints=None, obj_type="all", duplicate_collision_geoms=True
+        )
+
+        # Set relevant body names
+        self.door_body = self.naming_prefix + "door"
+        self.frame_body = self.naming_prefix + "frame"
+        self.latch_body = self.naming_prefix + "latch"
+        self.hinge_joint = self.naming_prefix + "hinge"
+
+        self.lock = lock
+        self.friction = friction
+        self.damping = damping
+        if self.friction is not None:
+            self._set_door_friction(self.friction)
+        if self.damping is not None:
+            self._set_door_damping(self.damping)
+
+    def _set_door_friction(self, friction):
+        """
+        Helper function to override the door friction directly in the XML
+
+        Args:
+            friction (3-tuple of float): friction parameters to override the ones specified in the XML
+        """
+        hinge = find_elements(root=self.worldbody, tags="joint", attribs={"name": self.hinge_joint}, return_first=True)
+        hinge.set("frictionloss", array_to_string(np.array([friction])))
+
+    def _set_door_damping(self, damping):
+        """
+        Helper function to override the door friction directly in the XML
+
+        Args:
+            damping (float): damping parameter to override the ones specified in the XML
+        """
+        hinge = find_elements(root=self.worldbody, tags="joint", attribs={"name": self.hinge_joint}, return_first=True)
+        hinge.set("damping", array_to_string(np.array([damping])))
+
+    @property
+    def important_sites(self):
+        """
+        Returns:
+            dict: In addition to any default sites for this object, also provides the following entries
+
+                :`'handle'`: Name of door handle location site
+        """
+        # Get dict from super call and add to it
+        dic = super().important_sites
+        dic.update({"handle": self.naming_prefix + "handle"})
+        return dic
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79787df25e2de33708dba56ee2ffa30c71af5b00
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/__init__.py
@@ -0,0 +1,2 @@
+from .robot_model import RobotModel, create_robot
+from .manipulators import *
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6d2c177d05d0cf19aa37d13e723af97be9eea15
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/__init__.py
@@ -0,0 +1,8 @@
+from .manipulator_model import ManipulatorModel
+from .sawyer_robot import Sawyer
+from .baxter_robot import Baxter
+from .panda_robot import Panda
+from .jaco_robot import Jaco
+from .kinova3_robot import Kinova3
+from .iiwa_robot import IIWA
+from .ur5e_robot import UR5e
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/baxter_robot.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/baxter_robot.py
new file mode 100644
index 0000000000000000000000000000000000000000..631dba7e16da4e5fac42719c9bb81ec5a7f3f254
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/baxter_robot.py
@@ -0,0 +1,89 @@
+import numpy as np
+
+from robosuite.models.robots.manipulators.manipulator_model import ManipulatorModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class Baxter(ManipulatorModel):
+    """
+    Baxter is a hunky bimanual robot designed by Rethink Robotics.
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this robot instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("robots/baxter/robot.xml"), idn=idn)
+
+    @property
+    def default_mount(self):
+        return "RethinkMinimalMount"
+
+    @property
+    def default_gripper(self):
+        """
+        Since this is bimanual robot, returns dict with `'right'`, `'left'` keywords corresponding to their respective
+        values
+
+        Returns:
+            dict: Dictionary containing arm-specific gripper names
+        """
+        return {"right": "RethinkGripper", "left": "RethinkGripper"}
+
+    @property
+    def default_controller_config(self):
+        """
+        Since this is bimanual robot, returns dict with `'right'`, `'left'` keywords corresponding to their respective
+        values
+
+        Returns:
+            dict: Dictionary containing arm-specific default controller config names
+        """
+        return {"right": "default_baxter", "left": "default_baxter"}
+
+    @property
+    def init_qpos(self):
+        """
+        Since this is bimanual robot, returns [right, left] array corresponding to respective values
+
+        Note that this is a pose such that the arms are half extended
+
+        Returns:
+            np.array: default initial qpos for the right, left arms
+        """
+        # [right, left]
+        # Arms half extended
+        return np.array(
+            [0.403, -0.636, 0.114, 1.432, 0.735, 1.205, -0.269, -0.403, -0.636, -0.114, 1.432, -0.735, 1.205, 0.269]
+        )
+
+    @property
+    def base_xpos_offset(self):
+        return {
+            "bins": (-0.5, -0.1, 0),
+            "empty": (-0.29, 0, 0),
+            "table": lambda table_length: (-0.26 - table_length / 2, 0, 0),
+        }
+
+    @property
+    def top_offset(self):
+        return np.array((0, 0, 1.0))
+
+    @property
+    def _horizontal_radius(self):
+        return 0.5
+
+    @property
+    def arm_type(self):
+        return "bimanual"
+
+    @property
+    def _eef_name(self):
+        """
+        Since this is bimanual robot, returns dict with `'right'`, `'left'` keywords corresponding to their respective
+        values
+
+        Returns:
+            dict: Dictionary containing arm-specific eef names
+        """
+        return {"right": "right_hand", "left": "left_hand"}
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/iiwa_robot.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/iiwa_robot.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeccb10f79ecbf5eafd7347538cfc8900b2a0a80
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/iiwa_robot.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+from robosuite.models.robots.manipulators.manipulator_model import ManipulatorModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class IIWA(ManipulatorModel):
+    """
+    IIWA is a bright and spunky robot created by KUKA
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this robot instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("robots/iiwa/robot.xml"), idn=idn)
+
+    @property
+    def default_mount(self):
+        return "RethinkMount"
+
+    @property
+    def default_gripper(self):
+        return "Robotiq140Gripper"
+
+    @property
+    def default_controller_config(self):
+        return "default_iiwa"
+
+    @property
+    def init_qpos(self):
+        return np.array([0.000, 0.650, 0.000, -1.890, 0.000, 0.600, 0.000])
+
+    @property
+    def base_xpos_offset(self):
+        return {
+            "bins": (-0.5, -0.1, 0),
+            "empty": (-0.6, 0, 0),
+            "table": lambda table_length: (-0.16 - table_length / 2, 0, 0),
+        }
+
+    @property
+    def top_offset(self):
+        return np.array((0, 0, 1.0))
+
+    @property
+    def _horizontal_radius(self):
+        return 0.5
+
+    @property
+    def arm_type(self):
+        return "single"
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/jaco_robot.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/jaco_robot.py
new file mode 100644
index 0000000000000000000000000000000000000000..092d431f34ccc810c4f27400629edb787b70a8a8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/jaco_robot.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+from robosuite.models.robots.manipulators.manipulator_model import ManipulatorModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class Jaco(ManipulatorModel):
+    """
+    Jaco is a kind and assistive robot created by Kinova
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this robot instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("robots/jaco/robot.xml"), idn=idn)
+
+    @property
+    def default_mount(self):
+        return "RethinkMount"
+
+    @property
+    def default_gripper(self):
+        return "JacoThreeFingerGripper"
+
+    @property
+    def default_controller_config(self):
+        return "default_jaco"
+
+    @property
+    def init_qpos(self):
+        return np.array([3.192, 3.680, -0.000, 1.170, 0.050, 3.760, 3.142])
+
+    @property
+    def base_xpos_offset(self):
+        return {
+            "bins": (-0.5, -0.1, 0),
+            "empty": (-0.6, 0, 0),
+            "table": lambda table_length: (-0.16 - table_length / 2, 0, 0),
+        }
+
+    @property
+    def top_offset(self):
+        return np.array((0, 0, 1.0))
+
+    @property
+    def _horizontal_radius(self):
+        return 0.5
+
+    @property
+    def arm_type(self):
+        return "single"
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/kinova3_robot.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/kinova3_robot.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a3835c78c39dc3d99955e5abdaa023e1ed3430
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/kinova3_robot.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+from robosuite.models.robots.manipulators.manipulator_model import ManipulatorModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class Kinova3(ManipulatorModel):
+    """
+    The Gen3 robot is the sparkly newest addition to the Kinova line
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this robot instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("robots/kinova3/robot.xml"), idn=idn)
+
+    @property
+    def default_mount(self):
+        return "PhantomMount"
+
+    @property
+    def default_gripper(self):
+        return "Robotiq85Gripper"
+
+    @property
+    def default_controller_config(self):
+        return "default_kinova3"
+
+    @property
+    def init_qpos(self):
+        return np.array([0.000, 0.650, 0.000, 1.890, 0.000, 0.600, -np.pi / 2])
+
+    @property
+    def base_xpos_offset(self):
+        return {
+            "bins": (-0.5, -0.1, 0),
+            "empty": (-0.6, 0, 0),
+            "table": lambda table_length: (-0.16 - table_length / 2, 0, 0),
+        }
+
+    @property
+    def top_offset(self):
+        return np.array((0, 0, 1.0))
+
+    @property
+    def _horizontal_radius(self):
+        return 0.5
+
+    @property
+    def arm_type(self):
+        return "single"
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/manipulator_model.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/manipulator_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7197d69f15212b4fd02ebffca71fb98b7fda617a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/manipulator_model.py
@@ -0,0 +1,177 @@
+from collections import OrderedDict
+
+import numpy as np
+
+from robosuite.models.robots import RobotModel
+from robosuite.utils.mjcf_utils import find_elements, string_to_array
+
+
+class ManipulatorModel(RobotModel):
+    """
+    Base class for all manipulator models (robot arm(s) with gripper(s)).
+
+    Args:
+        fname (str): Path to relevant xml file from which to create this robot instance
+        idn (int or str): Number or some other unique identification string for this robot instance
+    """
+
+    def __init__(self, fname, idn=0):
+        # Always run super init first
+        super().__init__(fname, idn=idn)
+
+        # key: gripper name and value: gripper model
+        self.grippers = OrderedDict()
+
+        # Grab hand's offset from final robot link (string -> np.array -> elements [1, 2, 3, 0] (x, y, z, w))
+        # Different case based on whether we're dealing with single or bimanual armed robot
+        if self.arm_type == "single":
+            hand_element = find_elements(
+                root=self.root, tags="body", attribs={"name": self.eef_name}, return_first=True
+            )
+            self.hand_rotation_offset = string_to_array(hand_element.get("quat", "1 0 0 0"))[[1, 2, 3, 0]]
+        else:  # "bimanual" case
+            self.hand_rotation_offset = {}
+            for arm in ("right", "left"):
+                hand_element = find_elements(
+                    root=self.root, tags="body", attribs={"name": self.eef_name[arm]}, return_first=True
+                )
+                self.hand_rotation_offset[arm] = string_to_array(hand_element.get("quat", "1 0 0 0"))[[1, 2, 3, 0]]
+
+        # Get camera names for this robot
+        self.cameras = self.get_element_names(self.worldbody, "camera")
+
+    def add_gripper(self, gripper, arm_name=None):
+        """
+        Mounts @gripper to arm.
+
+        Throws error if robot already has a gripper or gripper type is incorrect.
+
+        Args:
+            gripper (GripperModel): gripper MJCF model
+            arm_name (str): name of arm mount -- defaults to self.eef_name if not specified
+
+        Raises:
+            ValueError: [Multiple grippers]
+        """
+        if arm_name is None:
+            arm_name = self.eef_name
+        if arm_name in self.grippers:
+            raise ValueError("Attempts to add multiple grippers to one body")
+
+        self.merge(gripper, merge_body=arm_name)
+
+        self.grippers[arm_name] = gripper
+
+        # Update cameras in this model
+        self.cameras = self.get_element_names(self.worldbody, "camera")
+
+    # -------------------------------------------------------------------------------------- #
+    # Public Properties: In general, these are the name-adjusted versions from the private   #
+    #                    attributes pulled from their respective raw xml files               #
+    # -------------------------------------------------------------------------------------- #
+
+    @property
+    def eef_name(self):
+        """
+        Returns:
+            str or dict of str: Prefix-adjusted eef name for this robot. If bimanual robot, returns {"left", "right"}
+                keyword-mapped eef names
+        """
+        return self.correct_naming(self._eef_name)
+
+    @property
+    def models(self):
+        """
+        Returns a list of all m(sub-)models owned by this robot model. By default, this includes the gripper model,
+        if specified
+
+        Returns:
+            list: models owned by this object
+        """
+        models = super().models
+        return models + list(self.grippers.values())
+
+    # -------------------------------------------------------------------------------------- #
+    # -------------------------- Private Properties ---------------------------------------- #
+    # -------------------------------------------------------------------------------------- #
+
+    @property
+    def _important_sites(self):
+        """
+        Returns:
+            dict: (Default is no important sites; i.e.: empty dict)
+        """
+        return {}
+
+    @property
+    def _eef_name(self):
+        """
+        XML eef name for this robot to which grippers can be attached. Note that these should be the raw
+        string names directly pulled from a robot's corresponding XML file, NOT the adjusted name with an
+        auto-generated naming prefix
+
+        Returns:
+            str: Raw XML eef name for this robot (default is "right_hand")
+        """
+        return "right_hand"
+
+    # -------------------------------------------------------------------------------------- #
+    # All subclasses must implement the following properties                                 #
+    # -------------------------------------------------------------------------------------- #
+
+    @property
+    def default_gripper(self):
+        """
+        Defines the default gripper type for this robot that gets added to end effector
+
+        Returns:
+            str: Default gripper name to add to this robot
+        """
+        raise NotImplementedError
+
+    @property
+    def arm_type(self):
+        """
+        Type of robot arm. Should be either "bimanual" or "single" (or something else if it gets added in the future)
+
+        Returns:
+            str: Type of robot
+        """
+        raise NotImplementedError
+
+    @property
+    def base_xpos_offset(self):
+        """
+        Defines the dict of various (x,y,z) tuple offsets relative to specific arenas placed at (0,0,0)
+        Assumes robot is facing forwards (in the +x direction) when determining offset. Should have entries for each
+        manipulator arena case; i.e.: "bins", "empty", and "table")
+
+        Returns:
+            dict:
+
+                :`'bins'`: (x,y,z) robot offset if placed in bins arena
+                :`'empty'`: (x,y,z) robot offset if placed in the empty arena
+                :`'table'`: lambda function that takes in table_length and returns corresponding (x,y,z) offset
+                    if placed in the table arena
+        """
+        raise NotImplementedError
+
+    @property
+    def top_offset(self):
+        raise NotImplementedError
+
+    @property
+    def _horizontal_radius(self):
+        raise NotImplementedError
+
+    @property
+    def default_mount(self):
+        raise NotImplementedError
+
+    @property
+    def default_controller_config(self):
+        raise NotImplementedError
+
+    @property
+    def init_qpos(self):
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/panda_robot.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/panda_robot.py
new file mode 100644
index 0000000000000000000000000000000000000000..336440da8de6d33b52089a1ffb8f58f3e5e17db8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/panda_robot.py
@@ -0,0 +1,55 @@
+import numpy as np
+
+from robosuite.models.robots.manipulators.manipulator_model import ManipulatorModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class Panda(ManipulatorModel):
+    """
+    Panda is a sensitive single-arm robot designed by Franka.
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this robot instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("robots/panda/robot.xml"), idn=idn)
+
+        # Set joint damping
+        self.set_joint_attribute(attrib="damping", values=np.array((0.1, 0.1, 0.1, 0.1, 0.1, 0.01, 0.01)))
+
+    @property
+    def default_mount(self):
+        return "RethinkMount"
+
+    @property
+    def default_gripper(self):
+        return "PandaGripper"
+
+    @property
+    def default_controller_config(self):
+        return "default_panda"
+
+    @property
+    def init_qpos(self):
+        return np.array([0, np.pi / 16.0, 0.00, -np.pi / 2.0 - np.pi / 3.0, 0.00, np.pi - 0.2, np.pi / 4])
+
+    @property
+    def base_xpos_offset(self):
+        return {
+            "bins": (-0.5, -0.1, 0),
+            "empty": (-0.6, 0, 0),
+            "table": lambda table_length: (-0.16 - table_length / 2, 0, 0),
+        }
+
+    @property
+    def top_offset(self):
+        return np.array((0, 0, 1.0))
+
+    @property
+    def _horizontal_radius(self):
+        return 0.5
+
+    @property
+    def arm_type(self):
+        return "single"
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/sawyer_robot.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/sawyer_robot.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc9c4e763b09aa3e750f88e6493975edfe66b6ed
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/sawyer_robot.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+from robosuite.models.robots.manipulators.manipulator_model import ManipulatorModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class Sawyer(ManipulatorModel):
+    """
+    Sawyer is a witty single-arm robot designed by Rethink Robotics.
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this robot instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("robots/sawyer/robot.xml"), idn=idn)
+
+    @property
+    def default_mount(self):
+        return "RethinkMount"
+
+    @property
+    def default_gripper(self):
+        return "RethinkGripper"
+
+    @property
+    def default_controller_config(self):
+        return "default_sawyer"
+
+    @property
+    def init_qpos(self):
+        return np.array([0, -1.18, 0.00, 2.18, 0.00, 0.57, -1.57])
+
+    @property
+    def base_xpos_offset(self):
+        return {
+            "bins": (-0.5, -0.1, 0),
+            "empty": (-0.6, 0, 0),
+            "table": lambda table_length: (-0.16 - table_length / 2, 0, 0),
+        }
+
+    @property
+    def top_offset(self):
+        return np.array((0, 0, 1.0))
+
+    @property
+    def _horizontal_radius(self):
+        return 0.5
+
+    @property
+    def arm_type(self):
+        return "single"
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/ur5e_robot.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/ur5e_robot.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecd7a48b3c33fd94fd72187244907f93f74b1dca
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/manipulators/ur5e_robot.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+from robosuite.models.robots.manipulators.manipulator_model import ManipulatorModel
+from robosuite.utils.mjcf_utils import xml_path_completion
+
+
+class UR5e(ManipulatorModel):
+    """
+    UR5e is a sleek and elegant new robot created by Universal Robots
+
+    Args:
+        idn (int or str): Number or some other unique identification string for this robot instance
+    """
+
+    def __init__(self, idn=0):
+        super().__init__(xml_path_completion("robots/ur5e/robot.xml"), idn=idn)
+
+    @property
+    def default_mount(self):
+        return "RethinkMount"
+
+    @property
+    def default_gripper(self):
+        return "Robotiq85Gripper"
+
+    @property
+    def default_controller_config(self):
+        return "default_ur5e"
+
+    @property
+    def init_qpos(self):
+        return np.array([-0.470, -1.735, 2.480, -2.275, -1.590, -1.991])
+
+    @property
+    def base_xpos_offset(self):
+        return {
+            "bins": (-0.5, -0.1, 0),
+            "empty": (-0.6, 0, 0),
+            "table": lambda table_length: (-0.16 - table_length / 2, 0, 0),
+        }
+
+    @property
+    def top_offset(self):
+        return np.array((0, 0, 1.0))
+
+    @property
+    def _horizontal_radius(self):
+        return 0.5
+
+    @property
+    def arm_type(self):
+        return "single"
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/robots/robot_model.py b/phantom/submodules/phantom-robosuite/robosuite/models/robots/robot_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..63758c2094dd7dd97c6d551e708cfc0bab9d9b98
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/robots/robot_model.py
@@ -0,0 +1,298 @@
+import numpy as np
+
+from robosuite.models.base import MujocoXMLModel
+from robosuite.utils.mjcf_utils import ROBOT_COLLISION_COLOR, array_to_string, string_to_array
+from robosuite.utils.transform_utils import euler2mat, mat2quat
+
+REGISTERED_ROBOTS = {}
+
+
+def register_robot(target_class):
+    REGISTERED_ROBOTS[target_class.__name__] = target_class
+
+
+def create_robot(robot_name, *args, **kwargs):
+    """
+    Instantiates a Robot object.
+
+    Args:
+        robot_name (str): Name of the robot to initialize
+        *args: Additional arguments to pass to the specific Robot class initializer
+        **kwargs: Additional arguments to pass to the specific Robot class initializer
+
+    Returns:
+        Robot: Desired robot
+
+    Raises:
+        Exception: [Invalid robot name]
+    """
+    if robot_name not in REGISTERED_ROBOTS:
+        raise Exception(
+            "Robot {} not found. Make sure it is a registered robot among: {}".format(
+                robot_name, ", ".join(REGISTERED_ROBOTS)
+            )
+        )
+    return REGISTERED_ROBOTS[robot_name](*args, **kwargs)
+
+
+class RobotModelMeta(type):
+    """Metaclass for registering robot arms"""
+
+    def __new__(meta, name, bases, class_dict):
+        cls = super().__new__(meta, name, bases, class_dict)
+
+        # List all environments that should not be registered here.
+        _unregistered_envs = ["RobotModel", "ManipulatorModel"]
+
+        if cls.__name__ not in _unregistered_envs:
+            register_robot(cls)
+        return cls
+
+
+class RobotModel(MujocoXMLModel, metaclass=RobotModelMeta):
+    """
+    Base class for all robot models.
+
+    Args:
+        fname (str): Path to relevant xml file from which to create this robot instance
+        idn (int or str): Number or some other unique identification string for this robot instance
+    """
+
+    def __init__(self, fname, idn=0):
+        super().__init__(fname, idn=idn)
+
+        # Define other variables that get filled later
+        self.mount = None
+
+        # Get camera names for this robot
+        self.cameras = self.get_element_names(self.worldbody, "camera")
+
+        # By default, set small frictionloss and armature values
+        self.set_joint_attribute(attrib="frictionloss", values=0.1 * np.ones(self.dof), force=False)
+        self.set_joint_attribute(attrib="damping", values=0.1 * np.ones(self.dof), force=False)
+        self.set_joint_attribute(
+            attrib="armature", values=np.array([5.0 / (i + 1) for i in range(self.dof)]), force=False
+        )
+
+    def set_base_xpos(self, pos):
+        """
+        Places the robot on position @pos.
+
+        Args:
+            pos (3-array): (x,y,z) position to place robot base
+        """
+        self._elements["root_body"].set("pos", array_to_string(pos - self.bottom_offset))
+
+    def set_base_ori(self, rot):
+        """
+        Rotates robot by rotation @rot from its original orientation.
+
+        Args:
+            rot (3-array): (r,p,y) euler angles specifying the orientation for the robot base
+        """
+        # xml quat assumes w,x,y,z so we need to convert to this format from outputted x,y,z,w format from fcn
+        rot = mat2quat(euler2mat(rot))[[3, 0, 1, 2]]
+        self._elements["root_body"].set("quat", array_to_string(rot))
+
+    def set_joint_attribute(self, attrib, values, force=True):
+        """
+        Sets joint attributes, e.g.: friction loss, damping, etc.
+
+        Args:
+            attrib (str): Attribute to set for all joints
+            values (n-array): Values to set for each joint
+            force (bool): If True, will automatically override any pre-existing value. Otherwise, if a value already
+                exists for this value, it will be skipped
+
+        Raises:
+            AssertionError: [Inconsistent dimension sizes]
+        """
+        assert values.size == len(self._elements["joints"]), (
+            "Error setting joint attributes: "
+            + "Values must be same size as joint dimension. Got {}, expected {}!".format(values.size, self.dof)
+        )
+        for i, joint in enumerate(self._elements["joints"]):
+            if force or joint.get(attrib, None) is None:
+                joint.set(attrib, array_to_string(np.array([values[i]])))
+
+    def add_mount(self, mount):
+        """
+        Mounts @mount to arm.
+
+        Throws error if robot already has a mount or if mount type is incorrect.
+
+        Args:
+            mount (MountModel): mount MJCF model
+
+        Raises:
+            ValueError: [mount already added]
+        """
+        if self.mount is not None:
+            raise ValueError("Mount already added for this robot!")
+
+        # First adjust mount's base position
+        offset = self.base_offset - mount.top_offset
+        mount._elements["root_body"].set("pos", array_to_string(offset))
+
+        self.merge(mount, merge_body=self.root_body)
+
+        self.mount = mount
+
+        # Update cameras in this model
+        self.cameras = self.get_element_names(self.worldbody, "camera")
+
+    # -------------------------------------------------------------------------------------- #
+    # Public Properties: In general, these are the name-adjusted versions from the private   #
+    #                    attributes pulled from their respective raw xml files               #
+    # -------------------------------------------------------------------------------------- #
+
+    @property
+    def naming_prefix(self):
+        return "robot{}_".format(self.idn)
+
+    @property
+    def dof(self):
+        """
+        Defines the number of DOF of the robot
+
+        Returns:
+            int: robot DOF
+        """
+        return len(self._joints)
+
+    @property
+    def bottom_offset(self):
+        """
+        Returns vector from model root body to model bottom.
+        By default, this is equivalent to this robot's mount's (bottom_offset - top_offset) + this robot's base offset
+
+        Returns:
+            np.array: (dx, dy, dz) offset vector
+        """
+        return (
+            (self.mount.bottom_offset - self.mount.top_offset) + self._base_offset
+            if self.mount is not None
+            else self._base_offset
+        )
+
+    @property
+    def horizontal_radius(self):
+        """
+        Returns maximum distance from model root body to any radial point of the model. This method takes into
+        account the mount horizontal radius as well
+
+        Returns:
+            float: radius
+        """
+        return max(self._horizontal_radius, self.mount.horizontal_radius)
+
+    @property
+    def models(self):
+        """
+        Returns a list of all m(sub-)models owned by this robot model. By default, this includes the mount model,
+        if specified
+
+        Returns:
+            list: models owned by this object
+        """
+        return [self.mount] if self.mount is not None else []
+
+    @property
+    def contact_geom_rgba(self):
+        return ROBOT_COLLISION_COLOR
+
+    # -------------------------------------------------------------------------------------- #
+    # All subclasses must implement the following properties                                 #
+    # -------------------------------------------------------------------------------------- #
+
+    @property
+    def default_mount(self):
+        """
+        Defines the default mount type for this robot that gets added to root body (base)
+
+        Returns:
+            str: Default mount name to add to this robot
+        """
+        raise NotImplementedError
+
+    @property
+    def default_controller_config(self):
+        """
+        Defines the name of default controller config file in the controllers/config directory for this robot.
+
+        Returns:
+            str: filename of default controller config for this robot
+        """
+        raise NotImplementedError
+
+    @property
+    def init_qpos(self):
+        """
+        Defines the default rest qpos of this robot
+
+        Returns:
+            np.array: Default init qpos of this robot
+        """
+        raise NotImplementedError
+
+    @property
+    def base_xpos_offset(self):
+        """
+        Defines the dict of various (x,y,z) tuple offsets relative to specific arenas placed at (0,0,0)
+        Assumes robot is facing forwards (in the +x direction) when determining offset. Should have entries for each
+        arena case; i.e.: "bins", "empty", and "table")
+
+        Returns:
+            dict: Dict mapping arena names to robot offsets from the global origin (dict entries may also be lambdas
+                for variable offsets)
+        """
+        raise NotImplementedError
+
+    @property
+    def top_offset(self):
+        """
+        Returns vector from model root body to model top.
+        Useful for, e.g. placing models on a surface.
+        Must be defined by subclass.
+
+        Returns:
+            np.array: (dx, dy, dz) offset vector
+        """
+        raise NotImplementedError
+
+    @property
+    def _horizontal_radius(self):
+        """
+        Returns maximum distance from model root body to any radial point of the model.
+
+        Helps us put models programmatically without them flying away due to a huge initial contact force.
+        Must be defined by subclass.
+
+        Returns:
+            float: radius
+        """
+        raise NotImplementedError
+
+    @property
+    def _important_sites(self):
+        """
+        Returns:
+            dict: (Default is no important sites; i.e.: empty dict)
+        """
+        return {}
+
+    @property
+    def _important_geoms(self):
+        """
+        Returns:
+             dict: (Default is no important geoms; i.e.: empty dict)
+        """
+        return {}
+
+    @property
+    def _important_sensors(self):
+        """
+        Returns:
+            dict: (Default is no sensors; i.e.: empty dict)
+        """
+        return {}
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/tasks/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/models/tasks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f92c679f3dc810b4870bcc4c30b185299c2da5fd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/tasks/__init__.py
@@ -0,0 +1,2 @@
+from .task import Task
+from .manipulation_task import ManipulationTask
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/tasks/manipulation_task.py b/phantom/submodules/phantom-robosuite/robosuite/models/tasks/manipulation_task.py
new file mode 100644
index 0000000000000000000000000000000000000000..8309e2420d1145be40b9c3872743f2939d6b061a
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/tasks/manipulation_task.py
@@ -0,0 +1,7 @@
+from robosuite.models.tasks.task import Task
+
+
+class ManipulationTask(Task):
+    """
+    A manipulation-specific task. This is currently a future-proofing placeholder.
+    """
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/tasks/task.py b/phantom/submodules/phantom-robosuite/robosuite/models/tasks/task.py
new file mode 100644
index 0000000000000000000000000000000000000000..658fc2f3fd463d8a3bde4ffd1f7496c3070ab7bc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/tasks/task.py
@@ -0,0 +1,191 @@
+from copy import deepcopy
+
+from robosuite.models.objects import MujocoObject
+from robosuite.models.robots import RobotModel
+from robosuite.models.world import MujocoWorldBase
+from robosuite.utils.mjcf_utils import get_ids
+
+
+class Task(MujocoWorldBase):
+    """
+    Creates MJCF model for a task performed.
+
+    A task consists of one or more robots interacting with a variable number of
+    objects. This class combines the robot(s), the arena, and the objects
+    into a single MJCF model.
+
+    Args:
+        mujoco_arena (Arena): MJCF model of robot workspace
+
+        mujoco_robots (RobotModel or list of RobotModel): MJCF model of robot model(s) (list)
+
+        mujoco_objects (None or MujocoObject or list of MujocoObject): a list of MJCF models of physical objects
+
+    Raises:
+        AssertionError: [Invalid input object type]
+    """
+
+    def __init__(
+        self,
+        mujoco_arena,
+        mujoco_robots,
+        mujoco_objects=None,
+    ):
+        super().__init__()
+
+        # Store references to all models
+        self.mujoco_arena = mujoco_arena
+        self.mujoco_robots = [mujoco_robots] if isinstance(mujoco_robots, RobotModel) else mujoco_robots
+        if mujoco_objects is None:
+            self.mujoco_objects = []
+        else:
+            self.mujoco_objects = [mujoco_objects] if isinstance(mujoco_objects, MujocoObject) else mujoco_objects
+
+        # Merge all models
+        self.merge_arena(self.mujoco_arena)
+        for mujoco_robot in self.mujoco_robots:
+            self.merge_robot(mujoco_robot)
+        self.merge_objects(self.mujoco_objects)
+
+        self._instances_to_ids = None
+        self._geom_ids_to_instances = None
+        self._site_ids_to_instances = None
+        self._classes_to_ids = None
+        self._geom_ids_to_classes = None
+        self._site_ids_to_classes = None
+
+    def merge_robot(self, mujoco_robot):
+        """
+        Adds robot model to the MJCF model.
+
+        Args:
+            mujoco_robot (RobotModel): robot to merge into this MJCF model
+        """
+        self.merge(mujoco_robot)
+
+    def merge_arena(self, mujoco_arena):
+        """
+        Adds arena model to the MJCF model.
+
+        Args:
+            mujoco_arena (Arena): arena to merge into this MJCF model
+        """
+        self.merge(mujoco_arena)
+
+    def merge_objects(self, mujoco_objects):
+        """
+        Adds object models to the MJCF model.
+
+        Args:
+            mujoco_objects (list of MujocoObject): objects to merge into this MJCF model
+        """
+        for mujoco_obj in mujoco_objects:
+            # Make sure we actually got a MujocoObject
+            assert isinstance(mujoco_obj, MujocoObject), "Tried to merge non-MujocoObject! Got type: {}".format(
+                type(mujoco_obj)
+            )
+            # Merge this object
+            self.merge_assets(mujoco_obj)
+            self.worldbody.append(mujoco_obj.get_obj())
+
+    def generate_id_mappings(self, sim):
+        """
+        Generates IDs mapping class instances to set of (visual) geom IDs corresponding to that class instance
+
+        Args:
+            sim (MjSim): Current active mujoco simulation object
+        """
+        self._instances_to_ids = {}
+        self._geom_ids_to_instances = {}
+        self._site_ids_to_instances = {}
+        self._classes_to_ids = {}
+        self._geom_ids_to_classes = {}
+        self._site_ids_to_classes = {}
+
+        models = [model for model in self.mujoco_objects]
+        for robot in self.mujoco_robots:
+            models += [robot] + robot.models
+
+        # Parse all mujoco models from robots and objects
+        for model in models:
+            # Grab model class name and visual IDs
+            cls = str(type(model)).split("'")[1].split(".")[-1]
+            inst = model.name
+            id_groups = [
+                get_ids(sim=sim, elements=model.visual_geoms + model.contact_geoms, element_type="geom"),
+                get_ids(sim=sim, elements=model.sites, element_type="site"),
+            ]
+            group_types = ("geom", "site")
+            ids_to_instances = (self._geom_ids_to_instances, self._site_ids_to_instances)
+            ids_to_classes = (self._geom_ids_to_classes, self._site_ids_to_classes)
+
+            # Add entry to mapping dicts
+
+            # Instances should be unique
+            assert inst not in self._instances_to_ids, f"Instance {inst} already registered; should be unique"
+            self._instances_to_ids[inst] = {}
+
+            # Classes may not be unique
+            if cls not in self._classes_to_ids:
+                self._classes_to_ids[cls] = {group_type: [] for group_type in group_types}
+
+            for ids, group_type, ids_to_inst, ids_to_cls in zip(
+                id_groups, group_types, ids_to_instances, ids_to_classes
+            ):
+                # Add geom, site ids
+                self._instances_to_ids[inst][group_type] = ids
+                self._classes_to_ids[cls][group_type] += ids
+
+                # Add reverse mappings as well
+                for idn in ids:
+                    assert idn not in ids_to_inst, f"ID {idn} already registered; should be unique"
+                    ids_to_inst[idn] = inst
+                    ids_to_cls[idn] = cls
+
+    @property
+    def geom_ids_to_instances(self):
+        """
+        Returns:
+            dict: Mapping from geom IDs in sim to specific class instance names
+        """
+        return deepcopy(self._geom_ids_to_instances)
+
+    @property
+    def site_ids_to_instances(self):
+        """
+        Returns:
+            dict: Mapping from site IDs in sim to specific class instance names
+        """
+        return deepcopy(self._site_ids_to_instances)
+
+    @property
+    def instances_to_ids(self):
+        """
+        Returns:
+            dict: Mapping from specific class instance names to {geom, site} IDs in sim
+        """
+        return deepcopy(self._instances_to_ids)
+
+    @property
+    def geom_ids_to_classes(self):
+        """
+        Returns:
+            dict: Mapping from geom IDs in sim to specific classes
+        """
+        return deepcopy(self._geom_ids_to_classes)
+
+    @property
+    def site_ids_to_classes(self):
+        """
+        Returns:
+            dict: Mapping from site IDs in sim to specific classes
+        """
+        return deepcopy(self._site_ids_to_classes)
+
+    @property
+    def classes_to_ids(self):
+        """
+        Returns:
+            dict: Mapping from specific classes to {geom, site} IDs in sim
+        """
+        return deepcopy(self._classes_to_ids)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/models/world.py b/phantom/submodules/phantom-robosuite/robosuite/models/world.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee2f265d374dd0e9a7e2586176c0758a91f015dc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/models/world.py
@@ -0,0 +1,13 @@
+import robosuite.macros as macros
+from robosuite.models.base import MujocoXML
+from robosuite.utils.mjcf_utils import convert_to_string, find_elements, xml_path_completion
+
+
+class MujocoWorldBase(MujocoXML):
+    """Base class to inherit all mujoco worlds from."""
+
+    def __init__(self):
+        super().__init__(xml_path_completion("base.xml"))
+        # Modify the simulation timestep to be the requested value
+        options = find_elements(root=self.root, tags="option", attribs=None, return_first=True)
+        options.set("timestep", convert_to_string(macros.SIMULATION_TIMESTEP))
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ac7004e9d49e6122425170c97027b242029305c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/renderers/__init__.py
@@ -0,0 +1 @@
+from .base import load_renderer_config
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/base.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeb60c3dde1523b6f89352614027c4e66cecf696
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/renderers/base.py
@@ -0,0 +1,78 @@
+"""
+This file contains the base renderer class for Mujoco environments.
+"""
+
+import abc
+import json
+import os
+
+
+def load_renderer_config(renderer):
+    """Loads the config of the specified renderer.
+    Modify the dictionary returned by this function
+    according to reuirements.
+
+    Args:
+        renderer (str): Name of the renderer to use.
+
+    Returns:
+        dict: renderer default config.
+    """
+    if renderer == "nvisii":
+        fname = "config/nvisii_config.json"
+    else:
+        raise ValueError(f"renderer type can only be 'nvisii' got '{renderer}'")
+
+    dir_path = os.path.dirname(__file__)
+    with open(os.path.join(dir_path, fname)) as f:
+        config = json.load(f)
+
+    return config
+
+
+class Renderer:
+    """
+    Base class for all robosuite renderers
+    Defines basic interface for all renderers to adhere to
+    """
+
+    def __init__(self, env, renderer_type="mujoco"):
+        self.env = env
+        self.renderer_type = renderer_type
+
+    def __str__(self):
+        """Prints the renderer type in a formatted way
+
+        Returns:
+            str: string representing the renderer
+        """
+        return f'<RendererObject renderer_type="{self.renderer_type}">'
+
+    @abc.abstractmethod
+    def render(self, **kwargs):
+        """Renders the current state with the specified renderer"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def update(self):
+        """Updates the states in the renderer (for NVISII)"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def close(self):
+        """Closes the renderer objects"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def reset(self):
+        """Reset the renderer with initial states for environment"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_pixel_obs(self):
+        """Get the pixel observations from the given renderer
+
+        Returns:
+            numpyarr: numpy array representing pixels of renderer
+        """
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/base_parser.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/base_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e0ff7aed6f533b60a3aa2f8914cb3fa5db1bf04
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/renderers/base_parser.py
@@ -0,0 +1,55 @@
+import abc
+import xml.etree.ElementTree as ET
+
+
+class BaseParser(object):
+    """
+    Base class for Parser objects used by renderers.
+    """
+
+    def __init__(self, renderer, env):
+        """
+        Parse the mujoco xml and initialize iG renderer objects.
+
+        Args:
+            renderer: the renderer
+            env : Mujoco env
+        """
+
+        self.renderer = renderer
+        self.env = env
+        self.xml_root = ET.fromstring(self.env.sim.model.get_xml())
+        self.parent_map = {c: p for p in self.xml_root.iter() for c in p}
+        self.visual_objects = {}
+
+    @abc.abstractmethod
+    def parse_textures(self):
+        """
+        Parse and load all textures and store them
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def parse_materials(self):
+        """
+        Parse all materials and use texture mapping to initialize materials
+        """
+        raise NotImplementedError
+
+    def parse_cameras(self):
+        """
+        Parse cameras and initialize the cameras.
+        """
+        raise NotImplementedError
+
+    def parse_meshes(self):
+        """
+        Create mapping of meshes.
+        """
+        raise NotImplementedError
+
+    def parse_geometries(self):
+        """
+        Iterate through each geometry and load it in the renderer.
+        """
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/config/nvisii_config.json b/phantom/submodules/phantom-robosuite/robosuite/renderers/config/nvisii_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d6f5d862ada51716402a7d22b9780b92fdd6c7c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/renderers/config/nvisii_config.json
@@ -0,0 +1,14 @@
+{
+    "img_path": "images/",
+    "width": 1280,
+    "height": 720,
+    "spp": 512,
+    "use_noise": false,
+    "debug_mode": false,
+    "video_mode": false,
+    "video_path": "videos/",
+    "video_name": "robosuite_video_0.mp4",
+    "video_fps": 30,
+    "verbose": 1,
+    "vision_modalities": null
+}
\ No newline at end of file
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/context/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/context/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/context/egl_context.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/context/egl_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bfc15c18aca6e96ae4ad9f6fba0c45642899776
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/renderers/context/egl_context.py
@@ -0,0 +1,155 @@
+# Modifications Copyright 2022 The robosuite Authors
+# Original Copyright 2018 The dm_control Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import atexit
+import ctypes
+import os
+
+PYOPENGL_PLATFORM = os.environ.get("PYOPENGL_PLATFORM")
+
+if not PYOPENGL_PLATFORM:
+    os.environ["PYOPENGL_PLATFORM"] = "egl"
+elif PYOPENGL_PLATFORM.lower() != "egl":
+    raise ImportError(
+        "Cannot use EGL rendering platform. "
+        "The PYOPENGL_PLATFORM environment variable is set to {!r} "
+        "(should be either unset or 'egl')."
+    )
+
+from mujoco.egl import egl_ext as EGL
+from OpenGL import error
+
+
+def create_initialized_egl_device_display(device_id=0):
+    """Creates an initialized EGL display directly on a device."""
+    all_devices = EGL.eglQueryDevicesEXT()
+    selected_device = (
+        os.environ.get("CUDA_VISIBLE_DEVICES", None)
+        if os.environ.get("MUJOCO_EGL_DEVICE_ID", None) is None
+        else os.environ.get("MUJOCO_EGL_DEVICE_ID", None)
+    )
+    if selected_device is None:
+        candidates = all_devices
+        if device_id == -1:
+            device_idx = 0
+        else:
+            device_idx = device_id
+    else:
+        if not selected_device.isdigit():
+            device_inds = [int(x) for x in selected_device.split(",")]
+            if device_id == -1:
+                device_idx = device_inds[0]
+            else:
+                assert device_id in device_inds, "specified device id is not made visible in environment variables."
+                device_idx = device_id
+        else:
+            device_idx = int(selected_device)
+        if not 0 <= device_idx < len(all_devices):
+            raise RuntimeError(
+                f"The MUJOCO_EGL_DEVICE_ID environment variable must be an integer "
+                f"between 0 and {len(all_devices)-1} (inclusive), got {device_idx}."
+            )
+    candidates = all_devices[device_idx : device_idx + 1]
+    for device in candidates:
+        display = EGL.eglGetPlatformDisplayEXT(EGL.EGL_PLATFORM_DEVICE_EXT, device, None)
+        if display != EGL.EGL_NO_DISPLAY and EGL.eglGetError() == EGL.EGL_SUCCESS:
+            # `eglInitialize` may or may not raise an exception on failure depending
+            # on how PyOpenGL is configured. We therefore catch a `GLError` and also
+            # manually check the output of `eglGetError()` here.
+            try:
+                initialized = EGL.eglInitialize(display, None, None)
+            except error.GLError:
+                pass
+            else:
+                if initialized == EGL.EGL_TRUE and EGL.eglGetError() == EGL.EGL_SUCCESS:
+                    return display
+    return EGL.EGL_NO_DISPLAY
+
+
+global EGL_DISPLAY
+EGL_DISPLAY = None
+
+EGL_ATTRIBUTES = (
+    EGL.EGL_RED_SIZE,
+    8,
+    EGL.EGL_GREEN_SIZE,
+    8,
+    EGL.EGL_BLUE_SIZE,
+    8,
+    EGL.EGL_ALPHA_SIZE,
+    8,
+    EGL.EGL_DEPTH_SIZE,
+    24,
+    EGL.EGL_STENCIL_SIZE,
+    8,
+    EGL.EGL_COLOR_BUFFER_TYPE,
+    EGL.EGL_RGB_BUFFER,
+    EGL.EGL_SURFACE_TYPE,
+    EGL.EGL_PBUFFER_BIT,
+    EGL.EGL_RENDERABLE_TYPE,
+    EGL.EGL_OPENGL_BIT,
+    EGL.EGL_NONE,
+)
+
+
+class EGLGLContext:
+    """An EGL context for headless accelerated OpenGL rendering on GPU devices."""
+
+    def __init__(self, max_width, max_height, device_id=0):
+
+        del max_width, max_height  # unused
+        num_configs = ctypes.c_long()
+        config_size = 1
+        config = EGL.EGLConfig()
+        EGL.eglReleaseThread()
+        global EGL_DISPLAY
+        if EGL_DISPLAY is None:
+            # only initialize for the first time
+            EGL_DISPLAY = create_initialized_egl_device_display(device_id=device_id)
+            if EGL_DISPLAY == EGL.EGL_NO_DISPLAY:
+                raise ImportError(
+                    "Cannot initialize a EGL device display. This likely means that your EGL "
+                    "driver does not support the PLATFORM_DEVICE extension, which is "
+                    "required for creating a headless rendering context."
+                )
+            atexit.register(EGL.eglTerminate, EGL_DISPLAY)
+        EGL.eglChooseConfig(EGL_DISPLAY, EGL_ATTRIBUTES, ctypes.byref(config), config_size, num_configs)
+        if num_configs.value < 1:
+            raise RuntimeError(
+                "EGL failed to find a framebuffer configuration that matches the "
+                "desired attributes: {}".format(EGL_ATTRIBUTES)
+            )
+        EGL.eglBindAPI(EGL.EGL_OPENGL_API)
+        self._context = EGL.eglCreateContext(EGL_DISPLAY, config, EGL.EGL_NO_CONTEXT, None)
+        if not self._context:
+            raise RuntimeError("Cannot create an EGL context.")
+
+    def make_current(self):
+        if not EGL.eglMakeCurrent(EGL_DISPLAY, EGL.EGL_NO_SURFACE, EGL.EGL_NO_SURFACE, self._context):
+            raise RuntimeError("Failed to make the EGL context current.")
+
+    def free(self):
+        """Frees resources associated with this context."""
+        if self._context:
+            current_context = EGL.eglGetCurrentContext()
+            if current_context and self._context.address == current_context.address:
+                EGL.eglMakeCurrent(EGL_DISPLAY, EGL.EGL_NO_SURFACE, EGL.EGL_NO_SURFACE, EGL.EGL_NO_CONTEXT)
+            EGL.eglDestroyContext(EGL_DISPLAY, self._context)
+            EGL.eglReleaseThread()
+        self._context = None
+
+    def __del__(self):
+        self.free()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/context/glfw_context.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/context/glfw_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..087fe0d496833c489bc4debb652a08228cc7b0bf
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/renderers/context/glfw_context.py
@@ -0,0 +1,24 @@
+# Copyright 2017 The dm_control Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An OpenGL context created via GLFW."""
+
+from mujoco.glfw import GLContext
+
+
+class GLFWGLContext(GLContext):
+    """An OpenGL context created via GLFW."""
+
+    def __init__(self, max_width, max_height, device_id=0):
+        super().__init__(max_width, max_height)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/context/osmesa_context.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/context/osmesa_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2c8918cc270a7853cb5af7df15ac8ea00019031
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/renderers/context/osmesa_context.py
@@ -0,0 +1,26 @@
+# Copyright 2018 The dm_control Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An OSMesa context for software-based OpenGL rendering."""
+
+import os
+
+from mujoco.osmesa import GLContext
+
+
+class OSMesaGLContext(GLContext):
+    """An OSMesa context for software-based OpenGL rendering."""
+
+    def __init__(self, max_width, max_height, device_id=-1):
+        super().__init__(max_width, max_height)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/nvisii_renderer.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/nvisii_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..787eb7b8ef9a08bf5df09fa02c1880c38c4b3c96
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/nvisii_renderer.py
@@ -0,0 +1,575 @@
+import colorsys
+import os
+
+import cv2
+import matplotlib.cm as cm
+import numpy as np
+import nvisii
+import open3d as o3d
+
+import robosuite as suite
+import robosuite.renderers.nvisii.nvisii_utils as utils
+from robosuite.renderers.base import Renderer
+from robosuite.renderers.nvisii.parser import Parser
+from robosuite.utils import transform_utils as T
+from robosuite.utils.mjcf_utils import xml_path_completion
+from robosuite.utils.transform_utils import mat2quat
+from robosuite.wrappers import Wrapper
+
+np.set_printoptions(threshold=np.inf)
+
+
+class NVISIIRenderer(Renderer):
+    def __init__(
+        self,
+        env,
+        img_path="images/",
+        width=500,
+        height=500,
+        spp=256,
+        use_noise=False,
+        debug_mode=False,
+        video_mode=False,
+        video_path="videos/",
+        video_name="robosuite_video_0.mp4",
+        video_fps=60,
+        verbose=1,
+        vision_modalities=None,
+    ):
+        """
+        Initializes the nvisii wrapper. Wrapping any MuJoCo environment in this
+        wrapper will use the NVISII wrapper for rendering.
+
+        Args:
+            env (MujocoEnv instance): The environment to wrap.
+
+            img_path (string): Path to images.
+
+            width (int, optional): Width of the rendered image. Defaults to 500.
+
+            height (int, optional): Height of the rendered image. Defaults to 500.
+
+            spp (int, optional): Sample-per-pixel for each image. Larger spp will result
+                                 in higher quality images but will take more time to render
+                                 each image. Higher quality images typically use an spp of
+                                 around 512.
+
+            use_noise (bool, optional): Use noise or denoise. Deafults to false.
+
+            debug_mode (bool, optional): Use debug mode for nvisii. Deafults to false.
+
+            video_mode (bool, optional): By deafult, the NVISII wrapper saves the results as
+                                         images. If video_mode is set to true, a video is
+                                         produced and will be stored in the directory defined
+                                         by video_path. Defaults to false.
+
+            video_path (string, optional): Path to store the video. Required if video_mode is
+                                           set to true. Defaults to 'videos/'.
+
+            video_name (string, optional): Name for the file for the video. Defaults to
+                                           'robosuite_video_0.mp4'.
+
+            video_fps (int, optional): Frames per second for video. Defaults to 60.
+
+            verbose (int, optional): If verbose is set to 1, the wrapper will print the image
+                                     number for each image rendered. If verbose is set to 0,
+                                     nothing will be printed. Defaults to 1.
+
+            vision_modalities (string, optional): Options to render image with different ground truths
+                                              for NVISII. Options include "normal", "texture_coordinates",
+                                              "position", "depth".
+        """
+
+        super().__init__(env, renderer_type="nvisii")
+
+        self.env = env
+        self.img_path = img_path
+        self.width = width
+        self.height = height
+        self.spp = spp
+        self.use_noise = use_noise
+
+        self.video_mode = video_mode
+        self.video_path = video_path
+        self.video_name = video_name
+        self.video_fps = video_fps
+
+        self.verbose = verbose
+        self.vision_modalities = vision_modalities
+
+        self.img_cntr = 0
+
+        env._setup_references()
+
+        # enable interactive mode when debugging
+        if debug_mode:
+            nvisii.initialize_interactive()
+        else:
+            nvisii.initialize(headless=True)
+
+        self.segmentation_type = self.env.camera_segmentations
+
+        # add denoiser to nvisii if not using noise
+        if not use_noise:
+            nvisii.configure_denoiser()
+            nvisii.enable_denoiser()
+            nvisii.configure_denoiser(True, True, False)
+
+        if not os.path.exists(img_path):
+            os.makedirs(img_path)
+
+        if video_mode:
+            if not os.path.exists(video_path):
+                os.makedirs(video_path)
+            self.video = cv2.VideoWriter(
+                video_path + video_name, cv2.VideoWriter_fourcc(*"MP4V"), video_fps, (self.width, self.height)
+            )
+            print(f"video mode enabled")
+
+        if vision_modalities is None and self.segmentation_type[0] == None:
+            nvisii.sample_pixel_area(x_sample_interval=(0.0, 1.0), y_sample_interval=(0.0, 1.0))
+        else:
+            nvisii.sample_pixel_area(x_sample_interval=(0.5, 0.5), y_sample_interval=(0.5, 0.5))
+
+        self._init_nvisii_components()
+
+    def _init_nvisii_components(self):
+        self._init_lighting()
+        self._init_floor(image="plywood-4k.jpg")
+        self._init_walls(image="plaster-wall-4k.jpg")
+        self._init_camera()
+
+        self._load()
+
+    def _init_lighting(self):
+        # Intiailizes the lighting
+        self.light_1 = nvisii.entity.create(
+            name="light",
+            mesh=nvisii.mesh.create_sphere("light"),
+            transform=nvisii.transform.create("light"),
+        )
+
+        self.light_1.set_light(nvisii.light.create("light"))
+
+        self.light_1.get_light().set_intensity(150)  # intensity of the light
+        self.light_1.get_transform().set_scale(nvisii.vec3(0.3))  # scale the light down
+        self.light_1.get_transform().set_position(nvisii.vec3(3, 3, 4))  # sets the position of the light
+
+    def _init_floor(self, image):
+        """
+        Intiailizes the floor
+
+        Args:
+            image (string): String for the file to use as an image for the floor
+
+        """
+        floor_mesh = nvisii.mesh.create_plane(name="plane", size=nvisii.vec2(3, 3))
+
+        floor_entity = nvisii.entity.create(
+            name="floor",
+            mesh=floor_mesh,
+            material=nvisii.material.create("plane"),
+            transform=nvisii.transform.create("plane"),
+        )
+        floor_entity.get_transform().set_scale(nvisii.vec3(1))
+        floor_entity.get_transform().set_position(nvisii.vec3(0, 0, 0))
+
+        texture_image = xml_path_completion("textures/" + image)
+        texture = nvisii.texture.create_from_file(name="floor_texture", path=texture_image)
+
+        floor_entity.get_material().set_base_color_texture(texture)
+        floor_entity.get_material().set_roughness(0.4)
+        floor_entity.get_material().set_specular(0)
+
+    def _init_walls(self, image):
+        """
+        Intiailizes the walls
+
+        Args:
+            image (string): String for the file to use as an image for the walls
+        """
+        texture_image = xml_path_completion("textures/" + image)
+        texture = nvisii.texture.create_from_file(name="wall_texture", path=texture_image)
+
+        for wall in self.env.model.mujoco_arena.worldbody.findall("./geom[@material='walls_mat']"):
+
+            name = wall.get("name")
+            size = [float(x) for x in wall.get("size").split(" ")]
+
+            pos, quat = self._get_orientation_geom(name)
+
+            wall_entity = nvisii.entity.create(
+                name=name,
+                mesh=nvisii.mesh.create_box(name=name, size=nvisii.vec3(size[0], size[1], size[2])),
+                transform=nvisii.transform.create(name),
+                material=nvisii.material.create(name),
+            )
+
+            wall_entity.get_transform().set_position(nvisii.vec3(pos[0], pos[1], pos[2]))
+
+            wall_entity.get_transform().set_rotation(nvisii.quat(quat[0], quat[1], quat[2], quat[3]))
+
+            wall_entity.get_material().set_base_color_texture(texture)
+
+    def _init_camera(self):
+        """
+        Intializes the camera for the NVISII renderer
+        """
+
+        # intializes the camera
+        self.camera = nvisii.entity.create(
+            name="camera",
+            transform=nvisii.transform.create("camera_transform"),
+        )
+
+        self.camera.set_camera(
+            nvisii.camera.create_from_fov(
+                name="camera_camera", field_of_view=1, aspect=float(self.width) / float(self.height)
+            )
+        )
+
+        # Sets the primary camera of the renderer to the camera entity
+        nvisii.set_camera_entity(self.camera)
+        self._camera_configuration(
+            at_vec=nvisii.vec3(0, 0, 1.06),
+            up_vec=nvisii.vec3(0, 0, 1),
+            eye_vec=nvisii.vec3(1.24, 0.0, 1.35),
+            quat=nvisii.quat(-1, 0, 0, 0),
+        )
+
+        # Environment configuration
+        self._dome_light_intensity = 1
+        nvisii.set_dome_light_intensity(self._dome_light_intensity)
+        nvisii.set_max_bounce_depth(4)
+
+    def _camera_configuration(self, at_vec, up_vec, eye_vec, quat):
+        """
+        Sets the configuration for the NVISII camera. Configuration
+        is dependent on where the camera is located and where it
+        looks at
+        """
+        # configures the camera
+        self.camera.get_transform().look_at(
+            at=at_vec, up=up_vec, eye=eye_vec, previous=False  # look at (world coordinate)  # up vector
+        )
+
+        self.camera.get_transform().rotate_around(eye_vec, quat)
+
+    def set_camera_pos_quat(self, pos, quat):
+        self.camera.get_transform().set_position(pos)
+        self.camera.get_transform().look_at(
+            at=(0, 0, 1.06), up=(0, 0, 1), eye=pos, previous=False  # look at (world coordinate)  # up vector
+        )
+        # self.camera.get_transform().rotate_around(pos, quat)
+
+    def _get_orientation_geom(self, name):
+        """
+        Gets the position and quaternion for a geom
+        """
+
+        pos = self.env.sim.data.geom_xpos[self.env.sim.model.geom_name2id(name)]
+        R = self.env.sim.data.geom_xmat[self.env.sim.model.geom_name2id(name)].reshape(3, 3)
+
+        quat_xyzw = mat2quat(R)
+        quat = np.array([quat_xyzw[3], quat_xyzw[0], quat_xyzw[1], quat_xyzw[2]])
+
+        return pos, quat
+
+    def _load(self):
+        """
+        Loads the nessecary textures, materials, and geoms into the
+        NVISII renderer
+        """
+        self.parser = Parser("nvisii", self.env, self.segmentation_type)
+        self.parser.parse_textures()
+        self.parser.parse_materials()
+        self.parser.parse_geometries()
+        self.components = self.parser.components
+        self.max_elements = self.parser.max_elements
+        self.max_instances = self.parser.max_instances
+        self.max_classes = self.parser.max_classes
+
+    def update(self):
+        """
+        Updates the states for the wrapper given a certain action
+
+        Args:
+            action (np-array): The action the robot should take
+        """
+        for key, value in self.components.items():
+            self._update_orientation(name=key, component=value)
+
+    def _update_orientation(self, name, component):
+        """
+        Update position for an object or a robot in renderer.
+
+        Args:
+            name (string): name of component
+            component (nvisii entity or scene): Object in renderer and other info
+                                                for object.
+        """
+
+        obj = component.obj
+        parent_body_name = component.parent_body_name
+        geom_pos = component.geom_pos
+        geom_quat = component.geom_quat
+        dynamic = component.dynamic
+
+        if not dynamic:
+            return
+
+        self.body_tags = ["robot", "pedestal", "gripper", "peg"]
+
+        if parent_body_name != "worldbody":
+            if self.tag_in_name(name):
+                pos = self.env.sim.data.get_body_xpos(parent_body_name)
+            else:
+                pos = self.env.sim.data.get_geom_xpos(name)
+
+            B = self.env.sim.data.body_xmat[self.env.sim.model.body_name2id(parent_body_name)].reshape((3, 3))
+            quat_xyzw_body = mat2quat(B)
+            quat_wxyz_body = np.array(
+                [quat_xyzw_body[3], quat_xyzw_body[0], quat_xyzw_body[1], quat_xyzw_body[2]]
+            )  # wxyz
+            nvisii_quat = nvisii.quat(*quat_wxyz_body) * nvisii.quat(*geom_quat)
+
+            if self.tag_in_name(name):
+                # Add position offset if there are position offset defined in the geom tag
+                homo_mat = T.pose2mat((np.zeros((1, 3), dtype=np.float32), quat_xyzw_body))
+                pos_offset = homo_mat @ np.array([geom_pos[0], geom_pos[1], geom_pos[2], 1.0]).transpose()
+                pos = pos + pos_offset[:3]
+
+        else:
+            pos = [0, 0, 0]
+            nvisii_quat = nvisii.quat(1, 0, 0, 0)  # wxyz
+
+        if isinstance(obj, nvisii.scene):
+
+            # temp fix -- look into XML file for correct quat
+            if "s_visual" in name:
+                # single robot
+                if len(self.env.robots) == 1:
+                    nvisii_quat = nvisii.quat(0, 0.5, 0, 0)
+                # two robots - 0
+                elif len(self.env.robots) == 2 and "robot_0" in name:
+                    nvisii_quat = nvisii.quat(-0, 0.5, 0.5, 0)
+                # two robots - 1
+                else:
+                    nvisii_quat = nvisii.quat(-0, 0.5, -0.5, 0)
+
+            obj.transforms[0].set_position(nvisii.vec3(pos[0], pos[1], pos[2]))
+            obj.transforms[0].set_rotation(nvisii_quat)
+        else:
+            obj.get_transform().set_position(nvisii.vec3(pos[0], pos[1], pos[2]))
+            obj.get_transform().set_rotation(nvisii_quat)
+
+    def tag_in_name(self, name):
+        """
+        Checks if one of the tags in body tags in the name
+
+        Args:
+            name (string): Name of component
+        """
+        for tag in self.body_tags:
+            if tag in name:
+                return True
+        return False
+
+    def render(self, render_type="png"):
+        """
+        Renders an image of the NVISII renderer
+
+        Args:
+            render_type (string, optional): Type of file to save as. Defaults to 'png'
+        """
+
+        self.img_cntr += 1
+        verbose_word = "frame" if self.video_mode else "image"
+
+        if self.video_mode:
+            img_file = f"{self.img_path}/image_0.{render_type}"
+            if self.segmentation_type[0] != None:
+                self.render_segmentation_data(img_file)
+            elif self.vision_modalities is None:
+                self.render_to_file(img_file)
+            else:
+                self.render_data_to_file(img_file)
+
+            self.video.write(cv2.imread(img_file))
+        else:
+            img_file = f"{self.img_path}/image_{self.img_cntr}.{render_type}"
+            if self.segmentation_type[0] != None:
+                self.render_segmentation_data(img_file)
+            elif self.vision_modalities is None:
+                self.render_to_file(img_file)
+            else:
+                self.render_data_to_file(img_file)
+
+        if self.verbose == 1:
+            print(f"Rendering {verbose_word}... {self.img_cntr}")
+
+    def render_to_file(self, img_file):
+        nvisii.render_to_file(width=self.width, height=self.height, samples_per_pixel=self.spp, file_path=img_file)
+
+    def render_segmentation_data(self, img_file):
+
+        segmentation_array = nvisii.render_data(
+            width=int(self.width),
+            height=int(self.height),
+            start_frame=0,
+            frame_count=1,
+            bounce=int(0),
+            options="entity_id",
+            seed=1,
+        )
+        segmentation_array = np.array(segmentation_array).reshape(self.height, self.width, 4)[:, :, 0]
+        segmentation_array[segmentation_array > 3.4028234663852886e37] = 0
+        segmentation_array[segmentation_array < 3.4028234663852886e-37] = 0
+        segmentation_array = np.flipud(segmentation_array)
+
+        rgb_data = self.segmentation_to_rgb(segmentation_array.astype(dtype=np.uint8))
+
+        from PIL import Image
+
+        rgb_img = Image.fromarray(rgb_data)
+        rgb_img.save(img_file)
+
+    def render_data_to_file(self, img_file):
+
+        if self.vision_modalities == "depth" and self.img_cntr != 1:
+
+            depth_data = nvisii.render_data(
+                width=self.width,
+                height=self.height,
+                start_frame=0,
+                frame_count=1,
+                bounce=int(0),
+                options=self.vision_modalities,
+            )
+
+            depth_data = np.array(depth_data).reshape(self.height, self.width, 4)
+            depth_data = np.flipud(depth_data)[:, :, [0, 1, 2]]
+
+            # normalize depths
+            depth_data[:, :, 0] = (depth_data[:, :, 0] - np.min(depth_data[:, :, 0])) / (
+                np.max(depth_data[:, :, 0]) - np.min(depth_data[:, :, 0])
+            )
+            depth_data[:, :, 1] = (depth_data[:, :, 1] - np.min(depth_data[:, :, 1])) / (
+                np.max(depth_data[:, :, 1]) - np.min(depth_data[:, :, 1])
+            )
+            depth_data[:, :, 2] = (depth_data[:, :, 2] - np.min(depth_data[:, :, 2])) / (
+                np.max(depth_data[:, :, 2]) - np.min(depth_data[:, :, 2])
+            )
+
+            from PIL import Image
+
+            depth_image = Image.fromarray(((1 - depth_data) * 255).astype(np.uint8))
+            depth_image.save(img_file)
+
+        elif self.vision_modalities == "normal" and self.img_cntr != 1:
+
+            normal_data = nvisii.render_data(
+                width=self.width,
+                height=self.height,
+                start_frame=0,
+                frame_count=1,
+                bounce=int(0),
+                options="screen_space_normal",
+            )
+
+            normal_data = np.array(normal_data).reshape(self.height, self.width, 4)
+            normal_data = np.flipud(normal_data)[:, :, [0, 1, 2]]
+
+            normal_data[:, :, 0] = (normal_data[:, :, 0] + 1) / 2 * 255  # R
+            normal_data[:, :, 1] = (normal_data[:, :, 1] + 1) / 2 * 255  # G
+            normal_data[:, :, 2] = 255 - ((normal_data[:, :, 2] + 1) / 2 * 255)  # B
+
+            from PIL import Image
+
+            normal_image = Image.fromarray((normal_data).astype(np.uint8))
+            normal_image.save(img_file)
+
+        else:
+
+            nvisii.render_data_to_file(
+                width=self.width,
+                height=self.height,
+                start_frame=0,
+                frame_count=1,
+                bounce=int(0),
+                options=self.vision_modalities,
+                file_path=img_file,
+            )
+
+    def randomize_colors(self, N, bright=True):
+        """
+        Modified from https://github.com/matterport/Mask_RCNN/blob/master/mrcnn/visualize.py#L59
+        Generate random colors.
+        To get visually distinct colors, generate them in HSV space then
+        convert to RGB.
+        """
+        brightness = 1.0 if bright else 0.5
+        hsv = [(1.0 * i / N, 1, brightness) for i in range(N)]
+        colors = np.array(list(map(lambda c: colorsys.hsv_to_rgb(*c), hsv)))
+        rstate = np.random.RandomState(seed=20)
+        np.random.shuffle(colors)
+        return colors
+
+    def segmentation_to_rgb(self, seg_im, random_colors=False):
+        """
+        Helper function to visualize segmentations as RGB frames.
+        NOTE: assumes that geom IDs go up to 255 at most - if not,
+        multiple geoms might be assigned to the same color.
+        """
+        # ensure all values lie within [0, 255]
+        seg_im = np.mod(seg_im, 256)
+
+        if random_colors:
+            colors = self.randomize_colors(N=256, bright=True)
+            return (255.0 * colors[seg_im]).astype(np.uint8)
+        else:
+
+            cmap = cm.get_cmap("jet")
+
+            max_r = 0
+            if self.segmentation_type[0][0] == "element":
+                max_r = np.amax(seg_im) + 1
+            elif self.segmentation_type[0][0] == "class":
+                max_r = self.max_classes
+                for i in range(len(seg_im)):
+                    for j in range(len(seg_im[0])):
+                        if seg_im[i][j] in self.parser.entity_id_class_mapping:
+                            seg_im[i][j] = self.parser.entity_id_class_mapping[seg_im[i][j]]
+                        else:
+                            seg_im[i][j] = max_r - 1
+            elif self.segmentation_type[0][0] == "instance":
+                max_r = self.max_instances
+                for i in range(len(seg_im)):
+                    for j in range(len(seg_im[0])):
+                        if seg_im[i][j] in self.parser.entity_id_class_mapping:
+                            seg_im[i][j] = self.parser.entity_id_class_mapping[seg_im[i][j]]
+                        else:
+                            seg_im[i][j] = max_r - 1
+
+            color_list = np.array([cmap(i / (max_r)) for i in range(max_r)])
+
+            return (color_list[seg_im] * 255).astype(np.uint8)
+
+    def reset(self):
+        nvisii.clear_all()
+        self._init_nvisii_components()
+        self.update()
+
+    def get_pixel_obs(self):
+        frame_buffer = nvisii.render(width=self.width, height=self.height, samples_per_pixel=self.spp)
+
+        frame_buffer = np.array(frame_buffer).reshape(self.height, self.width, 4)
+        frame_buffer = np.flipud(frame_buffer)
+
+        return frame_buffer
+
+    def close(self):
+        """
+        Deinitializes the nvisii rendering environment
+        """
+        nvisii.deinitialize()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/nvisii_utils.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/nvisii_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbc62a727a962ff586820a8007eb719134b33560
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/nvisii_utils.py
@@ -0,0 +1,123 @@
+import math
+import os
+
+import numpy as np
+import nvisii
+
+
+def load_object(
+    geom,
+    geom_name,
+    geom_type,
+    geom_quat,
+    geom_pos,
+    geom_size,
+    geom_scale,
+    geom_rgba,
+    geom_tex_name,
+    geom_tex_file,
+    class_id,
+    meshes,
+):
+    """
+    Function that initializes the meshes in the memory.
+
+    Args:
+        geom (XML element): Object in XML file to load
+
+        geom_name (str): Name for the object.
+
+        geom_type (str): Type of the object. Types include "box", "cylinder", or "mesh".
+
+        geom_quat (array): Quaternion (wxyz) of the object.
+
+        geom_pos (array): Position of the object.
+
+        geom_size (array): Size of the object.
+
+        geom_scale (array): Scale of the object.
+
+        geom_rgba (array): Color of the object. This is only used if the geom type is not
+                           a mesh and there is no specified material.
+
+        geom_tex_name (str): Name of the texture for the object
+
+        geom_tex_file (str): File of the texture for the object
+
+        class_id (int) : Class id for the component
+
+        meshes (dict): Meshes for the object
+    """
+
+    primitive_types = ["box", "cylinder"]
+    component = None
+
+    if geom_type == "box":
+
+        component = nvisii.entity.create(
+            name=geom_name,
+            mesh=nvisii.mesh.create_box(name=geom_name, size=nvisii.vec3(geom_size[0], geom_size[1], geom_size[2])),
+            transform=nvisii.transform.create(geom_name),
+            material=nvisii.material.create(geom_name),
+        )
+
+    elif geom_type == "cylinder":
+
+        component = nvisii.entity.create(
+            name=geom_name,
+            mesh=nvisii.mesh.create_capped_cylinder(name=geom_name, radius=geom_size[0], size=geom_size[1]),
+            transform=nvisii.transform.create(geom_name),
+            material=nvisii.material.create(geom_name),
+        )
+
+    elif geom_type == "sphere":
+
+        component = nvisii.entity.create(
+            name=geom_name,
+            mesh=nvisii.mesh.create_sphere(name=geom_name, radius=geom_size[0]),
+            transform=nvisii.transform.create(geom_name),
+            material=nvisii.material.create(geom_name),
+        )
+
+    elif geom_type == "mesh":
+        filename = meshes[geom.attrib["mesh"]]["file"]
+        filename = os.path.splitext(filename)[0] + ".obj"
+
+        component = nvisii.import_scene(
+            file_path=filename,
+            position=nvisii.vec3(geom_pos[0], geom_pos[1], geom_pos[2]),
+            scale=(geom_scale[0], geom_scale[1], geom_scale[2]),
+            rotation=nvisii.quat(geom_quat[0], geom_quat[1], geom_quat[2], geom_quat[3]),
+        )
+
+    entity_ids = []
+    if isinstance(component, nvisii.scene):
+        for i in range(len(component.entities)):
+            entity_ids.append(component.entities[i].get_id())
+    else:
+        entity_ids.append(component.get_id())
+
+    if geom_type in primitive_types:
+        component.get_transform().set_position(nvisii.vec3(float(geom_pos[0]), float(geom_pos[1]), float(geom_pos[2])))
+
+    if geom_tex_file is not None and geom_tex_name is not None and geom_type != "mesh":
+
+        texture = nvisii.texture.get(geom_tex_name)
+
+        if texture is None:
+            texture = nvisii.texture.create_from_file(name=geom_tex_name, path=geom_tex_file)
+
+        component.get_material().set_base_color_texture(texture)
+    else:
+        if "gripper" in geom_name:
+            if geom_rgba is not None:
+                if isinstance(component, nvisii.scene):
+                    for entity in component.entities:
+                        entity.get_material().set_base_color(nvisii.vec3(geom_rgba[0], geom_rgba[1], geom_rgba[2]))
+                else:
+                    component.get_material().set_base_color(nvisii.vec3(geom_rgba[0], geom_rgba[1], geom_rgba[2]))
+            elif "hand_visual" in geom_name:
+                for entity in component.entities:
+                    entity.get_material().set_base_color(nvisii.vec3(0.05, 0.05, 0.05))
+
+    return component, entity_ids
diff --git a/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/parser.py b/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..b184cef1d29c7ab67408f5989b1a1cae67c874ab
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/renderers/nvisii/parser.py
@@ -0,0 +1,214 @@
+import xml.etree.ElementTree as ET
+from collections import namedtuple
+
+import numpy as np
+import nvisii
+
+from robosuite.renderers.base_parser import BaseParser
+from robosuite.renderers.nvisii.nvisii_utils import load_object
+from robosuite.utils.mjcf_utils import string_to_array
+
+Components = namedtuple(
+    "Components", ["obj", "geom_index", "element_id", "parent_body_name", "geom_pos", "geom_quat", "dynamic"]
+)
+
+
+class Parser(BaseParser):
+    def __init__(self, renderer, env, segmentation_type):
+        """
+        Parse the mujoco xml and initialize NVISII renderer objects.
+        Args:
+            env (Mujoco env): Environment to parse
+        """
+
+        super().__init__(renderer, env)
+        self.segmentation_type = segmentation_type
+        self.create_class_mapping()
+        self.components = {}
+
+    def parse_textures(self):
+        """
+        Parse and load all textures and store them
+        """
+
+        self.texture_attributes = {}
+        self.texture_id_mapping = {}
+
+        for texture in self.xml_root.iter("texture"):
+            texture_type = texture.get("type")
+            texture_name = texture.get("name")
+            texture_file = texture.get("file")
+            texture_rgb = texture.get("rgb1")
+
+            if texture_file is not None:
+                self.texture_attributes[texture_name] = texture.attrib
+            else:
+                color = np.array(string_to_array(texture_rgb))
+                self.texture_id_mapping[texture_name] = (color, texture_type)
+
+    def parse_materials(self):
+        """
+        Parse all materials and use texture mapping to initialize materials
+        """
+
+        self.material_texture_mapping = {}
+        for material in self.xml_root.iter("material"):
+            material_name = material.get("name")
+            texture_name = material.get("texture")
+            self.material_texture_mapping[material_name] = texture_name
+
+    def parse_meshes(self):
+        """
+        Create mapping of meshes.
+        """
+        self.meshes = {}
+        for mesh in self.xml_root.iter("mesh"):
+            self.meshes[mesh.get("name")] = mesh.attrib
+
+    def parse_geometries(self):
+        """
+        Iterate through each goemetry and load it in the NVISII renderer.
+        """
+        self.parse_meshes()
+        element_id = 0
+        repeated_names = {}
+        block_rendering_objects = ["VisualBread_g0", "VisualCan_g0", "VisualCereal_g0", "VisualMilk_g0"]
+
+        self.entity_id_class_mapping = {}
+
+        for geom_index, geom in enumerate(self.xml_root.iter("geom")):
+
+            parent_body = self.parent_map.get(geom)
+            parent_body_name = parent_body.get("name", "worldbody")
+
+            geom_name = geom.get("name")
+            geom_type = geom.get("type", "sphere")
+
+            rgba_str = geom.get("rgba")
+            geom_rgba = string_to_array(rgba_str) if rgba_str is not None else None
+
+            if geom_name is None:
+                if parent_body_name in repeated_names:
+                    geom_name = parent_body_name + str(repeated_names[parent_body_name])
+                    repeated_names[parent_body_name] += 1
+                else:
+                    geom_name = parent_body_name + "0"
+                    repeated_names[parent_body_name] = 1
+
+            if (geom.get("group") != "1" and geom_type != "plane") or ("collision" in geom_name):
+                continue
+
+            if "floor" in geom_name or "wall" in geom_name or geom_name in block_rendering_objects:
+                continue
+
+            geom_quat = string_to_array(geom.get("quat", "1 0 0 0"))
+            geom_quat = [geom_quat[0], geom_quat[1], geom_quat[2], geom_quat[3]]
+
+            # handling special case of bins arena
+            if "bin" in parent_body_name:
+                geom_pos = string_to_array(geom.get("pos", "0 0 0")) + string_to_array(parent_body.get("pos", "0 0 0"))
+            else:
+                geom_pos = string_to_array(geom.get("pos", "0 0 0"))
+
+            if geom_type == "mesh":
+                geom_scale = string_to_array(self.meshes[geom.get("mesh")].get("scale", "1 1 1"))
+            else:
+                geom_scale = [1, 1, 1]
+            geom_size = string_to_array(geom.get("size", "1 1 1"))
+
+            geom_mat = geom.get("material")
+
+            tags = ["bin"]
+            dynamic = True
+            if self.tag_in_name(geom_name, tags):
+                dynamic = False
+
+            geom_tex_name = None
+            geom_tex_file = None
+
+            if geom_mat is not None:
+                geom_tex_name = self.material_texture_mapping[geom_mat]
+
+                if geom_tex_name in self.texture_attributes:
+                    geom_tex_file = self.texture_attributes[geom_tex_name]["file"]
+
+            class_id = self.get_class_id(geom_index, element_id)
+
+            # load obj into nvisii
+            obj, entity_ids = load_object(
+                geom=geom,
+                geom_name=geom_name,
+                geom_type=geom_type,
+                geom_quat=geom_quat,
+                geom_pos=geom_pos,
+                geom_size=geom_size,
+                geom_scale=geom_scale,
+                geom_rgba=geom_rgba,
+                geom_tex_name=geom_tex_name,
+                geom_tex_file=geom_tex_file,
+                class_id=class_id,  # change
+                meshes=self.meshes,
+            )
+
+            element_id += 1
+
+            for entity_id in entity_ids:
+                self.entity_id_class_mapping[entity_id] = class_id
+
+            self.components[geom_name] = Components(
+                obj=obj,
+                geom_index=geom_index,
+                element_id=element_id,
+                parent_body_name=parent_body_name,
+                geom_pos=geom_pos,
+                geom_quat=geom_quat,
+                dynamic=dynamic,
+            )
+
+        self.max_elements = element_id
+
+    def create_class_mapping(self):
+        """
+        Create class name to index mapping for both semantic and instance
+        segmentation.
+        """
+        self.class2index = {}
+        for i, c in enumerate(self.env.model._classes_to_ids.keys()):
+            self.class2index[c] = i
+        self.class2index[None] = i + 1
+        self.max_classes = len(self.class2index)
+
+        self.instance2index = {}
+        for i, instance_class in enumerate(self.env.model._instances_to_ids.keys()):
+            self.instance2index[instance_class] = i
+        self.instance2index[None] = i + 1
+        self.max_instances = len(self.instance2index)
+
+    def get_class_id(self, geom_index, element_id):
+        """
+        Given index of the geom object get the class id based on
+        self.segmentation type.
+        """
+
+        if self.segmentation_type[0] == None or self.segmentation_type[0][0] == "element":
+            class_id = element_id
+        elif self.segmentation_type[0][0] == "class":
+            class_id = self.class2index[self.env.model._geom_ids_to_classes.get(geom_index)]
+        elif self.segmentation_type[0][0] == "instance":
+            class_id = self.instance2index[self.env.model._geom_ids_to_instances.get(geom_index)]
+
+        return class_id
+
+    def tag_in_name(self, name, tags):
+        """
+        Checks if one of the tags in body tags in the name
+
+        Args:
+            name (str): Name of geom element.
+
+            tags (array): List of keywords to check from.
+        """
+        for tag in tags:
+            if tag in name:
+                return True
+        return False
diff --git a/phantom/submodules/phantom-robosuite/robosuite/robots/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/robots/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c6296c86ab3b693a248fc06b29acc149edf10c7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/robots/__init__.py
@@ -0,0 +1,20 @@
+from .manipulator import Manipulator
+from .single_arm import SingleArm
+from .bimanual import Bimanual
+
+from robosuite.models.robots.robot_model import REGISTERED_ROBOTS
+
+ALL_ROBOTS = REGISTERED_ROBOTS.keys()
+
+# Robot class mappings -- must be maintained manually
+ROBOT_CLASS_MAPPING = {
+    "Baxter": Bimanual,
+    "IIWA": SingleArm,
+    "Jaco": SingleArm,
+    "Kinova3": SingleArm,
+    "Panda": SingleArm,
+    "Sawyer": SingleArm,
+    "UR5e": SingleArm,
+}
+
+BIMANUAL_ROBOTS = {k.lower() for k, v in ROBOT_CLASS_MAPPING.items() if v == Bimanual}
diff --git a/phantom/submodules/phantom-robosuite/robosuite/robots/bimanual.py b/phantom/submodules/phantom-robosuite/robosuite/robots/bimanual.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad8cbdce4032878e1fff1f957e4730df56f7c6df
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/robots/bimanual.py
@@ -0,0 +1,623 @@
+import copy
+import os
+from collections import OrderedDict
+
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.controllers import controller_factory, load_controller_config
+from robosuite.models.grippers import gripper_factory
+from robosuite.robots.manipulator import Manipulator
+from robosuite.utils.buffers import DeltaBuffer, RingBuffer
+from robosuite.utils.observables import Observable, sensor
+
+
+class Bimanual(Manipulator):
+    """
+    Initializes a bimanual robot simulation object.
+
+    Args:
+        robot_type (str): Specification for specific robot arm to be instantiated within this env (e.g: "Panda")
+
+        idn (int or str): Unique ID of this robot. Should be different from others
+
+        controller_config (dict or list of dict --> dict of dict): If set, contains relevant controller parameters
+            for creating custom controllers. Else, uses the default controller for this specific task. Should either
+            be single dict if same controller is to be used for both robot arms or else it should be a list of length 2.
+
+            :NOTE: In the latter case, assumes convention of [right, left]
+
+        initial_qpos (sequence of float): If set, determines the initial joint positions of the robot to be
+            instantiated for the task
+
+        initialization_noise (dict): Dict containing the initialization noise parameters. The expected keys and
+            corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to "None" or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            :Note: Specifying None will automatically create the required dict with "magnitude" set to 0.0
+
+        mount_type (str): type of mount, used to instantiate mount models from mount factory.
+            Default is "default", which is the default mount associated with this robot's corresponding model.
+            None results in no mount, and any other (valid) model overrides the default mount.
+
+        gripper_type (str or list of str --> dict): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default gripper associated
+            within the 'robot' specification. None removes the gripper, and any other (valid) model overrides the
+            default gripper. Should either be single str if same gripper type is to be used for both arms or else
+            it should be a list of length 2
+
+            :NOTE: In the latter case, assumes convention of [right, left]
+
+        control_freq (float): how many control signals to receive
+            in every second. This sets the amount of simulation time
+            that passes between every action input.
+    """
+
+    def __init__(
+        self,
+        robot_type: str,
+        idn=0,
+        controller_config=None,
+        initial_qpos=None,
+        initialization_noise=None,
+        mount_type="default",
+        gripper_type="default",
+        control_freq=20,
+    ):
+
+        self.controller = self._input2dict(None)
+        self.controller_config = self._input2dict(copy.deepcopy(controller_config))
+        self.gripper = self._input2dict(None)
+        self.gripper_type = self._input2dict(gripper_type)
+        self.has_gripper = self._input2dict([gripper_type is not None for _, gripper_type in self.gripper_type.items()])
+
+        self.gripper_joints = self._input2dict(None)  # xml joint names for gripper
+        self._ref_gripper_joint_pos_indexes = self._input2dict(None)  # xml gripper joint position indexes in mjsim
+        self._ref_gripper_joint_vel_indexes = self._input2dict(None)  # xml gripper joint velocity indexes in mjsim
+        self._ref_joint_gripper_actuator_indexes = self._input2dict(
+            None
+        )  # xml gripper (pos) actuator indexes for robot in mjsim
+        self.eef_rot_offset = self._input2dict(None)  # rotation offsets from final arm link to gripper (quat)
+        self.eef_site_id = self._input2dict(None)  # xml element id for eef in mjsim
+        self.eef_cylinder_id = self._input2dict(None)  # xml element id for eef cylinder in mjsim
+        self.torques = None  # Current torques being applied
+
+        self.recent_ee_forcetorques = self._input2dict(None)  # Current and last forces / torques sensed at eef
+        self.recent_ee_pose = self._input2dict(None)  # Current and last eef pose (pos + ori (quat))
+        self.recent_ee_vel = self._input2dict(None)  # Current and last eef velocity
+        self.recent_ee_vel_buffer = self._input2dict(None)  # RingBuffer holding prior 10 values of velocity values
+        self.recent_ee_acc = self._input2dict(None)  # Current and last eef acceleration
+
+        super().__init__(
+            robot_type=robot_type,
+            idn=idn,
+            initial_qpos=initial_qpos,
+            initialization_noise=initialization_noise,
+            mount_type=mount_type,
+            control_freq=control_freq,
+        )
+
+    def _load_controller(self):
+        """
+        Loads controller to be used for dynamic trajectories
+        """
+        # Flag for loading urdf once (only applicable for IK controllers)
+        urdf_loaded = False
+
+        # Load controller configs for both left and right arm
+        for arm in self.arms:
+            # First, load the default controller if none is specified
+            if not self.controller_config[arm]:
+                # Need to update default for a single agent
+                controller_path = os.path.join(
+                    os.path.dirname(__file__),
+                    "..",
+                    "controllers/config/{}.json".format(self.robot_model.default_controller_config[arm]),
+                )
+                self.controller_config[arm] = load_controller_config(custom_fpath=controller_path)
+
+            # Assert that the controller config is a dict file:
+            #             NOTE: "type" must be one of: {JOINT_POSITION, JOINT_TORQUE, JOINT_VELOCITY,
+            #                                           OSC_POSITION, OSC_POSE, IK_POSE}
+            assert (
+                type(self.controller_config[arm]) == dict
+            ), "Inputted controller config must be a dict! Instead, got type: {}".format(
+                type(self.controller_config[arm])
+            )
+
+            # Add to the controller dict additional relevant params:
+            #   the robot name, mujoco sim, eef_name, actuator_range, joint_indexes, timestep (model) freq,
+            #   policy (control) freq, and ndim (# joints)
+            self.controller_config[arm]["robot_name"] = self.name
+            self.controller_config[arm]["sim"] = self.sim
+            self.controller_config[arm]["eef_name"] = self.gripper[arm].important_sites["grip_site"]
+            self.controller_config[arm]["eef_rot_offset"] = self.eef_rot_offset[arm]
+            self.controller_config[arm]["ndim"] = self._joint_split_idx
+            self.controller_config[arm]["policy_freq"] = self.control_freq
+            (start, end) = (None, self._joint_split_idx) if arm == "right" else (self._joint_split_idx, None)
+            self.controller_config[arm]["joint_indexes"] = {
+                "joints": self.joint_indexes[start:end],
+                "qpos": self._ref_joint_pos_indexes[start:end],
+                "qvel": self._ref_joint_vel_indexes[start:end],
+            }
+            self.controller_config[arm]["actuator_range"] = (
+                self.torque_limits[0][start:end],
+                self.torque_limits[1][start:end],
+            )
+
+            # Only load urdf the first time this controller gets called
+            self.controller_config[arm]["load_urdf"] = True if not urdf_loaded else False
+            urdf_loaded = True
+
+            # Instantiate the relevant controller
+            self.controller[arm] = controller_factory(self.controller_config[arm]["type"], self.controller_config[arm])
+
+    def load_model(self):
+        """
+        Loads robot and optionally add grippers.
+        """
+        # First, run the superclass method to load the relevant model
+        super().load_model()
+
+        # Verify that the loaded model is of the correct type for this robot
+        if self.robot_model.arm_type != "bimanual":
+            raise TypeError(
+                "Error loading robot model: Incompatible arm type specified for this robot. "
+                "Requested model arm type: {}, robot arm type: {}".format(self.robot_model.arm_type, type(self))
+            )
+
+        # Now, load the gripper if necessary
+        for arm in self.arms:
+            if self.has_gripper[arm]:
+                if self.gripper_type[arm] == "default":
+                    # Load the default gripper from the robot file
+                    self.gripper[arm] = gripper_factory(
+                        self.robot_model.default_gripper[arm], idn="_".join((str(self.idn), arm))
+                    )
+                else:
+                    # Load user-specified gripper
+                    self.gripper[arm] = gripper_factory(self.gripper_type[arm], idn="_".join((str(self.idn), arm)))
+            else:
+                # Load null gripper
+                self.gripper[arm] = gripper_factory(None, idn="_".join((str(self.idn), arm)))
+            # Grab eef rotation offset
+            self.eef_rot_offset[arm] = T.quat_multiply(
+                self.robot_model.hand_rotation_offset[arm], self.gripper[arm].rotation_offset
+            )
+            # Add this gripper to the robot model
+            self.robot_model.add_gripper(self.gripper[arm], self.robot_model.eef_name[arm])
+
+    def reset(self, deterministic=False):
+        """
+        Sets initial pose of arm and grippers. Overrides gripper joint configuration if we're using a
+        deterministic reset (e.g.: hard reset from xml file)
+
+        Args:
+            deterministic (bool): If true, will not randomize initializations within the sim
+        """
+        # First, run the superclass method to reset the position and controller
+        super().reset(deterministic)
+
+        # Setup arm-specific values
+        for arm in self.arms:
+            # Now, reset the grippers if necessary
+            if self.has_gripper[arm]:
+                if not deterministic:
+                    self.sim.data.qpos[self._ref_gripper_joint_pos_indexes[arm]] = self.gripper[arm].init_qpos
+
+                self.gripper[arm].current_action = np.zeros(self.gripper[arm].dof)
+
+            # Update base pos / ori references in controller (technically only needs to be called once)
+            self.controller[arm].update_base_pose(self.base_pos, self.base_ori)
+            # Setup buffers for eef values
+            self.recent_ee_forcetorques[arm] = DeltaBuffer(dim=6)
+            self.recent_ee_pose[arm] = DeltaBuffer(dim=7)
+            self.recent_ee_vel[arm] = DeltaBuffer(dim=6)
+            self.recent_ee_vel_buffer[arm] = RingBuffer(dim=6, length=10)
+            self.recent_ee_acc[arm] = DeltaBuffer(dim=6)
+
+    def setup_references(self):
+        """
+        Sets up necessary reference for robots, grippers, and objects.
+
+        Note that this should get called during every reset from the environment
+        """
+        # First, run the superclass method to setup references for joint-related values / indexes
+        super().setup_references()
+
+        # Now, add references to gripper if necessary
+        # indices for grippers in qpos, qvel
+        for arm in self.arms:
+            if self.has_gripper[arm]:
+                self.gripper_joints[arm] = list(self.gripper[arm].joints)
+                self._ref_gripper_joint_pos_indexes[arm] = [
+                    self.sim.model.get_joint_qpos_addr(x) for x in self.gripper_joints[arm]
+                ]
+                self._ref_gripper_joint_vel_indexes[arm] = [
+                    self.sim.model.get_joint_qvel_addr(x) for x in self.gripper_joints[arm]
+                ]
+                self._ref_joint_gripper_actuator_indexes[arm] = [
+                    self.sim.model.actuator_name2id(actuator) for actuator in self.gripper[arm].actuators
+                ]
+
+            # IDs of sites for eef visualization
+            self.eef_site_id[arm] = self.sim.model.site_name2id(self.gripper[arm].important_sites["grip_site"])
+            self.eef_cylinder_id[arm] = self.sim.model.site_name2id(self.gripper[arm].important_sites["grip_cylinder"])
+
+    def control(self, action, policy_step=False):
+        """
+        Actuate the robot with the
+        passed joint velocities and gripper control.
+
+        Args:
+            action (np.array): The control to apply to the robot. The first @self.robot_model.dof dimensions should
+                be the desired normalized joint velocities and if the robot has a gripper, the next @self.gripper.dof
+                dimensions should be actuation controls for the gripper.
+
+                :NOTE: Assumes inputted actions are of form:
+                    [right_arm_control, right_gripper_control, left_arm_control, left_gripper_control]
+
+            policy_step (bool): Whether a new policy step (action) is being taken
+
+        Raises:
+            AssertionError: [Invalid action dimension]
+        """
+        # clip actions into valid range
+        assert len(action) == self.action_dim, "environment got invalid action dimension -- expected {}, got {}".format(
+            self.action_dim, len(action)
+        )
+
+        self.torques = np.array([])
+        # Now execute actions for each arm
+        for arm in self.arms:
+            # Make sure to split action space correctly
+            (start, end) = (None, self._action_split_idx) if arm == "right" else (self._action_split_idx, None)
+            sub_action = action[start:end]
+
+            gripper_action = None
+            if self.has_gripper[arm]:
+                # get all indexes past controller dimension indexes
+                gripper_action = sub_action[self.controller[arm].control_dim :]
+                sub_action = sub_action[: self.controller[arm].control_dim]
+
+            # Update the controller goal if this is a new policy step
+            if policy_step:
+                self.controller[arm].set_goal(sub_action)
+
+            # Now run the controller for a step and add it to the torques
+            self.torques = np.concatenate((self.torques, self.controller[arm].run_controller()))
+
+            # Get gripper action, if applicable
+            if self.has_gripper[arm]:
+                self.grip_action(gripper=self.gripper[arm], gripper_action=gripper_action)
+
+        # Clip the torques
+        low, high = self.torque_limits
+        self.torques = np.clip(self.torques, low, high)
+
+        # Apply joint torque control
+        self.sim.data.ctrl[self._ref_joint_actuator_indexes] = self.torques
+
+        # If this is a policy step, also update buffers holding recent values of interest
+        if policy_step:
+            # Update proprioceptive values
+            self.recent_qpos.push(self._joint_positions)
+            self.recent_actions.push(action)
+            self.recent_torques.push(self.torques)
+
+            for arm in self.arms:
+                # Update arm-specific proprioceptive values
+                self.recent_ee_forcetorques[arm].push(np.concatenate((self.ee_force[arm], self.ee_torque[arm])))
+                self.recent_ee_pose[arm].push(
+                    np.concatenate((self.controller[arm].ee_pos, T.mat2quat(self.controller[arm].ee_ori_mat)))
+                )
+                self.recent_ee_vel[arm].push(
+                    np.concatenate((self.controller[arm].ee_pos_vel, self.controller[arm].ee_ori_vel))
+                )
+
+                # Estimation of eef acceleration (averaged derivative of recent velocities)
+                self.recent_ee_vel_buffer[arm].push(
+                    np.concatenate((self.controller[arm].ee_pos_vel, self.controller[arm].ee_ori_vel))
+                )
+                diffs = np.vstack(
+                    [
+                        self.recent_ee_acc[arm].current,
+                        self.control_freq * np.diff(self.recent_ee_vel_buffer[arm].buf, axis=0),
+                    ]
+                )
+                ee_acc = np.array([np.convolve(col, np.ones(10) / 10.0, mode="valid")[0] for col in diffs.transpose()])
+                self.recent_ee_acc[arm].push(ee_acc)
+
+    def _visualize_grippers(self, visible):
+        """
+        Visualizes the gripper site(s) if applicable.
+
+        Args:
+            visible (bool): True if visualizing the gripper for this arm.
+        """
+        for arm in self.arms:
+            self.gripper[arm].set_sites_visibility(sim=self.sim, visible=visible)
+
+    def setup_observables(self):
+        """
+        Sets up observables to be used for this robot
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        # Get general robot observables first
+        observables = super().setup_observables()
+
+        # Get prefix from robot model to avoid naming clashes for multiple robots and define observables modality
+        pf = self.robot_model.naming_prefix
+        modality = f"{pf}proprio"
+        sensors = []
+        names = []
+
+        for arm in self.arms:
+            # Add in eef info
+            arm_sensors, arm_sensor_names = self._create_arm_sensors(arm=arm, modality=modality)
+            sensors += arm_sensors
+            names += arm_sensor_names
+
+        # Create observables for this robot
+        for name, s in zip(names, sensors):
+            observables[name] = Observable(
+                name=name,
+                sensor=s,
+                sampling_rate=self.control_freq,
+            )
+
+        return observables
+
+    def _create_arm_sensors(self, arm, modality):
+        """
+        Helper function to create sensors for a given arm. This is abstracted in a separate function call so that we
+        don't have local function naming collisions during the _setup_observables() call.
+
+        Args:
+            arm (str): Arm to create sensors for
+            modality (str): Modality to assign to all sensors
+
+        Returns:
+            2-tuple:
+                sensors (list): Array of sensors for the given arm
+                names (list): array of corresponding observable names
+        """
+        pf = self.robot_model.naming_prefix
+
+        # eef features
+        @sensor(modality=modality)
+        def eef_pos(obs_cache):
+            return np.array(self.sim.data.site_xpos[self.eef_site_id[arm]])
+
+        @sensor(modality=modality)
+        def eef_quat(obs_cache):
+            return T.convert_quat(self.sim.data.get_body_xquat(self.robot_model.eef_name[arm]), to="xyzw")
+
+        sensors = [eef_pos, eef_quat]
+        names = [f"{pf}{arm}_eef_pos", f"{pf}{arm}_eef_quat"]
+
+        # add in gripper sensors if this robot has a gripper
+        if self.has_gripper[arm]:
+
+            @sensor(modality=modality)
+            def gripper_qpos(obs_cache):
+                return np.array([self.sim.data.qpos[x] for x in self._ref_gripper_joint_pos_indexes[arm]])
+
+            @sensor(modality=modality)
+            def gripper_qvel(obs_cache):
+                return np.array([self.sim.data.qvel[x] for x in self._ref_gripper_joint_vel_indexes[arm]])
+
+            sensors += [gripper_qpos, gripper_qvel]
+            names += [f"{pf}{arm}_gripper_qpos", f"{pf}{arm}_gripper_qvel"]
+
+        return sensors, names
+
+    def _input2dict(self, inp):
+        """
+        Helper function that converts an input that is either a single value or a list into a dict with keys for
+        each arm: "right", "left"
+
+        Args:
+            inp (str or list or None): Input value to be converted to dict
+
+            :Note: If inp is a list, then assumes format is [right, left]
+
+        Returns:
+            dict: Inputs mapped for each robot arm
+        """
+        # First, convert to list if necessary
+        if type(inp) is not list:
+            inp = [inp for _ in range(2)]
+        # Now, convert list to dict and return
+        return {key: value for key, value in zip(self.arms, inp)}
+
+    @property
+    def arms(self):
+        """
+        Returns name of arms used as naming convention throughout this module
+
+        Returns:
+            2-tuple: ('right', 'left')
+        """
+        return "right", "left"
+
+    @property
+    def action_limits(self):
+        """
+        Action lower/upper limits per dimension.
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum (low) action values
+                - (np.array) maximum (high) action values
+        """
+        # Action limits based on controller limits
+        low, high = [], []
+        for arm in self.arms:
+            low_g, high_g = (
+                ([-1] * self.gripper[arm].dof, [1] * self.gripper[arm].dof) if self.has_gripper[arm] else ([], [])
+            )
+            low_c, high_c = self.controller[arm].control_limits
+            low, high = np.concatenate([low, low_c, low_g]), np.concatenate([high, high_c, high_g])
+        return low, high
+
+    @property
+    def ee_ft_integral(self):
+        """
+        Returns:
+            dict: each arm-specific entry specifies the integral over time of the applied ee force-torque for that arm
+        """
+        vals = {}
+        for arm in self.arms:
+            vals[arm] = np.abs((1.0 / self.control_freq) * self.recent_ee_forcetorques[arm].average)
+        return vals
+
+    @property
+    def ee_force(self):
+        """
+        Returns:
+            dict: each arm-specific entry specifies the force applied at the force sensor at the robot arm's eef
+        """
+        vals = {}
+        for arm in self.arms:
+            vals[arm] = self.get_sensor_measurement(self.gripper[arm].important_sensors["force_ee"])
+        return vals
+
+    @property
+    def ee_torque(self):
+        """
+        Returns:
+            dict: each arm-specific entry specifies the torque applied at the torque sensor at the robot arm's eef
+        """
+        vals = {}
+        for arm in self.arms:
+            vals[arm] = self.get_sensor_measurement(self.gripper[arm].important_sensors["torque_ee"])
+        return vals
+
+    @property
+    def _hand_pose(self):
+        """
+        Returns:
+            dict: each arm-specific entry specifies the eef pose in base frame of robot.
+        """
+        vals = {}
+        for arm in self.arms:
+            vals[arm] = self.pose_in_base_from_name(self.robot_model.eef_name[arm])
+        return vals
+
+    @property
+    def _hand_quat(self):
+        """
+        Returns:
+            dict: each arm-specific entry specifies the eef quaternion in base frame of robot.
+        """
+        vals = {}
+        orns = self._hand_orn
+        for arm in self.arms:
+            vals[arm] = T.mat2quat(orns[arm])
+        return vals
+
+    @property
+    def _hand_total_velocity(self):
+        """
+        Returns:
+            dict: each arm-specific entry specifies the total eef velocity (linear + angular) in the base frame
+            as a numpy array of shape (6,)
+        """
+        vals = {}
+        for arm in self.arms:
+            # Determine correct start, end points based on arm
+            (start, end) = (None, self._joint_split_idx) if arm == "right" else (self._joint_split_idx, None)
+
+            # Use jacobian to translate joint velocities to end effector velocities.
+            Jp = self.sim.data.get_body_jacp(self.robot_model.eef_name[arm]).reshape((3, -1))
+            Jp_joint = Jp[:, self._ref_joint_vel_indexes[start:end]]
+
+            Jr = self.sim.data.get_body_jacr(self.robot_model.eef_name[arm]).reshape((3, -1))
+            Jr_joint = Jr[:, self._ref_joint_vel_indexes[start:end]]
+
+            eef_lin_vel = Jp_joint.dot(self._joint_velocities)
+            eef_rot_vel = Jr_joint.dot(self._joint_velocities)
+            vals[arm] = np.concatenate([eef_lin_vel, eef_rot_vel])
+        return vals
+
+    @property
+    def _hand_pos(self):
+        """
+        Returns:
+            dict: each arm-specific entry specifies the position of eef in base frame of robot.
+        """
+        vals = {}
+        poses = self._hand_pose
+        for arm in self.arms:
+            eef_pose_in_base = poses[arm]
+            vals[arm] = eef_pose_in_base[:3, 3]
+        return vals
+
+    @property
+    def _hand_orn(self):
+        """
+        Returns:
+            dict: each arm-specific entry specifies the orientation of eef in base frame of robot as a rotation matrix.
+        """
+        vals = {}
+        poses = self._hand_pose
+        for arm in self.arms:
+            eef_pose_in_base = poses[arm]
+            vals[arm] = eef_pose_in_base[:3, :3]
+        return vals
+
+    @property
+    def _hand_vel(self):
+        """
+        Returns:
+            dict: each arm-specific entry specifies the velocity of eef in base frame of robot.
+        """
+        vels = self._hand_total_velocity
+        for arm in self.arms:
+            vels[arm] = vels[arm][:3]
+        return vels
+
+    @property
+    def _hand_ang_vel(self):
+        """
+        Returns:
+            dict: each arm-specific entry specifies the angular velocity of eef in base frame of robot.
+        """
+        vels = self._hand_total_velocity
+        for arm in self.arms:
+            vels[arm] = vels[arm][3:]
+        return vels
+
+    @property
+    def _action_split_idx(self):
+        """
+        Grabs the index that correctly splits the right arm from the left arm actions
+
+        :NOTE: Assumes inputted actions are of form:
+            [right_arm_control, right_gripper_control, left_arm_control, left_gripper_control]
+
+        Returns:
+            int: Index splitting right from left arm actions
+        """
+        return (
+            self.controller["right"].control_dim + self.gripper["right"].dof
+            if self.has_gripper["right"]
+            else self.controller["right"].control_dim
+        )
+
+    @property
+    def _joint_split_idx(self):
+        """
+        Returns:
+            int: the index that correctly splits the right arm from the left arm joints
+        """
+        return int(len(self.robot_joints) / 2)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/robots/manipulator.py b/phantom/submodules/phantom-robosuite/robosuite/robots/manipulator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b34e6c56b607dc15fe07823f4ac29345a860d986
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/robots/manipulator.py
@@ -0,0 +1,164 @@
+from robosuite.robots.robot import Robot
+
+
+class Manipulator(Robot):
+    """
+    Initializes a manipulator robot simulation object, as defined by a single corresponding robot arm XML and
+    associated gripper XML
+    """
+
+    def _load_controller(self):
+        raise NotImplementedError
+
+    def control(self, action, policy_step=False):
+        raise NotImplementedError
+
+    def grip_action(self, gripper, gripper_action):
+        """
+        Executes @gripper_action for specified @gripper
+
+        Args:
+            gripper (GripperModel): Gripper to execute action for
+            gripper_action (float): Value between [-1,1] to send to gripper
+        """
+        actuator_idxs = [self.sim.model.actuator_name2id(actuator) for actuator in gripper.actuators]
+        if self.direct_gripper_control:
+            if "Robotiq85" in gripper.name:
+                applied_gripper_action = gripper_action[0]
+            else:
+                applied_gripper_action = [gripper_action[0], -gripper_action[0]]
+        else:
+            gripper_action_actual = gripper.format_action(gripper_action)
+            # rescale normalized gripper action to control ranges
+            ctrl_range = self.sim.model.actuator_ctrlrange[actuator_idxs]
+            bias = 0.5 * (ctrl_range[:, 1] + ctrl_range[:, 0])
+            weight = 0.5 * (ctrl_range[:, 1] - ctrl_range[:, 0])
+            applied_gripper_action = bias + weight * gripper_action_actual
+        self.sim.data.ctrl[actuator_idxs] = applied_gripper_action
+
+    def visualize(self, vis_settings):
+        """
+        Do any necessary visualization for this manipulator
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "robots" and "grippers" keyword as well as any other
+                robot-specific options specified.
+        """
+        super().visualize(vis_settings=vis_settings)
+        self._visualize_grippers(visible=vis_settings["grippers"])
+
+    def _visualize_grippers(self, visible):
+        """
+        Visualizes the gripper site(s) if applicable.
+
+        Args:
+            visible (bool): True if visualizing grippers, else False
+        """
+        raise NotImplementedError
+
+    @property
+    def action_limits(self):
+        raise NotImplementedError
+
+    @property
+    def dof(self):
+        """
+        Returns:
+            int: degrees of freedom of the robot (with grippers).
+        """
+        # Get the dof of the base robot model
+        dof = super().dof
+        for gripper in self.robot_model.grippers.values():
+            dof += gripper.dof
+        return dof
+
+    @property
+    def ee_ft_integral(self):
+        """
+        Returns:
+            float or dict: either single value or arm-specific entries specifying the integral over time of the applied
+                ee force-torque for that arm
+        """
+        raise NotImplementedError
+
+    @property
+    def ee_force(self):
+        """
+        Returns:
+            np.array or dict: either single value or arm-specific entries specifying the force applied at the force sensor
+                at the robot arm's eef
+        """
+        raise NotImplementedError
+
+    @property
+    def ee_torque(self):
+        """
+        Returns:
+            np.array or dict: either single value or arm-specific entries specifying the torque applied at the torque
+                sensor at the robot arm's eef
+        """
+        raise NotImplementedError
+
+    @property
+    def _hand_pose(self):
+        """
+        Returns:
+            np.array or dict: either single value or arm-specific entries specifying the eef pose in base frame of
+                robot.
+        """
+        raise NotImplementedError
+
+    @property
+    def _hand_quat(self):
+        """
+        Returns:
+            np.array or dict: either single value or arm-specific entries specifying the eef quaternion in base frame
+                of robot.
+        """
+        raise NotImplementedError
+
+    @property
+    def _hand_total_velocity(self):
+        """
+        Returns:
+            np.array or dict: either single value or arm-specific entries specifying the total eef velocity
+                (linear + angular) in the base frame as a numpy array of shape (6,)
+        """
+        raise NotImplementedError
+
+    @property
+    def _hand_pos(self):
+        """
+        Returns:
+            np.array or dict: either single value or arm-specific entries specifying the position of eef in base frame
+                of robot.
+        """
+        raise NotImplementedError
+
+    @property
+    def _hand_orn(self):
+        """
+        Returns:
+            np.array or dict: either single value or arm-specific entries specifying the orientation of eef in base
+                frame of robot as a rotation matrix.
+        """
+        raise NotImplementedError
+
+    @property
+    def _hand_vel(self):
+        """
+        Returns:
+            np.array or dict: either single value or arm-specific entries specifying the velocity of eef in base frame
+                of robot.
+        """
+        raise NotImplementedError
+
+    @property
+    def _hand_ang_vel(self):
+        """
+        Returns:
+            np.array or dict: either single value or arm-specific entries specifying the angular velocity of eef in
+                base frame of robot.
+        """
+        raise NotImplementedError
diff --git a/phantom/submodules/phantom-robosuite/robosuite/robots/robot.py b/phantom/submodules/phantom-robosuite/robosuite/robots/robot.py
new file mode 100644
index 0000000000000000000000000000000000000000..e31586aef77f26d92ff26f01130899ee78a7c7e7
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/robots/robot.py
@@ -0,0 +1,387 @@
+from collections import OrderedDict
+
+import numpy as np
+
+import robosuite.macros as macros
+import robosuite.utils.transform_utils as T
+from robosuite.models.mounts import mount_factory
+from robosuite.models.robots import create_robot
+from robosuite.utils.binding_utils import MjSim
+from robosuite.utils.buffers import DeltaBuffer
+from robosuite.utils.observables import Observable, sensor
+
+
+class Robot(object):
+    """
+    Initializes a robot simulation object, as defined by a single corresponding robot XML
+
+    Args:
+        robot_type (str): Specification for specific robot arm to be instantiated within this env (e.g: "Panda")
+
+        idn (int or str): Unique ID of this robot. Should be different from others
+
+        initial_qpos (sequence of float): If set, determines the initial joint positions of the robot to be
+            instantiated for the task
+
+        initialization_noise (dict): Dict containing the initialization noise parameters. The expected keys and
+            corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to "None" or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            :Note: Specifying None will automatically create the required dict with "magnitude" set to 0.0
+
+        mount_type (str): type of mount, used to instantiate mount models from mount factory.
+            Default is "default", which is the default mount associated with this robot's corresponding model.
+            None results in no mount, and any other (valid) model overrides the default mount.
+
+        control_freq (float): how many control signals to receive
+            in every second. This sets the amount of simulation time
+            that passes between every action input.
+    """
+
+    def __init__(
+        self,
+        robot_type: str,
+        idn=0,
+        initial_qpos=None,
+        initialization_noise=None,
+        mount_type="default",
+        control_freq=20,
+    ):
+        # Set relevant attributes
+        self.sim = None  # MjSim this robot is tied to
+        self.name = robot_type  # Specific robot to instantiate
+        self.idn = idn  # Unique ID of this robot
+        self.robot_model = None  # object holding robot model-specific info
+        self.control_freq = control_freq  # controller Hz
+        self.mount_type = mount_type  # Type of mount to use
+
+        # Scaling of Gaussian initial noise applied to robot joints
+        self.initialization_noise = initialization_noise
+        if self.initialization_noise is None:
+            self.initialization_noise = {"magnitude": 0.0, "type": "gaussian"}  # no noise conditions
+        elif self.initialization_noise == "default":
+            self.initialization_noise = {"magnitude": 0.02, "type": "gaussian"}
+        self.initialization_noise["magnitude"] = (
+            self.initialization_noise["magnitude"] if self.initialization_noise["magnitude"] else 0.0
+        )
+
+        self.init_qpos = initial_qpos  # n-dim list / array of robot joints
+
+        self.robot_joints = None  # xml joint names for robot
+        self.base_pos = None  # Base position in world coordinates (x,y,z)
+        self.base_ori = None  # Base rotation in world coordinates (x,y,z,w quat)
+        self._ref_joint_indexes = None  # xml joint indexes for robot in mjsim
+        self._ref_joint_pos_indexes = None  # xml joint position indexes in mjsim
+        self._ref_joint_vel_indexes = None  # xml joint velocity indexes in mjsim
+        self._ref_joint_actuator_indexes = None  # xml joint (torq) actuator indexes for robot in mjsim
+
+        self.recent_qpos = None  # Current and last robot arm qpos
+        self.recent_actions = None  # Current and last action applied
+        self.recent_torques = None  # Current and last torques applied
+
+    def _load_controller(self):
+        """
+        Loads controller to be used for dynamic trajectories.
+        """
+        raise NotImplementedError
+
+    def load_model(self):
+        """
+        Loads robot and optionally add grippers.
+        """
+        self.robot_model = create_robot(self.name, idn=self.idn)
+
+        # Add mount if specified
+        if self.mount_type == "default":
+            self.robot_model.add_mount(mount=mount_factory(self.robot_model.default_mount, idn=self.idn))
+        else:
+            self.robot_model.add_mount(mount=mount_factory(self.mount_type, idn=self.idn))
+
+        # Use default from robot model for initial joint positions if not specified
+        if self.init_qpos is None:
+            self.init_qpos = self.robot_model.init_qpos
+
+    def reset_sim(self, sim: MjSim):
+        """
+        Replaces current sim with a new sim
+
+        Args:
+            sim (MjSim): New simulation being instantiated to replace the old one
+        """
+        self.sim = sim
+
+    def reset(self, deterministic=False):
+        """
+        Sets initial pose of arm and grippers. Overrides robot joint configuration if we're using a
+        deterministic reset (e.g.: hard reset from xml file)
+
+        Args:
+            deterministic (bool): If true, will not randomize initializations within the sim
+
+        Raises:
+            ValueError: [Invalid noise type]
+        """
+        init_qpos = np.array(self.init_qpos)
+        if not deterministic:
+            # Determine noise
+            if self.initialization_noise["type"] == "gaussian":
+                noise = np.random.randn(len(self.init_qpos)) * self.initialization_noise["magnitude"]
+            elif self.initialization_noise["type"] == "uniform":
+                noise = np.random.uniform(-1.0, 1.0, len(self.init_qpos)) * self.initialization_noise["magnitude"]
+            else:
+                raise ValueError("Error: Invalid noise type specified. Options are 'gaussian' or 'uniform'.")
+            init_qpos += noise
+
+        # Set initial position in sim
+        self.sim.data.qpos[self._ref_joint_pos_indexes] = init_qpos
+
+        # Load controllers
+        self._load_controller()
+
+        # Update base pos / ori references
+        self.base_pos = self.sim.data.get_body_xpos(self.robot_model.root_body)
+        self.base_ori = T.mat2quat(self.sim.data.get_body_xmat(self.robot_model.root_body).reshape((3, 3)))
+
+        # Setup buffers to hold recent values
+        self.recent_qpos = DeltaBuffer(dim=len(self.joint_indexes))
+        self.recent_actions = DeltaBuffer(dim=self.action_dim)
+        self.recent_torques = DeltaBuffer(dim=len(self.joint_indexes))
+
+    def setup_references(self):
+        """
+        Sets up necessary reference for robots, grippers, and objects.
+        """
+        # indices for joints in qpos, qvel
+        self.robot_joints = self.robot_model.joints
+        self._ref_joint_pos_indexes = [self.sim.model.get_joint_qpos_addr(x) for x in self.robot_joints]
+        self._ref_joint_vel_indexes = [self.sim.model.get_joint_qvel_addr(x) for x in self.robot_joints]
+
+        # indices for joint indexes
+        self._ref_joint_indexes = [self.sim.model.joint_name2id(joint) for joint in self.robot_model.joints]
+
+        # indices for joint pos actuation, joint vel actuation, gripper actuation
+        self._ref_joint_actuator_indexes = [
+            self.sim.model.actuator_name2id(actuator) for actuator in self.robot_model.actuators
+        ]
+
+    def setup_observables(self):
+        """
+        Sets up observables to be used for this robot
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        # Get prefix from robot model to avoid naming clashes for multiple robots and define observables modality
+        pf = self.robot_model.naming_prefix
+        pre_compute = f"{pf}joint_pos"
+        modality = f"{pf}proprio"
+
+        # proprioceptive features
+        @sensor(modality=modality)
+        def joint_pos(obs_cache):
+            return np.array([self.sim.data.qpos[x] for x in self._ref_joint_pos_indexes])
+
+        @sensor(modality=modality)
+        def joint_pos_cos(obs_cache):
+            return np.cos(obs_cache[pre_compute]) if pre_compute in obs_cache else np.zeros(self.robot_model.dof)
+
+        @sensor(modality=modality)
+        def joint_pos_sin(obs_cache):
+            return np.sin(obs_cache[pre_compute]) if pre_compute in obs_cache else np.zeros(self.robot_model.dof)
+
+        @sensor(modality=modality)
+        def joint_vel(obs_cache):
+            return np.array([self.sim.data.qvel[x] for x in self._ref_joint_vel_indexes])
+
+        sensors = [joint_pos, joint_pos_cos, joint_pos_sin, joint_vel]
+        names = ["joint_pos", "joint_pos_cos", "joint_pos_sin", "joint_vel"]
+        # We don't want to include the direct joint pos sensor outputs
+        actives = [False, True, True, True]
+
+        # Create observables for this robot
+        observables = OrderedDict()
+        for name, s, active in zip(names, sensors, actives):
+            obs_name = pf + name
+            observables[obs_name] = Observable(
+                name=obs_name,
+                sensor=s,
+                sampling_rate=self.control_freq,
+                active=active,
+            )
+
+        return observables
+
+    def control(self, action, policy_step=False):
+        """
+        Actuate the robot with the
+        passed joint velocities and gripper control.
+
+        Args:
+            action (np.array): The control to apply to the robot. The first @self.robot_model.dof dimensions should
+                be the desired normalized joint velocities and if the robot has a gripper, the next @self.gripper.dof
+                dimensions should be actuation controls for the gripper.
+            policy_step (bool): Whether a new policy step (action) is being taken
+        """
+        raise NotImplementedError
+
+    def check_q_limits(self):
+        """
+        Check if this robot is either very close or at the joint limits
+
+        Returns:
+            bool: True if this arm is near its joint limits
+        """
+        tolerance = 0.1
+        for (qidx, (q, q_limits)) in enumerate(
+            zip(self.sim.data.qpos[self._ref_joint_pos_indexes], self.sim.model.jnt_range[self._ref_joint_indexes])
+        ):
+            if q_limits[0] != q_limits[1] and not (q_limits[0] + tolerance < q < q_limits[1] - tolerance):
+                print("Joint limit reached in joint " + str(qidx))
+                return True
+        return False
+
+    def visualize(self, vis_settings):
+        """
+        Do any necessary visualization for this robot
+
+        Args:
+            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
+                component should be visualized. Should have "robots" keyword as well as any other robot-specific
+                options specified.
+        """
+        self.robot_model.set_sites_visibility(sim=self.sim, visible=vis_settings["robots"])
+
+    @property
+    def action_limits(self):
+        """
+        Action lower/upper limits per dimension.
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum (low) action values
+                - (np.array) maximum (high) action values
+        """
+        raise NotImplementedError
+
+    @property
+    def torque_limits(self):
+        """
+        Torque lower/upper limits per dimension.
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum (low) torque values
+                - (np.array) maximum (high) torque values
+        """
+        # Torque limit values pulled from relevant robot.xml file
+        low = self.sim.model.actuator_ctrlrange[self._ref_joint_actuator_indexes, 0]
+        high = self.sim.model.actuator_ctrlrange[self._ref_joint_actuator_indexes, 1]
+
+        return low, high
+
+    @property
+    def action_dim(self):
+        """
+        Action space dimension for this robot
+        """
+        return self.action_limits[0].shape[0]
+
+    @property
+    def dof(self):
+        """
+        Returns:
+            int: the active DoF of the robot (Number of robot joints + active gripper DoF).
+        """
+        dof = self.robot_model.dof
+        return dof
+
+    def pose_in_base_from_name(self, name):
+        """
+        A helper function that takes in a named data field and returns the pose
+        of that object in the base frame.
+
+        Args:
+            name (str): Name of body in sim to grab pose
+
+        Returns:
+            np.array: (4,4) array corresponding to the pose of @name in the base frame
+        """
+
+        pos_in_world = self.sim.data.get_body_xpos(name)
+        rot_in_world = self.sim.data.get_body_xmat(name).reshape((3, 3))
+        pose_in_world = T.make_pose(pos_in_world, rot_in_world)
+
+        base_pos_in_world = self.sim.data.get_body_xpos(self.robot_model.root_body)
+        base_rot_in_world = self.sim.data.get_body_xmat(self.robot_model.root_body).reshape((3, 3))
+        base_pose_in_world = T.make_pose(base_pos_in_world, base_rot_in_world)
+        world_pose_in_base = T.pose_inv(base_pose_in_world)
+
+        pose_in_base = T.pose_in_A_to_pose_in_B(pose_in_world, world_pose_in_base)
+        return pose_in_base
+
+    def set_robot_joint_positions(self, jpos):
+        """
+        Helper method to force robot joint positions to the passed values.
+
+        Args:
+            jpos (np.array): Joint positions to manually set the robot to
+        """
+        self.sim.data.qpos[self._ref_joint_pos_indexes] = jpos
+        self.sim.forward()
+
+    @property
+    def js_energy(self):
+        """
+        Returns:
+            np.array: the energy consumed by each joint between previous and current steps
+        """
+        # We assume in the motors torque is proportional to current (and voltage is constant)
+        # In that case the amount of power scales proportional to the torque and the energy is the
+        # time integral of that
+        # Note that we use mean torque
+        return np.abs((1.0 / self.control_freq) * self.recent_torques.average)
+
+    @property
+    def _joint_positions(self):
+        """
+        Returns:
+            np.array: joint positions (in angles / radians)
+        """
+        return self.sim.data.qpos[self._ref_joint_pos_indexes]
+
+    @property
+    def _joint_velocities(self):
+        """
+        Returns:
+            np.array: joint velocities (angular velocity)
+        """
+        return self.sim.data.qvel[self._ref_joint_vel_indexes]
+
+    @property
+    def joint_indexes(self):
+        """
+        Returns:
+            list: mujoco internal indexes for the robot joints
+        """
+        return self._ref_joint_indexes
+
+    def get_sensor_measurement(self, sensor_name):
+        """
+        Grabs relevant sensor data from the sim object
+
+        Args:
+            sensor_name (str): name of the sensor
+
+        Returns:
+            np.array: sensor values
+        """
+        sensor_idx = np.sum(self.sim.model.sensor_dim[: self.sim.model.sensor_name2id(sensor_name)])
+        sensor_dim = self.sim.model.sensor_dim[self.sim.model.sensor_name2id(sensor_name)]
+        return np.array(self.sim.data.sensordata[sensor_idx : sensor_idx + sensor_dim])
diff --git a/phantom/submodules/phantom-robosuite/robosuite/robots/single_arm.py b/phantom/submodules/phantom-robosuite/robosuite/robots/single_arm.py
new file mode 100644
index 0000000000000000000000000000000000000000..934f91728a801faf113f4a5f1eacdfd4868c66dd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/robots/single_arm.py
@@ -0,0 +1,463 @@
+import copy
+import os
+from collections import OrderedDict
+
+import numpy as np
+
+import robosuite.utils.transform_utils as T
+from robosuite.controllers import controller_factory, load_controller_config
+from robosuite.models.grippers import gripper_factory
+from robosuite.robots.manipulator import Manipulator
+from robosuite.utils.buffers import DeltaBuffer, RingBuffer
+from robosuite.utils.observables import Observable, sensor
+
+
+class SingleArm(Manipulator):
+    """
+    Initializes a single-armed robot simulation object.
+
+    Args:
+        robot_type (str): Specification for specific robot arm to be instantiated within this env (e.g: "Panda")
+
+        idn (int or str): Unique ID of this robot. Should be different from others
+
+        controller_config (dict): If set, contains relevant controller parameters for creating a custom controller.
+            Else, uses the default controller for this specific task
+
+        initial_qpos (sequence of float): If set, determines the initial joint positions of the robot to be
+            instantiated for the task
+
+        initialization_noise (dict): Dict containing the initialization noise parameters. The expected keys and
+            corresponding value types are specified below:
+
+            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
+                joint positions. Setting this value to "None" or 0.0 results in no noise being applied.
+                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
+                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
+            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"
+
+            :Note: Specifying None will automatically create the required dict with "magnitude" set to 0.0
+
+        mount_type (str): type of mount, used to instantiate mount models from mount factory.
+            Default is "default", which is the default mount associated with this robot's corresponding model.
+            None results in no mount, and any other (valid) model overrides the default mount.
+
+        gripper_type (str): type of gripper, used to instantiate
+            gripper models from gripper factory. Default is "default", which is the default gripper associated
+            within the 'robot' specification. None removes the gripper, and any other (valid) model overrides the
+            default gripper
+
+        control_freq (float): how many control signals to receive
+            in every second. This sets the amount of simulation time
+            that passes between every action input.
+    """
+
+    def __init__(
+        self,
+        robot_type: str,
+        idn=0,
+        controller_config=None,
+        initial_qpos=None,
+        initialization_noise=None,
+        mount_type="default",
+        gripper_type="default",
+        control_freq=20,
+        direct_gripper_control=False,
+    ):
+
+        self.controller = None
+        self.controller_config = copy.deepcopy(controller_config)
+        self.gripper_type = gripper_type
+        self.has_gripper = self.gripper_type is not None
+
+        self.gripper = None  # Gripper class
+        self.gripper_joints = None  # xml joint names for gripper
+        self._ref_gripper_joint_pos_indexes = None  # xml gripper joint position indexes in mjsim
+        self._ref_gripper_joint_vel_indexes = None  # xml gripper joint velocity indexes in mjsim
+        self._ref_joint_gripper_actuator_indexes = None  # xml gripper (pos) actuator indexes for robot in mjsim
+        self.eef_rot_offset = None  # rotation offsets from final arm link to gripper (quat)
+        self.eef_site_id = None  # xml element id for eef in mjsim
+        self.eef_cylinder_id = None  # xml element id for eef cylinder in mjsim
+        self.torques = None  # Current torques being applied
+
+        self.recent_ee_forcetorques = None  # Current and last forces / torques sensed at eef
+        self.recent_ee_pose = None  # Current and last eef pose (pos + ori (quat))
+        self.recent_ee_vel = None  # Current and last eef velocity
+        self.recent_ee_vel_buffer = None  # RingBuffer holding prior 10 values of velocity values
+        self.recent_ee_acc = None  # Current and last eef acceleration
+
+        self.direct_gripper_control = direct_gripper_control
+
+        super().__init__(
+            robot_type=robot_type,
+            idn=idn,
+            initial_qpos=initial_qpos,
+            initialization_noise=initialization_noise,
+            mount_type=mount_type,
+            control_freq=control_freq,
+        )
+
+    def _load_controller(self):
+        """
+        Loads controller to be used for dynamic trajectories
+        """
+        # First, load the default controller if none is specified
+        if not self.controller_config:
+            # Need to update default for a single agent
+            controller_path = os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "controllers/config/{}.json".format(self.robot_model.default_controller_config),
+            )
+            self.controller_config = load_controller_config(custom_fpath=controller_path)
+
+        # Assert that the controller config is a dict file:
+        #             NOTE: "type" must be one of: {JOINT_POSITION, JOINT_TORQUE, JOINT_VELOCITY,
+        #                                           OSC_POSITION, OSC_POSE, IK_POSE}
+        assert (
+            type(self.controller_config) == dict
+        ), "Inputted controller config must be a dict! Instead, got type: {}".format(type(self.controller_config))
+
+        # Add to the controller dict additional relevant params:
+        #   the robot name, mujoco sim, eef_name, joint_indexes, timestep (model) freq,
+        #   policy (control) freq, and ndim (# joints)
+        self.controller_config["robot_name"] = self.name
+        self.controller_config["sim"] = self.sim
+        self.controller_config["eef_name"] = self.gripper.important_sites["grip_site"]
+        self.controller_config["eef_rot_offset"] = self.eef_rot_offset
+        self.controller_config["joint_indexes"] = {
+            "joints": self.joint_indexes,
+            "qpos": self._ref_joint_pos_indexes,
+            "qvel": self._ref_joint_vel_indexes,
+        }
+        self.controller_config["actuator_range"] = self.torque_limits
+        self.controller_config["policy_freq"] = self.control_freq
+        self.controller_config["ndim"] = len(self.robot_joints)
+
+        # Instantiate the relevant controller
+        self.controller = controller_factory(self.controller_config["type"], self.controller_config)
+
+    def load_model(self):
+        """
+        Loads robot and optionally add grippers.
+        """
+        # First, run the superclass method to load the relevant model
+        super().load_model()
+
+        # Verify that the loaded model is of the correct type for this robot
+        if self.robot_model.arm_type != "single":
+            raise TypeError(
+                "Error loading robot model: Incompatible arm type specified for this robot. "
+                "Requested model arm type: {}, robot arm type: {}".format(self.robot_model.arm_type, type(self))
+            )
+
+        # Now, load the gripper if necessary
+        if self.has_gripper:
+            if self.gripper_type == "default":
+                # Load the default gripper from the robot file
+                self.gripper = gripper_factory(self.robot_model.default_gripper, idn=self.idn)
+            else:
+                # Load user-specified gripper
+                self.gripper = gripper_factory(self.gripper_type, idn=self.idn)
+        else:
+            # Load null gripper
+            self.gripper = gripper_factory(None, idn=self.idn)
+        # Grab eef rotation offset
+        self.eef_rot_offset = T.quat_multiply(self.robot_model.hand_rotation_offset, self.gripper.rotation_offset)
+        # Add gripper to this robot model
+        self.robot_model.add_gripper(self.gripper)
+
+    def reset(self, deterministic=False):
+        """
+        Sets initial pose of arm and grippers. Overrides gripper joint configuration if we're using a
+        deterministic reset (e.g.: hard reset from xml file)
+
+        Args:
+            deterministic (bool): If true, will not randomize initializations within the sim
+        """
+        # First, run the superclass method to reset the position and controller
+        super().reset(deterministic)
+
+        # Now, reset the gripper if necessary
+        if self.has_gripper:
+            if not deterministic:
+                self.sim.data.qpos[self._ref_gripper_joint_pos_indexes] = self.gripper.init_qpos
+
+            self.gripper.current_action = np.zeros(self.gripper.dof)
+
+        # Update base pos / ori references in controller
+        self.controller.update_base_pose(self.base_pos, self.base_ori)
+
+        # # Setup buffers to hold recent values
+        self.recent_ee_forcetorques = DeltaBuffer(dim=6)
+        self.recent_ee_pose = DeltaBuffer(dim=7)
+        self.recent_ee_vel = DeltaBuffer(dim=6)
+        self.recent_ee_vel_buffer = RingBuffer(dim=6, length=10)
+        self.recent_ee_acc = DeltaBuffer(dim=6)
+
+    def setup_references(self):
+        """
+        Sets up necessary reference for robots, grippers, and objects.
+
+        Note that this should get called during every reset from the environment
+        """
+        # First, run the superclass method to setup references for joint-related values / indexes
+        super().setup_references()
+
+        # Now, add references to gripper if necessary
+        # indices for grippers in qpos, qvel
+        if self.has_gripper:
+            self.gripper_joints = list(self.gripper.joints)
+            self._ref_gripper_joint_pos_indexes = [self.sim.model.get_joint_qpos_addr(x) for x in self.gripper_joints]
+            self._ref_gripper_joint_vel_indexes = [self.sim.model.get_joint_qvel_addr(x) for x in self.gripper_joints]
+            self._ref_joint_gripper_actuator_indexes = [
+                self.sim.model.actuator_name2id(actuator) for actuator in self.gripper.actuators
+            ]
+
+        # IDs of sites for eef visualization
+        self.eef_site_id = self.sim.model.site_name2id(self.gripper.important_sites["grip_site"])
+        self.eef_cylinder_id = self.sim.model.site_name2id(self.gripper.important_sites["grip_cylinder"])
+
+    def control(self, action, policy_step=False):
+        """
+        Actuate the robot with the
+        passed joint velocities and gripper control.
+
+        Args:
+            action (np.array): The control to apply to the robot. The first @self.robot_model.dof dimensions should be
+                the desired normalized joint velocities and if the robot has a gripper, the next @self.gripper.dof
+                dimensions should be actuation controls for the gripper.
+            policy_step (bool): Whether a new policy step (action) is being taken
+
+        Raises:
+            AssertionError: [Invalid action dimension]
+        """
+
+        # clip actions into valid range
+        assert len(action) == self.action_dim, "environment got invalid action dimension -- expected {}, got {}".format(
+            self.action_dim, len(action)
+        )
+
+        gripper_action = None
+        if self.has_gripper:
+            gripper_action = action[self.controller.control_dim :]  # all indexes past controller dimension indexes
+            arm_action = action[: self.controller.control_dim]
+        else:
+            arm_action = action
+
+        # Update the controller goal if this is a new policy step
+        if policy_step:
+            self.controller.set_goal(arm_action)
+
+        # Now run the controller for a step
+        torques = self.controller.run_controller()
+
+        # Clip the torques
+        low, high = self.torque_limits
+        self.torques = np.clip(torques, low, high)
+
+        # Get gripper action, if applicable
+        if self.has_gripper:
+            self.grip_action(gripper=self.gripper, gripper_action=gripper_action)
+
+        # Apply joint torque control
+        self.sim.data.ctrl[self._ref_joint_actuator_indexes] = self.torques
+
+        # If this is a policy step, also update buffers holding recent values of interest
+        if policy_step:
+            # Update proprioceptive values
+            self.recent_qpos.push(self._joint_positions)
+            self.recent_actions.push(action)
+            self.recent_torques.push(self.torques)
+            self.recent_ee_forcetorques.push(np.concatenate((self.ee_force, self.ee_torque)))
+            self.recent_ee_pose.push(np.concatenate((self.controller.ee_pos, T.mat2quat(self.controller.ee_ori_mat))))
+            self.recent_ee_vel.push(np.concatenate((self.controller.ee_pos_vel, self.controller.ee_ori_vel)))
+
+            # Estimation of eef acceleration (averaged derivative of recent velocities)
+            self.recent_ee_vel_buffer.push(np.concatenate((self.controller.ee_pos_vel, self.controller.ee_ori_vel)))
+            diffs = np.vstack(
+                [self.recent_ee_acc.current, self.control_freq * np.diff(self.recent_ee_vel_buffer.buf, axis=0)]
+            )
+            ee_acc = np.array([np.convolve(col, np.ones(10) / 10.0, mode="valid")[0] for col in diffs.transpose()])
+            self.recent_ee_acc.push(ee_acc)
+
+    def _visualize_grippers(self, visible):
+        """
+        Visualizes the gripper site(s) if applicable.
+
+        Args:
+            visible (bool): True if visualizing the gripper for this arm.
+        """
+        self.gripper.set_sites_visibility(sim=self.sim, visible=visible)
+
+    def setup_observables(self):
+        """
+        Sets up observables to be used for this robot
+
+        Returns:
+            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
+        """
+        # Get general robot observables first
+        observables = super().setup_observables()
+
+        # Get prefix from robot model to avoid naming clashes for multiple robots and define observables modality
+        pf = self.robot_model.naming_prefix
+        modality = f"{pf}proprio"
+
+        # eef features
+        @sensor(modality=modality)
+        def eef_pos(obs_cache):
+            return np.array(self.sim.data.site_xpos[self.eef_site_id])
+
+        @sensor(modality=modality)
+        def eef_quat(obs_cache):
+            return T.convert_quat(self.sim.data.get_body_xquat(self.robot_model.eef_name), to="xyzw")
+
+        @sensor(modality=modality)
+        def eef_vel_lin(obs_cache):
+            return np.array(self.sim.data.get_body_xvelp(self.robot_model.eef_name))
+
+        @sensor(modality=modality)
+        def eef_vel_ang(obs_cache):
+            return np.array(self.sim.data.get_body_xvelr(self.robot_model.eef_name))
+
+        sensors = [eef_pos, eef_quat, eef_vel_lin, eef_vel_ang]
+        names = [f"{pf}eef_pos", f"{pf}eef_quat", f"{pf}eef_vel_lin", f"{pf}eef_vel_ang"]
+        # Exclude eef vel by default
+        actives = [True, True, False, False]
+
+        # add in gripper sensors if this robot has a gripper
+        if self.has_gripper:
+
+            @sensor(modality=modality)
+            def gripper_qpos(obs_cache):
+                return np.array([self.sim.data.qpos[x] for x in self._ref_gripper_joint_pos_indexes])
+
+            @sensor(modality=modality)
+            def gripper_qvel(obs_cache):
+                return np.array([self.sim.data.qvel[x] for x in self._ref_gripper_joint_vel_indexes])
+
+            sensors += [gripper_qpos, gripper_qvel]
+            names += [f"{pf}gripper_qpos", f"{pf}gripper_qvel"]
+            actives += [True, True]
+
+        # Create observables for this robot
+        for name, s, active in zip(names, sensors, actives):
+            observables[name] = Observable(
+                name=name,
+                sensor=s,
+                sampling_rate=self.control_freq,
+                active=active,
+            )
+
+        return observables
+
+    @property
+    def action_limits(self):
+        """
+        Action lower/upper limits per dimension.
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum (low) action values
+                - (np.array) maximum (high) action values
+        """
+        # Action limits based on controller limits
+        low, high = ([-1] * self.gripper.dof, [1] * self.gripper.dof) if self.has_gripper else ([], [])
+        low_c, high_c = self.controller.control_limits
+        low = np.concatenate([low_c, low])
+        high = np.concatenate([high_c, high])
+
+        return low, high
+
+    @property
+    def ee_ft_integral(self):
+        """
+        Returns:
+            np.array: the integral over time of the applied ee force-torque
+        """
+        return np.abs((1.0 / self.control_freq) * self.recent_ee_forcetorques.average)
+
+    @property
+    def ee_force(self):
+        """
+        Returns:
+            np.array: force applied at the force sensor at the robot arm's eef
+        """
+        return self.get_sensor_measurement(self.gripper.important_sensors["force_ee"])
+
+    @property
+    def ee_torque(self):
+        """
+        Returns torque applied at the torque sensor at the robot arm's eef
+        """
+        return self.get_sensor_measurement(self.gripper.important_sensors["torque_ee"])
+
+    @property
+    def _hand_pose(self):
+        """
+        Returns:
+            np.array: (4,4) array corresponding to the eef pose in base frame of robot.
+        """
+        return self.pose_in_base_from_name(self.robot_model.eef_name)
+
+    @property
+    def _hand_quat(self):
+        """
+        Returns:
+            np.array: (x,y,z,w) eef quaternion in base frame of robot.
+        """
+        return T.mat2quat(self._hand_orn)
+
+    @property
+    def _hand_total_velocity(self):
+        """
+        Returns:
+            np.array: 6-array representing the total eef velocity (linear + angular) in the base frame
+        """
+
+        # Use jacobian to translate joint velocities to end effector velocities.
+        Jp = self.sim.data.get_body_jacp(self.robot_model.eef_name).reshape((3, -1))
+        Jp_joint = Jp[:, self._ref_joint_vel_indexes]
+
+        Jr = self.sim.data.get_body_jacr(self.robot_model.eef_name).reshape((3, -1))
+        Jr_joint = Jr[:, self._ref_joint_vel_indexes]
+
+        eef_lin_vel = Jp_joint.dot(self._joint_velocities)
+        eef_rot_vel = Jr_joint.dot(self._joint_velocities)
+        return np.concatenate([eef_lin_vel, eef_rot_vel])
+
+    @property
+    def _hand_pos(self):
+        """
+        Returns:
+            np.array: 3-array representing the position of eef in base frame of robot.
+        """
+        eef_pose_in_base = self._hand_pose
+        return eef_pose_in_base[:3, 3]
+
+    @property
+    def _hand_orn(self):
+        """
+        Returns:
+            np.array: (3,3) array representing the orientation of eef in base frame of robot as a rotation matrix.
+        """
+        eef_pose_in_base = self._hand_pose
+        return eef_pose_in_base[:3, :3]
+
+    @property
+    def _hand_vel(self):
+        """
+        Returns:
+            np.array: (x,y,z) velocity of eef in base frame of robot.
+        """
+        return self._hand_total_velocity[:3]
+
+    @property
+    def _hand_ang_vel(self):
+        """
+        Returns:
+            np.array: (ax,ay,az) angular velocity of eef in base frame of robot.
+        """
+        return self._hand_total_velocity[3:]
diff --git a/phantom/submodules/phantom-robosuite/robosuite/scripts/browse_mjcf_model.py b/phantom/submodules/phantom-robosuite/robosuite/scripts/browse_mjcf_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f87f6edac3181a4278483be99ea279ad510d7b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/scripts/browse_mjcf_model.py
@@ -0,0 +1,35 @@
+"""Visualize MJCF models.
+
+Loads MJCF XML models from file and renders it on screen.
+
+Example:
+    $ python browse_mjcf_model.py --filepath ../models/assets/arenas/table_arena.xml
+"""
+
+import argparse
+import os
+
+import mujoco
+
+import robosuite as suite
+from robosuite.utils import OpenCVRenderer
+from robosuite.utils.binding_utils import MjRenderContext, MjSim
+
+if __name__ == "__main__":
+
+    arena_file = os.path.join(suite.models.assets_root, "arenas/pegs_arena.xml")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--filepath", type=str, default=arena_file)
+    args = parser.parse_args()
+
+    model = mujoco.MjModel.from_xml_path(args.filepath)
+    sim = MjSim(model)
+    render_context = MjRenderContext(sim)
+    sim.add_render_context(render_context)
+    viewer = OpenCVRenderer(sim)
+
+    print("Press ESC to exit...")
+    while True:
+        sim.step()
+        viewer.render()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/scripts/collect_human_demonstrations.py b/phantom/submodules/phantom-robosuite/robosuite/scripts/collect_human_demonstrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..bce59432f856bac14dffeb6f871be733c9a1d8e4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/scripts/collect_human_demonstrations.py
@@ -0,0 +1,253 @@
+"""
+A script to collect a batch of human demonstrations.
+
+The demonstrations can be played back using the `playback_demonstrations_from_hdf5.py` script.
+"""
+
+import argparse
+import datetime
+import json
+import os
+import shutil
+import time
+from glob import glob
+
+import h5py
+import numpy as np
+
+import robosuite as suite
+import robosuite.macros as macros
+from robosuite import load_controller_config
+from robosuite.utils.input_utils import input2action
+from robosuite.wrappers import DataCollectionWrapper, VisualizationWrapper
+
+
+def collect_human_trajectory(env, device, arm, env_configuration):
+    """
+    Use the device (keyboard or SpaceNav 3D mouse) to collect a demonstration.
+    The rollout trajectory is saved to files in npz format.
+    Modify the DataCollectionWrapper wrapper to add new fields or change data formats.
+
+    Args:
+        env (MujocoEnv): environment to control
+        device (Device): to receive controls from the device
+        arms (str): which arm to control (eg bimanual) 'right' or 'left'
+        env_configuration (str): specified environment configuration
+    """
+
+    env.reset()
+
+    # ID = 2 always corresponds to agentview
+    env.render()
+
+    is_first = True
+
+    task_completion_hold_count = -1  # counter to collect 10 timesteps after reaching goal
+    device.start_control()
+
+    # Loop until we get a reset from the input or the task completes
+    while True:
+        # Set active robot
+        active_robot = env.robots[0] if env_configuration == "bimanual" else env.robots[arm == "left"]
+
+        # Get the newest action
+        action, grasp = input2action(
+            device=device, robot=active_robot, active_arm=arm, env_configuration=env_configuration
+        )
+
+        # If action is none, then this a reset so we should break
+        if action is None:
+            break
+
+        # Run environment step
+        env.step(action)
+        env.render()
+
+        # Also break if we complete the task
+        if task_completion_hold_count == 0:
+            break
+
+        # state machine to check for having a success for 10 consecutive timesteps
+        if env._check_success():
+            if task_completion_hold_count > 0:
+                task_completion_hold_count -= 1  # latched state, decrement count
+            else:
+                task_completion_hold_count = 10  # reset count on first success timestep
+        else:
+            task_completion_hold_count = -1  # null the counter if there's no success
+
+    # cleanup for end of data collection episodes
+    env.close()
+
+
+def gather_demonstrations_as_hdf5(directory, out_dir, env_info):
+    """
+    Gathers the demonstrations saved in @directory into a
+    single hdf5 file.
+
+    The strucure of the hdf5 file is as follows.
+
+    data (group)
+        date (attribute) - date of collection
+        time (attribute) - time of collection
+        repository_version (attribute) - repository version used during collection
+        env (attribute) - environment name on which demos were collected
+
+        demo1 (group) - every demonstration has a group
+            model_file (attribute) - model xml string for demonstration
+            states (dataset) - flattened mujoco states
+            actions (dataset) - actions applied during demonstration
+
+        demo2 (group)
+        ...
+
+    Args:
+        directory (str): Path to the directory containing raw demonstrations.
+        out_dir (str): Path to where to store the hdf5 file.
+        env_info (str): JSON-encoded string containing environment information,
+            including controller and robot info
+    """
+
+    hdf5_path = os.path.join(out_dir, "demo.hdf5")
+    f = h5py.File(hdf5_path, "w")
+
+    # store some metadata in the attributes of one group
+    grp = f.create_group("data")
+
+    num_eps = 0
+    env_name = None  # will get populated at some point
+
+    for ep_directory in os.listdir(directory):
+
+        state_paths = os.path.join(directory, ep_directory, "state_*.npz")
+        states = []
+        actions = []
+        success = False
+
+        for state_file in sorted(glob(state_paths)):
+            dic = np.load(state_file, allow_pickle=True)
+            env_name = str(dic["env"])
+
+            states.extend(dic["states"])
+            for ai in dic["action_infos"]:
+                actions.append(ai["actions"])
+            success = success or dic["successful"]
+
+        if len(states) == 0:
+            continue
+
+        # Add only the successful demonstration to dataset
+        if success:
+            print("Demonstration is successful and has been saved")
+            # Delete the last state. This is because when the DataCollector wrapper
+            # recorded the states and actions, the states were recorded AFTER playing that action,
+            # so we end up with an extra state at the end.
+            del states[-1]
+            assert len(states) == len(actions)
+
+            num_eps += 1
+            ep_data_grp = grp.create_group("demo_{}".format(num_eps))
+
+            # store model xml as an attribute
+            xml_path = os.path.join(directory, ep_directory, "model.xml")
+            with open(xml_path, "r") as f:
+                xml_str = f.read()
+            ep_data_grp.attrs["model_file"] = xml_str
+
+            # write datasets for states and actions
+            ep_data_grp.create_dataset("states", data=np.array(states))
+            ep_data_grp.create_dataset("actions", data=np.array(actions))
+        else:
+            print("Demonstration is unsuccessful and has NOT been saved")
+
+    # write dataset attributes (metadata)
+    now = datetime.datetime.now()
+    grp.attrs["date"] = "{}-{}-{}".format(now.month, now.day, now.year)
+    grp.attrs["time"] = "{}:{}:{}".format(now.hour, now.minute, now.second)
+    grp.attrs["repository_version"] = suite.__version__
+    grp.attrs["env"] = env_name
+    grp.attrs["env_info"] = env_info
+
+    f.close()
+
+
+if __name__ == "__main__":
+    # Arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--directory",
+        type=str,
+        default=os.path.join(suite.models.assets_root, "demonstrations"),
+    )
+    parser.add_argument("--environment", type=str, default="Lift")
+    parser.add_argument("--robots", nargs="+", type=str, default="Panda", help="Which robot(s) to use in the env")
+    parser.add_argument(
+        "--config", type=str, default="single-arm-opposed", help="Specified environment configuration if necessary"
+    )
+    parser.add_argument("--arm", type=str, default="right", help="Which arm to control (eg bimanual) 'right' or 'left'")
+    parser.add_argument("--camera", type=str, default="agentview", help="Which camera to use for collecting demos")
+    parser.add_argument(
+        "--controller", type=str, default="OSC_POSE", help="Choice of controller. Can be 'IK_POSE' or 'OSC_POSE'"
+    )
+    parser.add_argument("--device", type=str, default="keyboard")
+    parser.add_argument("--pos-sensitivity", type=float, default=1.0, help="How much to scale position user inputs")
+    parser.add_argument("--rot-sensitivity", type=float, default=1.0, help="How much to scale rotation user inputs")
+    args = parser.parse_args()
+
+    # Get controller config
+    controller_config = load_controller_config(default_controller=args.controller)
+
+    # Create argument configuration
+    config = {
+        "env_name": args.environment,
+        "robots": args.robots,
+        "controller_configs": controller_config,
+    }
+
+    # Check if we're using a multi-armed environment and use env_configuration argument if so
+    if "TwoArm" in args.environment:
+        config["env_configuration"] = args.config
+
+    # Create environment
+    env = suite.make(
+        **config,
+        has_renderer=True,
+        has_offscreen_renderer=False,
+        render_camera=args.camera,
+        ignore_done=True,
+        use_camera_obs=False,
+        reward_shaping=True,
+        control_freq=20,
+    )
+
+    # Wrap this with visualization wrapper
+    env = VisualizationWrapper(env)
+
+    # Grab reference to controller config and convert it to json-encoded string
+    env_info = json.dumps(config)
+
+    # wrap the environment with data collection wrapper
+    tmp_directory = "/tmp/{}".format(str(time.time()).replace(".", "_"))
+    env = DataCollectionWrapper(env, tmp_directory)
+
+    # initialize device
+    if args.device == "keyboard":
+        from robosuite.devices import Keyboard
+
+        device = Keyboard(pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
+    elif args.device == "spacemouse":
+        from robosuite.devices import SpaceMouse
+
+        device = SpaceMouse(pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
+    else:
+        raise Exception("Invalid device choice: choose either 'keyboard' or 'spacemouse'.")
+
+    # make a new timestamped directory
+    t1, t2 = str(time.time()).split(".")
+    new_dir = os.path.join(args.directory, "{}_{}".format(t1, t2))
+    os.makedirs(new_dir)
+
+    # collect demonstrations
+    while True:
+        collect_human_trajectory(env, device, args.arm, args.config)
+        gather_demonstrations_as_hdf5(tmp_directory, new_dir, env_info)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/scripts/compile_mjcf_model.py b/phantom/submodules/phantom-robosuite/robosuite/scripts/compile_mjcf_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5b9334a00d4b3fdfa5a2ab8fa06eb4013faf5cc
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/scripts/compile_mjcf_model.py
@@ -0,0 +1,39 @@
+"""Loads a raw mjcf file and saves a compiled mjcf file.
+
+This avoids mujoco-py from complaining about .urdf extension.
+Also allows assets to be compiled properly.
+
+Example:
+    $ python compile_mjcf_model.py source_mjcf.xml target_mjcf.xml
+"""
+
+import os
+import sys
+from shutil import copyfile
+
+import mujoco
+
+
+def print_usage():
+    print("""python compile_mjcf_model.py input_file output_file""")
+
+
+if __name__ == "__main__":
+
+    if len(sys.argv) != 3:
+        print_usage()
+        exit(0)
+
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+    input_folder = os.path.dirname(input_file)
+
+    tempfile = os.path.join(input_folder, ".robosuite_temp_model.xml")
+    copyfile(input_file, tempfile)
+
+    model = mujoco.MjModel.from_xml_path(tempfile)
+    xml_string = model.get_xml()
+    with open(output_file, "w") as f:
+        f.write(xml_string)
+
+    os.remove(tempfile)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/scripts/make_reset_video.py b/phantom/submodules/phantom-robosuite/robosuite/scripts/make_reset_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8eb7ef70a2721da4ce20b7c6a8f797d6b161eef
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/scripts/make_reset_video.py
@@ -0,0 +1,97 @@
+"""
+Convenience script to make a video out of initial environment 
+configurations. This can be a useful debugging tool to understand
+what different sampled environment configurations look like.
+"""
+
+import argparse
+
+import imageio
+import numpy as np
+
+import robosuite as suite
+from robosuite.controllers import load_controller_config
+from robosuite.utils.input_utils import *
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # camera to use for generating frames
+    parser.add_argument(
+        "--camera",
+        type=str,
+        default="agentview",
+    )
+
+    # number of frames in output video
+    parser.add_argument(
+        "--frames",
+        type=int,
+        default=10,
+    )
+
+    # path to output video
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="reset.mp4",
+    )
+
+    args = parser.parse_args()
+    camera_name = args.camera
+    num_frames = args.frames
+    output_path = args.output
+
+    # Create dict to hold options that will be passed to env creation call
+    options = {}
+
+    # print welcome info
+    print("Welcome to robosuite v{}!".format(suite.__version__))
+    print(suite.__logo__)
+
+    # Choose environment and add it to options
+    options["env_name"] = choose_environment()
+
+    # If a multi-arm environment has been chosen, choose configuration and appropriate robot(s)
+    if "TwoArm" in options["env_name"]:
+        # Choose env config and add it to options
+        options["env_configuration"] = choose_multi_arm_config()
+
+        # If chosen configuration was bimanual, the corresponding robot must be Baxter. Else, have user choose robots
+        if options["env_configuration"] == "bimanual":
+            options["robots"] = "Baxter"
+        else:
+            options["robots"] = []
+
+            # Have user choose two robots
+            print("A multiple single-arm configuration was chosen.\n")
+
+            for i in range(2):
+                print("Please choose Robot {}...\n".format(i))
+                options["robots"].append(choose_robots(exclude_bimanual=True))
+
+    # Else, we simply choose a single (single-armed) robot to instantiate in the environment
+    else:
+        options["robots"] = choose_robots(exclude_bimanual=True)
+
+    # Load the controller
+    options["controller_configs"] = load_controller_config(default_controller="OSC_POSE")
+
+    # initialize the task
+    env = suite.make(
+        **options,
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        ignore_done=True,
+        use_camera_obs=False,
+        control_freq=20,
+    )
+
+    # write a video
+    video_writer = imageio.get_writer(output_path, fps=5)
+    for i in range(num_frames):
+        env.reset()
+        video_img = env.sim.render(height=512, width=512, camera_name=camera_name)[::-1]
+        env.step(np.zeros_like(env.action_spec[0]))
+        video_writer.append_data(video_img)
+    video_writer.close()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/scripts/playback_demonstrations_from_hdf5.py b/phantom/submodules/phantom-robosuite/robosuite/scripts/playback_demonstrations_from_hdf5.py
new file mode 100644
index 0000000000000000000000000000000000000000..0decbd1b6edd31c3d31de4f52b22327d6fd69a64
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/scripts/playback_demonstrations_from_hdf5.py
@@ -0,0 +1,106 @@
+"""
+A convenience script to playback random demonstrations from
+a set of demonstrations stored in a hdf5 file.
+
+Arguments:
+    --folder (str): Path to demonstrations
+    --use-actions (optional): If this flag is provided, the actions are played back
+        through the MuJoCo simulator, instead of loading the simulator states
+        one by one.
+    --visualize-gripper (optional): If set, will visualize the gripper site
+
+Example:
+    $ python playback_demonstrations_from_hdf5.py --folder ../models/assets/demonstrations/lift/
+"""
+
+import argparse
+import json
+import os
+import random
+
+import h5py
+import numpy as np
+
+import robosuite
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--folder",
+        type=str,
+        help="Path to your demonstration folder that contains the demo.hdf5 file, e.g.: "
+        "'path_to_assets_dir/demonstrations/YOUR_DEMONSTRATION'",
+    ),
+    parser.add_argument(
+        "--use-actions",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    demo_path = args.folder
+    hdf5_path = os.path.join(demo_path, "demo.hdf5")
+    f = h5py.File(hdf5_path, "r")
+    env_name = f["data"].attrs["env"]
+    env_info = json.loads(f["data"].attrs["env_info"])
+
+    env = robosuite.make(
+        **env_info,
+        has_renderer=True,
+        has_offscreen_renderer=False,
+        ignore_done=True,
+        use_camera_obs=False,
+        reward_shaping=True,
+        control_freq=20,
+    )
+
+    # list of all demonstrations episodes
+    demos = list(f["data"].keys())
+
+    while True:
+        print("Playing back random episode... (press ESC to quit)")
+
+        # select an episode randomly
+        ep = random.choice(demos)
+
+        # read the model xml, using the metadata stored in the attribute for this episode
+        model_xml = f["data/{}".format(ep)].attrs["model_file"]
+
+        env.reset()
+        xml = env.edit_model_xml(model_xml)
+        env.reset_from_xml_string(xml)
+        env.sim.reset()
+        env.viewer.set_camera(0)
+
+        # load the flattened mujoco states
+        states = f["data/{}/states".format(ep)][()]
+
+        if args.use_actions:
+
+            # load the initial state
+            env.sim.set_state_from_flattened(states[0])
+            env.sim.forward()
+
+            # load the actions and play them back open-loop
+            actions = np.array(f["data/{}/actions".format(ep)][()])
+            num_actions = actions.shape[0]
+
+            for j, action in enumerate(actions):
+                env.step(action)
+                env.render()
+
+                if j < num_actions - 1:
+                    # ensure that the actions deterministically lead to the same recorded states
+                    state_playback = env.sim.get_state().flatten()
+                    if not np.all(np.equal(states[j + 1], state_playback)):
+                        err = np.linalg.norm(states[j + 1] - state_playback)
+                        print(f"[warning] playback diverged by {err:.2f} for ep {ep} at step {j}")
+
+        else:
+
+            # force the sequence of internal mujoco states one by one
+            for state in states:
+                env.sim.set_state_from_flattened(state)
+                env.sim.forward()
+                env.render()
+
+    f.close()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/scripts/setup_macros.py b/phantom/submodules/phantom-robosuite/robosuite/scripts/setup_macros.py
new file mode 100644
index 0000000000000000000000000000000000000000..16abdde5ad8246d018e387890e2c32539602e1b3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/scripts/setup_macros.py
@@ -0,0 +1,31 @@
+"""
+This script sets up a private macros file.
+The private macros file (macros_private.py) is not tracked by git,
+allowing user-specific settings that are not tracked by git.
+This script checks if macros_private.py exists.
+If applicable, it creates the private macros at robosuite/macros_private.py
+"""
+
+import os
+import shutil
+
+import robosuite
+
+if __name__ == "__main__":
+    base_path = robosuite.__path__[0]
+    macros_path = os.path.join(base_path, "macros.py")
+    macros_private_path = os.path.join(base_path, "macros_private.py")
+
+    if not os.path.exists(macros_path):
+        print("{} does not exist! Aborting...".format(macros_path))
+
+    if os.path.exists(macros_private_path):
+        ans = input("{} already exists! \noverwrite? (y/n)\n".format(macros_private_path))
+
+        if ans == "y":
+            print("REMOVING")
+        else:
+            exit()
+
+    shutil.copyfile(macros_path, macros_private_path)
+    print("copied {}\nto {}".format(macros_path, macros_private_path))
diff --git a/phantom/submodules/phantom-robosuite/robosuite/scripts/tune_camera.py b/phantom/submodules/phantom-robosuite/robosuite/scripts/tune_camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..e27e380b743fd158298157d7f9046a419c36f93f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/scripts/tune_camera.py
@@ -0,0 +1,226 @@
+"""
+Convenience script to tune a camera view in a mujoco environment.
+Allows keyboard presses to move a camera around in the viewer, and
+then prints the final position and quaternion you should set
+for your camera in the mujoco XML file.
+"""
+
+import argparse
+import time
+import xml.etree.ElementTree as ET
+
+import numpy as np
+from pynput.keyboard import Controller, Key, Listener
+
+import robosuite
+import robosuite.utils.transform_utils as T
+from robosuite.utils.camera_utils import CameraMover
+from robosuite.utils.mjcf_utils import find_elements, find_parent
+
+# some settings
+DELTA_POS_KEY_PRESS = 0.05  # delta camera position per key press
+DELTA_ROT_KEY_PRESS = 1  # delta camera angle per key press
+
+
+class KeyboardHandler:
+    def __init__(self, camera_mover):
+        """
+        Store internal state here.
+
+        Args:
+            camera_mover (CameraMover): Playback camera class
+        cam_body_id (int): id corresponding to parent body of camera element
+        """
+        self.camera_mover = camera_mover
+
+        # make a thread to listen to keyboard and register our callback functions
+        self.listener = Listener(on_press=self.on_press, on_release=self.on_release)
+
+        # start listening
+        self.listener.start()
+
+    def on_press(self, key):
+        """
+        Key handler for key presses.
+
+        Args:
+            key (int): keycode corresponding to the key that was pressed
+        """
+
+        try:
+            # controls for moving rotation
+            if key == Key.up:
+                # rotate up
+                self.camera_mover.rotate_camera(point=None, axis=[1.0, 0.0, 0.0], angle=DELTA_ROT_KEY_PRESS)
+            elif key == Key.down:
+                # rotate down
+                self.camera_mover.rotate_camera(point=None, axis=[-1.0, 0.0, 0.0], angle=DELTA_ROT_KEY_PRESS)
+            elif key == Key.left:
+                # rotate left
+                self.camera_mover.rotate_camera(point=None, axis=[0.0, 1.0, 0.0], angle=DELTA_ROT_KEY_PRESS)
+            elif key == Key.right:
+                # rotate right
+                self.camera_mover.rotate_camera(point=None, axis=[0.0, -1.0, 0.0], angle=DELTA_ROT_KEY_PRESS)
+
+            # controls for moving position
+            elif key.char == "w":
+                # move forward
+                self.camera_mover.move_camera(direction=[0.0, 0.0, -1.0], scale=DELTA_POS_KEY_PRESS)
+            elif key.char == "s":
+                # move backward
+                self.camera_mover.move_camera(direction=[0.0, 0.0, 1.0], scale=DELTA_POS_KEY_PRESS)
+            elif key.char == "a":
+                # move left
+                self.camera_mover.move_camera(direction=[-1.0, 0.0, 0.0], scale=DELTA_POS_KEY_PRESS)
+            elif key.char == "d":
+                # move right
+                self.camera_mover.move_camera(direction=[1.0, 0.0, 0.0], scale=DELTA_POS_KEY_PRESS)
+            elif key.char == "r":
+                # move up
+                self.camera_mover.move_camera(direction=[0.0, 1.0, 0.0], scale=DELTA_POS_KEY_PRESS)
+            elif key.char == "f":
+                # move down
+                self.camera_mover.move_camera(direction=[0.0, -1.0, 0.0], scale=DELTA_POS_KEY_PRESS)
+            elif key.char == ".":
+                # rotate counterclockwise
+                self.camera_mover.rotate_camera(point=None, axis=[0.0, 0.0, 1.0], angle=DELTA_ROT_KEY_PRESS)
+            elif key.char == "/":
+                # rotate clockwise
+                self.camera_mover.rotate_camera(point=None, axis=[0.0, 0.0, -1.0], angle=DELTA_ROT_KEY_PRESS)
+
+        except AttributeError as e:
+            pass
+
+    def on_release(self, key):
+        """
+        Key handler for key releases.
+
+        Args:
+            key: [NOT USED]
+        """
+        pass
+
+
+def print_command(char, info):
+    """
+    Prints out the command + relevant info entered by user
+
+    Args:
+        char (str): Command entered
+        info (str): Any additional info to print
+    """
+    char += " " * (10 - len(char))
+    print("{}\t{}".format(char, info))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--env", type=str, default="Lift")
+    parser.add_argument("--robots", nargs="+", type=str, default="Sawyer", help="Which robot(s) to use in the env")
+    args = parser.parse_args()
+
+    print("\nWelcome to the camera tuning script! You will be able to tune a camera view")
+    print("by moving it around using your keyboard. The controls are printed below.")
+
+    print("")
+    print_command("Keys", "Command")
+    print_command("w-s", "zoom the camera in/out")
+    print_command("a-d", "pan the camera left/right")
+    print_command("r-f", "pan the camera up/down")
+    print_command("arrow keys", "rotate the camera to change view direction")
+    print_command(".-/", "rotate the camera view without changing view direction")
+    print("")
+
+    # read camera XML tag from user input
+    inp = input(
+        "\nPlease paste a camera name below \n"
+        "OR xml tag below (e.g. <camera ... />) \n"
+        "OR leave blank for an example:\n"
+    )
+
+    if len(inp) == 0:
+        if args.env != "Lift":
+            raise Exception("ERROR: env must be Lift to run default example.")
+        print("\nUsing an example tag corresponding to the frontview camera.")
+        print("This xml tag was copied from robosuite/models/assets/arenas/table_arena.xml")
+        inp = '<camera mode="fixed" name="frontview" pos="1.6 0 1.45" quat="0.56 0.43 0.43 0.56"/>'
+
+    # remember the tag and infer some properties
+    from_tag = "<" in inp
+    notify_str = (
+        "NOTE: using the following xml tag:\n"
+        if from_tag
+        else "NOTE: using the following camera (initialized at default sim location)\n"
+    )
+
+    print(notify_str)
+    print("{}\n".format(inp))
+
+    cam_tree = ET.fromstring(inp) if from_tag else ET.Element("camera", attrib={"name": inp})
+    CAMERA_NAME = cam_tree.get("name")
+
+    # make the environment
+    env = robosuite.make(
+        args.env,
+        robots=args.robots,
+        has_renderer=True,
+        has_offscreen_renderer=False,
+        ignore_done=True,
+        use_camera_obs=False,
+        control_freq=100,
+    )
+    env.reset()
+
+    # Create the camera mover
+    camera_mover = CameraMover(
+        env=env,
+        camera=CAMERA_NAME,
+    )
+
+    # Make sure we're using the camera that we're modifying
+    camera_id = env.sim.model.camera_name2id(CAMERA_NAME)
+    env.viewer.set_camera(camera_id=camera_id)
+
+    # Infer initial camera pose
+    if from_tag:
+        initial_file_camera_pos = np.array(cam_tree.get("pos").split(" ")).astype(float)
+        initial_file_camera_quat = T.convert_quat(np.array(cam_tree.get("quat").split(" ")).astype(float), to="xyzw")
+        # Set these values as well
+        camera_mover.set_camera_pose(pos=initial_file_camera_pos, quat=initial_file_camera_quat)
+        # Optionally set fov if specified
+        cam_fov = cam_tree.get("fovy", None)
+        if cam_fov is not None:
+            env.sim.model.cam_fovy[camera_id] = float(cam_fov)
+    else:
+        initial_file_camera_pos, initial_file_camera_quat = camera_mover.get_camera_pose()
+    # Define initial file camera pose
+    initial_file_camera_pose = T.make_pose(initial_file_camera_pos, T.quat2mat(initial_file_camera_quat))
+
+    # remember difference between camera pose in initial tag and absolute camera pose in world
+    initial_world_camera_pos, initial_world_camera_quat = camera_mover.get_camera_pose()
+    initial_world_camera_pose = T.make_pose(initial_world_camera_pos, T.quat2mat(initial_world_camera_quat))
+    world_in_file = initial_file_camera_pose.dot(T.pose_inv(initial_world_camera_pose))
+
+    # register callbacks to handle key presses in the viewer
+    key_handler = KeyboardHandler(camera_mover=camera_mover)
+
+    # just spin to let user interact with window
+    spin_count = 0
+    while True:
+        action = np.zeros(env.action_dim)
+        obs, reward, done, _ = env.step(action)
+        env.render()
+        spin_count += 1
+        if spin_count % 500 == 0:
+            # convert from world coordinates to file coordinates (xml subtree)
+            camera_pos, camera_quat = camera_mover.get_camera_pose()
+            world_camera_pose = T.make_pose(camera_pos, T.quat2mat(camera_quat))
+            file_camera_pose = world_in_file.dot(world_camera_pose)
+            # TODO: Figure out why numba causes black screen of death (specifically, during mat2pose --> mat2quat call below)
+            camera_pos, camera_quat = T.mat2pose(file_camera_pose)
+            camera_quat = T.convert_quat(camera_quat, to="wxyz")
+
+            print("\n\ncurrent camera tag you should copy")
+            cam_tree.set("pos", "{} {} {}".format(camera_pos[0], camera_pos[1], camera_pos[2]))
+            cam_tree.set("quat", "{} {} {} {}".format(camera_quat[0], camera_quat[1], camera_quat[2], camera_quat[3]))
+            print(ET.tostring(cam_tree, encoding="utf8").decode("utf8"))
diff --git a/phantom/submodules/phantom-robosuite/robosuite/scripts/tune_joints.py b/phantom/submodules/phantom-robosuite/robosuite/scripts/tune_joints.py
new file mode 100644
index 0000000000000000000000000000000000000000..09dedbd5fbe308fa523c8afeb2479347972504d3
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/scripts/tune_joints.py
@@ -0,0 +1,311 @@
+"""
+Convenience script to tune a robot's joint positions in a mujoco environment.
+Allows keyboard presses to move specific robot joints around in the viewer, and
+then prints the current joint parameters upon an inputted command
+
+RELEVANT KEY PRESSES:
+    '1 - n' : Sets the active robot joint being tuned to this number. Maximum
+        is n which is the number of robot joints
+    't' : Toggle between robot arms being tuned (only applicable for multi-arm environments)
+    'r' : Resets the active joint values to 0
+    'UP_ARROW' : Increment the active robot joint position
+    'DOWN_ARROW' : Decrement the active robot joint position
+    'RIGHT_ARROW' : Increment the delta joint position change per keypress
+    'LEFT_ARROW' : Decrement the delta joint position change per keypress
+
+"""
+
+import argparse
+
+import numpy as np
+from pynput.keyboard import Controller, Key, Listener
+
+import robosuite
+from robosuite.robots import SingleArm
+
+
+class KeyboardHandler:
+    def __init__(self, env, delta=0.05):
+        """
+        Store internal state here.
+
+        Args:
+            env (MujocoEnv): Environment to use
+            delta (float): initial joint tuning increment
+        """
+        self.env = env
+        self.delta = delta
+        self.num_robots = len(env.robots)
+        self.active_robot_num = 0
+        self.active_arm_joint = 1
+        self.active_arm = "right"  # only relevant for bimanual robots
+        self.current_joints_pos = env.sim.data.qpos[self.active_robot._ref_joint_pos_indexes[: self.num_joints]]
+
+        # make a thread to listen to keyboard and register our callback functions
+        self.listener = Listener(on_press=self.on_press, on_release=self.on_release)
+
+        # start listening
+        self.listener.start()
+
+    def on_press(self, key):
+        """
+        Key handler for key presses.
+
+        Args:
+            key (int): keycode corresponding to the key that was pressed
+        """
+
+        try:
+            if key == Key.up:
+                # Increment the active joint
+                self._update_joint_position(self.active_arm_joint, self.delta)
+            elif key == Key.down:
+                # Decrement the active joint
+                self._update_joint_position(self.active_arm_joint, -self.delta)
+            elif key == Key.right:
+                # Increment the delta value
+                self.delta = min(1.0, self.delta + 0.005)
+                # Print out new value to user
+                print("Delta now = {:.3f}".format(self.delta))
+            elif key == Key.left:
+                # Decrement the delta value
+                self.delta = max(0, self.delta - 0.005)
+                print("Delta now = {:.3f}".format(self.delta))
+            # controls for setting active arm
+            elif key.char == "0":
+                # Notify use that joint indexes are 1-indexed
+                print("Joint Indexes are 1-Indexed. Available joints are 1 - {}".format(self.num_joints))
+            elif key.char == "1":
+                # Make sure range is valid; if so, update this specific joint
+                if self._check_valid_joint(1):
+                    self.active_arm_joint = 1
+                    # Print out to user
+                    print("New joint being tuned: {}".format(self.active_arm_joint))
+            elif key.char == "2":
+                # Make sure range is valid; if so, update this specific joint
+                if self._check_valid_joint(2):
+                    self.active_arm_joint = 2
+                    # Print out to user
+                    print("New joint being tuned: {}".format(self.active_arm_joint))
+            elif key.char == "3":
+                # Make sure range is valid; if so, update this specific joint
+                if self._check_valid_joint(3):
+                    self.active_arm_joint = 3
+                    # Print out to user
+                    print("New joint being tuned: {}".format(self.active_arm_joint))
+            elif key.char == "4":
+                # Make sure range is valid; if so, update this specific joint
+                if self._check_valid_joint(4):
+                    self.active_arm_joint = 4
+                    # Print out to user
+                    print("New joint being tuned: {}".format(self.active_arm_joint))
+            elif key.char == "5":
+                # Make sure range is valid; if so, update this specific joint
+                if self._check_valid_joint(5):
+                    self.active_arm_joint = 5
+                    # Print out to user
+                    print("New joint being tuned: {}".format(self.active_arm_joint))
+            elif key.char == "6":
+                # Make sure range is valid; if so, update this specific joint
+                if self._check_valid_joint(6):
+                    self.active_arm_joint = 6
+                    # Print out to user
+                    print("New joint being tuned: {}".format(self.active_arm_joint))
+            elif key.char == "7":
+                # Make sure range is valid; if so, update this specific joint
+                if self._check_valid_joint(7):
+                    self.active_arm_joint = 7
+                    # Print out to user
+                    print("New joint being tuned: {}".format(self.active_arm_joint))
+            elif key.char == "8":
+                # Make sure range is valid; if so, update this specific joint
+                if self._check_valid_joint(8):
+                    self.active_arm_joint = 8
+                    # Print out to user
+                    print("New joint being tuned: {}".format(self.active_arm_joint))
+            elif key.char == "9":
+                # Make sure range is valid; if so, update this specific joint
+                if self._check_valid_joint(9):
+                    self.active_arm_joint = 9
+                    # Print out to user
+                    print("New joint being tuned: {}".format(self.active_arm_joint))
+            elif key.char == "t":
+                # Toggle active arm
+                self._toggle_arm()
+            elif key.char == "r":
+                # Reset active arm joint qpos to 0
+                self.set_joint_positions(np.zeros(self.num_joints))
+
+        except AttributeError as e:
+            pass
+
+    def on_release(self, key):
+        """
+        Key handler for key releases.
+
+        Args:
+            key: [NOT USED]
+        """
+        pass
+
+    def set_joint_positions(self, qpos):
+        """
+        Automatically sets the joint positions to be the given value
+
+        Args:
+            qpos (np.array): Joint positions to set
+        """
+        self.current_joints_pos = qpos
+        self._update_joint_position(1, 0)
+
+    def _check_valid_joint(self, i):
+        """
+        Checks to make sure joint number request @i is within valid range
+
+        Args:
+            i (int): Index to validate
+
+        Returns:
+            bool: True if index @i is valid, else prints out an error and returns False
+        """
+        if i > self.num_joints:
+            # Print error
+            print("Error: Requested joint {} is out of range; available joints are 1 - {}".format(i, self.num_joints))
+            return False
+        else:
+            return True
+
+    def _toggle_arm(self):
+        """
+        Toggle between arms in the environment to set as current active arm
+        """
+        if isinstance(self.active_robot, SingleArm):
+            self.active_robot_num = (self.active_robot_num + 1) % self.num_robots
+            robot = self.active_robot_num
+        else:  # Bimanual case
+            self.active_arm = "left" if self.active_arm == "right" else "right"
+            robot = self.active_arm
+        # Reset joint being controlled to 1
+        self.active_arm_joint = 1
+        # Print out new robot to user
+        print("New robot arm being tuned: {}".format(robot))
+
+    def _update_joint_position(self, i, delta):
+        """
+        Updates specified joint position @i by value @delta from its current position
+        Note: assumes @i is already within the valid joint range
+
+        Args:
+            i (int): Joint index to update
+            delta (float): Increment to alter specific joint by
+        """
+        self.current_joints_pos[i - 1] += delta
+        if isinstance(self.active_robot, SingleArm):
+            robot = self.active_robot_num
+            self.env.sim.data.qpos[self.active_robot._ref_joint_pos_indexes] = self.current_joints_pos
+        else:  # Bimanual case
+            robot = self.active_arm
+            if self.active_arm == "right":
+                self.env.sim.data.qpos[
+                    self.active_robot._ref_joint_pos_indexes[: self.num_joints]
+                ] = self.current_joints_pos
+            else:  # left arm case
+                self.env.sim.data.qpos[
+                    self.active_robot._ref_joint_pos_indexes[self.num_joints :]
+                ] = self.current_joints_pos
+        # Print out current joint positions to user
+        print("Robot {} joint qpos: {}".format(robot, self.current_joints_pos))
+
+    @property
+    def active_robot(self):
+        """
+        Returns:
+            Robot: active robot arm currently being tuned
+        """
+        return self.env.robots[self.active_robot_num]
+
+    @property
+    def num_joints(self):
+        """
+        Returns:
+            int: number of joints for the current arm
+        """
+        if isinstance(self.active_robot, SingleArm):
+            return len(self.active_robot.torque_limits[0])
+        else:  # Bimanual arm case
+            return int(len(self.active_robot.torque_limits[0]) / 2)
+
+
+def print_command(char, info):
+    """
+    Prints out the command + relevant info entered by user
+
+    Args:
+        char (str): Command entered
+        info (str): Any additional info to print
+    """
+    char += " " * (10 - len(char))
+    print("{}\t{}".format(char, info))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--env", type=str, default="Lift")
+    parser.add_argument("--robots", nargs="+", type=str, default="Panda", help="Which robot(s) to use in the env")
+    parser.add_argument(
+        "--init_qpos", nargs="+", type=float, default=0, help="Initial qpos to use. 0 defaults to all zeros"
+    )
+
+    args = parser.parse_args()
+
+    print(
+        "\nWelcome to the joint tuning script! You will be able to tune the robot\n"
+        "arm joints in the specified environment by using your keyboard. The \n"
+        "controls are printed below:"
+    )
+
+    print("")
+    print_command("Keys", "Command")
+    print_command("1-N", "Active Joint being tuned (N=number of joints for the active arm)")
+    print_command("t", "Toggle between robot arms in the environment")
+    print_command("r", "Reset active arm joints to all 0s")
+    print_command("up/down", "incr/decrement the active joint angle")
+    print_command("right/left", "incr/decrement the delta joint angle per up/down keypress")
+    print("")
+
+    # Setup printing options for numbers
+    np.set_printoptions(formatter={"float": lambda x: "{0:0.3f}".format(x)})
+
+    # Define the controller
+    controller_config = robosuite.load_controller_config(default_controller="JOINT_POSITION")
+
+    # make the environment
+    env = robosuite.make(
+        args.env,
+        robots=args.robots,
+        has_renderer=True,
+        has_offscreen_renderer=False,
+        ignore_done=True,
+        use_camera_obs=False,
+        control_freq=20,
+        render_camera=None,
+        controller_configs=controller_config,
+        initialization_noise=None,
+    )
+    env.reset()
+
+    # register callbacks to handle key presses in the viewer
+    key_handler = KeyboardHandler(env=env)
+
+    # Set initial state
+    if type(args.init_qpos) == int and args.init_qpos == 0:
+        # Default to all zeros
+        pass
+    else:
+        key_handler.set_joint_positions(args.init_qpos)
+
+    # just spin to let user interact with window
+    while True:
+        action = np.zeros(env.action_dim)
+        obs, reward, done, _ = env.step(action)
+        env.render()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f80caa075d3150eb8346a4aae00fdf6438f499
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/__init__.py
@@ -0,0 +1,3 @@
+from .errors import robosuiteError, XMLError, SimulationError, RandomizationError
+
+from .opencv_renderer import OpenCVRenderer
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/binding_utils.py b/phantom/submodules/phantom-robosuite/robosuite/utils/binding_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd46540e1a316603946cbf118d912d92f0ca604c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/binding_utils.py
@@ -0,0 +1,1177 @@
+"""
+Useful classes for supporting DeepMind MuJoCo binding.
+"""
+
+import gc
+import os
+from tempfile import TemporaryDirectory
+
+# DIRTY HACK copied from mujoco-py - a global lock on rendering
+from threading import Lock
+
+import mujoco
+import numpy as np
+
+_MjSim_render_lock = Lock()
+
+import ctypes
+import ctypes.util
+import os
+import platform
+import subprocess
+
+import robosuite.macros as macros
+
+_SYSTEM = platform.system()
+if _SYSTEM == "Windows":
+    ctypes.WinDLL(os.path.join(os.path.dirname(__file__), "mujoco.dll"))
+
+CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "")
+if CUDA_VISIBLE_DEVICES != "":
+    MUJOCO_EGL_DEVICE_ID = os.environ.get("MUJOCO_EGL_DEVICE_ID", None)
+    if MUJOCO_EGL_DEVICE_ID is not None:
+        assert MUJOCO_EGL_DEVICE_ID.isdigit() and (
+            MUJOCO_EGL_DEVICE_ID in CUDA_VISIBLE_DEVICES
+        ), "MUJOCO_EGL_DEVICE_ID needs to be set to one of the device id specified in CUDA_VISIBLE_DEVICES"
+
+if macros.MUJOCO_GPU_RENDERING and os.environ.get("MUJOCO_GL", None) not in ["osmesa", "glx"]:
+    # If gpu rendering is specified in macros, then we enforce gpu
+    # option for rendering
+    if _SYSTEM == "Darwin":
+        os.environ["MUJOCO_GL"] = "cgl"
+    else:
+        os.environ["MUJOCO_GL"] = "egl"
+_MUJOCO_GL = os.environ.get("MUJOCO_GL", "").lower().strip()
+if _MUJOCO_GL not in ("disable", "disabled", "off", "false", "0"):
+    _VALID_MUJOCO_GL = ("enable", "enabled", "on", "true", "1", "glfw", "")
+    if _SYSTEM == "Linux":
+        _VALID_MUJOCO_GL += ("glx", "egl", "osmesa")
+    elif _SYSTEM == "Windows":
+        _VALID_MUJOCO_GL += ("wgl",)
+    elif _SYSTEM == "Darwin":
+        _VALID_MUJOCO_GL += ("cgl",)
+    if _MUJOCO_GL not in _VALID_MUJOCO_GL:
+        raise RuntimeError(f"invalid value for environment variable MUJOCO_GL: {_MUJOCO_GL}")
+    if _SYSTEM == "Linux" and _MUJOCO_GL == "osmesa":
+        from robosuite.renderers.context.osmesa_context import OSMesaGLContext as GLContext
+    elif _SYSTEM == "Linux" and _MUJOCO_GL == "egl":
+        from robosuite.renderers.context.egl_context import EGLGLContext as GLContext
+    else:
+        from robosuite.renderers.context.glfw_context import GLFWGLContext as GLContext
+
+
+class MjRenderContext:
+    """
+    Class that encapsulates rendering functionality for a
+    MuJoCo simulation.
+
+    See https://github.com/openai/mujoco-py/blob/4830435a169c1f3e3b5f9b58a7c3d9c39bdf4acb/mujoco_py/mjrendercontext.pyx
+    """
+
+    def __init__(self, sim, offscreen=True, device_id=-1, max_width=640, max_height=480):
+        assert offscreen, "only offscreen supported for now"
+        self.sim = sim
+        self.offscreen = offscreen
+        self.device_id = device_id
+
+        # setup GL context with defaults for now
+        self.gl_ctx = GLContext(max_width=max_width, max_height=max_height, device_id=self.device_id)
+        self.gl_ctx.make_current()
+
+        # Ensure the model data has been updated so that there
+        # is something to render
+        sim.forward()
+        # make sure sim has this context
+        sim.add_render_context(self)
+
+        self.model = sim.model
+        self.data = sim.data
+
+        # create default scene
+        self.scn = mujoco.MjvScene(sim.model._model, maxgeom=1000)
+
+        # camera
+        self.cam = mujoco.MjvCamera()
+        self.cam.fixedcamid = 0
+        self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
+
+        # options for visual / collision mesh can be set externally, e.g. vopt.geomgroup[0], vopt.geomgroup[1]
+        self.vopt = mujoco.MjvOption()
+
+        self.pert = mujoco.MjvPerturb()
+        self.pert.active = 0
+        self.pert.select = 0
+        self.pert.skinselect = -1
+
+        # self._markers = []
+        # self._overlay = {}
+
+        self._set_mujoco_context_and_buffers()
+
+    def _set_mujoco_context_and_buffers(self):
+        self.con = mujoco.MjrContext(self.model._model, mujoco.mjtFontScale.mjFONTSCALE_150)
+        mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_OFFSCREEN, self.con)
+
+    def update_offscreen_size(self, width, height):
+        if (width != self.con.offWidth) or (height != self.con.offHeight):
+            self.model.vis.global_.offwidth = width
+            self.model.vis.global_.offheight = height
+            self.con.free()
+            del self.con
+            self._set_mujoco_context_and_buffers()
+
+    def upload_texture(self, tex_id):
+        """Uploads given texture to the GPU"""
+        self.gl_ctx.make_current()
+        mujoco.mjr_uploadTexture(self.model, self.con, tex_id)
+
+    def render(self, width, height, camera_id=None, segmentation=False):
+        viewport = mujoco.MjrRect(0, 0, width, height)
+
+        # if self.sim.render_callback is not None:
+        #     self.sim.render_callback(self.sim, self)
+
+        # update width and height of rendering context if necessary
+        if width > self.con.offWidth or height > self.con.offHeight:
+            new_width = max(width, self.model.vis.global_.offwidth)
+            new_height = max(height, self.model.vis.global_.offheight)
+            self.update_offscreen_size(new_width, new_height)
+
+        if camera_id is not None:
+            if camera_id == -1:
+                self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
+            else:
+                self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
+            self.cam.fixedcamid = camera_id
+
+        mujoco.mjv_updateScene(
+            self.model._model, self.data._data, self.vopt, self.pert, self.cam, mujoco.mjtCatBit.mjCAT_ALL, self.scn
+        )
+
+        if segmentation:
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 1
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 1
+
+        # for marker_params in self._markers:
+        #     self._add_marker_to_scene(marker_params)
+
+        mujoco.mjr_render(viewport=viewport, scn=self.scn, con=self.con)
+        # for gridpos, (text1, text2) in self._overlay.items():
+        #     mjr_overlay(const.FONTSCALE_150, gridpos, rect, text1.encode(), text2.encode(), &self._con)
+
+        if segmentation:
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 0
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 0
+
+    def read_pixels(self, width, height, depth=False, segmentation=False):
+        viewport = mujoco.MjrRect(0, 0, width, height)
+        rgb_img = np.empty((height, width, 3), dtype=np.uint8)
+        depth_img = np.empty((height, width), dtype=np.float32) if depth else None
+
+        mujoco.mjr_readPixels(rgb=rgb_img, depth=depth_img, viewport=viewport, con=self.con)
+
+        ret_img = rgb_img
+        if segmentation:
+            seg_img = rgb_img[:, :, 0] + rgb_img[:, :, 1] * (2**8) + rgb_img[:, :, 2] * (2**16)
+            seg_img[seg_img >= (self.scn.ngeom + 1)] = 0
+            seg_ids = np.full((self.scn.ngeom + 1, 2), fill_value=-1, dtype=np.int32)
+
+            for i in range(self.scn.ngeom):
+                geom = self.scn.geoms[i]
+                if geom.segid != -1:
+                    seg_ids[geom.segid + 1, 0] = geom.objtype
+                    seg_ids[geom.segid + 1, 1] = geom.objid
+            ret_img = seg_ids[seg_img]
+
+        if depth:
+            return (ret_img, depth_img)
+        else:
+            return ret_img
+
+    def upload_texture(self, tex_id):
+        """Uploads given texture to the GPU."""
+        self.gl_ctx.make_current()
+        mujoco.mjr_uploadTexture(self.model, self.con, tex_id)
+
+    def __del__(self):
+        # free mujoco rendering context and GL rendering context
+        self.con.free()
+        self.gl_ctx.free()
+        del self.con
+        del self.gl_ctx
+        del self.scn
+        del self.cam
+        del self.vopt
+        del self.pert
+
+
+class MjRenderContextOffscreen(MjRenderContext):
+    def __init__(self, sim, device_id, max_width=640, max_height=480):
+        super().__init__(sim, offscreen=True, device_id=device_id, max_width=max_width, max_height=max_height)
+
+
+class MjSimState:
+    """
+    A mujoco simulation state.
+    """
+
+    def __init__(self, time, qpos, qvel):
+        self.time = time
+        self.qpos = qpos
+        self.qvel = qvel
+
+    @classmethod
+    def from_flattened(cls, array, sim):
+        """
+        Takes flat mjstate array and MjSim instance and
+        returns MjSimState.
+        """
+        idx_time = 0
+        idx_qpos = idx_time + 1
+        idx_qvel = idx_qpos + sim.model.nq
+
+        time = array[idx_time]
+        qpos = array[idx_qpos : idx_qpos + sim.model.nq]
+        qvel = array[idx_qvel : idx_qvel + sim.model.nv]
+        assert sim.model.na == 0
+
+        return cls(time=time, qpos=qpos, qvel=qvel)
+
+    def flatten(self):
+        return np.concatenate([[self.time], self.qpos, self.qvel], axis=0)
+
+
+class _MjModelMeta(type):
+    """
+    Metaclass which allows MjModel below to delegate to mujoco.MjModel.
+
+    Taken from dm_control: https://github.com/deepmind/dm_control/blob/main/dm_control/mujoco/wrapper/core.py#L244
+    """
+
+    def __new__(cls, name, bases, dct):
+        for attr in dir(mujoco.MjModel):
+            if not attr.startswith("_"):
+                if attr not in dct:
+                    # pylint: disable=protected-access
+                    fget = lambda self, attr=attr: getattr(self._model, attr)
+                    fset = lambda self, value, attr=attr: setattr(self._model, attr, value)
+                    # pylint: enable=protected-access
+                    dct[attr] = property(fget, fset)
+        return super().__new__(cls, name, bases, dct)
+
+
+class MjModel(metaclass=_MjModelMeta):
+    """Wrapper class for a MuJoCo 'mjModel' instance.
+    MjModel encapsulates features of the model that are expected to remain
+    constant. It also contains simulation and visualization options which may be
+    changed occasionally, although this is done explicitly by the user.
+    """
+
+    _HAS_DYNAMIC_ATTRIBUTES = True
+
+    def __init__(self, model_ptr):
+        """Creates a new MjModel instance from a mujoco.MjModel."""
+        self._model = model_ptr
+
+        # make useful mappings such as _body_name2id and _body_id2name
+        self.make_mappings()
+
+    @classmethod
+    def from_xml_path(cls, xml_path):
+        """Creates an MjModel instance from a path to a model XML file."""
+        model_ptr = _get_model_ptr_from_xml(xml_path=xml_path)
+        return cls(model_ptr)
+
+    def __del__(self):
+        # free mujoco model
+        del self._model
+
+    """
+    Some methods supported by sim.model in mujoco-py.
+    Copied from https://github.com/openai/mujoco-py/blob/ab86d331c9a77ae412079c6e58b8771fe63747fc/mujoco_py/generated/wrappers.pxi#L2611
+    """
+
+    def _extract_mj_names(self, name_adr, num_obj, obj_type):
+        """
+        See https://github.com/openai/mujoco-py/blob/ab86d331c9a77ae412079c6e58b8771fe63747fc/mujoco_py/generated/wrappers.pxi#L1127
+        """
+
+        ### TODO: fix this to use @name_adr like mujoco-py - more robust than assuming IDs are continuous ###
+
+        # objects don't need to be named in the XML, so name might be None
+        id2name = {i: None for i in range(num_obj)}
+        name2id = {}
+        for i in range(num_obj):
+            name = mujoco.mj_id2name(self._model, obj_type, i)
+            name2id[name] = i
+            id2name[i] = name
+
+        # # objects don't need to be named in the XML, so name might be None
+        # id2name = { i: None for i in range(num_obj) }
+        # name2id = {}
+        # for i in range(num_obj):
+        #     name = self.model.names[name_adr[i]]
+        #     decoded_name = name.decode()
+        #     if decoded_name:
+        #         obj_id = mujoco.mj_name2id(self.model, obj_type, name)
+        #         assert (0 <= obj_id < num_obj) and (id2name[obj_id] is None)
+        #         name2id[decoded_name] = obj_id
+        #         id2name[obj_id] = decoded_name
+
+        # sort names by increasing id to keep order deterministic
+        return tuple(id2name[nid] for nid in sorted(name2id.values())), name2id, id2name
+
+    def make_mappings(self):
+        """
+        Make some useful internal mappings that mujoco-py supported.
+        """
+        p = self
+        self.body_names, self._body_name2id, self._body_id2name = self._extract_mj_names(
+            p.name_bodyadr, p.nbody, mujoco.mjtObj.mjOBJ_BODY
+        )
+        self.joint_names, self._joint_name2id, self._joint_id2name = self._extract_mj_names(
+            p.name_jntadr, p.njnt, mujoco.mjtObj.mjOBJ_JOINT
+        )
+        self.geom_names, self._geom_name2id, self._geom_id2name = self._extract_mj_names(
+            p.name_geomadr, p.ngeom, mujoco.mjtObj.mjOBJ_GEOM
+        )
+        self.site_names, self._site_name2id, self._site_id2name = self._extract_mj_names(
+            p.name_siteadr, p.nsite, mujoco.mjtObj.mjOBJ_SITE
+        )
+        self.light_names, self._light_name2id, self._light_id2name = self._extract_mj_names(
+            p.name_lightadr, p.nlight, mujoco.mjtObj.mjOBJ_LIGHT
+        )
+        self.camera_names, self._camera_name2id, self._camera_id2name = self._extract_mj_names(
+            p.name_camadr, p.ncam, mujoco.mjtObj.mjOBJ_CAMERA
+        )
+        self.actuator_names, self._actuator_name2id, self._actuator_id2name = self._extract_mj_names(
+            p.name_actuatoradr, p.nu, mujoco.mjtObj.mjOBJ_ACTUATOR
+        )
+        self.sensor_names, self._sensor_name2id, self._sensor_id2name = self._extract_mj_names(
+            p.name_sensoradr, p.nsensor, mujoco.mjtObj.mjOBJ_SENSOR
+        )
+        self.tendon_names, self._tendon_name2id, self._tendon_id2name = self._extract_mj_names(
+            p.name_tendonadr, p.ntendon, mujoco.mjtObj.mjOBJ_TENDON
+        )
+        self.mesh_names, self._mesh_name2id, self._mesh_id2name = self._extract_mj_names(
+            p.name_meshadr, p.nmesh, mujoco.mjtObj.mjOBJ_MESH
+        )
+
+    def body_id2name(self, id):
+        """Get body name from mujoco body id."""
+        if id not in self._body_id2name:
+            raise ValueError("No body with id %d exists." % id)
+        return self._body_id2name[id]
+
+    def body_name2id(self, name):
+        """Get body id from mujoco body name."""
+        if name not in self._body_name2id:
+            raise ValueError('No "body" with name %s exists. Available "body" names = %s.' % (name, self.body_names))
+        return self._body_name2id[name]
+
+    def joint_id2name(self, id):
+        """Get joint name from mujoco joint id."""
+        if id not in self._joint_id2name:
+            raise ValueError("No joint with id %d exists." % id)
+        return self._joint_id2name[id]
+
+    def joint_name2id(self, name):
+        """Get joint id from joint name."""
+        if name not in self._joint_name2id:
+            raise ValueError('No "joint" with name %s exists. Available "joint" names = %s.' % (name, self.joint_names))
+        return self._joint_name2id[name]
+
+    def geom_id2name(self, id):
+        """Get geom name from  geom id."""
+        if id not in self._geom_id2name:
+            raise ValueError("No geom with id %d exists." % id)
+        return self._geom_id2name[id]
+
+    def geom_name2id(self, name):
+        """Get geom id from  geom name."""
+        if name not in self._geom_name2id:
+            raise ValueError('No "geom" with name %s exists. Available "geom" names = %s.' % (name, self.geom_names))
+        return self._geom_name2id[name]
+
+    def site_id2name(self, id):
+        """Get site name from site id."""
+        if id not in self._site_id2name:
+            raise ValueError("No site with id %d exists." % id)
+        return self._site_id2name[id]
+
+    def site_name2id(self, name):
+        """Get site id from site name."""
+        if name not in self._site_name2id:
+            raise ValueError('No "site" with name %s exists. Available "site" names = %s.' % (name, self.site_names))
+        return self._site_name2id[name]
+
+    def light_id2name(self, id):
+        """Get light name from light id."""
+        if id not in self._light_id2name:
+            raise ValueError("No light with id %d exists." % id)
+        return self._light_id2name[id]
+
+    def light_name2id(self, name):
+        """Get light id from light name."""
+        if name not in self._light_name2id:
+            raise ValueError('No "light" with name %s exists. Available "light" names = %s.' % (name, self.light_names))
+        return self._light_name2id[name]
+
+    def camera_id2name(self, id):
+        """Get camera name from camera id."""
+        if id not in self._camera_id2name:
+            raise ValueError("No camera with id %d exists." % id)
+        return self._camera_id2name[id]
+
+    def camera_name2id(self, name):
+        """Get camera id from  camera name."""
+        if name not in self._camera_name2id:
+            raise ValueError(
+                'No "camera" with name %s exists. Available "camera" names = %s.' % (name, self.camera_names)
+            )
+        return self._camera_name2id[name]
+
+    def actuator_id2name(self, id):
+        """Get actuator name from actuator id."""
+        if id not in self._actuator_id2name:
+            raise ValueError("No actuator with id %d exists." % id)
+        return self._actuator_id2name[id]
+
+    def actuator_name2id(self, name):
+        """Get actuator id from actuator name."""
+        if name not in self._actuator_name2id:
+            raise ValueError(
+                'No "actuator" with name %s exists. Available "actuator" names = %s.' % (name, self.actuator_names)
+            )
+        return self._actuator_name2id[name]
+
+    def sensor_id2name(self, id):
+        """Get sensor name from sensor id."""
+        if id not in self._sensor_id2name:
+            raise ValueError("No sensor with id %d exists." % id)
+        return self._sensor_id2name[id]
+
+    def sensor_name2id(self, name):
+        """Get sensor id from sensor name."""
+        if name not in self._sensor_name2id:
+            raise ValueError(
+                'No "sensor" with name %s exists. Available "sensor" names = %s.' % (name, self.sensor_names)
+            )
+        return self._sensor_name2id[name]
+
+    def tendon_id2name(self, id):
+        """Get tendon name from tendon id."""
+        if id not in self._tendon_id2name:
+            raise ValueError("No tendon with id %d exists." % id)
+        return self._tendon_id2name[id]
+
+    def tendon_name2id(self, name):
+        """Get tendon id from tendon name."""
+        if name not in self._tendon_name2id:
+            raise ValueError(
+                'No "tendon" with name %s exists. Available "tendon" names = %s.' % (name, self.tendon_names)
+            )
+        return self._tendon_name2id[name]
+
+    def mesh_id2name(self, id):
+        """Get mesh name from  mesh id."""
+        if id not in self._mesh_id2name:
+            raise ValueError("No mesh with id %d exists." % id)
+        return self._mesh_id2name[id]
+
+    def mesh_name2id(self, name):
+        """Get mesh id from mesh name."""
+        if name not in self._mesh_name2id:
+            raise ValueError('No "mesh" with name %s exists. Available "mesh" names = %s.' % (name, self.mesh_names))
+        return self._mesh_name2id[name]
+
+    # def userdata_id2name(self, id):
+    #     if id not in self._userdata_id2name:
+    #         raise ValueError("No userdata with id %d exists." % id)
+    #     return self._userdata_id2name[id]
+
+    # def userdata_name2id(self, name):
+    #     if name not in self._userdata_name2id:
+    #         raise ValueError("No \"userdata\" with name %s exists. Available \"userdata\" names = %s." % (name, self.userdata_names))
+    #     return self._userdata_name2id[name]
+
+    def get_xml(self):
+        with TemporaryDirectory() as td:
+            filename = os.path.join(td, "model.xml")
+            ret = mujoco.mj_saveLastXML(filename.encode(), self._model)
+            return open(filename).read()
+
+    def get_joint_qpos_addr(self, name):
+        """
+        See https://github.com/openai/mujoco-py/blob/ab86d331c9a77ae412079c6e58b8771fe63747fc/mujoco_py/generated/wrappers.pxi#L1178
+
+        Returns the qpos address for given joint.
+        Returns:
+        - address (int, tuple): returns int address if 1-dim joint, otherwise
+            returns the a (start, end) tuple for pos[start:end] access.
+        """
+        joint_id = self.joint_name2id(name)
+        joint_type = self.jnt_type[joint_id]
+        joint_addr = self.jnt_qposadr[joint_id]
+        if joint_type == mujoco.mjtJoint.mjJNT_FREE:
+            ndim = 7
+        elif joint_type == mujoco.mjtJoint.mjJNT_BALL:
+            ndim = 4
+        else:
+            assert joint_type in (mujoco.mjtJoint.mjJNT_HINGE, mujoco.mjtJoint.mjJNT_SLIDE)
+            ndim = 1
+
+        if ndim == 1:
+            return joint_addr
+        else:
+            return (joint_addr, joint_addr + ndim)
+
+    def get_joint_qvel_addr(self, name):
+        """
+        See https://github.com/openai/mujoco-py/blob/ab86d331c9a77ae412079c6e58b8771fe63747fc/mujoco_py/generated/wrappers.pxi#L1202
+
+        Returns the qvel address for given joint.
+        Returns:
+        - address (int, tuple): returns int address if 1-dim joint, otherwise
+            returns the a (start, end) tuple for vel[start:end] access.
+        """
+        joint_id = self.joint_name2id(name)
+        joint_type = self.jnt_type[joint_id]
+        joint_addr = self.jnt_dofadr[joint_id]
+        if joint_type == mujoco.mjtJoint.mjJNT_FREE:
+            ndim = 6
+        elif joint_type == mujoco.mjtJoint.mjJNT_BALL:
+            ndim = 3
+        else:
+            assert joint_type in (mujoco.mjtJoint.mjJNT_HINGE, mujoco.mjtJoint.mjJNT_SLIDE)
+            ndim = 1
+
+        if ndim == 1:
+            return joint_addr
+        else:
+            return (joint_addr, joint_addr + ndim)
+
+
+class _MjDataMeta(type):
+    """
+    Metaclass which allows MjData below to delegate to mujoco.MjData.
+
+    Taken from dm_control.
+    """
+
+    def __new__(cls, name, bases, dct):
+        for attr in dir(mujoco.MjData):
+            if not attr.startswith("_"):
+                if attr not in dct:
+                    # pylint: disable=protected-access
+                    fget = lambda self, attr=attr: getattr(self._data, attr)
+                    fset = lambda self, value, attr=attr: setattr(self._data, attr, value)
+                    # pylint: enable=protected-access
+                    dct[attr] = property(fget, fset)
+        return super().__new__(cls, name, bases, dct)
+
+
+class MjData(metaclass=_MjDataMeta):
+    """Wrapper class for a MuJoCo 'mjData' instance.
+    MjData contains all of the dynamic variables and intermediate results produced
+    by the simulation. These are expected to change on each simulation timestep.
+    The properties without docstrings are defined in mujoco source code from https://github.com/deepmind/mujoco/blob/062cb53a4a14b2a7a900453613a7ce498728f9d8/include/mujoco/mjdata.h#L126.
+    """
+
+    def __init__(self, model):
+        """Construct a new MjData instance.
+        Args:
+          model: An MjModel instance.
+        """
+        self._model = model
+        self._data = mujoco.MjData(model._model)
+
+    @property
+    def model(self):
+        """The parent MjModel for this MjData instance."""
+        return self._model
+
+    def __del__(self):
+        # free mujoco data
+        del self._data
+
+    """
+    Some methods supported by sim.data in mujoco-py.
+    Copied from https://github.com/openai/mujoco-py/blob/ab86d331c9a77ae412079c6e58b8771fe63747fc/mujoco_py/generated/wrappers.pxi#L2611
+    """
+
+    @property
+    def body_xpos(self):
+        """
+        Note: mujoco-py used to support sim.data.body_xpos but DM mujoco bindings requires sim.data.xpos,
+              so we explicitly expose this as a property
+        """
+        return self._data.xpos
+
+    @property
+    def body_xquat(self):
+        """
+        Note: mujoco-py used to support sim.data.body_xquat but DM mujoco bindings requires sim.data.xquat,
+              so we explicitly expose this as a property
+        """
+        return self._data.xquat
+
+    @property
+    def body_xmat(self):
+        """
+        Note: mujoco-py used to support sim.data.body_xmat but DM mujoco bindings requires sim.data.xmax,
+              so we explicitly expose this as a property
+        """
+        return self._data.xmat
+
+    def get_body_xpos(self, name):
+        """
+        Query cartesian position of a mujoco body using a name string.
+
+        Args:
+            name (str): The name of a mujoco body
+        Returns:
+            xpos (np.ndarray): The xpos value of the mujoco body
+        """
+        bid = self.model.body_name2id(name)
+        return self.xpos[bid]
+
+    def get_body_xquat(self, name):
+        """
+        Query the rotation of a mujoco body in quaternion (in wxyz convention) using a name string.
+
+        Args:
+            name (str): The name of a mujoco body
+        Returns:
+            xquat (np.ndarray): The xquat value of the mujoco body
+        """
+        bid = self.model.body_name2id(name)
+        return self.xquat[bid]
+
+    def get_body_xmat(self, name):
+        """
+        Query the rotation of a mujoco body in a rotation matrix using a name string.
+
+        Args:
+            name (str): The name of a mujoco body
+        Returns:
+            xmat (np.ndarray): The xmat value of the mujoco body
+        """
+        bid = self.model.body_name2id(name)
+        return self.xmat[bid].reshape((3, 3))
+
+    def get_body_jacp(self, name):
+        """
+        Query the position jacobian of a mujoco body using a name string.
+
+        Args:
+            name (str): The name of a mujoco body
+        Returns:
+            jacp (np.ndarray): The jacp value of the mujoco body
+        """
+        bid = self.model.body_name2id(name)
+        jacp = np.zeros((3, self.model.nv))
+        mujoco.mj_jacBody(self.model._model, self._data, jacp, None, bid)
+        return jacp
+
+    def get_body_jacr(self, name):
+        """
+        Query the rotation jacobian of a mujoco body using a name string.
+
+        Args:
+            name (str): The name of a mujoco body
+        Returns:
+            jacr (np.ndarray): The jacr value of the mujoco body
+        """
+        bid = self.model.body_name2id(name)
+        jacr = np.zeros((3, self.model.nv))
+        mujoco.mj_jacBody(self.model._model, self._data, None, jacr, bid)
+        return jacr
+
+    def get_body_xvelp(self, name):
+        """
+        Query the translational velocity of a mujoco body using a name string.
+
+        Args:
+            name (str): The name of a mujoco body
+        Returns:
+            xvelp (np.ndarray): The translational velocity of the mujoco body.
+        """
+        jacp = self.get_body_jacp(name)
+        xvelp = np.dot(jacp, self.qvel)
+        return xvelp
+
+    def get_body_xvelr(self, name):
+        """
+        Query the rotational velocity of a mujoco body using a name string.
+
+        Args:
+            name (str): The name of a mujoco body
+        Returns:
+            xvelr (np.ndarray): The rotational velocity of the mujoco body.
+        """
+        jacr = self.get_body_jacr(name)
+        xvelr = np.dot(jacr, self.qvel)
+        return xvelr
+
+    def get_geom_xpos(self, name):
+        """
+        Query the cartesian position of a mujoco geom using a name string.
+
+        Args:
+            name (str): The name of a mujoco geom
+        Returns:
+            geom_xpos (np.ndarray): The cartesian position of the mujoco body.
+        """
+        gid = self.model.geom_name2id(name)
+        return self.geom_xpos[gid]
+
+    def get_geom_xmat(self, name):
+        """
+        Query the rotation of a mujoco geom in a rotation matrix using a name string.
+
+        Args:
+            name (str): The name of a mujoco geom
+        Returns:
+            geom_xmat (np.ndarray): The 3x3 rotation matrix of the mujoco geom.
+        """
+        gid = self.model.geom_name2id(name)
+        return self.geom_xmat[gid].reshape((3, 3))
+
+    def get_geom_jacp(self, name):
+        """
+        Query the position jacobian of a mujoco geom using a name string.
+
+        Args:
+            name (str): The name of a mujoco geom
+        Returns:
+            jacp (np.ndarray): The jacp value of the mujoco geom
+        """
+        gid = self.model.geom_name2id(name)
+        jacp = np.zeros((3, self.model.nv))
+        mujoco.mj_jacGeom(self.model._model, self._data, jacp, None, gid)
+        return jacp
+
+    def get_geom_jacr(self, name):
+        """
+        Query the rotation jacobian of a mujoco geom using a name string.
+
+        Args:
+            name (str): The name of a mujoco geom
+        Returns:
+            jacr (np.ndarray): The jacr value of the mujoco geom
+        """
+        gid = self.model.geom_name2id(name)
+        jacv = np.zeros((3, self.model.nv))
+        mujoco.mj_jacGeom(self.model._model, self._data, None, jacv, gid)
+        return jacr
+
+    def get_geom_xvelp(self, name):
+        """
+        Query the translational velocity of a mujoco geom using a name string.
+
+        Args:
+            name (str): The name of a mujoco geom
+        Returns:
+            xvelp (np.ndarray): The translational velocity of the mujoco geom
+        """
+        jacp = self.get_geom_jacp(name)
+        xvelp = np.dot(jacp, self.qvel)
+        return xvelp
+
+    def get_geom_xvelr(self, name):
+        """
+        Query the rotational velocity of a mujoco geom using a name string.
+
+        Args:
+            name (str): The name of a mujoco geom
+        Returns:
+            xvelr (np.ndarray): The rotational velocity of the mujoco geom
+        """
+        jacr = self.get_geom_jacr(name)
+        xvelr = np.dot(jacr, self.qvel)
+        return xvelr
+
+    def get_site_xpos(self, name):
+        """
+        Query the cartesian position of a mujoco site using a name string.
+
+        Args:
+            name (str): The name of a mujoco site
+        Returns:
+            site_xpos (np.ndarray): The carteisan position of the mujoco site
+        """
+        sid = self.model.site_name2id(name)
+        return self.site_xpos[sid]
+
+    def get_site_xmat(self, name):
+        """
+        Query the rotation of a mujoco site in a rotation matrix using a name string.
+
+        Args:
+            name (str): The name of a mujoco site
+        Returns:
+            site_xmat (np.ndarray): The 3x3 rotation matrix of the mujoco site.
+        """
+        sid = self.model.site_name2id(name)
+        return self.site_xmat[sid].reshape((3, 3))
+
+    def get_site_jacp(self, name):
+        """
+        Query the position jacobian of a mujoco site using a name string.
+
+        Args:
+            name (str): The name of a mujoco site
+        Returns:
+            jacp (np.ndarray): The jacp value of the mujoco site
+        """
+        sid = self.model.site_name2id(name)
+        jacp = np.zeros((3, self.model.nv))
+        mujoco.mj_jacSite(self.model._model, self._data, jacp, None, sid)
+        return jacp
+
+    def get_site_jacr(self, name):
+        """
+        Query the rotation jacobian of a mujoco site using a name string.
+
+        Args:
+            name (str): The name of a mujoco site
+        Returns:
+            jacr (np.ndarray): The jacr value of the mujoco site
+        """
+        sid = self.model.site_name2id(name)
+        jacr = np.zeros((3, self.model.nv))
+        mujoco.mj_jacSite(self.model._model, self._data, None, jacr, sid)
+        return jacr
+
+    def get_site_xvelp(self, name):
+        """
+        Query the translational velocity of a mujoco site using a name string.
+
+        Args:
+            name (str): The name of a mujoco site
+        Returns:
+            xvelp (np.ndarray): The translational velocity of the mujoco site
+        """
+        jacp = self.get_site_jacp(name)
+        xvelp = np.dot(jacp, self.qvel)
+        return xvelp
+
+    def get_site_xvelr(self, name):
+        """
+        Query the rotational velocity of a mujoco site using a name string.
+
+        Args:
+            name (str): The name of a mujoco site
+        Returns:
+            xvelr (np.ndarray): The rotational velocity of the mujoco site
+        """
+        jacr = self.get_site_jacr(name)
+        xvelr = np.dot(jacr, self.qvel)
+        return xvelr
+
+    def get_camera_xpos(self, name):
+        """
+        Get the cartesian position of a camera using name
+
+        Args:
+            name (str): The name of a camera
+        Returns:
+            cam_xpos (np.ndarray): The cartesian position of a camera
+        """
+        cid = self.model.camera_name2id(name)
+        return self.cam_xpos[cid]
+
+    def get_camera_xmat(self, name):
+        """
+        Get the rotation of a camera in a rotation matrix using name
+
+        Args:
+            name (str): The name of a camera
+        Returns:
+            cam_xmat (np.ndarray): The 3x3 rotation matrix of a camera
+        """
+        cid = self.model.camera_name2id(name)
+        return self.cam_xmat[cid].reshape((3, 3))
+
+    def get_light_xpos(self, name):
+        """
+        Get cartesian position of a light source
+
+        Args:
+            name (str): The name of a lighting source
+        Returns:
+            light_xpos (np.ndarray): The cartesian position of the light source
+        """
+        lid = self.model.light_name2id(name)
+        return self.light_xpos[lid]
+
+    def get_light_xdir(self, name):
+        """
+        Get the direction of a light source using name
+
+        Args:
+            name (str): The name of a light
+        Returns:
+            light_xdir (np.ndarray): The direction vector of the lightsource
+        """
+        lid = self.model.light_name2id(name)
+        return self.light_xdir[lid]
+
+    def get_sensor(self, name):
+        """
+        Get the data of a sensor using name
+
+        Args:
+            name (str): The name of a sensor
+        Returns:
+            sensordata (np.ndarray): The sensor data vector
+        """
+        sid = self.model.sensor_name2id(name)
+        return self.sensordata[sid]
+
+    def get_mocap_pos(self, name):
+        """
+        Get the position of a mocap body using name.
+
+        Args:
+            name (str): The name of a joint
+        Returns:
+            mocap_pos (np.ndarray): The current position of a mocap body.
+        """
+        body_id = self.model.body_name2id(name)
+        mocap_id = self.model.body_mocapid[body_id]
+        return self.mocap_pos[mocap_id]
+
+    def set_mocap_pos(self, name, value):
+        """
+        Set the quaternion of a mocap body using name.
+
+        Args:
+            name (str): The name of a joint
+            value (float): The desired joint position of a mocap body.
+        """
+        body_id = self.model.body_name2id(name)
+        mocap_id = self.model.body_mocapid[body_id]
+        self.mocap_pos[mocap_id] = value
+
+    def get_mocap_quat(self, name):
+        """
+        Get the quaternion of a mocap body using name.
+
+        Args:
+            name (str): The name of a joint
+        Returns:
+            mocap_quat (np.ndarray): The current quaternion of a mocap body.
+        """
+        body_id = self.model.body_name2id(name)
+        mocap_id = self.model.body_mocapid[body_id]
+        return self.mocap_quat[mocap_id]
+
+    def set_mocap_quat(self, name, value):
+        """
+        Set the quaternion of a mocap body using name.
+
+        Args:
+            name (str): The name of a joint
+            value (float): The desired joint quaternion of a mocap body.
+        """
+        body_id = self.model.body_name2id(name)
+        mocap_id = self.model.body_mocapid[body_id]
+        self.mocap_quat[mocap_id] = value
+
+    def get_joint_qpos(self, name):
+        """
+        Get the position of a joint using name.
+
+        Args:
+            name (str): The name of a joint
+
+        Returns:
+            qpos (np.ndarray): The current position of a joint.
+        """
+        addr = self.model.get_joint_qpos_addr(name)
+        if isinstance(addr, (int, np.int32, np.int64)):
+            return self.qpos[addr]
+        else:
+            start_i, end_i = addr
+            return self.qpos[start_i:end_i]
+
+    def set_joint_qpos(self, name, value):
+        """
+        Set the velocities of a joint using name.
+
+        Args:
+            name (str): The name of a joint
+            value (float): The desired joint velocity of a joint.
+        """
+        addr = self.model.get_joint_qpos_addr(name)
+        if isinstance(addr, (int, np.int32, np.int64)):
+            self.qpos[addr] = value
+        else:
+            start_i, end_i = addr
+            value = np.array(value)
+            assert value.shape == (end_i - start_i,), "Value has incorrect shape %s: %s" % (name, value)
+            self.qpos[start_i:end_i] = value
+
+    def get_joint_qvel(self, name):
+        """
+        Get the velocity of a joint using name.
+
+        Args:
+            name (str): The name of a joint
+
+        Returns:
+            qvel (np.ndarray): The current velocity of a joint.
+        """
+        addr = self.model.get_joint_qvel_addr(name)
+        if isinstance(addr, (int, np.int32, np.int64)):
+            return self.qvel[addr]
+        else:
+            start_i, end_i = addr
+            return self.qvel[start_i:end_i]
+
+    def set_joint_qvel(self, name, value):
+        """
+        Set the velocities of a mjo using name.
+
+        Args:
+            name (str): The name of a joint
+            value (float): The desired joint velocity of a joint.
+        """
+        addr = self.model.get_joint_qvel_addr(name)
+        if isinstance(addr, (int, np.int32, np.int64)):
+            self.qvel[addr] = value
+        else:
+            start_i, end_i = addr
+            value = np.array(value)
+            assert value.shape == (end_i - start_i,), "Value has incorrect shape %s: %s" % (name, value)
+            self.qvel[start_i:end_i] = value
+
+
+class MjSim:
+    """
+    Meant to somewhat replicate functionality in mujoco-py's MjSim object
+    (see https://github.com/openai/mujoco-py/blob/master/mujoco_py/mjsim.pyx).
+    """
+
+    def __init__(self, model):
+        """
+        Args:
+            model: should be an MjModel instance created via a factory function
+                such as mujoco.MjModel.from_xml_string(xml)
+        """
+        self.model = MjModel(model)
+        self.data = MjData(self.model)
+
+        # offscreen render context object
+        self._render_context_offscreen = None
+
+    @classmethod
+    def from_xml_string(cls, xml):
+        model = mujoco.MjModel.from_xml_string(xml)
+        return cls(model)
+
+    @classmethod
+    def from_xml_file(cls, xml_file):
+        f = open(xml_file, "r")
+        xml = f.read()
+        f.close()
+        return cls.from_xml_string(xml)
+
+    def reset(self):
+        """Reset simulation."""
+        mujoco.mj_resetData(self.model._model, self.data._data)
+
+    def forward(self):
+        """Forward call to synchronize derived quantities."""
+        mujoco.mj_forward(self.model._model, self.data._data)
+
+    def step(self, with_udd=True):
+        """Step simulation."""
+        mujoco.mj_step(self.model._model, self.data._data)
+
+    def render(
+        self,
+        width=None,
+        height=None,
+        *,
+        camera_name=None,
+        depth=False,
+        mode="offscreen",
+        device_id=-1,
+        segmentation=False,
+    ):
+        """
+        Renders view from a camera and returns image as an `numpy.ndarray`.
+        Args:
+        - width (int): desired image width.
+        - height (int): desired image height.
+        - camera_name (str): name of camera in model. If None, the free
+            camera will be used.
+        - depth (bool): if True, also return depth buffer
+        - device (int): device to use for rendering (only for GPU-backed
+            rendering).
+        Returns:
+        - rgb (uint8 array): image buffer from camera
+        - depth (float array): depth buffer from camera (only returned
+            if depth=True)
+        """
+        if camera_name is None:
+            camera_id = None
+        else:
+            camera_id = self.model.camera_name2id(camera_name)
+
+        assert mode == "offscreen", "only offscreen supported for now"
+        assert self._render_context_offscreen is not None
+        with _MjSim_render_lock:
+            self._render_context_offscreen.render(
+                width=width, height=height, camera_id=camera_id, segmentation=segmentation
+            )
+            return self._render_context_offscreen.read_pixels(width, height, depth=depth, segmentation=segmentation)
+
+    def add_render_context(self, render_context):
+        assert render_context.offscreen
+        if self._render_context_offscreen is not None:
+            # free context
+            del self._render_context_offscreen
+        self._render_context_offscreen = render_context
+
+    def get_state(self):
+        """Return MjSimState instance for current state."""
+        return MjSimState(
+            time=self.data.time,
+            qpos=np.copy(self.data.qpos),
+            qvel=np.copy(self.data.qvel),
+        )
+
+    def set_state(self, value):
+        """
+        Set internal state from MjSimState instance. Should
+        call @forward afterwards to synchronize derived quantities.
+        """
+        self.data.time = value.time
+        self.data.qpos[:] = np.copy(value.qpos)
+        self.data.qvel[:] = np.copy(value.qvel)
+
+    def set_state_from_flattened(self, value):
+        """
+        Set internal mujoco state using flat mjstate array. Should
+        call @forward afterwards to synchronize derived quantities.
+
+        See https://github.com/openai/mujoco-py/blob/4830435a169c1f3e3b5f9b58a7c3d9c39bdf4acb/mujoco_py/mjsimstate.pyx#L54
+        """
+        state = MjSimState.from_flattened(value, self)
+
+        # do this instead of @set_state to avoid extra copy of qpos and qvel
+        self.data.time = state.time
+        self.data.qpos[:] = state.qpos
+        self.data.qvel[:] = state.qvel
+
+    def free(self):
+        # clean up here to prevent memory leaks
+        del self._render_context_offscreen
+        del self.data
+        del self.model
+        del self
+        gc.collect()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/buffers.py b/phantom/submodules/phantom-robosuite/robosuite/utils/buffers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2a1bc20f86a79870d885d76a12739f0263c03fb
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/buffers.py
@@ -0,0 +1,173 @@
+"""
+Collection of Buffer objects with general functionality
+"""
+
+
+import numpy as np
+
+
+class Buffer(object):
+    """
+    Abstract class for different kinds of data buffers. Minimum API should have a "push" and "clear" method
+    """
+
+    def push(self, value):
+        """
+        Pushes a new @value to the buffer
+
+        Args:
+            value: Value to push to the buffer
+        """
+        raise NotImplementedError
+
+    def clear(self):
+        raise NotImplementedError
+
+
+class RingBuffer(Buffer):
+    """
+    Simple RingBuffer object to hold values to average (useful for, e.g.: filtering D component in PID control)
+
+    Note that the buffer object is a 2D numpy array, where each row corresponds to
+    individual entries into the buffer
+
+    Args:
+        dim (int): Size of entries being added. This is, e.g.: the size of a state vector that is to be stored
+        length (int): Size of the ring buffer
+    """
+
+    def __init__(self, dim, length):
+        # Store input args
+        self.dim = dim
+        self.length = length
+
+        # Variable so that initial average values are accurate
+        self._size = 0
+
+        # Save pointer to end of buffer
+        self.ptr = self.length - 1
+
+        # Construct ring buffer
+        self.buf = np.zeros((length, dim))
+
+    def push(self, value):
+        """
+        Pushes a new value into the buffer
+
+        Args:
+            value (int or float or array): Value(s) to push into the array (taken as a single new element)
+        """
+        # Increment pointer, then add value (also increment size if necessary)
+        self.ptr = (self.ptr + 1) % self.length
+        self.buf[self.ptr] = np.array(value)
+        if self._size < self.length:
+            self._size += 1
+
+    def clear(self):
+        """
+        Clears buffer and reset pointer
+        """
+        self.buf = np.zeros((self.length, self.dim))
+        self.ptr = self.length - 1
+        self._size = 0
+
+    @property
+    def current(self):
+        """
+        Gets the most recent value pushed to the buffer
+
+        Returns:
+            float or np.array: Most recent value in buffer
+        """
+        return self.buf[self.ptr]
+
+    @property
+    def average(self):
+        """
+        Gets the average of components in buffer
+
+        Returns:
+            float or np.array: Averaged value of all elements in buffer
+        """
+        return np.mean(self.buf[: self._size], axis=0)
+
+
+class DeltaBuffer(Buffer):
+    """
+    Simple 2-length buffer object to streamline grabbing delta values between "current" and "last" values
+
+    Constructs delta object.
+
+    Args:
+        dim (int): Size of numerical arrays being inputted
+        init_value (None or Iterable): Initial value to fill "last" value with initially.
+            If None (default), last array will be filled with zeros
+    """
+
+    def __init__(self, dim, init_value=None):
+        # Setup delta object
+        self.dim = dim
+        self.last = np.zeros(self.dim) if init_value is None else np.array(init_value)
+        self.current = np.zeros(self.dim)
+
+    def push(self, value):
+        """
+        Pushes a new value into the buffer; current becomes last and @value becomes current
+
+        Args:
+            value (int or float or array): Value(s) to push into the array (taken as a single new element)
+        """
+        self.last = self.current
+        self.current = np.array(value)
+
+    def clear(self):
+        """
+        Clears last and current value
+        """
+        self.last, self.current = np.zeros(self.dim), np.zeros(self.dim)
+
+    @property
+    def delta(self, abs_value=False):
+        """
+        Returns the delta between last value and current value. If abs_value is set to True, then returns
+        the absolute value between the values
+
+        Args:
+            abs_value (bool): Whether to return absolute value or not
+
+        Returns:
+            float or np.array: difference between current and last value
+        """
+        return self.current - self.last if not abs_value else np.abs(self.current - self.last)
+
+    @property
+    def average(self):
+        """
+        Returns the average between the current and last value
+
+        Returns:
+            float or np.array: Averaged value of all elements in buffer
+        """
+        return (self.current + self.last) / 2.0
+
+
+class DelayBuffer(RingBuffer):
+    """
+    Modified RingBuffer that returns delayed values when polled
+    """
+
+    def get_delayed_value(self, delay):
+        """
+        Returns value @delay increments behind most recent value.
+
+        Args:
+            delay (int): How many steps backwards from most recent value to grab value. Note that this should not be
+                greater than the buffer's length
+
+        Returns:
+            np.array: delayed value
+        """
+        # First make sure that the delay is valid
+        assert delay < self.length, "Requested delay must be less than buffer's length!"
+        # Grab delayed value
+        return self.buf[(self.ptr - delay) % self.length]
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/camera_utils.py b/phantom/submodules/phantom-robosuite/robosuite/utils/camera_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..733e65c57c6597ae7b065674ffe23ffad7d93d74
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/camera_utils.py
@@ -0,0 +1,628 @@
+"""
+This module includes:
+
+- Utility classes for modifying sim cameras
+
+- Utility functions for performing common camera operations such as retrieving
+camera matrices and transforming from world to camera frame or vice-versa.
+"""
+import json
+import xml.etree.ElementTree as ET
+
+import h5py
+import numpy as np
+
+import robosuite
+import robosuite.utils.transform_utils as T
+from robosuite.wrappers import DomainRandomizationWrapper, VisualizationWrapper
+
+
+def get_camera_intrinsic_matrix(sim, camera_name, camera_height, camera_width):
+    """
+    Obtains camera intrinsic matrix.
+
+    Args:
+        sim (MjSim): simulator instance
+        camera_name (str): name of camera
+        camera_height (int): height of camera images in pixels
+        camera_width (int): width of camera images in pixels
+    Return:
+        K (np.array): 3x3 camera matrix
+    """
+    cam_id = sim.model.camera_name2id(camera_name)
+    fovy = sim.model.cam_fovy[cam_id]
+    f = 0.5 * camera_height / np.tan(fovy * np.pi / 360)
+    K = np.array([[f, 0, camera_width / 2], [0, f, camera_height / 2], [0, 0, 1]])
+    return K
+
+
+def get_camera_extrinsic_matrix(sim, camera_name):
+    """
+    Returns a 4x4 homogenous matrix corresponding to the camera pose in the
+    world frame. MuJoCo has a weird convention for how it sets up the
+    camera body axis, so we also apply a correction so that the x and y
+    axis are along the camera view and the z axis points along the
+    viewpoint.
+    Normal camera convention: https://docs.opencv.org/2.4/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.html
+
+    Args:
+        sim (MjSim): simulator instance
+        camera_name (str): name of camera
+    Return:
+        R (np.array): 4x4 camera extrinsic matrix
+    """
+    cam_id = sim.model.camera_name2id(camera_name)
+    camera_pos = sim.data.cam_xpos[cam_id]
+    camera_rot = sim.data.cam_xmat[cam_id].reshape(3, 3)
+    R = T.make_pose(camera_pos, camera_rot)
+
+    # IMPORTANT! This is a correction so that the camera axis is set up along the viewpoint correctly.
+    camera_axis_correction = np.array(
+        [[1.0, 0.0, 0.0, 0.0], [0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+    )
+    R = R @ camera_axis_correction
+    return R
+
+
+def get_camera_transform_matrix(sim, camera_name, camera_height, camera_width):
+    """
+    Camera transform matrix to project from world coordinates to pixel coordinates.
+
+    Args:
+        sim (MjSim): simulator instance
+        camera_name (str): name of camera
+        camera_height (int): height of camera images in pixels
+        camera_width (int): width of camera images in pixels
+    Return:
+        K (np.array): 4x4 camera matrix to project from world coordinates to pixel coordinates
+    """
+    R = get_camera_extrinsic_matrix(sim=sim, camera_name=camera_name)
+    K = get_camera_intrinsic_matrix(
+        sim=sim, camera_name=camera_name, camera_height=camera_height, camera_width=camera_width
+    )
+    K_exp = np.eye(4)
+    K_exp[:3, :3] = K
+
+    # Takes a point in world, transforms to camera frame, and then projects onto image plane.
+    return K_exp @ T.pose_inv(R)
+
+
+def get_camera_segmentation(sim, camera_name, camera_height, camera_width):
+    """
+    Obtains camera segmentation matrix.
+
+    Args:
+        sim (MjSim): simulator instance
+        camera_name (str): name of camera
+        camera_height (int): height of camera images in pixels
+        camera_width (int): width of camera images in pixels
+    Return:
+        im (np.array): 2-channel segmented image where the first contains the
+            geom types and the second contains the geom IDs
+    """
+    return sim.render(camera_name=camera_name, height=camera_height, width=camera_width, segmentation=True)[::-1]
+
+
+def get_real_depth_map(sim, depth_map):
+    """
+    By default, MuJoCo will return a depth map that is normalized in [0, 1]. This
+    helper function converts the map so that the entries correspond to actual distances.
+
+    (see https://github.com/deepmind/dm_control/blob/master/dm_control/mujoco/engine.py#L742)
+
+    Args:
+        sim (MjSim): simulator instance
+        depth_map (np.array): depth map with values normalized in [0, 1] (default depth map
+            returned by MuJoCo)
+    Return:
+        depth_map (np.array): depth map that corresponds to actual distances
+    """
+    # Make sure that depth values are normalized
+    assert np.all(depth_map >= 0.0) and np.all(depth_map <= 1.0)
+    extent = sim.model.stat.extent
+    far = sim.model.vis.map.zfar * extent
+    near = sim.model.vis.map.znear * extent
+    return near / (1.0 - depth_map * (1.0 - near / far))
+
+
+def project_points_from_world_to_camera(points, world_to_camera_transform, camera_height, camera_width):
+    """
+    Helper function to project a batch of points in the world frame
+    into camera pixels using the world to camera transformation.
+
+    Args:
+        points (np.array): 3D points in world frame to project onto camera pixel locations. Should
+            be shape [..., 3].
+        world_to_camera_transform (np.array): 4x4 Tensor to go from robot coordinates to pixel
+            coordinates.
+        camera_height (int): height of the camera image
+        camera_width (int): width of the camera image
+
+    Return:
+        pixels (np.array): projected pixel indices of shape [..., 2]
+    """
+    assert points.shape[-1] == 3  # last dimension must be 3D
+    assert len(world_to_camera_transform.shape) == 2
+    assert world_to_camera_transform.shape[0] == 4 and world_to_camera_transform.shape[1] == 4
+
+    # convert points to homogenous coordinates -> (px, py, pz, 1)
+    ones_pad = np.ones(points.shape[:-1] + (1,))
+    points = np.concatenate((points, ones_pad), axis=-1)  # shape [..., 4]
+
+    # batch matrix multiplication of 4 x 4 matrix and 4 x 1 vectors to do robot frame to pixels transform
+    mat_reshape = [1] * len(points.shape[:-1]) + [4, 4]
+    cam_trans = world_to_camera_transform.reshape(mat_reshape)  # shape [..., 4, 4]
+    pixels = np.matmul(cam_trans, points[..., None])[..., 0]  # shape [..., 4]
+
+    # re-scaling from homogenous coordinates to recover pixel values
+    # (x, y, z) -> (x / z, y / z)
+    pixels = pixels / pixels[..., 2:3]
+    pixels = pixels[..., :2].round().astype(int)  # shape [..., 2]
+
+    # swap first and second coordinates to get pixel indices that correspond to (height, width)
+    # and also clip pixels that are out of range of the camera image
+    pixels = np.concatenate(
+        (
+            pixels[..., 1:2].clip(0, camera_height - 1),
+            pixels[..., 0:1].clip(0, camera_width - 1),
+        ),
+        axis=-1,
+    )
+
+    return pixels
+
+
+def transform_from_pixels_to_world(pixels, depth_map, camera_to_world_transform):
+    """
+    Helper function to take a batch of pixel locations and the corresponding depth image
+    and transform these points from the camera frame to the world frame.
+
+    Args:
+        pixels (np.array): pixel coordinates of shape [..., 2]
+        depth_map (np.array): depth images of shape [..., H, W, 1]
+        camera_to_world_transform (np.array): 4x4 Tensor to go from pixel coordinates to world
+            coordinates.
+
+    Return:
+        points (np.array): 3D points in robot frame of shape [..., 3]
+    """
+
+    # make sure leading dimensions are consistent
+    pixels_leading_shape = pixels.shape[:-1]
+    depth_map_leading_shape = depth_map.shape[:-3]
+    assert depth_map_leading_shape == pixels_leading_shape
+
+    # sample from the depth map using the pixel locations with bilinear sampling
+    pixels = pixels.astype(float)
+    im_h, im_w = depth_map.shape[-2:]
+    depth_map_reshaped = depth_map.reshape(-1, im_h, im_w, 1)
+    z = bilinear_interpolate(im=depth_map_reshaped, x=pixels[..., 1:2], y=pixels[..., 0:1])
+    z = z.reshape(*depth_map_leading_shape, 1)  # shape [..., 1]
+
+    # form 4D homogenous camera vector to transform - [x * z, y * z, z, 1]
+    # (note that we need to swap the first 2 dimensions of pixels to go from pixel indices
+    # to camera coordinates)
+    cam_pts = [pixels[..., 1:2] * z, pixels[..., 0:1] * z, z, np.ones_like(z)]
+    cam_pts = np.concatenate(cam_pts, axis=-1)  # shape [..., 4]
+
+    # batch matrix multiplication of 4 x 4 matrix and 4 x 1 vectors to do camera to robot frame transform
+    mat_reshape = [1] * len(cam_pts.shape[:-1]) + [4, 4]
+    cam_trans = camera_to_world_transform.reshape(mat_reshape)  # shape [..., 4, 4]
+    points = np.matmul(cam_trans, cam_pts[..., None])[..., 0]  # shape [..., 4]
+    return points[..., :3]
+
+
+def bilinear_interpolate(im, x, y):
+    """
+    Bilinear sampling for pixel coordinates x and y from source image im.
+    Taken from https://stackoverflow.com/questions/12729228/simple-efficient-bilinear-interpolation-of-images-in-numpy-and-python
+    """
+    x = np.asarray(x)
+    y = np.asarray(y)
+
+    x0 = np.floor(x).astype(int)
+    x1 = x0 + 1
+    y0 = np.floor(y).astype(int)
+    y1 = y0 + 1
+
+    x0 = np.clip(x0, 0, im.shape[1] - 1)
+    x1 = np.clip(x1, 0, im.shape[1] - 1)
+    y0 = np.clip(y0, 0, im.shape[0] - 1)
+    y1 = np.clip(y1, 0, im.shape[0] - 1)
+
+    Ia = im[y0, x0]
+    Ib = im[y1, x0]
+    Ic = im[y0, x1]
+    Id = im[y1, x1]
+
+    wa = (x1 - x) * (y1 - y)
+    wb = (x1 - x) * (y - y0)
+    wc = (x - x0) * (y1 - y)
+    wd = (x - x0) * (y - y0)
+
+    return wa * Ia + wb * Ib + wc * Ic + wd * Id
+
+
+class CameraMover:
+    """
+    A class for manipulating a camera.
+
+    WARNING: This class will initially RE-INITIALIZE the environment.
+
+    Args:
+        env (MujocoEnv): Mujoco environment to modify camera
+        camera (str): Which camera to mobilize during playback, e.g.: frontview, agentview, etc.
+        init_camera_pos (None or 3-array): If specified, should be the (x,y,z) global cartesian pos to
+            initialize camera to
+        init_camera_quat (None or 4-array): If specified, should be the (x,y,z,w) global quaternion orientation to
+            initialize camera to
+    """
+
+    def __init__(
+        self,
+        env,
+        camera="frontview",
+        init_camera_pos=None,
+        init_camera_quat=None,
+    ):
+        # Store relevant values and initialize other values
+        self.env = env
+        self.camera = camera
+        self.mover_body_name = f"{self.camera}_cameramover"
+
+        # Get state
+        state = self.env.sim.get_state().flatten()
+
+        # Grab environment xml
+        xml = env.sim.model.get_xml()
+
+        # Modify xml to add mocap to move camera around
+        xml = self.modify_xml_for_camera_movement(xml=xml, camera_name=self.camera)
+
+        # Reset the environment and restore the state
+        self.env.reset_from_xml_string(xml)
+        self.env.sim.reset()
+        self.env.sim.set_state_from_flattened(state)
+        self.env.sim.forward()
+
+        # Set initial camera pose
+        self.set_camera_pose(pos=init_camera_pos, quat=init_camera_quat)
+
+    def set_camera_pose(self, pos=None, quat=None):
+        """
+        Sets the camera pose, which optionally includes position and / or quaternion
+
+        Args:
+            pos (None or 3-array): If specified, should be the (x,y,z) global cartesian pos to set camera to
+            quat (None or 4-array): If specified, should be the (x,y,z,w) global quaternion orientation to set camera to
+        """
+        if pos is not None:
+            self.env.sim.data.set_mocap_pos(self.mover_body_name, pos)
+        if quat is not None:
+            self.env.sim.data.set_mocap_quat(self.mover_body_name, T.convert_quat(quat, to="wxyz"))
+
+        # Make sure changes propagate in sim
+        self.env.sim.forward()
+
+    def get_camera_pose(self):
+        """
+        Grab the current camera pose, which optionally includes position and / or quaternion
+
+        Returns:
+            2-tuple:
+                - 3-array: (x,y,z) camera global cartesian pos
+                - 4-array: (x,y,z,w) camera global quaternion orientation
+        """
+        # Grab values from sim
+        pos = self.env.sim.data.get_mocap_pos(self.mover_body_name)
+        quat = T.convert_quat(self.env.sim.data.get_mocap_quat(self.mover_body_name), to="xyzw")
+
+        return pos, quat
+
+    def modify_xml_for_camera_movement(self, xml, camera_name):
+        """
+        Cameras in mujoco are 'fixed', so they can't be moved by default.
+        Although it's possible to hack position movement, rotation movement
+        does not work. An alternative is to attach a camera to a mocap body,
+        and move the mocap body.
+
+        This function modifies the camera with name @camera_name in the xml
+        by attaching it to a mocap body that can move around freely. In this
+        way, we can move the camera by moving the mocap body.
+
+        See http://www.mujoco.org/forum/index.php?threads/move-camera.2201/ for
+        further details.
+
+        Args:
+            xml (str): Mujoco sim XML file as a string
+            camera_name (str): Name of camera to tune
+        """
+        tree = ET.fromstring(xml)
+        wb = tree.find("worldbody")
+
+        # find the correct camera
+        camera_elem = None
+        cameras = wb.findall("camera")
+        for camera in cameras:
+            if camera.get("name") == camera_name:
+                camera_elem = camera
+                break
+        assert camera_elem is not None
+
+        # add mocap body
+        mocap = ET.SubElement(wb, "body")
+        mocap.set("name", self.mover_body_name)
+        mocap.set("mocap", "true")
+        mocap.set("pos", camera.get("pos"))
+        mocap.set("quat", camera.get("quat"))
+        new_camera = ET.SubElement(mocap, "camera")
+        new_camera.set("mode", "fixed")
+        new_camera.set("name", camera.get("name"))
+        new_camera.set("pos", "0 0 0")
+
+        # remove old camera element
+        wb.remove(camera_elem)
+
+        return ET.tostring(tree, encoding="utf8").decode("utf8")
+
+    def rotate_camera(self, point, axis, angle):
+        """
+        Rotate the camera view about a direction (in the camera frame).
+
+        Args:
+            point (None or 3-array): (x,y,z) cartesian coordinates about which to rotate camera in camera frame. If None,
+                assumes the point is the current location of the camera
+            axis (3-array): (ax,ay,az) axis about which to rotate camera in camera frame
+            angle (float): how much to rotate about that direction
+
+        Returns:
+            2-tuple:
+                pos: (x,y,z) updated camera position
+                quat: (x,y,z,w) updated camera quaternion orientation
+        """
+        # current camera rotation + pos
+        camera_pos = np.array(self.env.sim.data.get_mocap_pos(self.mover_body_name))
+        camera_rot = T.quat2mat(T.convert_quat(self.env.sim.data.get_mocap_quat(self.mover_body_name), to="xyzw"))
+
+        # rotate by angle and direction to get new camera rotation
+        rad = np.pi * angle / 180.0
+        R = T.rotation_matrix(rad, axis, point=point)
+        camera_pose = np.zeros((4, 4))
+        camera_pose[:3, :3] = camera_rot
+        camera_pose[:3, 3] = camera_pos
+        camera_pose = camera_pose @ R
+
+        # Update camera pose
+        pos, quat = camera_pose[:3, 3], T.mat2quat(camera_pose[:3, :3])
+        self.set_camera_pose(pos=pos, quat=quat)
+
+        return pos, quat
+
+    def move_camera(self, direction, scale):
+        """
+        Move the camera view along a direction (in the camera frame).
+
+        Args:
+            direction (3-array): direction vector for where to move camera in camera frame
+            scale (float): how much to move along that direction
+        """
+        # current camera rotation + pos
+        camera_pos = np.array(self.env.sim.data.get_mocap_pos(self.mover_body_name))
+        camera_quat = self.env.sim.data.get_mocap_quat(self.mover_body_name)
+        camera_rot = T.quat2mat(T.convert_quat(camera_quat, to="xyzw"))
+
+        # move along camera frame axis and set new position
+        camera_pos += scale * camera_rot.dot(direction)
+        self.set_camera_pose(pos=camera_pos)
+
+        return camera_pos, camera_quat
+
+
+class DemoPlaybackCameraMover(CameraMover):
+    """
+    A class for playing back demonstrations and recording the resulting frames with the flexibility of a mobile camera
+    that can be set manually or panned automatically frame-by-frame
+
+    Note: domain randomization is also supported for playback!
+
+    Args:
+        demo (str): absolute fpath to .hdf5 demo
+        env_config (None or dict): (optional) values to override inferred environment information from demonstration.
+            (e.g.: camera h / w, depths, segmentations, etc...)
+            Any value not specified will be inferred from the extracted demonstration metadata
+            Note that there are some specific arguments that MUST be set a certain way, if any of these values
+            are specified with @env_config, an error will be raised
+        replay_from_actions (bool): If True, will replay demonstration's actions. Otherwise, replays will be hardcoded
+            from the demonstration states
+        visualize_sites (bool): If True, will visualize sites during playback. Note that this CANNOT be paired
+            simultaneously with camera segmentations
+        camera (str): Which camera to mobilize during playback, e.g.: frontview, agentview, etc.
+        init_camera_pos (None or 3-array): If specified, should be the (x,y,z) global cartesian pos to
+            initialize camera to
+        init_camera_quat (None or 4-array): If specified, should be the (x,y,z,w) global quaternion orientation to
+            initialize camera to
+        use_dr (bool): If True, will use domain randomization during playback
+        dr_args (None or dict): If specified, will set the domain randomization wrapper arguments if using dr
+    """
+
+    def __init__(
+        self,
+        demo,
+        env_config=None,
+        replay_from_actions=False,
+        visualize_sites=False,
+        camera="frontview",
+        init_camera_pos=None,
+        init_camera_quat=None,
+        use_dr=False,
+        dr_args=None,
+    ):
+        # Store relevant values and initialize other values
+        self.camera_id = None
+        self.replay_from_actions = replay_from_actions
+        self.states = None
+        self.actions = None
+        self.step = None
+        self.n_steps = None
+        self.current_ep = None
+        self.started = False
+
+        # Load the demo
+        self.f = h5py.File(demo, "r")
+
+        # Extract relevant info
+        env_info = json.loads(self.f["data"].attrs["env_info"])
+
+        # Construct default env arguments
+        default_args = {
+            "has_renderer": False,
+            "has_offscreen_renderer": True,
+            "ignore_done": True,
+            "use_camera_obs": True,
+            "reward_shaping": True,
+            "hard_reset": False,
+            "camera_names": camera,
+        }
+
+        # If custom env_config is specified, make sure that there's no overlap with default args and merge with config
+        if env_config is not None:
+            for k in env_config.keys():
+                assert k not in default_args, f"Key {k} cannot be specified in env_config!"
+            env_info.update(env_config)
+
+        # Merge in default args
+        env_info.update(default_args)
+
+        # Create env
+        env = robosuite.make(**env_info)
+
+        # Optionally wrap with visualization wrapper
+        if visualize_sites:
+            env = VisualizationWrapper(env=self.env)
+
+        # Optionally use domain randomization if specified
+        self.use_dr = use_dr
+        if self.use_dr:
+            default_dr_args = {
+                "seed": 1,
+                "randomize_camera": False,
+                "randomize_every_n_steps": 10,
+            }
+            default_dr_args.update(dr_args)
+            env = DomainRandomizationWrapper(
+                env=self.env,
+                **default_dr_args,
+            )
+
+        # list of all demonstrations episodes
+        self.demos = list(self.f["data"].keys())
+
+        # Run super init
+        super().__init__(
+            env=env,
+            camera=camera,
+            init_camera_pos=init_camera_pos,
+            init_camera_quat=init_camera_quat,
+        )
+
+        # Load episode 0 by default
+        self.load_episode_xml(demo_num=0)
+
+    def load_episode_xml(self, demo_num):
+        """
+        Loads demo episode with specified @demo_num into the simulator.
+
+        Args:
+            demo_num (int): Demonstration number to load
+        """
+        # Grab raw xml file
+        ep = self.demos[demo_num]
+        model_xml = self.f[f"data/{ep}"].attrs["model_file"]
+
+        # Reset environment
+        self.env.reset()
+        xml = self.env.edit_model_xml(model_xml)
+        xml = self.modify_xml_for_camera_movement(xml, camera_name=self.camera)
+        self.env.reset_from_xml_string(xml)
+        self.env.sim.reset()
+
+        # Update camera info
+        self.camera_id = self.env.sim.model.camera_name2id(self.camera)
+
+        # Load states and actions
+        self.states = self.f[f"data/{ep}/states"].value
+        self.actions = np.array(self.f[f"data/{ep}/actions"].value)
+
+        # Set initial state
+        self.env.sim.set_state_from_flattened(self.states[0])
+
+        # Reset step count and set current episode number
+        self.step = 0
+        self.n_steps = len(self.actions)
+        self.current_ep = demo_num
+
+        # Notify user of loaded episode
+        print(f"Loaded episode {demo_num}.")
+
+    def grab_next_frame(self):
+        """
+        Grabs the next frame in the demo sequence by stepping the simulation and returning the resulting value(s)
+
+        Returns:
+            dict: Keyword-mapped np.arrays from the demonstration sequence, corresponding to all image modalities used
+                in the playback environment (e.g.: "image", "depth", "segmentation_instance")
+        """
+        # Make sure the episode isn't completed yet, if so, we load the next episode
+        if self.step == self.n_steps:
+            self.load_episode_xml(demo_num=self.current_ep + 1)
+
+        # Step the environment and grab obs
+        if self.replay_from_actions:
+            obs, _, _, _ = self.env.step(self.actions[self.step])
+        else:  # replay from states
+            self.env.sim.set_state_from_flattened(self.states[self.step + 1])
+            if self.use_dr:
+                self.env.step_randomization()
+            self.env.sim.forward()
+            obs = self.env._get_observation()
+
+        # Increment the step counter
+        self.step += 1
+
+        # Return all relevant frames
+        return {k.split(f"{self.camera}_")[-1]: obs[k] for k in obs if self.camera in k}
+
+    def grab_episode_frames(self, demo_num, pan_point=(0, 0, 0.8), pan_axis=(0, 0, 1), pan_rate=0.01):
+        """
+        Playback entire episode @demo_num, while optionally rotating the camera about point @pan_point and
+            axis @pan_axis if @pan_rate > 0
+
+        Args:
+            demo_num (int): Demonstration episode number to load for playback
+            pan_point (3-array): (x,y,z) cartesian coordinates about which to rotate camera in camera frame
+            pan_direction (3-array): (ax,ay,az) axis about which to rotate camera in camera frame
+            pan_rate (float): how quickly to pan camera if not 0
+
+        Returns:
+            dict: Keyword-mapped stacked np.arrays from the demonstration sequence, corresponding to all image
+                modalities used in the playback environment (e.g.: "image", "depth", "segmentation_instance")
+
+        """
+        # First, load env
+        self.load_episode_xml(demo_num=demo_num)
+
+        # Initialize dict to return
+        obs = self.env._get_observation()
+        frames_dict = {k.split(f"{self.camera}_")[-1]: [] for k in obs if self.camera in k}
+
+        # Continue to loop playback steps while there are still frames left in the episode
+        while self.step < self.n_steps:
+            # Take playback step and add frames
+            for k, frame in self.grab_next_frame().items():
+                frames_dict[k].append(frame)
+
+            # Update camera pose
+            self.rotate_camera(point=pan_point, axis=pan_axis, angle=pan_rate)
+
+        # Stack all frames and return
+        return {k: np.stack(frames) for k, frames in frames_dict.items()}
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/control_utils.py b/phantom/submodules/phantom-robosuite/robosuite/utils/control_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..18cc0a6b30cd8011f5267cc39542c86a01867e44
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/control_utils.py
@@ -0,0 +1,236 @@
+import numpy as np
+
+import robosuite.utils.transform_utils as trans
+from robosuite.utils.numba import jit_decorator
+
+
+@jit_decorator
+def nullspace_torques(mass_matrix, nullspace_matrix, initial_joint, joint_pos, joint_vel, joint_kp=10):
+    """
+    For a robot with redundant DOF(s), a nullspace exists which is orthogonal to the remainder of the controllable
+    subspace of the robot's joints. Therefore, an additional secondary objective that does not impact the original
+    controller objective may attempt to be maintained using these nullspace torques.
+
+    This utility function specifically calculates nullspace torques that attempt to maintain a given robot joint
+    positions @initial_joint with zero velocity using proportinal gain @joint_kp
+
+    :Note: @mass_matrix, @nullspace_matrix, @joint_pos, and @joint_vel should reflect the robot's state at the current
+    timestep
+
+    Args:
+        mass_matrix (np.array): 2d array representing the mass matrix of the robot
+        nullspace_matrix (np.array): 2d array representing the nullspace matrix of the robot
+        initial_joint (np.array): Joint configuration to be used for calculating nullspace torques
+        joint_pos (np.array): Current joint positions
+        joint_vel (np.array): Current joint velocities
+        joint_kp (float): Proportional control gain when calculating nullspace torques
+
+    Returns:
+          np.array: nullspace torques
+    """
+
+    # kv calculated below corresponds to critical damping
+    joint_kv = np.sqrt(joint_kp) * 2
+
+    # calculate desired torques based on gains and error
+    pose_torques = np.dot(mass_matrix, (joint_kp * (initial_joint - joint_pos) - joint_kv * joint_vel))
+
+    # map desired torques to null subspace within joint torque actuator space
+    nullspace_torques = np.dot(nullspace_matrix.transpose(), pose_torques)
+    return nullspace_torques
+
+
+@jit_decorator
+def opspace_matrices(mass_matrix, J_full, J_pos, J_ori):
+    """
+    Calculates the relevant matrices used in the operational space control algorithm
+
+    Args:
+        mass_matrix (np.array): 2d array representing the mass matrix of the robot
+        J_full (np.array): 2d array representing the full Jacobian matrix of the robot
+        J_pos (np.array): 2d array representing the position components of the Jacobian matrix of the robot
+        J_ori (np.array): 2d array representing the orientation components of the Jacobian matrix of the robot
+
+    Returns:
+        4-tuple:
+
+            - (np.array): full lambda matrix (as 2d array)
+            - (np.array): position components of lambda matrix (as 2d array)
+            - (np.array): orientation components of lambda matrix (as 2d array)
+            - (np.array): nullspace matrix (as 2d array)
+    """
+    mass_matrix_inv = np.linalg.inv(mass_matrix)
+
+    # J M^-1 J^T
+    lambda_full_inv = np.dot(np.dot(J_full, mass_matrix_inv), J_full.transpose())
+
+    # Jx M^-1 Jx^T
+    lambda_pos_inv = np.dot(np.dot(J_pos, mass_matrix_inv), J_pos.transpose())
+
+    # Jr M^-1 Jr^T
+    lambda_ori_inv = np.dot(np.dot(J_ori, mass_matrix_inv), J_ori.transpose())
+
+    # take the inverses, but zero out small singular values for stability
+    lambda_full = np.linalg.pinv(lambda_full_inv)
+    lambda_pos = np.linalg.pinv(lambda_pos_inv)
+    lambda_ori = np.linalg.pinv(lambda_ori_inv)
+
+    # nullspace
+    Jbar = np.dot(mass_matrix_inv, J_full.transpose()).dot(lambda_full)
+    nullspace_matrix = np.eye(J_full.shape[-1], J_full.shape[-1]) - np.dot(Jbar, J_full)
+
+    return lambda_full, lambda_pos, lambda_ori, nullspace_matrix
+
+
+@jit_decorator
+def orientation_error(desired, current):
+    """
+    This function calculates a 3-dimensional orientation error vector for use in the
+    impedance controller. It does this by computing the delta rotation between the
+    inputs and converting that rotation to exponential coordinates (axis-angle
+    representation, where the 3d vector is axis * angle).
+    See https://en.wikipedia.org/wiki/Axis%E2%80%93angle_representation for more information.
+    Optimized function to determine orientation error from matrices
+
+    Args:
+        desired (np.array): 2d array representing target orientation matrix
+        current (np.array): 2d array representing current orientation matrix
+
+    Returns:
+        np.array: 2d array representing orientation error as a matrix
+    """
+    rc1 = current[0:3, 0]
+    rc2 = current[0:3, 1]
+    rc3 = current[0:3, 2]
+    rd1 = desired[0:3, 0]
+    rd2 = desired[0:3, 1]
+    rd3 = desired[0:3, 2]
+
+    error = 0.5 * (np.cross(rc1, rd1) + np.cross(rc2, rd2) + np.cross(rc3, rd3))
+
+    return error
+
+
+def set_goal_position(delta, current_position, position_limit=None, set_pos=None):
+    """
+    Calculates and returns the desired goal position, clipping the result accordingly to @position_limits.
+    @delta and @current_position must be specified if a relative goal is requested, else @set_pos must be
+    specified to define a global goal position
+
+    Args:
+        delta (np.array): Desired relative change in position
+        current_position (np.array): Current position
+        position_limit (None or np.array): 2d array defining the (min, max) limits of permissible position goal commands
+        set_pos (None or np.array): If set, will ignore @delta and set the goal position to this value
+
+    Returns:
+        np.array: calculated goal position in absolute coordinates
+
+    Raises:
+        ValueError: [Invalid position_limit shape]
+    """
+    n = len(current_position)
+    if set_pos is not None:
+        goal_position = set_pos
+    else:
+        goal_position = current_position + delta
+
+    if position_limit is not None:
+        if position_limit.shape != (2, n):
+            raise ValueError(
+                "Position limit should be shaped (2,{}) " "but is instead: {}".format(n, position_limit.shape)
+            )
+
+        # Clip goal position
+        goal_position = np.clip(goal_position, position_limit[0], position_limit[1])
+
+    return goal_position
+
+
+def set_goal_orientation(delta, current_orientation, orientation_limit=None, set_ori=None):
+    """
+    Calculates and returns the desired goal orientation, clipping the result accordingly to @orientation_limits.
+    @delta and @current_orientation must be specified if a relative goal is requested, else @set_ori must be
+    an orientation matrix specified to define a global orientation
+
+    Args:
+        delta (np.array): Desired relative change in orientation, in axis-angle form [ax, ay, az]
+        current_orientation (np.array): Current orientation, in rotation matrix form
+        orientation_limit (None or np.array): 2d array defining the (min, max) limits of permissible orientation goal commands
+        set_ori (None or np.array): If set, will ignore @delta and set the goal orientation to this value
+
+    Returns:
+        np.array: calculated goal orientation in absolute coordinates
+
+    Raises:
+        ValueError: [Invalid orientation_limit shape]
+    """
+    # directly set orientation
+    if set_ori is not None:
+        goal_orientation = set_ori
+
+    # otherwise use delta to set goal orientation
+    else:
+        # convert axis-angle value to rotation matrix
+        quat_error = trans.axisangle2quat(delta)
+        rotation_mat_error = trans.quat2mat(quat_error)
+        goal_orientation = np.dot(rotation_mat_error, current_orientation)
+
+    # check for orientation limits
+    if np.array(orientation_limit).any():
+        if orientation_limit.shape != (2, 3):
+            raise ValueError(
+                "Orientation limit should be shaped (2,3) " "but is instead: {}".format(orientation_limit.shape)
+            )
+
+        # Convert to euler angles for clipping
+        euler = trans.mat2euler(goal_orientation)
+
+        # Clip euler angles according to specified limits
+        limited = False
+        for idx in range(3):
+            if orientation_limit[0][idx] < orientation_limit[1][idx]:  # Normal angle sector meaning
+                if orientation_limit[0][idx] < euler[idx] < orientation_limit[1][idx]:
+                    continue
+                else:
+                    limited = True
+                    dist_to_lower = euler[idx] - orientation_limit[0][idx]
+                    if dist_to_lower > np.pi:
+                        dist_to_lower -= 2 * np.pi
+                    elif dist_to_lower < -np.pi:
+                        dist_to_lower += 2 * np.pi
+
+                    dist_to_higher = euler[idx] - orientation_limit[1][idx]
+                    if dist_to_lower > np.pi:
+                        dist_to_higher -= 2 * np.pi
+                    elif dist_to_lower < -np.pi:
+                        dist_to_higher += 2 * np.pi
+
+                    if dist_to_lower < dist_to_higher:
+                        euler[idx] = orientation_limit[0][idx]
+                    else:
+                        euler[idx] = orientation_limit[1][idx]
+            else:  # Inverted angle sector meaning
+                if orientation_limit[0][idx] < euler[idx] or euler[idx] < orientation_limit[1][idx]:
+                    continue
+                else:
+                    limited = True
+                    dist_to_lower = euler[idx] - orientation_limit[0][idx]
+                    if dist_to_lower > np.pi:
+                        dist_to_lower -= 2 * np.pi
+                    elif dist_to_lower < -np.pi:
+                        dist_to_lower += 2 * np.pi
+
+                    dist_to_higher = euler[idx] - orientation_limit[1][idx]
+                    if dist_to_lower > np.pi:
+                        dist_to_higher -= 2 * np.pi
+                    elif dist_to_lower < -np.pi:
+                        dist_to_higher += 2 * np.pi
+
+                    if dist_to_lower < dist_to_higher:
+                        euler[idx] = orientation_limit[0][idx]
+                    else:
+                        euler[idx] = orientation_limit[1][idx]
+        if limited:
+            goal_orientation = trans.euler2mat(np.array([euler[0], euler[1], euler[2]]))
+    return goal_orientation
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/errors.py b/phantom/submodules/phantom-robosuite/robosuite/utils/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..efe0b5cd7119b9a987975ae6028a9e76326d1f8c
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/errors.py
@@ -0,0 +1,22 @@
+class robosuiteError(Exception):
+    """Base class for exceptions in robosuite."""
+
+    pass
+
+
+class XMLError(robosuiteError):
+    """Exception raised for errors related to xml."""
+
+    pass
+
+
+class SimulationError(robosuiteError):
+    """Exception raised for errors during runtime."""
+
+    pass
+
+
+class RandomizationError(robosuiteError):
+    """Exception raised for really really bad RNG."""
+
+    pass
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/input_utils.py b/phantom/submodules/phantom-robosuite/robosuite/utils/input_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..26dee46cce4db7a9d7060064a96a07c8a23fa7ee
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/input_utils.py
@@ -0,0 +1,255 @@
+"""
+Utility functions for grabbing user inputs
+"""
+
+import numpy as np
+
+import robosuite as suite
+import robosuite.utils.transform_utils as T
+from robosuite.devices import *
+from robosuite.models.robots import *
+from robosuite.robots import *
+
+
+def choose_environment():
+    """
+    Prints out environment options, and returns the selected env_name choice
+
+    Returns:
+        str: Chosen environment name
+    """
+    # get the list of all environments
+    envs = sorted(suite.ALL_ENVIRONMENTS)
+
+    # Select environment to run
+    print("Here is a list of environments in the suite:\n")
+
+    for k, env in enumerate(envs):
+        print("[{}] {}".format(k, env))
+    print()
+    try:
+        s = input("Choose an environment to run " + "(enter a number from 0 to {}): ".format(len(envs) - 1))
+        # parse input into a number within range
+        k = min(max(int(s), 0), len(envs))
+    except:
+        k = 0
+        print("Input is not valid. Use {} by default.\n".format(envs[k]))
+
+    # Return the chosen environment name
+    return envs[k]
+
+
+def choose_controller():
+    """
+    Prints out controller options, and returns the requested controller name
+
+    Returns:
+        str: Chosen controller name
+    """
+    # get the list of all controllers
+    controllers_info = suite.controllers.CONTROLLER_INFO
+    controllers = list(suite.ALL_CONTROLLERS)
+
+    # Select controller to use
+    print("Here is a list of controllers in the suite:\n")
+
+    for k, controller in enumerate(controllers):
+        print("[{}] {} - {}".format(k, controller, controllers_info[controller]))
+    print()
+    try:
+        s = input("Choose a controller for the robot " + "(enter a number from 0 to {}): ".format(len(controllers) - 1))
+        # parse input into a number within range
+        k = min(max(int(s), 0), len(controllers) - 1)
+    except:
+        k = 0
+        print("Input is not valid. Use {} by default.".format(controllers)[k])
+
+    # Return chosen controller
+    return controllers[k]
+
+
+def choose_multi_arm_config():
+    """
+    Prints out multi-arm environment configuration options, and returns the requested config name
+
+    Returns:
+        str: Requested multi-arm configuration name
+    """
+    # Get the list of all multi arm configs
+    env_configs = {
+        "Single Arms Opposed": "single-arm-opposed",
+        "Single Arms Parallel": "single-arm-parallel",
+        "Bimanual": "bimanual",
+    }
+
+    # Select environment configuration
+    print("A multi-arm environment was chosen. Here is a list of multi-arm environment configurations:\n")
+
+    for k, env_config in enumerate(list(env_configs)):
+        print("[{}] {}".format(k, env_config))
+    print()
+    try:
+        s = input(
+            "Choose a configuration for this environment "
+            + "(enter a number from 0 to {}): ".format(len(env_configs) - 1)
+        )
+        # parse input into a number within range
+        k = min(max(int(s), 0), len(env_configs))
+    except:
+        k = 0
+        print("Input is not valid. Use {} by default.".format(list(env_configs)[k]))
+
+    # Return requested configuration
+    return list(env_configs.values())[k]
+
+
+def choose_robots(exclude_bimanual=False):
+    """
+    Prints out robot options, and returns the requested robot. Restricts options to single-armed robots if
+    @exclude_bimanual is set to True (False by default)
+
+    Args:
+        exclude_bimanual (bool): If set, excludes bimanual robots from the robot options
+
+    Returns:
+        str: Requested robot name
+    """
+    # Get the list of robots
+    robots = {
+        "Sawyer",
+        "Panda",
+        "Jaco",
+        "Kinova3",
+        "IIWA",
+        "UR5e",
+    }
+
+    # Add Baxter if bimanual robots are not excluded
+    if not exclude_bimanual:
+        robots.add("Baxter")
+
+    # Make sure set is deterministically sorted
+    robots = sorted(robots)
+
+    # Select robot
+    print("Here is a list of available robots:\n")
+
+    for k, robot in enumerate(robots):
+        print("[{}] {}".format(k, robot))
+    print()
+    try:
+        s = input("Choose a robot " + "(enter a number from 0 to {}): ".format(len(robots) - 1))
+        # parse input into a number within range
+        k = min(max(int(s), 0), len(robots))
+    except:
+        k = 0
+        print("Input is not valid. Use {} by default.".format(list(robots)[k]))
+
+    # Return requested robot
+    return list(robots)[k]
+
+
+def input2action(device, robot, active_arm="right", env_configuration=None):
+    """
+    Converts an input from an active device into a valid action sequence that can be fed into an env.step() call
+
+    If a reset is triggered from the device, immediately returns None. Else, returns the appropriate action
+
+    Args:
+        device (Device): A device from which user inputs can be converted into actions. Can be either a Spacemouse or
+            Keyboard device class
+
+        robot (Robot): Which robot we're controlling
+
+        active_arm (str): Only applicable for multi-armed setups (e.g.: multi-arm environments or bimanual robots).
+            Allows inputs to be converted correctly if the control type (e.g.: IK) is dependent on arm choice.
+            Choices are {right, left}
+
+        env_configuration (str or None): Only applicable for multi-armed environments. Allows inputs to be converted
+            correctly if the control type (e.g.: IK) is dependent on the environment setup. Options are:
+            {bimanual, single-arm-parallel, single-arm-opposed}
+
+    Returns:
+        2-tuple:
+
+            - (None or np.array): Action interpreted from @device including any gripper action(s). None if we get a
+                reset signal from the device
+            - (None or int): 1 if desired close, -1 if desired open gripper state. None if get a reset signal from the
+                device
+
+    """
+    state = device.get_controller_state()
+    # Note: Devices output rotation with x and z flipped to account for robots starting with gripper facing down
+    #       Also note that the outputted rotation is an absolute rotation, while outputted dpos is delta pos
+    #       Raw delta rotations from neutral user input is captured in raw_drotation (roll, pitch, yaw)
+    dpos, rotation, raw_drotation, grasp, reset = (
+        state["dpos"],
+        state["rotation"],
+        state["raw_drotation"],
+        state["grasp"],
+        state["reset"],
+    )
+
+    # If we're resetting, immediately return None
+    if reset:
+        return None, None
+
+    # Get controller reference
+    controller = robot.controller if not isinstance(robot, Bimanual) else robot.controller[active_arm]
+    gripper_dof = robot.gripper.dof if not isinstance(robot, Bimanual) else robot.gripper[active_arm].dof
+
+    # First process the raw drotation
+    drotation = raw_drotation[[1, 0, 2]]
+    if controller.name == "IK_POSE":
+        # If this is panda, want to swap x and y axis
+        if isinstance(robot.robot_model, Panda):
+            drotation = drotation[[1, 0, 2]]
+        else:
+            # Flip x
+            drotation[0] = -drotation[0]
+        # Scale rotation for teleoperation (tuned for IK)
+        drotation *= 10
+        dpos *= 5
+        # relative rotation of desired from current eef orientation
+        # map to quat
+        drotation = T.mat2quat(T.euler2mat(drotation))
+
+        # If we're using a non-forward facing configuration, need to adjust relative position / orientation
+        if env_configuration == "single-arm-opposed":
+            # Swap x and y for pos and flip x,y signs for ori
+            dpos = dpos[[1, 0, 2]]
+            drotation[0] = -drotation[0]
+            drotation[1] = -drotation[1]
+            if active_arm == "left":
+                # x pos needs to be flipped
+                dpos[0] = -dpos[0]
+            else:
+                # y pos needs to be flipped
+                dpos[1] = -dpos[1]
+
+        # Lastly, map to axis angle form
+        drotation = T.quat2axisangle(drotation)
+
+    elif controller.name == "OSC_POSE":
+        # Flip z
+        drotation[2] = -drotation[2]
+        # Scale rotation for teleoperation (tuned for OSC) -- gains tuned for each device
+        drotation = drotation * 1.5 if isinstance(device, Keyboard) else drotation * 50
+        dpos = dpos * 75 if isinstance(device, Keyboard) else dpos * 125
+    elif controller.name == "OSC_POSITION":
+        dpos = dpos * 75 if isinstance(device, Keyboard) else dpos * 125
+    else:
+        # No other controllers currently supported
+        print("Error: Unsupported controller specified -- Robot must have either an IK or OSC-based controller!")
+
+    # map 0 to -1 (open) and map 1 to 1 (closed)
+    grasp = 1 if grasp else -1
+
+    # Create action based on action space of individual robot
+    if controller.name == "OSC_POSITION":
+        action = np.concatenate([dpos, [grasp] * gripper_dof])
+    else:
+        action = np.concatenate([dpos, drotation, [grasp] * gripper_dof])
+
+    # Return the action and grasp
+    return action, grasp
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/log_utils.py b/phantom/submodules/phantom-robosuite/robosuite/utils/log_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e902fc3788c8a34c60a728d8cf889c66f4dbb370
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/log_utils.py
@@ -0,0 +1,102 @@
+"""
+This file contains utility classes and functions for logging to stdout and stderr
+Adapted from robomimic: https://github.com/ARISE-Initiative/robomimic/blob/master/robomimic/utils/log_utils.py
+"""
+import logging
+import os
+import time
+
+from termcolor import colored
+
+import robosuite.macros as macros
+
+LEVEL_COLORS = {
+    logging.DEBUG: "green",
+    logging.INFO: "green",
+    logging.WARNING: "yellow",
+    logging.ERROR: "red",
+    logging.CRITICAL: "red",
+}
+
+FORMAT_STR = {"file": "[robosuite %(levelname)s - %(asctime)s] ", "console": "[robosuite %(levelname)s] "}
+
+MESSAGE_STR = "%(message)s (%(filename)s:%(lineno)d)"
+
+
+class FileFormatter(logging.Formatter):
+    """Formatter class of logging for file logging."""
+
+    FORMATS = {
+        levelno: colored(FORMAT_STR["file"], color, attrs=["bold"]) + MESSAGE_STR
+        for (levelno, color) in LEVEL_COLORS.items()
+    }
+
+    def format(self, record):
+        """Apply custom fomatting on LogRecord object record."""
+        log_fmt = self.FORMATS.get(record.levelno)
+        formatter = logging.Formatter(log_fmt, "%Y-%m-%d %H:%M:%S")
+        return formatter.format(record)
+
+
+class ConsoleFormatter(logging.Formatter):
+    """Formatter class of logging for console logging."""
+
+    FORMATS = {
+        logging.DEBUG: FORMAT_STR["console"] + MESSAGE_STR,
+        logging.INFO: "%(message)s",
+        logging.WARNING: colored(FORMAT_STR["console"], "yellow", attrs=["bold"]) + MESSAGE_STR,
+        logging.ERROR: colored(FORMAT_STR["console"], "red", attrs=["bold"]) + MESSAGE_STR,
+        logging.CRITICAL: colored(FORMAT_STR["console"], "red", attrs=["bold", "reverse"]) + MESSAGE_STR,
+    }
+
+    def format(self, record):
+        """Apply custom fomatting on LogRecord object record."""
+        log_fmt = self.FORMATS.get(record.levelno)
+        formatter = logging.Formatter(log_fmt)
+        return formatter.format(record)
+
+
+class DefaultLogger:
+    """Default logger class in robosuite codebase."""
+
+    def __init__(self, logger_name="robosuite_logs", console_logging_level="INFO", file_logging_level=None):
+        """
+        Args:
+            logger_name (str, optional): logger name. Defaults to "robosuite_logs".
+            console_logging_level (str, optional): logging level for console logging. Defaults to "INFO".
+            file_logging_level (_type_, optional): logging level for file logging. Defaults to None.
+        """
+        self.logger_name = logger_name
+        logger = logging.getLogger(self.logger_name)
+
+        if file_logging_level is not None:
+            time_str = str(time.time()).replace(".", "_")
+            log_file_path = "/tmp/robosuite_{}_{}.log".format(time_str, os.getpid())
+            fh = logging.FileHandler(log_file_path)
+            print(colored("[robosuite]: Saving logs to {}".format(log_file_path), "yellow"))
+            fh.setLevel(logging.getLevelName(file_logging_level))
+            file_formatter = FileFormatter()
+            fh.setFormatter(file_formatter)
+            logger.addHandler(fh)
+
+        if console_logging_level is not None:
+            ch = logging.StreamHandler()
+            ch.setLevel(logging.getLevelName(console_logging_level))
+            console_formatter = ConsoleFormatter()
+            ch.setFormatter(console_formatter)
+            logger.addHandler(ch)
+
+    def get_logger(self):
+        """_summary_
+
+        Returns:
+            DefaultLogger: The retrieved logger whose name equals self.logger_name
+        """
+        logger = logging.getLogger(self.logger_name)
+        return logger
+
+
+ROBOSUITE_DEFAULT_LOGGER = DefaultLogger(
+    console_logging_level=macros.CONSOLE_LOGGING_LEVEL,
+    file_logging_level=macros.FILE_LOGGING_LEVEL,
+).get_logger()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/mjcf_utils.py b/phantom/submodules/phantom-robosuite/robosuite/utils/mjcf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6af7f647763622f661ed518d16c37add5fa08541
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/mjcf_utils.py
@@ -0,0 +1,855 @@
+# utility functions for manipulating MJCF XML models
+
+import os
+import xml.etree.ElementTree as ET
+from collections.abc import Iterable
+from copy import deepcopy
+from pathlib import Path
+
+import numpy as np
+from PIL import Image
+
+import robosuite
+
+RED = [1, 0, 0, 1]
+GREEN = [0, 1, 0, 1]
+BLUE = [0, 0, 1, 1]
+CYAN = [0, 1, 1, 1]
+ROBOT_COLLISION_COLOR = [0, 0.5, 0, 1]
+MOUNT_COLLISION_COLOR = [0.5, 0.5, 0, 1]
+GRIPPER_COLLISION_COLOR = [0, 0, 0.5, 1]
+OBJECT_COLLISION_COLOR = [0.5, 0, 0, 1]
+ENVIRONMENT_COLLISION_COLOR = [0.5, 0.5, 0, 1]
+SENSOR_TYPES = {
+    "touch",
+    "accelerometer",
+    "velocimeter",
+    "gyro",
+    "force",
+    "torque",
+    "magnetometer",
+    "rangefinder",
+    "jointpos",
+    "jointvel",
+    "tendonpos",
+    "tendonvel",
+    "actuatorpos",
+    "actuatorvel",
+    "actuatorfrc",
+    "ballangvel",
+    "jointlimitpos",
+    "jointlimitvel",
+    "jointlimitfrc",
+    "tendonlimitpos",
+    "tendonlimitvel",
+    "tendonlimitfrc",
+    "framepos",
+    "framequat",
+    "framexaxis",
+    "frameyaxis",
+    "framezaxis",
+    "framelinvel",
+    "frameangvel",
+    "framelinacc",
+    "frameangacc",
+    "subtreecom",
+    "subtreelinvel",
+    "subtreeangmom",
+    "user",
+}
+
+MUJOCO_NAMED_ATTRIBUTES = {
+    "class",
+    "childclass",
+    "name",
+    "objname",
+    "material",
+    "texture",
+    "joint",
+    "joint1",
+    "joint2",
+    "jointinparent",
+    "geom",
+    "geom1",
+    "geom2",
+    "mesh",
+    "fixed",
+    "actuator",
+    "objname",
+    "tendon",
+    "tendon1",
+    "tendon2",
+    "slidesite",
+    "cranksite",
+    "body",
+    "body1",
+    "body2",
+    "hfield",
+    "target",
+    "prefix",
+    "site",
+}
+
+IMAGE_CONVENTION_MAPPING = {
+    "opengl": 1,
+    "opencv": -1,
+}
+
+TEXTURE_FILES = {
+    "WoodRed": "red-wood.png",
+    "WoodGreen": "green-wood.png",
+    "WoodBlue": "blue-wood.png",
+    "WoodLight": "light-wood.png",
+    "WoodDark": "dark-wood.png",
+    "WoodTiles": "wood-tiles.png",
+    "WoodPanels": "wood-varnished-panels.png",
+    "WoodgrainGray": "gray-woodgrain.png",
+    "PlasterCream": "cream-plaster.png",
+    "PlasterPink": "pink-plaster.png",
+    "PlasterYellow": "yellow-plaster.png",
+    "PlasterGray": "gray-plaster.png",
+    "PlasterWhite": "white-plaster.png",
+    "BricksWhite": "white-bricks.png",
+    "Metal": "metal.png",
+    "SteelBrushed": "steel-brushed.png",
+    "SteelScratched": "steel-scratched.png",
+    "Brass": "brass-ambra.png",
+    "Bread": "bread.png",
+    "Can": "can.png",
+    "Ceramic": "ceramic.png",
+    "Cereal": "cereal.png",
+    "Clay": "clay.png",
+    "Dirt": "dirt.png",
+    "Glass": "glass.png",
+    "FeltGray": "gray-felt.png",
+    "Lemon": "lemon.png",
+}
+
+TEXTURES = {
+    texture_name: os.path.join("textures", texture_file) for (texture_name, texture_file) in TEXTURE_FILES.items()
+}
+
+ALL_TEXTURES = TEXTURES.keys()
+
+
+class CustomMaterial(object):
+    """
+    Simple class to instantiate the necessary parameters to define an appropriate texture / material combo
+
+    Instantiates a nested dict holding necessary components for procedurally generating a texture / material combo
+
+    Please see http://www.mujoco.org/book/XMLreference.html#asset for specific details on
+        attributes expected for Mujoco texture / material tags, respectively
+
+    Note that the values in @tex_attrib and @mat_attrib can be in string or array / numerical form.
+
+    Args:
+        texture (None or str or 4-array): Name of texture file to be imported. If a string, should be part of
+            ALL_TEXTURES. If texture is a 4-array, then this argument will be interpreted as an rgba tuple value and
+            a template png will be procedurally generated during object instantiation, with any additional
+            texture / material attributes specified. If None, no file will be linked and no rgba value will be set
+            Note, if specified, the RGBA values are expected to be floats between 0 and 1
+
+        tex_name (str): Name to reference the imported texture
+
+        mat_name (str): Name to reference the imported material
+
+        tex_attrib (dict): Any other optional mujoco texture specifications.
+
+        mat_attrib (dict): Any other optional mujoco material specifications.
+
+        shared (bool): If True, this material should not have any naming prefixes added to all names
+
+    Raises:
+        AssertionError: [Invalid texture]
+    """
+
+    def __init__(
+        self,
+        texture,
+        tex_name,
+        mat_name,
+        tex_attrib=None,
+        mat_attrib=None,
+        shared=False,
+    ):
+        # Check if the desired texture is an rgba value
+        if type(texture) is str:
+            default = False
+            # Verify that requested texture is valid
+            assert texture in ALL_TEXTURES, "Error: Requested invalid texture. Got {}. Valid options are:\n{}".format(
+                texture, ALL_TEXTURES
+            )
+        else:
+            default = True
+            # If specified, this is an rgba value and a default texture is desired; make sure length of rgba array is 4
+            if texture is not None:
+                assert len(texture) == 4, (
+                    "Error: Requested default texture. Got array of length {}."
+                    "Expected rgba array of length 4.".format(len(texture))
+                )
+
+        # Setup the texture and material attributes
+        self.tex_attrib = {} if tex_attrib is None else tex_attrib.copy()
+        self.mat_attrib = {} if mat_attrib is None else mat_attrib.copy()
+
+        # Add in name values
+        self.name = mat_name
+        self.shared = shared
+        self.tex_attrib["name"] = tex_name
+        self.mat_attrib["name"] = mat_name
+        self.mat_attrib["texture"] = tex_name
+
+        # Loop through all attributes and convert all non-string values into strings
+        for attrib in (self.tex_attrib, self.mat_attrib):
+            for k, v in attrib.items():
+                if type(v) is not str:
+                    if isinstance(v, Iterable):
+                        attrib[k] = array_to_string(v)
+                    else:
+                        attrib[k] = str(v)
+
+        # Handle default and non-default cases separately for linking texture patch file locations
+        if not default:
+            # Add in the filepath to texture patch
+            self.tex_attrib["file"] = xml_path_completion(TEXTURES[texture])
+        else:
+            if texture is not None:
+                # Create a texture patch
+                tex = Image.new("RGBA", (100, 100), tuple((np.array(texture) * 255).astype("int")))
+                # Create temp directory if it does not exist
+                save_dir = "/tmp/robosuite_temp_tex"
+                Path(save_dir).mkdir(parents=True, exist_ok=True)
+                # Save this texture patch to the temp directory on disk (MacOS / Linux)
+                fpath = save_dir + "/{}.png".format(tex_name)
+                tex.save(fpath, "PNG")
+                # Link this texture file to the default texture dict
+                self.tex_attrib["file"] = fpath
+
+
+def xml_path_completion(xml_path):
+    """
+    Takes in a local xml path and returns a full path.
+        if @xml_path is absolute, do nothing
+        if @xml_path is not absolute, load xml that is shipped by the package
+
+    Args:
+        xml_path (str): local xml path
+
+    Returns:
+        str: Full (absolute) xml path
+    """
+    if xml_path.startswith("/"):
+        full_path = xml_path
+    else:
+        full_path = os.path.join(robosuite.models.assets_root, xml_path)
+    return full_path
+
+
+def array_to_string(array):
+    """
+    Converts a numeric array into the string format in mujoco.
+
+    Examples:
+        [0, 1, 2] => "0 1 2"
+
+    Args:
+        array (n-array): Array to convert to a string
+
+    Returns:
+        str: String equivalent of @array
+    """
+    return " ".join(["{}".format(x) for x in array])
+
+
+def string_to_array(string):
+    """
+    Converts a array string in mujoco xml to np.array.
+
+    Examples:
+        "0 1 2" => [0, 1, 2]
+
+    Args:
+        string (str): String to convert to an array
+
+    Returns:
+        np.array: Numerical array equivalent of @string
+    """
+    return np.array([float(x) for x in string.strip().split(" ")])
+
+
+def convert_to_string(inp):
+    """
+    Converts any type of {bool, int, float, list, tuple, array, string, np.str_} into an mujoco-xml compatible string.
+        Note that an input string / np.str_ results in a no-op action.
+
+    Args:
+        inp: Input to convert to string
+
+    Returns:
+        str: String equivalent of @inp
+    """
+    if type(inp) in {list, tuple, np.ndarray}:
+        return array_to_string(inp)
+    elif type(inp) in {int, float, bool}:
+        return str(inp).lower()
+    elif type(inp) in {str, np.str_}:
+        return inp
+    else:
+        raise ValueError("Unsupported type received: got {}".format(type(inp)))
+
+
+def set_alpha(node, alpha=0.1):
+    """
+    Sets all a(lpha) field of the rgba attribute to be @alpha
+    for @node and all subnodes
+    used for managing display
+
+    Args:
+        node (ET.Element): Specific node element within XML tree
+        alpha (float): Value to set alpha value of rgba tuple
+    """
+    for child_node in node.findall(".//*[@rgba]"):
+        rgba_orig = string_to_array(child_node.get("rgba"))
+        child_node.set("rgba", array_to_string(list(rgba_orig[0:3]) + [alpha]))
+
+
+def new_element(tag, name, **kwargs):
+    """
+    Creates a new @tag element with attributes specified by @**kwargs.
+
+    Args:
+        tag (str): Type of element to create
+        name (None or str): Name for this element. Should only be None for elements that do not have an explicit
+            name attribute (e.g.: inertial elements)
+        **kwargs: Specified attributes for the new joint
+
+    Returns:
+        ET.Element: new specified xml element
+    """
+    # Name will be set if it's not None
+    if name is not None:
+        kwargs["name"] = name
+    # Loop through all attributes and pop any that are None, otherwise convert them to strings
+    for k, v in kwargs.copy().items():
+        if v is None:
+            kwargs.pop(k)
+        else:
+            kwargs[k] = convert_to_string(v)
+    element = ET.Element(tag, attrib=kwargs)
+    return element
+
+
+def new_joint(name, **kwargs):
+    """
+    Creates a joint tag with attributes specified by @**kwargs.
+
+    Args:
+        name (str): Name for this joint
+        **kwargs: Specified attributes for the new joint
+
+    Returns:
+        ET.Element: new joint xml element
+    """
+    return new_element(tag="joint", name=name, **kwargs)
+
+
+def new_actuator(name, joint, act_type="actuator", **kwargs):
+    """
+    Creates an actuator tag with attributes specified by @**kwargs.
+
+    Args:
+        name (str): Name for this actuator
+        joint (str): type of actuator transmission.
+            see all types here: http://mujoco.org/book/modeling.html#actuator
+        act_type (str): actuator type. Defaults to "actuator"
+        **kwargs: Any additional specified attributes for the new joint
+
+    Returns:
+        ET.Element: new actuator xml element
+    """
+    element = new_element(tag=act_type, name=name, **kwargs)
+    element.set("joint", joint)
+    return element
+
+
+def new_site(name, rgba=RED, pos=(0, 0, 0), size=(0.005,), **kwargs):
+    """
+    Creates a site element with attributes specified by @**kwargs.
+
+    NOTE: With the exception of @name, @pos, and @size, if any arg is set to
+        None, the value will automatically be popped before passing the values
+        to create the appropriate XML
+
+    Args:
+        name (str): Name for this site
+        rgba (4-array): (r,g,b,a) color and transparency. Defaults to solid red.
+        pos (3-array): (x,y,z) 3d position of the site.
+        size (n-array of float): site size (sites are spherical by default).
+        **kwargs: Any additional specified attributes for the new site
+
+    Returns:
+        ET.Element: new site xml element
+    """
+    kwargs["pos"] = pos
+    kwargs["size"] = size
+    kwargs["rgba"] = rgba if rgba is not None else None
+    return new_element(tag="site", name=name, **kwargs)
+
+
+def new_geom(name, type, size, pos=(0, 0, 0), group=0, **kwargs):
+    """
+    Creates a geom element with attributes specified by @**kwargs.
+
+    NOTE: With the exception of @geom_type, @size, and @pos, if any arg is set to
+        None, the value will automatically be popped before passing the values
+        to create the appropriate XML
+
+    Args:
+        name (str): Name for this geom
+        type (str): type of the geom.
+            see all types here: http://mujoco.org/book/modeling.html#geom
+        size (n-array of float): geom size parameters.
+        pos (3-array): (x,y,z) 3d position of the site.
+        group (int): the integrer group that the geom belongs to. useful for
+            separating visual and physical elements.
+        **kwargs: Any additional specified attributes for the new geom
+
+    Returns:
+        ET.Element: new geom xml element
+    """
+    kwargs["type"] = type
+    kwargs["size"] = size
+    kwargs["pos"] = pos
+    kwargs["group"] = group if group is not None else None
+    return new_element(tag="geom", name=name, **kwargs)
+
+
+def new_body(name, pos=(0, 0, 0), **kwargs):
+    """
+    Creates a body element with attributes specified by @**kwargs.
+
+    Args:
+        name (str): Name for this body
+        pos (3-array): (x,y,z) 3d position of the body frame.
+        **kwargs: Any additional specified attributes for the new body
+
+    Returns:
+        ET.Element: new body xml element
+    """
+    kwargs["pos"] = pos
+    return new_element(tag="body", name=name, **kwargs)
+
+
+def new_inertial(pos=(0, 0, 0), mass=None, **kwargs):
+    """
+    Creates a inertial element with attributes specified by @**kwargs.
+
+    Args:
+        pos (3-array): (x,y,z) 3d position of the inertial frame.
+        mass (float): The mass of inertial
+        **kwargs: Any additional specified attributes for the new inertial element
+
+    Returns:
+        ET.Element: new inertial xml element
+    """
+    kwargs["mass"] = mass if mass is not None else None
+    kwargs["pos"] = pos
+    return new_element(tag="inertial", name=None, **kwargs)
+
+
+def get_size(size, size_max, size_min, default_max, default_min):
+    """
+    Helper method for providing a size, or a range to randomize from
+
+    Args:
+        size (n-array): Array of numbers that explicitly define the size
+        size_max (n-array): Array of numbers that define the custom max size from which to randomly sample
+        size_min (n-array): Array of numbers that define the custom min size from which to randomly sample
+        default_max (n-array): Array of numbers that define the default max size from which to randomly sample
+        default_min (n-array): Array of numbers that define the default min size from which to randomly sample
+
+    Returns:
+        np.array: size generated
+
+    Raises:
+        ValueError: [Inconsistent array sizes]
+    """
+    if len(default_max) != len(default_min):
+        raise ValueError(
+            "default_max = {} and default_min = {}".format(str(default_max), str(default_min))
+            + " have different lengths"
+        )
+    if size is not None:
+        if (size_max is not None) or (size_min is not None):
+            raise ValueError("size = {} overrides size_max = {}, size_min = {}".format(size, size_max, size_min))
+    else:
+        if size_max is None:
+            size_max = default_max
+        if size_min is None:
+            size_min = default_min
+        size = np.array([np.random.uniform(size_min[i], size_max[i]) for i in range(len(default_max))])
+    return np.array(size)
+
+
+def add_to_dict(dic, fill_in_defaults=True, default_value=None, **kwargs):
+    """
+    Helper function to add key-values to dictionary @dic where each entry is its own array (list).
+    Args:
+        dic (dict): Dictionary to which new key / value pairs will be added. If the key already exists,
+            will append the value to that key entry
+        fill_in_defaults (bool): If True, will automatically add @default_value to all dictionary entries that are
+            not explicitly specified in @kwargs
+        default_value (any): Default value to fill (None by default)
+
+    Returns:
+        dict: Modified dictionary
+    """
+    # Get keys and length of array for a given entry in dic
+    keys = set(dic.keys())
+    n = len(list(keys)[0]) if keys else 0
+    for k, v in kwargs.items():
+        if k in dic:
+            dic[k].append(v)
+            keys.remove(k)
+        else:
+            dic[k] = [default_value] * n + [v] if fill_in_defaults else [v]
+    # If filling in defaults, fill in remaining default values
+    if fill_in_defaults:
+        for k in keys:
+            dic[k].append(default_value)
+    return dic
+
+
+def add_prefix(
+    root,
+    prefix,
+    tags="default",
+    attribs="default",
+    exclude=None,
+):
+    """
+    Find all element(s) matching the requested @tag, and appends @prefix to all @attributes if they exist.
+
+    Args:
+        root (ET.Element): Root of the xml element tree to start recursively searching through.
+        prefix (str): Prefix to add to all specified attributes
+        tags (str or list of str or set): Tag(s) to search for in this ElementTree. "Default" corresponds to all tags
+        attribs (str or list of str or set): Element attribute(s) to append prefix to. "Default" corresponds
+            to all attributes that reference names
+        exclude (None or function): Filtering function that should take in an ET.Element or a string (attribute) and
+            return True if we should exclude the given element / attribute from having any prefixes added
+    """
+    # Standardize tags and attributes to be a set
+    if tags != "default":
+        tags = {tags} if type(tags) is str else set(tags)
+    if attribs == "default":
+        attribs = MUJOCO_NAMED_ATTRIBUTES
+    attribs = {attribs} if type(attribs) is str else set(attribs)
+
+    # Check the current element for matching conditions
+    if (tags == "default" or root.tag in tags) and (exclude is None or not exclude(root)):
+        for attrib in attribs:
+            v = root.get(attrib, None)
+            # Only add prefix if the attribute exist, the current attribute doesn't already begin with prefix,
+            # and the @exclude filter is either None or returns False
+            if v is not None and not v.startswith(prefix) and (exclude is None or not exclude(v)):
+                root.set(attrib, prefix + v)
+    # Continue recursively searching through the element tree
+    for r in root:
+        add_prefix(root=r, prefix=prefix, tags=tags, attribs=attribs, exclude=exclude)
+
+
+def add_material(root, naming_prefix="", custom_material=None):
+    """
+    Iterates through all element(s) in @root recursively and adds a material / texture to all visual geoms that don't
+    already have a material specified.
+
+    Args:
+        root (ET.Element): Root of the xml element tree to start recursively searching through.
+        naming_prefix (str): Adds this prefix to all material and texture names
+        custom_material (None or CustomMaterial): If specified, will add this material to all visual geoms.
+            Else, will add a default "no-change" material.
+
+    Returns:
+        4-tuple: (ET.Element, ET.Element, CustomMaterial, bool) (tex_element, mat_element, material, used)
+            corresponding to the added material and whether the material was actually used or not.
+    """
+    # Initialize used as False
+    used = False
+    # First, make sure material is specified
+    if custom_material is None:
+        custom_material = CustomMaterial(
+            texture=None,
+            tex_name="default_tex",
+            mat_name="default_mat",
+            tex_attrib={
+                "type": "cube",
+                "builtin": "flat",
+                "width": 100,
+                "height": 100,
+                "rgb1": np.ones(3),
+                "rgb2": np.ones(3),
+            },
+        )
+    # Else, check to make sure the custom material begins with the specified prefix and that it's unique
+    if not custom_material.name.startswith(naming_prefix) and not custom_material.shared:
+        custom_material.name = naming_prefix + custom_material.name
+        custom_material.tex_attrib["name"] = naming_prefix + custom_material.tex_attrib["name"]
+        custom_material.mat_attrib["name"] = naming_prefix + custom_material.mat_attrib["name"]
+        custom_material.mat_attrib["texture"] = naming_prefix + custom_material.mat_attrib["texture"]
+
+    # Check the current element for matching conditions
+    if root.tag == "geom" and root.get("group", None) == "1" and root.get("material", None) is None:
+        # Add a new material attribute to this geom
+        root.set("material", custom_material.name)
+        # Set used to True
+        used = True
+    # Continue recursively searching through the element tree
+    for r in root:
+        _, _, _, _used = add_material(root=r, naming_prefix=naming_prefix, custom_material=custom_material)
+        # Update used
+        used = used or _used
+    # Lastly, return the new texture and material elements
+    tex_element = new_element(tag="texture", **custom_material.tex_attrib)
+    mat_element = new_element(tag="material", **custom_material.mat_attrib)
+    return tex_element, mat_element, custom_material, used
+
+
+def recolor_collision_geoms(root, rgba, exclude=None):
+    """
+    Iteratively searches through all elements starting with @root to find all geoms belonging to group 0 and set
+    the corresponding rgba value to the specified @rgba argument. Note: also removes any material values for these
+    elements.
+
+    Args:
+        root (ET.Element): Root of the xml element tree to start recursively searching through
+        rgba (4-array): (R, G, B, A) values to assign to all geoms with this group.
+        exclude (None or function): Filtering function that should take in an ET.Element and
+            return True if we should exclude the given element / attribute from having its collision geom impacted.
+    """
+    # Check this body
+    if root.tag == "geom" and root.get("group") in {None, "0"} and (exclude is None or not exclude(root)):
+        root.set("rgba", array_to_string(rgba))
+        root.attrib.pop("material", None)
+
+    # Iterate through all children elements
+    for r in root:
+        recolor_collision_geoms(root=r, rgba=rgba, exclude=exclude)
+
+
+def _element_filter(element, parent):
+    """
+    Default element filter to be used in sort_elements. This will filter for the following groups:
+
+        :`'root_body'`: Top-level body element
+        :`'bodies'`: Any body elements
+        :`'joints'`: Any joint elements
+        :`'actuators'`: Any actuator elements
+        :`'sites'`: Any site elements
+        :`'sensors'`: Any sensor elements
+        :`'contact_geoms'`: Any geoms used for collision (as specified by group 0 (default group) geoms)
+        :`'visual_geoms'`: Any geoms used for visual rendering (as specified by group 1 geoms)
+
+    Args:
+        element (ET.Element): Current XML element that we are filtering
+        parent (ET.Element): Parent XML element for the current element
+
+    Returns:
+        str or None: Assigned filter key for this element. None if no matching filter is found.
+    """
+    # Check for actuator first since this is dependent on the parent element
+    if parent is not None and parent.tag == "actuator":
+        return "actuators"
+    elif element.tag == "joint":
+        # Make sure this is not a tendon (this should not have a "joint", "joint1", or "joint2" attribute specified)
+        if element.get("joint") is None and element.get("joint1") is None:
+            return "joints"
+    elif element.tag == "body":
+        # If the parent of this does not have a tag "body", then this is the top-level body element
+        if parent is None or parent.tag != "body":
+            return "root_body"
+        return "bodies"
+    elif element.tag == "site":
+        return "sites"
+    elif element.tag in SENSOR_TYPES:
+        return "sensors"
+    elif element.tag == "geom":
+        # Only get collision and visual geoms (group 0 / None, or 1, respectively)
+        group = element.get("group")
+        if group in {None, "0", "1"}:
+            return "visual_geoms" if group == "1" else "contact_geoms"
+    else:
+        # If no condition met, return None
+        return None
+
+
+def sort_elements(root, parent=None, element_filter=None, _elements_dict=None):
+    """
+    Utility method to iteratively sort all elements based on @tags. This XML ElementTree will be parsed such that
+    all elements with the same key as returned by @element_filter will be grouped as a list entry in the returned
+    dictionary.
+
+    Args:
+        root (ET.Element): Root of the xml element tree to start recursively searching through
+        parent (ET.Element): Parent of the root node. Default is None (no parent node initially)
+        element_filter (None or function): Function used to filter the incoming elements. Should take in two
+            ET.Elements (current_element, parent_element) and return a string filter_key if the element
+            should be added to the list of values sorted by filter_key, and return None if no value should be added.
+            If no element_filter is specified, defaults to self._element_filter.
+        _elements_dict (dict): Dictionary that gets passed to recursive calls. Should not be modified externally by
+            top-level call.
+
+    Returns:
+        dict: Filtered key-specific lists of the corresponding elements
+    """
+    # Initialize dictionary and element filter if None is set
+    if _elements_dict is None:
+        _elements_dict = {}
+    if element_filter is None:
+        element_filter = _element_filter
+
+    # Parse this element
+    key = element_filter(root, parent)
+    if key is not None:
+        # Initialize new entry in the dict if this is the first time encountering this value, otherwise append
+        if key not in _elements_dict:
+            _elements_dict[key] = [root]
+        else:
+            _elements_dict[key].append(root)
+
+    # Loop through all possible subtrees for this XML recurisvely
+    for r in root:
+        _elements_dict = sort_elements(
+            root=r, parent=root, element_filter=element_filter, _elements_dict=_elements_dict
+        )
+
+    return _elements_dict
+
+
+def find_parent(root, child):
+    """
+    Find the parent element of the specified @child node, recurisvely searching through @root.
+
+    Args:
+        root (ET.Element): Root of the xml element tree to start recursively searching through.
+        child (ET.Element): Child element whose parent is to be found
+
+    Returns:
+        None or ET.Element: Matching parent if found, else None
+    """
+    # Iterate through children (DFS), if the correct child element is found, then return the current root as the parent
+    for r in root:
+        if r == child:
+            return root
+        parent = find_parent(root=r, child=child)
+        if parent is not None:
+            return parent
+    # If we get here, we didn't find anything ):
+    return None
+
+
+def find_elements(root, tags, attribs=None, return_first=True):
+    """
+    Find all element(s) matching the requested @tag and @attributes. If @return_first is True, then will return the
+    first element found matching the criteria specified. Otherwise, will return a list of elements that match the
+    criteria.
+
+    Args:
+        root (ET.Element): Root of the xml element tree to start recursively searching through.
+        tags (str or list of str or set): Tag(s) to search for in this ElementTree.
+        attribs (None or dict of str): Element attribute(s) to check against for a filtered element. A match is
+            considered found only if all attributes match. Each attribute key should have a corresponding value with
+            which to compare against.
+        return_first (bool): Whether to immediately return once the first matching element is found.
+
+    Returns:
+        None or ET.Element or list of ET.Element: Matching element(s) found. Returns None if there was no match.
+    """
+    # Initialize return value
+    elements = None if return_first else []
+
+    # Make sure tags is list
+    tags = [tags] if type(tags) is str else tags
+
+    # Check the current element for matching conditions
+    if root.tag in tags:
+        matching = True
+        if attribs is not None:
+            for k, v in attribs.items():
+                if root.get(k) != v:
+                    matching = False
+                    break
+        # If all criteria were matched, add this to the solution (or return immediately if specified)
+        if matching:
+            if return_first:
+                return root
+            else:
+                elements.append(root)
+    # Continue recursively searching through the element tree
+    for r in root:
+        if return_first:
+            elements = find_elements(tags=tags, attribs=attribs, root=r, return_first=return_first)
+            if elements is not None:
+                return elements
+        else:
+            found_elements = find_elements(tags=tags, attribs=attribs, root=r, return_first=return_first)
+            pre_elements = deepcopy(elements)
+            if found_elements:
+                elements += found_elements if type(found_elements) is list else [found_elements]
+
+    return elements if elements else None
+
+
+def save_sim_model(sim, fname):
+    """
+    Saves the current model xml from @sim at file location @fname.
+
+    Args:
+        sim (MjSim): XML file to save, in string form
+        fname (str): Absolute filepath to the location to save the file
+    """
+    with open(fname, "w") as f:
+        sim.save(file=f, format="xml")
+
+
+def get_ids(sim, elements, element_type="geom", inplace=False):
+    """
+    Grabs the mujoco IDs for each element in @elements, corresponding to the specified @element_type.
+
+    Args:
+        sim (MjSim): Active mujoco simulation object
+        elements (str or list or dict): Element(s) to convert into IDs. Note that the return type corresponds to
+            @elements type, where each element name is replaced with the ID
+        element_type (str): The type of element to grab ID for. Options are {geom, body, site}
+        inplace (bool): If False, will create a copy of @elements to prevent overwriting the original data structure
+
+    Returns:
+        str or list or dict: IDs corresponding to @elements.
+    """
+    if not inplace:
+        # Copy elements first so we don't write to the underlying object
+        elements = deepcopy(elements)
+    # Choose what to do based on elements type
+    if isinstance(elements, str):
+        # We simply return the value of this single element
+        assert element_type in {
+            "geom",
+            "body",
+            "site",
+        }, f"element_type must be either geom, body, or site. Got: {element_type}"
+        if element_type == "geom":
+            elements = sim.model.geom_name2id(elements)
+        elif element_type == "body":
+            elements = sim.model.body_name2id(elements)
+        else:  # site
+            elements = sim.model.site_name2id(elements)
+    elif isinstance(elements, dict):
+        # Iterate over each element in dict and recursively repeat
+        for name, ele in elements:
+            elements[name] = get_ids(sim=sim, elements=ele, element_type=element_type, inplace=True)
+    else:  # We assume this is an iterable array
+        assert isinstance(elements, Iterable), "Elements must be iterable for get_id!"
+        elements = [get_ids(sim=sim, elements=ele, element_type=element_type, inplace=True) for ele in elements]
+
+    return elements
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/mjmod.py b/phantom/submodules/phantom-robosuite/robosuite/utils/mjmod.py
new file mode 100644
index 0000000000000000000000000000000000000000..3712e619906214e4221b4937b655b7092aba7611
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/mjmod.py
@@ -0,0 +1,1997 @@
+"""
+Modder classes used for domain randomization. Largely based off of the mujoco-py
+implementation below.
+
+https://github.com/openai/mujoco-py/blob/1fe312b09ae7365f0dd9d4d0e453f8da59fae0bf/mujoco_py/modder.py
+"""
+
+import copy
+import os
+from collections import defaultdict
+
+import numpy as np
+from PIL import Image
+
+import robosuite
+import robosuite.utils.transform_utils as trans
+from robosuite.utils.binding_utils import MjRenderContextOffscreen
+
+
+class BaseModder:
+    """
+    Base class meant to modify simulation attributes mid-sim.
+
+    Using @random_state ensures that sampling here won't be affected
+    by sampling that happens outside of the modders.
+
+    Args:
+        sim (MjSim): simulation object
+
+        random_state (RandomState): instance of np.random.RandomState, specific
+            seed used to randomize these modifications without impacting other
+            numpy seeds / randomizations
+    """
+
+    def __init__(self, sim, random_state=None):
+        self.sim = sim
+        if random_state is None:
+            # default to global RandomState instance
+            self.random_state = np.random.mtrand._rand
+        else:
+            self.random_state = random_state
+
+    def update_sim(self, sim):
+        """
+        Setter function to update internal sim variable
+
+        Args:
+            sim (MjSim): MjSim object
+        """
+        self.sim = sim
+
+    @property
+    def model(self):
+        """
+        Returns:
+            MjModel: Mujoco sim model
+        """
+        # Available for quick convenience access
+        return self.sim.model
+
+
+class LightingModder(BaseModder):
+    """
+    Modder to modify lighting within a Mujoco simulation.
+
+    Args:
+        sim (MjSim): MjSim object
+
+        random_state (RandomState): instance of np.random.RandomState
+
+        light_names (None or list of str): list of lights to use for randomization. If not provided, all
+            lights in the model are randomized.
+
+        randomize_position (bool): If True, randomizes position of lighting
+
+        randomize_direction (bool): If True, randomizes direction of lighting
+
+        randomize_specular (bool): If True, randomizes specular attribute of lighting
+
+        randomize_ambient (bool): If True, randomizes ambient attribute of lighting
+
+        randomize_diffuse (bool): If True, randomizes diffuse attribute of lighting
+
+        randomize_active (bool): If True, randomizes active nature of lighting
+
+        position_perturbation_size (float): Magnitude of position randomization
+
+        direction_perturbation_size (float): Magnitude of direction randomization
+
+        specular_perturbation_size (float): Magnitude of specular attribute randomization
+
+        ambient_perturbation_size (float): Magnitude of ambient attribute randomization
+
+        diffuse_perturbation_size (float): Magnitude of diffuse attribute randomization
+    """
+
+    def __init__(
+        self,
+        sim,
+        random_state=None,
+        light_names=None,
+        randomize_position=True,
+        randomize_direction=True,
+        randomize_specular=True,
+        randomize_ambient=True,
+        randomize_diffuse=True,
+        randomize_active=True,
+        position_perturbation_size=0.1,
+        direction_perturbation_size=0.35,  # 20 degrees
+        specular_perturbation_size=0.1,
+        ambient_perturbation_size=0.1,
+        diffuse_perturbation_size=0.1,
+    ):
+        super().__init__(sim, random_state=random_state)
+
+        if light_names is None:
+            light_names = self.sim.model.light_names
+        self.light_names = light_names
+
+        self.randomize_position = randomize_position
+        self.randomize_direction = randomize_direction
+        self.randomize_specular = randomize_specular
+        self.randomize_ambient = randomize_ambient
+        self.randomize_diffuse = randomize_diffuse
+        self.randomize_active = randomize_active
+
+        self.position_perturbation_size = position_perturbation_size
+        self.direction_perturbation_size = direction_perturbation_size
+        self.specular_perturbation_size = specular_perturbation_size
+        self.ambient_perturbation_size = ambient_perturbation_size
+        self.diffuse_perturbation_size = diffuse_perturbation_size
+
+        self.save_defaults()
+
+    def save_defaults(self):
+        """
+        Uses the current MjSim state and model to save default parameter values.
+        """
+        self._defaults = {k: {} for k in self.light_names}
+        for name in self.light_names:
+            self._defaults[name]["pos"] = np.array(self.get_pos(name))
+            self._defaults[name]["dir"] = np.array(self.get_dir(name))
+            self._defaults[name]["specular"] = np.array(self.get_specular(name))
+            self._defaults[name]["ambient"] = np.array(self.get_ambient(name))
+            self._defaults[name]["diffuse"] = np.array(self.get_diffuse(name))
+            self._defaults[name]["active"] = self.get_active(name)
+
+    def restore_defaults(self):
+        """
+        Reloads the saved parameter values.
+        """
+        for name in self.light_names:
+            self.set_pos(name, self._defaults[name]["pos"])
+            self.set_dir(name, self._defaults[name]["dir"])
+            self.set_specular(name, self._defaults[name]["specular"])
+            self.set_ambient(name, self._defaults[name]["ambient"])
+            self.set_diffuse(name, self._defaults[name]["diffuse"])
+            self.set_active(name, self._defaults[name]["active"])
+
+    def randomize(self):
+        """
+        Randomizes all requested lighting values within the sim
+        """
+        for name in self.light_names:
+            if self.randomize_position:
+                self._randomize_position(name)
+
+            if self.randomize_direction:
+                self._randomize_direction(name)
+
+            if self.randomize_specular:
+                self._randomize_specular(name)
+
+            if self.randomize_ambient:
+                self._randomize_ambient(name)
+
+            if self.randomize_diffuse:
+                self._randomize_diffuse(name)
+
+            if self.randomize_active:
+                self._randomize_active(name)
+
+    def _randomize_position(self, name):
+        """
+        Helper function to randomize position of a specific light source
+
+        Args:
+            name (str): Name of the lighting source to randomize for
+        """
+        delta_pos = self.random_state.uniform(
+            low=-self.position_perturbation_size,
+            high=self.position_perturbation_size,
+            size=3,
+        )
+        self.set_pos(
+            name,
+            self._defaults[name]["pos"] + delta_pos,
+        )
+
+    def _randomize_direction(self, name):
+        """
+        Helper function to randomize direction of a specific light source
+
+        Args:
+            name (str): Name of the lighting source to randomize for
+        """
+        # sample a small, random axis-angle delta rotation
+        random_axis, random_angle = trans.random_axis_angle(
+            angle_limit=self.direction_perturbation_size, random_state=self.random_state
+        )
+        random_delta_rot = trans.quat2mat(trans.axisangle2quat(random_axis * random_angle))
+
+        # rotate direction by this delta rotation and set the new direction
+        new_dir = random_delta_rot.dot(self._defaults[name]["dir"])
+        self.set_dir(
+            name,
+            new_dir,
+        )
+
+    def _randomize_specular(self, name):
+        """
+        Helper function to randomize specular attribute of a specific light source
+
+        Args:
+            name (str): Name of the lighting source to randomize for
+        """
+        delta = self.random_state.uniform(
+            low=-self.specular_perturbation_size,
+            high=self.specular_perturbation_size,
+            size=3,
+        )
+        self.set_specular(
+            name,
+            self._defaults[name]["specular"] + delta,
+        )
+
+    def _randomize_ambient(self, name):
+        """
+        Helper function to randomize ambient attribute of a specific light source
+
+        Args:
+            name (str): Name of the lighting source to randomize for
+        """
+        delta = self.random_state.uniform(
+            low=-self.ambient_perturbation_size,
+            high=self.ambient_perturbation_size,
+            size=3,
+        )
+        self.set_ambient(
+            name,
+            self._defaults[name]["ambient"] + delta,
+        )
+
+    def _randomize_diffuse(self, name):
+        """
+        Helper function to randomize diffuse attribute of a specific light source
+
+        Args:
+            name (str): Name of the lighting source to randomize for
+        """
+        delta = self.random_state.uniform(
+            low=-self.diffuse_perturbation_size,
+            high=self.diffuse_perturbation_size,
+            size=3,
+        )
+        self.set_diffuse(
+            name,
+            self._defaults[name]["diffuse"] + delta,
+        )
+
+    def _randomize_active(self, name):
+        """
+        Helper function to randomize active nature of a specific light source
+
+        Args:
+            name (str): Name of the lighting source to randomize for
+        """
+        active = int(self.random_state.uniform() > 0.5)
+        self.set_active(name, active)
+
+    def get_pos(self, name):
+        """
+        Grabs position of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+
+        Returns:
+            np.array: (x,y,z) position of lighting source
+
+        Raises:
+            AssertionError: Invalid light name
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        return self.model.light_pos[lightid]
+
+    def set_pos(self, name, value):
+        """
+        Sets position of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+            value (np.array): (x,y,z) position to set lighting source to
+
+        Raises:
+            AssertionError: Invalid light name
+            AssertionError: Invalid @value
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        value = list(value)
+        assert len(value) == 3, "Expected 3-dim value, got %s" % value
+
+        self.model.light_pos[lightid] = value
+
+    def get_dir(self, name):
+        """
+        Grabs direction of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+
+        Returns:
+            np.array: (x,y,z) direction of lighting source
+
+        Raises:
+            AssertionError: Invalid light name
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        return self.model.light_dir[lightid]
+
+    def set_dir(self, name, value):
+        """
+        Sets direction of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+            value (np.array): (ax,ay,az) direction to set lighting source to
+
+        Raises:
+            AssertionError: Invalid light name
+            AssertionError: Invalid @value
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        value = list(value)
+        assert len(value) == 3, "Expected 3-dim value, got %s" % value
+
+        self.model.light_dir[lightid] = value
+
+    def get_active(self, name):
+        """
+        Grabs active nature of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+
+        Returns:
+            int: Whether light source is active (1) or not (0)
+
+        Raises:
+            AssertionError: Invalid light name
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        return self.model.light_active[lightid]
+
+    def set_active(self, name, value):
+        """
+        Sets active nature of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+            value (int): Whether light source is active (1) or not (0)
+
+        Raises:
+            AssertionError: Invalid light name
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        self.model.light_active[lightid] = value
+
+    def get_specular(self, name):
+        """
+        Grabs specular attribute of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+
+        Returns:
+            np.array: (r,g,b) specular color of lighting source
+
+        Raises:
+            AssertionError: Invalid light name
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        return self.model.light_specular[lightid]
+
+    def set_specular(self, name, value):
+        """
+        Sets specular attribute of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+            value (np.array): (r,g,b) specular color to set lighting source to
+
+        Raises:
+            AssertionError: Invalid light name
+            AssertionError: Invalid @value
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        value = list(value)
+        assert len(value) == 3, "Expected 3-dim value, got %s" % value
+
+        self.model.light_specular[lightid] = value
+
+    def get_ambient(self, name):
+        """
+        Grabs ambient attribute of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+
+        Returns:
+            np.array: (r,g,b) ambient color of lighting source
+
+        Raises:
+            AssertionError: Invalid light name
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        return self.model.light_ambient[lightid]
+
+    def set_ambient(self, name, value):
+        """
+        Sets ambient attribute of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+            value (np.array): (r,g,b) ambient color to set lighting source to
+
+        Raises:
+            AssertionError: Invalid light name
+            AssertionError: Invalid @value
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        value = list(value)
+        assert len(value) == 3, "Expected 3-dim value, got %s" % value
+
+        self.model.light_ambient[lightid] = value
+
+    def get_diffuse(self, name):
+        """
+        Grabs diffuse attribute of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+
+        Returns:
+            np.array: (r,g,b) diffuse color of lighting source
+
+        Raises:
+            AssertionError: Invalid light name
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        return self.model.light_diffuse[lightid]
+
+    def set_diffuse(self, name, value):
+        """
+        Sets diffuse attribute of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+            value (np.array): (r,g,b) diffuse color to set lighting source to
+
+        Raises:
+            AssertionError: Invalid light name
+            AssertionError: Invalid @value
+        """
+        lightid = self.get_lightid(name)
+        assert lightid > -1, "Unkwnown light %s" % name
+
+        value = list(value)
+        assert len(value) == 3, "Expected 3-dim value, got %s" % value
+
+        self.model.light_diffuse[lightid] = value
+
+    def get_lightid(self, name):
+        """
+        Grabs unique id number of a specific light source
+
+        Args:
+            name (str): Name of the lighting source
+
+        Returns:
+            int: id of lighting source. -1 if not found
+        """
+        return self.model.light_name2id(name)
+
+
+class CameraModder(BaseModder):
+    """
+    Modder for modifying camera attributes in mujoco sim
+
+    Args:
+        sim (MjSim): MjSim object
+
+        random_state (None or RandomState): instance of np.random.RandomState
+
+        camera_names (None or list of str): list of camera names to use for randomization. If not provided,
+            all cameras are used for randomization.
+
+        randomize_position (bool): if True, randomize camera position
+
+        randomize_rotation (bool): if True, randomize camera rotation
+
+        randomize_fovy (bool): if True, randomize camera fovy
+
+        position_perturbation_size (float): size of camera position perturbations to each dimension
+
+        rotation_perturbation_size (float): magnitude of camera rotation perturbations in axis-angle.
+            Default corresponds to around 5 degrees.
+
+        fovy_perturbation_size (float): magnitude of camera fovy perturbations (corresponds to focusing)
+
+    Raises:
+        AssertionError: [No randomization selected]
+    """
+
+    def __init__(
+        self,
+        sim,
+        random_state=None,
+        camera_names=None,
+        randomize_position=True,
+        randomize_rotation=True,
+        randomize_fovy=True,
+        position_perturbation_size=0.01,
+        rotation_perturbation_size=0.087,
+        fovy_perturbation_size=5.0,
+    ):
+        super().__init__(sim, random_state=random_state)
+
+        assert randomize_position or randomize_rotation or randomize_fovy
+
+        if camera_names is None:
+            camera_names = self.sim.model.camera_names
+        self.camera_names = camera_names
+
+        self.randomize_position = randomize_position
+        self.randomize_rotation = randomize_rotation
+        self.randomize_fovy = randomize_fovy
+
+        self.position_perturbation_size = position_perturbation_size
+        self.rotation_perturbation_size = rotation_perturbation_size
+        self.fovy_perturbation_size = fovy_perturbation_size
+
+        self.save_defaults()
+
+    def save_defaults(self):
+        """
+        Uses the current MjSim state and model to save default parameter values.
+        """
+        self._defaults = {k: {} for k in self.camera_names}
+        for camera_name in self.camera_names:
+            self._defaults[camera_name]["pos"] = np.array(self.get_pos(camera_name))
+            self._defaults[camera_name]["quat"] = np.array(self.get_quat(camera_name))
+            self._defaults[camera_name]["fovy"] = self.get_fovy(camera_name)
+
+    def restore_defaults(self):
+        """
+        Reloads the saved parameter values.
+        """
+        for camera_name in self.camera_names:
+            self.set_pos(camera_name, self._defaults[camera_name]["pos"])
+            self.set_quat(camera_name, self._defaults[camera_name]["quat"])
+            self.set_fovy(camera_name, self._defaults[camera_name]["fovy"])
+
+    def randomize(self):
+        """
+        Randomizes all requested camera values within the sim
+        """
+        for camera_name in self.camera_names:
+            if self.randomize_position:
+                self._randomize_position(camera_name)
+
+            if self.randomize_rotation:
+                self._randomize_rotation(camera_name)
+
+            if self.randomize_fovy:
+                self._randomize_fovy(camera_name)
+
+    def _randomize_position(self, name):
+        """
+        Helper function to randomize position of a specific camera
+
+        Args:
+            name (str): Name of the camera to randomize for
+        """
+        delta_pos = self.random_state.uniform(
+            low=-self.position_perturbation_size,
+            high=self.position_perturbation_size,
+            size=3,
+        )
+        self.set_pos(
+            name,
+            self._defaults[name]["pos"] + delta_pos,
+        )
+
+    def _randomize_rotation(self, name):
+        """
+        Helper function to randomize orientation of a specific camera
+
+        Args:
+            name (str): Name of the camera to randomize for
+        """
+        # sample a small, random axis-angle delta rotation
+        random_axis, random_angle = trans.random_axis_angle(
+            angle_limit=self.rotation_perturbation_size, random_state=self.random_state
+        )
+        random_delta_rot = trans.quat2mat(trans.axisangle2quat(random_axis * random_angle))
+
+        # compute new rotation and set it
+        base_rot = trans.quat2mat(trans.convert_quat(self._defaults[name]["quat"], to="xyzw"))
+        new_rot = random_delta_rot.T.dot(base_rot)
+        new_quat = trans.convert_quat(trans.mat2quat(new_rot), to="wxyz")
+        self.set_quat(
+            name,
+            new_quat,
+        )
+
+    def _randomize_fovy(self, name):
+        """
+        Helper function to randomize fovy of a specific camera
+
+        Args:
+            name (str): Name of the camera to randomize for
+        """
+        delta_fovy = self.random_state.uniform(
+            low=-self.fovy_perturbation_size,
+            high=self.fovy_perturbation_size,
+        )
+        self.set_fovy(
+            name,
+            self._defaults[name]["fovy"] + delta_fovy,
+        )
+
+    def get_fovy(self, name):
+        """
+        Grabs fovy of a specific camera
+
+        Args:
+            name (str): Name of the camera
+
+        Returns:
+            float: vertical field of view of the camera, expressed in degrees
+
+        Raises:
+            AssertionError: Invalid camera name
+        """
+        camid = self.get_camid(name)
+        assert camid > -1, "Unknown camera %s" % name
+        return self.model.cam_fovy[camid]
+
+    def set_fovy(self, name, value):
+        """
+        Sets fovy of a specific camera
+
+        Args:
+            name (str): Name of the camera
+            value (float): vertical field of view of the camera, expressed in degrees
+
+        Raises:
+            AssertionError: Invalid camera name
+            AssertionError: Invalid value
+        """
+        camid = self.get_camid(name)
+        assert 0 < value < 180
+        assert camid > -1, "Unknown camera %s" % name
+        self.model.cam_fovy[camid] = value
+
+    def get_quat(self, name):
+        """
+        Grabs orientation of a specific camera
+
+        Args:
+            name (str): Name of the camera
+
+        Returns:
+            np.array: (w,x,y,z) orientation of the camera, expressed in quaternions
+
+        Raises:
+            AssertionError: Invalid camera name
+        """
+        camid = self.get_camid(name)
+        assert camid > -1, "Unknown camera %s" % name
+        return self.model.cam_quat[camid]
+
+    def set_quat(self, name, value):
+        """
+        Sets orientation of a specific camera
+
+        Args:
+            name (str): Name of the camera
+            value (np.array): (w,x,y,z) orientation of the camera, expressed in quaternions
+
+        Raises:
+            AssertionError: Invalid camera name
+            AssertionError: Invalid value
+        """
+        value = list(value)
+        assert len(value) == 4, "Expectd value of length 4, instead got %s" % value
+        camid = self.get_camid(name)
+        assert camid > -1, "Unknown camera %s" % name
+        self.model.cam_quat[camid] = value
+
+    def get_pos(self, name):
+        """
+        Grabs position of a specific camera
+
+        Args:
+            name (str): Name of the camera
+
+        Returns:
+            np.array: (x,y,z) position of the camera
+
+        Raises:
+            AssertionError: Invalid camera name
+        """
+        camid = self.get_camid(name)
+        assert camid > -1, "Unknown camera %s" % name
+        return self.model.cam_pos[camid]
+
+    def set_pos(self, name, value):
+        """
+        Sets position of a specific camera
+
+        Args:
+            name (str): Name of the camera
+            value (np.array): (x,y,z) position of the camera
+
+        Raises:
+            AssertionError: Invalid camera name
+            AssertionError: Invalid value
+        """
+        value = list(value)
+        assert len(value) == 3, "Expected value of length 3, instead got %s" % value
+        camid = self.get_camid(name)
+        assert camid > -1
+        self.model.cam_pos[camid] = value
+
+    def get_camid(self, name):
+        """
+        Grabs unique id number of a specific camera
+
+        Args:
+            name (str): Name of the camera
+
+        Returns:
+            int: id of camera. -1 if not found
+        """
+        return self.model.camera_name2id(name)
+
+
+class TextureModder(BaseModder):
+    """
+    Modify textures in model. Example use:
+        sim = MjSim(...)
+        modder = TextureModder(sim)
+        modder.whiten_materials()  # ensures materials won't impact colors
+        modder.set_checker('some_geom', (255, 0, 0), (0, 0, 0))
+        modder.rand_all('another_geom')
+
+    Note: in order for the textures to take full effect, you'll need to set
+    the rgba values for all materials to [1, 1, 1, 1], otherwise the texture
+    colors will be modulated by the material colors. Call the
+    `whiten_materials` helper method to set all material colors to white.
+
+    Args:
+        sim (MjSim): MjSim object
+
+        random_state (RandomState): instance of np.random.RandomState
+
+        geom_names ([string]): list of geom names to use for randomization. If not provided,
+            all geoms are used for randomization.
+
+        randomize_local (bool): if True, constrain RGB color variations to be close to the
+            original RGB colors per geom and texture. Otherwise, RGB color values will
+            be sampled uniformly at random.
+
+        randomize_material (bool): if True, randomizes material properties associated with a
+            given texture (reflectance, shininess, specular)
+
+        local_rgb_interpolation (float): determines the size of color variations from
+            the base geom colors when @randomize_local is True.
+
+        local_material_interpolation (float): determines the size of material variations from
+            the base material when @randomize_local and @randomize_material are both True.
+
+        texture_variations (list of str): a list of texture variation strings. Each string
+            must be either 'rgb', 'checker', 'noise', or 'gradient' and corresponds to
+            a specific kind of texture randomization. For each geom that has a material
+            and texture, a random variation from this list is sampled and applied.
+
+        randomize_skybox (bool): if True, apply texture variations to the skybox as well.
+    """
+
+    def __init__(
+        self,
+        sim,
+        random_state=None,
+        geom_names=None,
+        randomize_local=False,
+        randomize_material=False,
+        local_rgb_interpolation=0.1,
+        local_material_interpolation=0.2,
+        texture_variations=("rgb", "checker", "noise", "gradient"),
+        randomize_skybox=True,
+    ):
+        super().__init__(sim, random_state=random_state)
+
+        if geom_names is None:
+            geom_names = self.sim.model.geom_names
+        self.geom_names = geom_names
+
+        self.randomize_local = randomize_local
+        self.randomize_material = randomize_material
+        self.local_rgb_interpolation = local_rgb_interpolation
+        self.local_material_interpolation = local_material_interpolation
+        self.texture_variations = list(texture_variations)
+        self.randomize_skybox = randomize_skybox
+
+        self._all_texture_variation_callbacks = {
+            "rgb": self.rand_rgb,
+            "checker": self.rand_checker,
+            "noise": self.rand_noise,
+            "gradient": self.rand_gradient,
+        }
+        self._texture_variation_callbacks = {
+            k: self._all_texture_variation_callbacks[k] for k in self.texture_variations
+        }
+
+        self.save_defaults()
+
+    def save_defaults(self):
+        """
+        Uses the current MjSim state and model to save default parameter values.
+        """
+        self.textures = [Texture(self.model, i) for i in range(self.model.ntex)]
+        # self._build_tex_geom_map()
+
+        # save copy of original texture bitmaps
+        self._default_texture_bitmaps = [np.array(text.bitmap) for text in self.textures]
+
+        # These matrices will be used to rapidly synthesize
+        # checker pattern bitmaps
+        self._cache_checker_matrices()
+
+        self._defaults = {k: {} for k in self.geom_names}
+        if self.randomize_skybox:
+            self._defaults["skybox"] = {}
+        for name in self.geom_names:
+            if self._check_geom_for_texture(name):
+                # store the texture bitmap for this geom
+                tex_id = self._name_to_tex_id(name)
+                self._defaults[name]["texture"] = self._default_texture_bitmaps[tex_id]
+                # store material properties as well (in tuple (reflectance, shininess, specular) form)
+                self._defaults[name]["material"] = self.get_material(name)
+            else:
+                # store geom color
+                self._defaults[name]["rgb"] = np.array(self.get_geom_rgb(name))
+
+        if self.randomize_skybox:
+            tex_id = self._name_to_tex_id("skybox")
+            self._defaults["skybox"]["texture"] = self._default_texture_bitmaps[tex_id]
+
+    def restore_defaults(self):
+        """
+        Reloads the saved parameter values.
+        """
+        for name in self.geom_names:
+            if self._check_geom_for_texture(name):
+                self.set_texture(name, self._defaults[name]["texture"], perturb=False)
+                self.set_material(name, self._defaults[name]["material"], perturb=False)
+            else:
+                self.set_geom_rgb(name, self._defaults[name]["rgb"])
+
+        if self.randomize_skybox:
+            self.set_texture("skybox", self._defaults["skybox"]["texture"], perturb=False)
+
+    def randomize(self):
+        """
+        Overrides mujoco-py implementation to also randomize color
+        for geoms that have no material.
+        """
+        self.whiten_materials()
+        for name in self.geom_names:
+            if self._check_geom_for_texture(name):
+                # geom has valid texture that can be randomized
+                self._randomize_texture(name)
+                # randomize material if requested
+                if self.randomize_material:
+                    self._randomize_material(name)
+            else:
+                # randomize geom color
+                self._randomize_geom_color(name)
+
+        if self.randomize_skybox:
+            self._randomize_texture("skybox")
+
+    def _randomize_geom_color(self, name):
+        """
+        Helper function to randomize color of a specific geom
+
+        Args:
+            name (str): Name of the geom to randomize for
+        """
+        if self.randomize_local:
+            random_color = self.random_state.uniform(0, 1, size=3)
+            rgb = (1.0 - self.local_rgb_interpolation) * self._defaults[name][
+                "rgb"
+            ] + self.local_rgb_interpolation * random_color
+        else:
+            rgb = self.random_state.uniform(0, 1, size=3)
+        self.set_geom_rgb(name, rgb)
+
+    def _randomize_texture(self, name):
+        """
+        Helper function to randomize texture of a specific geom
+
+        Args:
+            name (str): Name of the geom to randomize for
+        """
+        keys = list(self._texture_variation_callbacks.keys())
+        choice = keys[self.random_state.randint(len(keys))]
+        self._texture_variation_callbacks[choice](name)
+
+    def _randomize_material(self, name):
+        """
+        Helper function to randomize material of a specific geom
+
+        Args:
+            name (str): Name of the geom to randomize for
+        """
+        # Return immediately if this is the skybox
+        if name == "skybox":
+            return
+        # Grab material id
+        mat_id = self._name_to_mat_id(name)
+        # Randomize reflectance, shininess, and specular
+        material = self.random_state.uniform(0, 1, size=3)  # (reflectance, shininess, specular)
+        self.set_material(name, material, perturb=self.randomize_local)
+
+    def rand_checker(self, name):
+        """
+        Generates a random checker pattern for a specific geom
+
+        Args:
+            name (str): Name of the geom to randomize for
+        """
+        rgb1, rgb2 = self.get_rand_rgb(2)
+        self.set_checker(name, rgb1, rgb2, perturb=self.randomize_local)
+
+    def rand_gradient(self, name):
+        """
+        Generates a random gradient pattern for a specific geom
+
+        Args:
+            name (str): Name of the geom to randomize for
+        """
+        rgb1, rgb2 = self.get_rand_rgb(2)
+        vertical = bool(self.random_state.uniform() > 0.5)
+        self.set_gradient(name, rgb1, rgb2, vertical=vertical, perturb=self.randomize_local)
+
+    def rand_rgb(self, name):
+        """
+        Generates a random RGB color for a specific geom
+
+        Args:
+            name (str): Name of the geom to randomize for
+        """
+        rgb = self.get_rand_rgb()
+        self.set_rgb(name, rgb, perturb=self.randomize_local)
+
+    def rand_noise(self, name):
+        """
+        Generates a random RGB noise pattern for a specific geom
+
+        Args:
+            name (str): Name of the geom to randomize for
+        """
+        fraction = 0.1 + self.random_state.uniform() * 0.8
+        rgb1, rgb2 = self.get_rand_rgb(2)
+        self.set_noise(name, rgb1, rgb2, fraction, perturb=self.randomize_local)
+
+    def whiten_materials(self):
+        """
+        Extends modder.TextureModder to also whiten geom_rgba
+
+        Helper method for setting all material colors to white, otherwise
+        the texture modifications won't take full effect.
+        """
+        for name in self.geom_names:
+            # whiten geom
+            geom_id = self.model.geom_name2id(name)
+            self.model.geom_rgba[geom_id, :] = 1.0
+
+            if self._check_geom_for_texture(name):
+                # whiten material
+                mat_id = self.model.geom_matid[geom_id]
+                self.model.mat_rgba[mat_id, :] = 1.0
+
+    def get_geom_rgb(self, name):
+        """
+        Grabs rgb color of a specific geom
+
+        Args:
+            name (str): Name of the geom
+
+        Returns:
+            np.array: (r,g,b) geom colors
+        """
+        geom_id = self.model.geom_name2id(name)
+        return self.model.geom_rgba[geom_id, :3]
+
+    def set_geom_rgb(self, name, rgb):
+        """
+        Sets rgb color of a specific geom
+
+        Args:
+            name (str): Name of the geom
+            rgb (np.array): (r,g,b) geom colors
+        """
+        geom_id = self.model.geom_name2id(name)
+        self.model.geom_rgba[geom_id, :3] = rgb
+
+    def get_rand_rgb(self, n=1):
+        """
+        Grabs a batch of random rgb tuple combos
+
+        Args:
+            n (int): How many sets of rgb tuples to randomly generate
+
+        Returns:
+            np.array or n-tuple: if n > 1, each tuple entry is a rgb tuple. else, single (r,g,b) array
+        """
+
+        def _rand_rgb():
+            return np.array(self.random_state.uniform(size=3) * 255, dtype=np.uint8)
+
+        if n == 1:
+            return _rand_rgb()
+        else:
+            return tuple(_rand_rgb() for _ in range(n))
+
+    def get_texture(self, name):
+        """
+        Grabs texture of a specific geom
+
+        Args:
+            name (str): Name of the geom
+
+        Returns:
+            Texture: texture associated with the geom
+        """
+        tex_id = self._name_to_tex_id(name)
+        texture = self.textures[tex_id]
+        return texture
+
+    def set_texture(self, name, bitmap, perturb=False):
+        """
+        Sets the bitmap for the texture that corresponds
+        to geom @name.
+
+        If @perturb is True, then use the computed bitmap
+        to perturb the default bitmap slightly, instead
+        of replacing it.
+
+        Args:
+            name (str): Name of the geom
+            bitmap (np.array): 3d-array representing rgb pixel-wise values
+            perturb (bool): Whether to perturb the inputted bitmap or not
+        """
+        bitmap_to_set = self.get_texture(name).bitmap
+        if perturb:
+            bitmap = (1.0 - self.local_rgb_interpolation) * self._defaults[name][
+                "texture"
+            ] + self.local_rgb_interpolation * bitmap
+        bitmap_to_set[:] = bitmap
+        self.upload_texture(name)
+
+    def get_material(self, name):
+        """
+        Grabs material of a specific geom
+
+        Args:
+            name (str): Name of the geom
+
+        Returns:
+            np.array: (reflectance, shininess, specular) material properties associated with the geom
+        """
+        mat_id = self._name_to_mat_id(name)
+        # Material is in tuple form (reflectance, shininess, specular)
+        material = np.array(
+            (self.model.mat_reflectance[mat_id], self.model.mat_shininess[mat_id], self.model.mat_specular[mat_id])
+        )
+        return material
+
+    def set_material(self, name, material, perturb=False):
+        """
+        Sets the material that corresponds to geom @name.
+
+        If @perturb is True, then use the computed material
+        to perturb the default material slightly, instead
+        of replacing it.
+
+        Args:
+            name (str): Name of the geom
+            material (np.array): (reflectance, shininess, specular) material properties associated with the geom
+            perturb (bool): Whether to perturb the inputted material properties or not
+        """
+        mat_id = self._name_to_mat_id(name)
+        if perturb:
+            material = (1.0 - self.local_material_interpolation) * self._defaults[name][
+                "material"
+            ] + self.local_material_interpolation * material
+        self.model.mat_reflectance[mat_id] = material[0]
+        self.model.mat_shininess[mat_id] = material[1]
+        self.model.mat_specular[mat_id] = material[2]
+
+    def get_checker_matrices(self, name):
+        """
+        Grabs checker pattern matrix associated with @name.
+
+        Args:
+            name (str): Name of geom
+
+        Returns:
+            np.array: 3d-array representing rgb checker pattern
+        """
+        tex_id = self._name_to_tex_id(name)
+        return self._texture_checker_mats[tex_id]
+
+    def set_checker(self, name, rgb1, rgb2, perturb=False):
+        """
+        Use the two checker matrices to create a checker
+        pattern from the two colors, and set it as
+        the texture for geom @name.
+
+        Args:
+            name (str): Name of geom
+            rgb1 (3-array): (r,g,b) value for one half of checker pattern
+            rgb2 (3-array): (r,g,b) value for other half of checker pattern
+            perturb (bool): Whether to perturb the resulting checker pattern or not
+        """
+        cbd1, cbd2 = self.get_checker_matrices(name)
+        rgb1 = np.asarray(rgb1).reshape([1, 1, -1])
+        rgb2 = np.asarray(rgb2).reshape([1, 1, -1])
+        bitmap = rgb1 * cbd1 + rgb2 * cbd2
+
+        self.set_texture(name, bitmap, perturb=perturb)
+
+    def set_gradient(self, name, rgb1, rgb2, vertical=True, perturb=False):
+        """
+        Creates a linear gradient from rgb1 to rgb2.
+
+        Args:
+            name (str): Name of geom
+            rgb1 (3-array): start color
+            rgb2 (3- array): end color
+            vertical (bool): if True, the gradient in the positive
+                y-direction, if False it's in the positive x-direction.
+            perturb (bool): Whether to perturb the resulting gradient pattern or not
+        """
+        # NOTE: MuJoCo's gradient uses a sigmoid. Here we simplify
+        # and just use a linear gradient... We could change this
+        # to just use a tanh-sigmoid if needed.
+        bitmap = self.get_texture(name).bitmap
+        h, w = bitmap.shape[:2]
+        if vertical:
+            p = np.tile(np.linspace(0, 1, h)[:, None], (1, w))
+        else:
+            p = np.tile(np.linspace(0, 1, w), (h, 1))
+
+        new_bitmap = np.zeros_like(bitmap)
+        for i in range(3):
+            new_bitmap[..., i] = rgb2[i] * p + rgb1[i] * (1.0 - p)
+
+        self.set_texture(name, new_bitmap, perturb=perturb)
+
+    def set_rgb(self, name, rgb, perturb=False):
+        """
+        Just set the texture bitmap for geom @name
+        to a constant rgb value.
+
+        Args:
+            name (str): Name of geom
+            rgb (3-array): desired (r,g,b) color
+            perturb (bool): Whether to perturb the resulting color pattern or not
+        """
+        bitmap = self.get_texture(name).bitmap
+        new_bitmap = np.zeros_like(bitmap)
+        new_bitmap[..., :] = np.asarray(rgb)
+
+        self.set_texture(name, new_bitmap, perturb=perturb)
+
+    def set_noise(self, name, rgb1, rgb2, fraction=0.9, perturb=False):
+        """
+        Sets the texture bitmap for geom @name to a noise pattern
+
+        Args:
+            name (str): name of geom
+            rgb1 (3-array): background color
+            rgb2 (3-array): color of random noise foreground color
+            fraction (float): fraction of pixels with foreground color
+            perturb (bool): Whether to perturb the resulting color pattern or not
+        """
+        bitmap = self.get_texture(name).bitmap
+        h, w = bitmap.shape[:2]
+        mask = self.random_state.uniform(size=(h, w)) < fraction
+
+        new_bitmap = np.zeros_like(bitmap)
+        new_bitmap[..., :] = np.asarray(rgb1)
+        new_bitmap[mask, :] = np.asarray(rgb2)
+
+        self.set_texture(name, new_bitmap, perturb=perturb)
+
+    def upload_texture(self, name, device_id=0):
+        """
+        Uploads the texture to the GPU so it's available in the rendering.
+
+        Args:
+            name (str): name of geom
+        """
+        texture = self.get_texture(name)
+        if self.sim._render_context_offscreen is None:
+            render_context = MjRenderContextOffscreen(self.sim, device_id)
+            render_context.upload_texture(texture.id)
+
+    def _check_geom_for_texture(self, name):
+        """
+        Helper function to determined if the geom @name has
+        an assigned material and that the material has
+        an assigned texture.
+
+        Args:
+            name (str): name of geom
+
+        Returns:
+            bool: True if specific geom has both material and texture associated, else False
+        """
+        geom_id = self.model.geom_name2id(name)
+        mat_id = self.model.geom_matid[geom_id]
+        if mat_id < 0:
+            return False
+        tex_id = self.model.mat_texid[mat_id]
+        if tex_id < 0:
+            return False
+        return True
+
+    def _name_to_tex_id(self, name):
+        """
+        Helper function to get texture id from geom name.
+
+        Args:
+            name (str): name of geom
+
+        Returns:
+            int: id of texture associated with geom
+
+        Raises:
+            AssertionError: [No texture associated with geom]
+        """
+
+        # handle skybox separately
+        if name == "skybox":
+            skybox_tex_id = -1
+            for tex_id in range(self.model.ntex):
+                skybox_textype = 2
+                if self.model.tex_type[tex_id] == skybox_textype:
+                    skybox_tex_id = tex_id
+            assert skybox_tex_id >= 0
+            return skybox_tex_id
+
+        assert self._check_geom_for_texture(name)
+        geom_id = self.model.geom_name2id(name)
+        mat_id = self.model.geom_matid[geom_id]
+        tex_id = self.model.mat_texid[mat_id]
+        return tex_id
+
+    def _name_to_mat_id(self, name):
+        """
+        Helper function to get material id from geom name.
+
+        Args:
+            name (str): name of geom
+
+        Returns:
+            int: id of material associated with geom
+
+        Raises:
+            ValueError: [No material associated with skybox]
+            AssertionError: [No material associated with geom]
+        """
+
+        # handle skybox separately
+        if name == "skybox":
+            raise ValueError("Error: skybox has no material!")
+
+        assert self._check_geom_for_texture(name)
+        geom_id = self.model.geom_name2id(name)
+        mat_id = self.model.geom_matid[geom_id]
+        return mat_id
+
+    def _cache_checker_matrices(self):
+        """
+        Cache two matrices of the form [[1, 0, 1, ...],
+                                        [0, 1, 0, ...],
+                                        ...]
+        and                            [[0, 1, 0, ...],
+                                        [1, 0, 1, ...],
+                                        ...]
+        for each texture. To use for fast creation of checkerboard patterns
+        """
+        self._texture_checker_mats = []
+        for tex_id in range(self.model.ntex):
+            texture = self.textures[tex_id]
+            h, w = texture.bitmap.shape[:2]
+            self._texture_checker_mats.append(self._make_checker_matrices(h, w))
+
+    def _make_checker_matrices(self, h, w):
+        """
+        Helper function to quickly generate binary matrices used to create checker patterns
+
+        Args:
+            h (int): Desired height of matrices
+            w (int): Desired width of matrices
+
+        Returns:
+            2-tuple:
+
+                - (np.array): 2d-array representing first half of checker matrix
+                - (np.array): 2d-array representing second half of checker matrix
+        """
+        re = np.r_[((w + 1) // 2) * [0, 1]]
+        ro = np.r_[((w + 1) // 2) * [1, 0]]
+        cbd1 = np.expand_dims(np.row_stack(((h + 1) // 2) * [re, ro]), -1)[:h, :w]
+        cbd2 = np.expand_dims(np.row_stack(((h + 1) // 2) * [ro, re]), -1)[:h, :w]
+        return cbd1, cbd2
+
+
+# From mjtTexture
+MJT_TEXTURE_ENUM = ["2d", "cube", "skybox"]
+
+
+class Texture:
+    """
+    Helper class for operating on the MuJoCo textures.
+
+    Args:
+        model (MjModel): Mujoco sim model
+        tex_id (int): id of specific texture in mujoco sim
+    """
+
+    __slots__ = ["id", "type", "height", "width", "tex_adr", "tex_rgb"]
+
+    def __init__(self, model, tex_id):
+        self.id = tex_id
+        self.type = MJT_TEXTURE_ENUM[model.tex_type[tex_id]]
+        self.height = model.tex_height[tex_id]
+        self.width = model.tex_width[tex_id]
+        self.tex_adr = model.tex_adr[tex_id]
+        self.tex_rgb = model.tex_rgb
+
+    @property
+    def bitmap(self):
+        """
+        Grabs color bitmap associated with this texture from the mujoco sim.
+
+        Returns:
+            np.array: 3d-array representing the rgb texture bitmap
+        """
+        size = self.height * self.width * 3
+        data = self.tex_rgb[self.tex_adr : self.tex_adr + size]
+        return data.reshape((self.height, self.width, 3))
+
+
+class DynamicsModder(BaseModder):
+    """
+    Modder for various dynamics properties of the mujoco model, such as friction, damping, etc.
+    This can be used to modify parameters stored in MjModel (ie friction, damping, etc.) as
+    well as optimizer parameters stored in PyMjOption (i.e.: medium density, viscosity, etc.)
+    To modify a parameter, use the parameter to be changed as a keyword argument to
+    self.mod and the new value as the value for that argument. Supports arbitrary many
+    modifications in a single step. Example use:
+        sim = MjSim(...)
+        modder = DynamicsModder(sim)
+        modder.mod("element1_name", "attr1", new_value1)
+        modder.mod("element2_name", "attr2", new_value2)
+        ...
+        modder.update()
+
+    NOTE: It is necessary to perform modder.update() after performing all modifications to make sure
+        the changes are propagated
+
+    NOTE: A full list of supported randomizable parameters can be seen by calling modder.dynamics_parameters
+
+    NOTE: When modifying parameters belonging to MjModel.opt (e.g.: density, viscosity), no name should
+        be specified (set it as None in mod(...)). This is because opt does not have a name attribute
+        associated with it
+
+    Args:
+        sim (MjSim): Mujoco sim instance
+
+        random_state (RandomState): instance of np.random.RandomState
+
+        randomize_density (bool): If True, randomizes global medium density
+
+        randomize_viscosity (bool): If True, randomizes global medium viscosity
+
+        density_perturbation_ratio (float): Relative (fraction) magnitude of default density randomization
+
+        viscosity_perturbation_ratio:  Relative (fraction) magnitude of default viscosity randomization
+
+        body_names (None or list of str): list of bodies to use for randomization. If not provided, all
+            bodies in the model are randomized.
+
+        randomize_position (bool): If True, randomizes body positions
+
+        randomize_quaternion (bool): If True, randomizes body quaternions
+
+        randomize_inertia (bool): If True, randomizes body inertias (only applicable for non-zero mass bodies)
+
+        randomize_mass (bool): If True, randomizes body masses (only applicable for non-zero mass bodies)
+
+        position_perturbation_size (float): Magnitude of body position randomization
+
+        quaternion_perturbation_size (float): Magnitude of body quaternion randomization (angle in radians)
+
+        inertia_perturbation_ratio (float): Relative (fraction) magnitude of body inertia randomization
+
+        mass_perturbation_ratio (float): Relative (fraction) magnitude of body mass randomization
+
+        geom_names (None or list of str): list of geoms to use for randomization. If not provided, all
+            geoms in the model are randomized.
+
+        randomize_friction (bool): If True, randomizes geom frictions
+
+        randomize_solref (bool): If True, randomizes geom solrefs
+
+        randomize_solimp (bool): If True, randomizes geom solimps
+
+        friction_perturbation_ratio (float): Relative (fraction) magnitude of geom friction randomization
+
+        solref_perturbation_ratio (float): Relative (fraction) magnitude of geom solref randomization
+
+        solimp_perturbation_ratio (float): Relative (fraction) magnitude of geom solimp randomization
+
+        joint_names (None or list of str): list of joints to use for randomization. If not provided, all
+            joints in the model are randomized.
+
+        randomize_stiffness (bool): If True, randomizes joint stiffnesses
+
+        randomize_frictionloss (bool): If True, randomizes joint frictionlosses
+
+        randomize_damping (bool): If True, randomizes joint dampings
+
+        randomize_armature (bool): If True, randomizes joint armatures
+
+        stiffness_perturbation_ratio (float): Relative (fraction) magnitude of joint stiffness randomization
+
+        frictionloss_perturbation_size (float): Magnitude of joint frictionloss randomization
+
+        damping_perturbation_size (float): Magnitude of joint damping randomization
+
+        armature_perturbation_size (float): Magnitude of joint armature randomization
+    """
+
+    def __init__(
+        self,
+        sim,
+        random_state=None,
+        # Opt parameters
+        randomize_density=True,
+        randomize_viscosity=True,
+        density_perturbation_ratio=0.1,
+        viscosity_perturbation_ratio=0.1,
+        # Body parameters
+        body_names=None,
+        randomize_position=True,
+        randomize_quaternion=True,
+        randomize_inertia=True,
+        randomize_mass=True,
+        position_perturbation_size=0.02,
+        quaternion_perturbation_size=0.02,
+        inertia_perturbation_ratio=0.02,
+        mass_perturbation_ratio=0.02,
+        # Geom parameters
+        geom_names=None,
+        randomize_friction=True,
+        randomize_solref=True,
+        randomize_solimp=True,
+        friction_perturbation_ratio=0.1,
+        solref_perturbation_ratio=0.1,
+        solimp_perturbation_ratio=0.1,
+        # Joint parameters
+        joint_names=None,
+        randomize_stiffness=True,
+        randomize_frictionloss=True,
+        randomize_damping=True,
+        randomize_armature=True,
+        stiffness_perturbation_ratio=0.1,
+        frictionloss_perturbation_size=0.05,
+        damping_perturbation_size=0.01,
+        armature_perturbation_size=0.01,
+    ):
+        super().__init__(sim=sim, random_state=random_state)
+
+        # Setup relevant values
+        self.dummy_bodies = set()
+        # Find all bodies that don't have any mass associated with them
+        for body_name in self.sim.model.body_names:
+            body_id = self.sim.model.body_name2id(body_name)
+            if self.sim.model.body_mass[body_id] == 0:
+                self.dummy_bodies.add(body_name)
+
+        # Get all values to randomize
+        self.body_names = list(self.sim.model.body_names) if body_names is None else body_names
+        self.geom_names = list(self.sim.model.geom_names) if geom_names is None else geom_names
+        self.joint_names = list(self.sim.model.joint_names) if joint_names is None else joint_names
+
+        # Setup randomization settings
+        # Each dynamics randomization group has its set of randomizable parameters, each of which has
+        # its own settings ["randomize": whether its actively being randomized, "perturbation": the (potentially)
+        # relative magnitude of the randomization to use, "type": either "ratio" or "size" (relative or absolute
+        # perturbations), and "clip": (low, high) values to clip the final perturbed value by]
+        self.opt_randomizations = {
+            "density": {
+                "randomize": randomize_density,
+                "perturbation": density_perturbation_ratio,
+                "type": "ratio",
+                "clip": (0.0, np.inf),
+            },
+            "viscosity": {
+                "randomize": randomize_viscosity,
+                "perturbation": viscosity_perturbation_ratio,
+                "type": "ratio",
+                "clip": (0.0, np.inf),
+            },
+        }
+
+        self.body_randomizations = {
+            "position": {
+                "randomize": randomize_position,
+                "perturbation": position_perturbation_size,
+                "type": "size",
+                "clip": (-np.inf, np.inf),
+            },
+            "quaternion": {
+                "randomize": randomize_quaternion,
+                "perturbation": quaternion_perturbation_size,
+                "type": "size",
+                "clip": (-np.inf, np.inf),
+            },
+            "inertia": {
+                "randomize": randomize_inertia,
+                "perturbation": inertia_perturbation_ratio,
+                "type": "ratio",
+                "clip": (0.0, np.inf),
+            },
+            "mass": {
+                "randomize": randomize_mass,
+                "perturbation": mass_perturbation_ratio,
+                "type": "ratio",
+                "clip": (0.0, np.inf),
+            },
+        }
+
+        self.geom_randomizations = {
+            "friction": {
+                "randomize": randomize_friction,
+                "perturbation": friction_perturbation_ratio,
+                "type": "ratio",
+                "clip": (0.0, np.inf),
+            },
+            "solref": {
+                "randomize": randomize_solref,
+                "perturbation": solref_perturbation_ratio,
+                "type": "ratio",
+                "clip": (0.0, 1.0),
+            },
+            "solimp": {
+                "randomize": randomize_solimp,
+                "perturbation": solimp_perturbation_ratio,
+                "type": "ratio",
+                "clip": (0.0, np.inf),
+            },
+        }
+
+        self.joint_randomizations = {
+            "stiffness": {
+                "randomize": randomize_stiffness,
+                "perturbation": stiffness_perturbation_ratio,
+                "type": "ratio",
+                "clip": (0.0, np.inf),
+            },
+            "frictionloss": {
+                "randomize": randomize_frictionloss,
+                "perturbation": frictionloss_perturbation_size,
+                "type": "size",
+                "clip": (0.0, np.inf),
+            },
+            "damping": {
+                "randomize": randomize_damping,
+                "perturbation": damping_perturbation_size,
+                "type": "size",
+                "clip": (0.0, np.inf),
+            },
+            "armature": {
+                "randomize": randomize_armature,
+                "perturbation": armature_perturbation_size,
+                "type": "size",
+                "clip": (0.0, np.inf),
+            },
+        }
+
+        # Store defaults so we don't loss track of the original (non-perturbed) values
+        self.opt_defaults = None
+        self.body_defaults = None
+        self.geom_defaults = None
+        self.joint_defaults = None
+        self.save_defaults()
+
+    def save_defaults(self):
+        """
+        Grabs the current values for all parameters in sim and stores them as default values
+        """
+        self.opt_defaults = {
+            None: {  # no name associated with the opt parameters
+                "density": self.sim.model.opt.density,
+                "viscosity": self.sim.model.opt.viscosity,
+            }
+        }
+
+        self.body_defaults = {}
+        for body_name in self.sim.model.body_names:
+            body_id = self.sim.model.body_name2id(body_name)
+            self.body_defaults[body_name] = {
+                "position": np.array(self.sim.model.body_pos[body_id]),
+                "quaternion": np.array(self.sim.model.body_quat[body_id]),
+                "inertia": np.array(self.sim.model.body_inertia[body_id]),
+                "mass": self.sim.model.body_mass[body_id],
+            }
+
+        self.geom_defaults = {}
+        for geom_name in self.sim.model.geom_names:
+            geom_id = self.sim.model.geom_name2id(geom_name)
+            self.geom_defaults[geom_name] = {
+                "friction": np.array(self.sim.model.geom_friction[geom_id]),
+                "solref": np.array(self.sim.model.geom_solref[geom_id]),
+                "solimp": np.array(self.sim.model.geom_solimp[geom_id]),
+            }
+
+        self.joint_defaults = {}
+        for joint_name in self.sim.model.joint_names:
+            joint_id = self.sim.model.joint_name2id(joint_name)
+            dof_idx = [i for i, v in enumerate(self.sim.model.dof_jntid) if v == joint_id]
+            self.joint_defaults[joint_name] = {
+                "stiffness": self.sim.model.jnt_stiffness[joint_id],
+                "frictionloss": np.array(self.sim.model.dof_frictionloss[dof_idx]),
+                "damping": np.array(self.sim.model.dof_damping[dof_idx]),
+                "armature": np.array(self.sim.model.dof_armature[dof_idx]),
+            }
+
+    def restore_defaults(self):
+        """
+        Restores the default values curently saved in this modder
+        """
+        # Loop through all defaults and set the default value in sim
+        for group_defaults in (self.opt_defaults, self.body_defaults, self.geom_defaults, self.joint_defaults):
+            for name, defaults in group_defaults.items():
+                for attr, default_val in defaults.items():
+                    self.mod(name=name, attr=attr, val=default_val)
+
+        # Make sure changes propagate in sim
+        self.update()
+
+    def randomize(self):
+        """
+        Randomizes all enabled dynamics parameters in the simulation
+        """
+        for group_defaults, group_randomizations, group_randomize_names in zip(
+            (self.opt_defaults, self.body_defaults, self.geom_defaults, self.joint_defaults),
+            (self.opt_randomizations, self.body_randomizations, self.geom_randomizations, self.joint_randomizations),
+            ([None], self.body_names, self.geom_names, self.joint_names),
+        ):
+            for name in group_randomize_names:
+                # Randomize all parameters associated with this element
+                for attr, default_val in group_defaults[name].items():
+                    val = copy.copy(default_val)
+                    settings = group_randomizations[attr]
+                    if settings["randomize"]:
+                        # Randomize accordingly, and clip the final perturbed value
+                        perturbation = np.random.rand() if type(val) in {int, float} else np.random.rand(*val.shape)
+                        perturbation = settings["perturbation"] * (-1 + 2 * perturbation)
+                        val = val + perturbation if settings["type"] == "size" else val * (1.0 + perturbation)
+                        val = np.clip(val, *settings["clip"])
+                    # Modify this value
+                    self.mod(name=name, attr=attr, val=val)
+
+        # Make sure changes propagate in sim
+        self.update()
+
+    def update_sim(self, sim):
+        """
+        In addition to super method, update internal default values to match the current values from
+        (the presumably new) @sim.
+
+        Args:
+            sim (MjSim): MjSim object
+        """
+        super().update_sim(sim=sim)
+        self.save_defaults()
+
+    def update(self):
+        """
+        Propagates the changes made up to this point through the simulation
+        """
+        self.sim.forward()
+
+    def mod(self, name, attr, val):
+        """
+        General method to modify dynamics parameter @attr to be new value @val, associated with element @name.
+
+        Args:
+            name (str): Name of element to modify parameter. This can be a body, geom, or joint name. If modifying
+                an opt parameter, this should be set to None
+            attr (str): Name of the dynamics parameter to modify. Valid options are self.dynamics_parameters
+            val (int or float or n-array): New value(s) to set for the given dynamics parameter. The type of this
+                argument should match the expected type for the given parameter.
+        """
+        # Make sure specified parameter is valid, and then modify it
+        assert (
+            attr in self.dynamics_parameters
+        ), "Invalid dynamics parameter specified! Supported parameters are: {};" " requested: {}".format(
+            self.dynamics_parameters, attr
+        )
+        # Modify the requested parameter (uses a clean way to programmatically call the appropriate method)
+        getattr(self, f"mod_{attr}")(name, val)
+
+    def mod_density(self, name=None, val=0.0):
+        """
+        Modifies the global medium density of the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#option for more details.
+
+        Args:
+            name (str): Name for this element. Should be left as None (opt has no name attribute)
+            val (float): New density value.
+        """
+        # Make sure inputs are of correct form
+        assert name is None, "No name should be specified if modding density!"
+
+        # Modify this value
+        self.sim.model.opt.density = val
+
+    def mod_viscosity(self, name=None, val=0.0):
+        """
+        Modifies the global medium viscosity of the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#option for more details.
+
+        Args:
+            name (str): Name for this element. Should be left as None (opt has no name attribute)
+            val (float): New viscosity value.
+        """
+        # Make sure inputs are of correct form
+        assert name is None, "No name should be specified if modding density!"
+
+        # Modify this value
+        self.sim.model.opt.viscosity = val
+
+    def mod_position(self, name, val=(0, 0, 0)):
+        """
+        Modifies the @name's relative body position within the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#body for more details.
+
+        Args:
+            name (str): Name for this element.
+            val (3-array): New (x, y, z) relative position.
+        """
+        # Modify this value
+        body_id = self.sim.model.body_name2id(name)
+        self.sim.model.body_pos[body_id] = np.array(val)
+
+    def mod_quaternion(self, name, val=(1, 0, 0, 0)):
+        """
+        Modifies the @name's relative body orientation (quaternion) within the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#body for more details.
+
+        Note: This method automatically normalizes the inputted value.
+
+        Args:
+            name (str): Name for this element.
+            val (4-array): New (w, x, y, z) relative quaternion.
+        """
+        # Normalize the inputted value
+        val = np.array(val) / np.linalg.norm(val)
+        # Modify this value
+        body_id = self.sim.model.body_name2id(name)
+        self.sim.model.body_quat[body_id] = val
+
+    def mod_inertia(self, name, val):
+        """
+        Modifies the @name's relative body inertia within the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#body for more details.
+
+        Args:
+            name (str): Name for this element.
+            val (3-array): New (ixx, iyy, izz) diagonal values in the inertia matrix.
+        """
+        # Modify this value if it's not a dummy body
+        if name not in self.dummy_bodies:
+            body_id = self.sim.model.body_name2id(name)
+            self.sim.model.body_inertia[body_id] = np.array(val)
+
+    def mod_mass(self, name, val):
+        """
+        Modifies the @name's mass within the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#body for more details.
+
+        Args:
+            name (str): Name for this element.
+            val (float): New mass.
+        """
+        # Modify this value if it's not a dummy body
+        if name not in self.dummy_bodies:
+            body_id = self.sim.model.body_name2id(name)
+            self.sim.model.body_mass[body_id] = val
+
+    def mod_friction(self, name, val):
+        """
+        Modifies the @name's geom friction within the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#geom for more details.
+
+        Args:
+            name (str): Name for this element.
+            val (3-array): New (sliding, torsional, rolling) friction values.
+        """
+        # Modify this value
+        geom_id = self.sim.model.geom_name2id(name)
+        self.sim.model.geom_friction[geom_id] = np.array(val)
+
+    def mod_solref(self, name, val):
+        """
+        Modifies the @name's geom contact solver parameters within the simulation.
+        See http://www.mujoco.org/book/modeling.html#CSolver for more details.
+
+        Args:
+            name (str): Name for this element.
+            val (2-array): New (timeconst, dampratio) solref values.
+        """
+        # Modify this value
+        geom_id = self.sim.model.geom_name2id(name)
+        self.sim.model.geom_solref[geom_id] = np.array(val)
+
+    def mod_solimp(self, name, val):
+        """
+        Modifies the @name's geom contact solver impedance parameters within the simulation.
+        See http://www.mujoco.org/book/modeling.html#CSolver for more details.
+
+        Args:
+            name (str): Name for this element.
+            val (5-array): New (dmin, dmax, width, midpoint, power) solimp values.
+        """
+        # Modify this value
+        geom_id = self.sim.model.geom_name2id(name)
+        self.sim.model.geom_solimp[geom_id] = np.array(val)
+
+    def mod_stiffness(self, name, val):
+        """
+        Modifies the @name's joint stiffness within the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#joint for more details.
+
+        NOTE: If the stiffness is already at 0, we IGNORE this value since a non-stiff joint (i.e.: free-turning)
+            joint is fundamentally different than a stiffened joint)
+
+        Args:
+            name (str): Name for this element.
+            val (float): New stiffness.
+        """
+        # Modify this value (only if there is stiffness to begin with)
+        jnt_id = self.sim.model.joint_name2id(name)
+        if self.sim.model.jnt_stiffness[jnt_id] != 0:
+            self.sim.model.jnt_stiffness[jnt_id] = val
+
+    def mod_frictionloss(self, name, val):
+        """
+        Modifies the @name's joint frictionloss within the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#joint for more details.
+
+        NOTE: If the requested joint is a free joint, it will be ignored since it does not
+            make physical sense to have friction loss associated with this joint (air drag / damping
+            is already captured implicitly by the medium density / viscosity values)
+
+        Args:
+            name (str): Name for this element.
+            val (float): New friction loss.
+        """
+        # Modify this value (only if it's not a free joint)
+        jnt_id = self.sim.model.joint_name2id(name)
+        if self.sim.model.jnt_type[jnt_id] != 0:
+            dof_idx = [i for i, v in enumerate(self.sim.model.dof_jntid) if v == jnt_id]
+            self.sim.model.dof_frictionloss[dof_idx] = val
+
+    def mod_damping(self, name, val):
+        """
+        Modifies the @name's joint damping within the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#joint for more details.
+
+        NOTE: If the requested joint is a free joint, it will be ignored since it does not
+            make physical sense to have damping associated with this joint (air drag / damping
+            is already captured implicitly by the medium density / viscosity values)
+
+        Args:
+            name (str): Name for this element.
+            val (float): New damping.
+        """
+        # Modify this value (only if it's not a free joint)
+        jnt_id = self.sim.model.joint_name2id(name)
+        if self.sim.model.jnt_type[jnt_id] != 0:
+            dof_idx = [i for i, v in enumerate(self.sim.model.dof_jntid) if v == jnt_id]
+            self.sim.model.dof_damping[dof_idx] = val
+
+    def mod_armature(self, name, val):
+        """
+        Modifies the @name's joint armature within the simulation.
+        See http://www.mujoco.org/book/XMLreference.html#joint for more details.
+
+        Args:
+            name (str): Name for this element.
+            val (float): New armature.
+        """
+        # Modify this value (only if it's not a free joint)
+        jnt_id = self.sim.model.joint_name2id(name)
+        if self.sim.model.jnt_type[jnt_id] != 0:
+            dof_idx = [i for i, v in enumerate(self.sim.model.dof_jntid) if v == jnt_id]
+            self.sim.model.dof_armature[dof_idx] = val
+
+    @property
+    def dynamics_parameters(self):
+        """
+        Returns:
+            set: All dynamics parameters that can be randomized using this modder.
+        """
+        return {
+            # Opt parameters
+            "density",
+            "viscosity",
+            # Body parameters
+            "position",
+            "quaternion",
+            "inertia",
+            "mass",
+            # Geom parameters
+            "friction",
+            "solref",
+            "solimp",
+            # Joint parameters
+            "stiffness",
+            "frictionloss",
+            "damping",
+            "armature",
+        }
+
+    @property
+    def opt(self):
+        """
+        Returns:
+             PyMjOption: MjModel sim options
+        """
+        return self.sim.model.opt
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/numba.py b/phantom/submodules/phantom-robosuite/robosuite/utils/numba.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd5d8549758e24993abbd72bbf970d90c0e0091b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/numba.py
@@ -0,0 +1,12 @@
+"""
+Numba utils.
+"""
+import numba
+
+import robosuite.macros as macros
+
+
+def jit_decorator(func):
+    if macros.ENABLE_NUMBA:
+        return numba.jit(nopython=True, cache=macros.CACHE_NUMBA)(func)
+    return func
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/observables.py b/phantom/submodules/phantom-robosuite/robosuite/utils/observables.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c23e8189bcdc1d84a47efe14299a661e11de468
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/observables.py
@@ -0,0 +1,403 @@
+import numpy as np
+
+
+def sensor(modality):
+    """
+    Decorator that should be added to any sensors that will be an observable.
+
+    Decorated functions should have signature:
+
+        any = func(obs_cache)
+
+    Where @obs_cache is a dictionary mapping observable keys to pre-computed values, and @any is either a scalar
+    or array. This function should also handle the case if obs_cache is either None or an empty dict.
+
+    An example use case is shown below:
+
+        >>> @sensor(modality="proprio")
+        >>> def joint_pos(obs_cache):
+                # Always handle case if obs_cache is empty
+                if not obs_cache:
+                    return np.zeros(7)
+                # Otherwise, run necessary calculations and return output
+                ...
+                out = ...
+                return out
+
+    Args:
+        modality (str): Modality for this sensor
+
+    Returns:
+        function: decorator function
+    """
+    # Define standard decorator (with no args)
+    def decorator(func):
+        # Add modality attribute
+        func.__modality__ = modality
+        # Return function
+        return func
+
+    return decorator
+
+
+def create_deterministic_corrupter(corruption, low=-np.inf, high=np.inf):
+    """
+    Creates a deterministic corrupter that applies the same corrupted value to all sensor values
+
+    Args:
+        corruption (float): Corruption to apply
+        low (float): Minimum value for output for clipping
+        high (float): Maximum value for output for clipping
+
+    Returns:
+        function: corrupter
+    """
+
+    def corrupter(inp):
+        inp = np.array(inp)
+        return np.clip(inp + corruption, low, high)
+
+    return corrupter
+
+
+def create_uniform_noise_corrupter(min_noise, max_noise, low=-np.inf, high=np.inf):
+    """
+    Creates a corrupter that applies uniform noise to a given input within range @low to @high
+
+    Args:
+        min_noise (float): Minimum noise to apply
+        max_noise (float): Maximum noise to apply
+        low (float): Minimum value for output for clipping
+        high (float): Maxmimum value for output for clipping
+
+    Returns:
+        function: corrupter
+    """
+
+    def corrupter(inp):
+        inp = np.array(inp)
+        noise = (max_noise - min_noise) * np.random.random_sample(inp.shape) + min_noise
+        return np.clip(inp + noise, low, high)
+
+    return corrupter
+
+
+def create_gaussian_noise_corrupter(mean, std, low=-np.inf, high=np.inf):
+    """
+    Creates a corrupter that applies gaussian noise to a given input with mean @mean and std dev @std
+
+    Args:
+        mean (float): Mean of the noise to apply
+        std (float): Standard deviation of the noise to apply
+        low (float): Minimum value for output for clipping
+        high (float): Maxmimum value for output for clipping
+
+    Returns:
+        function: corrupter
+    """
+
+    def corrupter(inp):
+        inp = np.array(inp)
+        noise = mean + std * np.random.randn(*inp.shape)
+        return np.clip(inp + noise, low, high)
+
+    return corrupter
+
+
+def create_deterministic_delayer(delay):
+    """
+    Create a deterministic delayer that always returns the same delay value
+
+    Args:
+        delay (float): Delay value to return
+
+    Returns:
+        function: delayer
+    """
+    assert delay >= 0, "Inputted delay must be non-negative!"
+    return lambda: delay
+
+
+def create_uniform_sampled_delayer(min_delay, max_delay):
+    """
+    Creates uniformly sampled delayer, with minimum delay @low and maximum delay @high, both inclusive
+
+    Args:
+        min_delay (float): Minimum possible delay
+        max_delay (float): Maxmimum possible delay
+
+    Returns:
+        function: delayer
+    """
+    assert min(min_delay, max_delay) >= 0, "Inputted delay must be non-negative!"
+    return lambda: min_delay + (max_delay - min_delay) * np.random.random()
+
+
+def create_gaussian_sampled_delayer(mean, std):
+    """
+    Creates a gaussian sampled delayer, with average delay @mean which varies by standard deviation @std
+
+    Args:
+        mean (float): Average delay
+        std (float): Standard deviation of the delay variation
+
+    Returns:
+        function: delayer
+    """
+    assert mean >= 0, "Inputted mean delay must be non-negative!"
+    return lambda: max(0.0, int(np.round(mean + std * np.random.randn())))
+
+
+# Common defaults to use
+NO_CORRUPTION = lambda inp: inp
+NO_FILTER = lambda inp: inp
+NO_DELAY = lambda: 0.0
+
+
+class Observable:
+    """
+    Base class for all observables -- defines interface for interacting with sensors
+
+    Args:
+        name (str): Name for this observable
+        sensor (function with `sensor` decorator): Method to grab raw sensor data for this observable. Should take in a
+            single dict argument (observation cache if a pre-computed value is required) and return the raw sensor data
+            for the current timestep. Must handle case if inputted argument is empty ({}), and should have `sensor`
+            decorator when defined
+        corrupter (None or function): Method to corrupt the raw sensor data for this observable. Should take in
+            the output of @sensor and return the same type (corrupted data). If None, results in default no corruption
+        filter (None or function): Method to filter the outputted reading for this observable. Should take in the output
+            of @corrupter and return the same type (filtered data). If None, results in default no filter. Note that
+            this function can also double as an observer, where sampled data is recorded by this function.
+        delayer (None or function): Method to delay the raw sensor data when polling this observable. Should take in
+            no arguments and return a float, for the number of seconds to delay the measurement by. If None, results in
+            default no delayer
+        sampling_rate (float): Sampling rate for this observable (Hz)
+        enabled (bool): Whether this sensor is enabled or not. If enabled, this observable's values
+            are continually computed / updated every time update() is called.
+        active (bool): Whether this sensor is active or not. If active, this observable's current
+            observed value is returned from self.obs, otherwise self.obs returns None.
+    """
+
+    def __init__(
+        self,
+        name,
+        sensor,
+        corrupter=None,
+        filter=None,
+        delayer=None,
+        sampling_rate=20,
+        enabled=True,
+        active=True,
+    ):
+        # Set all internal variables and methods
+        self.name = name
+        self._sensor = sensor
+        self._corrupter = corrupter if corrupter is not None else NO_CORRUPTION
+        self._filter = filter if filter is not None else NO_FILTER
+        self._delayer = delayer if delayer is not None else NO_DELAY
+        self._sampling_timestep = 1.0 / sampling_rate
+        self._enabled = enabled
+        self._active = active
+        self._is_number = False  # filled in during sensor check call
+        self._data_shape = (1,)  # filled in during sensor check call
+
+        # Make sure sensor is working
+        self._check_sensor_validity()
+
+        # These values will be modified during update() call
+        self._time_since_last_sample = 0.0  # seconds
+        self._current_delay = self._delayer()  # seconds
+        self._current_observed_value = 0 if self._is_number else np.zeros(self._data_shape)
+        self._sampled = False
+
+    def update(self, timestep, obs_cache, force=False):
+        """
+        Updates internal values for this observable, if enabled.
+
+        Args:
+            timestep (float): Amount of simulation time (in sec) that has passed since last call.
+            obs_cache (dict): Observation cache mapping observable names to pre-computed values to pass to sensor. This
+                will be updated in-place during this call.
+            force (bool): If True, will force the observable to update its internal value to the newest value.
+        """
+        if self._enabled:
+            # Increment internal time counter
+            self._time_since_last_sample += timestep
+
+            # If the delayed sampling time has been passed and we haven't sampled yet for this sampling period,
+            # we should grab a new measurement
+            if (
+                not self._sampled and self._sampling_timestep - self._current_delay >= self._time_since_last_sample
+            ) or force:
+                # Get newest raw value, corrupt it, filter it, and set it as our current observed value
+                obs = np.array(self._filter(self._corrupter(self._sensor(obs_cache))))
+                self._current_observed_value = obs[0] if len(obs.shape) == 1 and obs.shape[0] == 1 else obs
+                # Update cache entry as well
+                obs_cache[self.name] = np.array(self._current_observed_value)
+                # Toggle sampled and re-sample next time delay
+                self._sampled = True
+                self._current_delay = self._delayer()
+
+            # If our total time since last sample has surpassed our sampling timestep,
+            # then we reset our timer and sampled flag
+            if self._time_since_last_sample >= self._sampling_timestep:
+                if not self._sampled:
+                    # If we still haven't sampled yet, sample immediately and warn user that sampling rate is too low
+                    print(
+                        f"Warning: sampling rate for observable {self.name} is either too low or delay is too high. "
+                        f"Please adjust one (or both)"
+                    )
+                    # Get newest raw value, corrupt it, filter it, and set it as our current observed value
+                    obs = np.array(self._filter(self._corrupter(self._sensor(obs_cache))))
+                    self._current_observed_value = obs[0] if len(obs.shape) == 1 and obs.shape[0] == 1 else obs
+                    # Update cache entry as well
+                    obs_cache[self.name] = np.array(self._current_observed_value)
+                    # Re-sample next time delay
+                    self._current_delay = self._delayer()
+                self._time_since_last_sample %= self._sampling_timestep
+                self._sampled = False
+
+    def reset(self):
+        """
+        Resets this observable's internal values (but does not reset its sensor, corrupter, delayer, or filter)
+        """
+        self._time_since_last_sample = 0.0
+        self._current_delay = self._delayer()
+        self._current_observed_value = 0 if self._is_number else np.zeros(self._data_shape)
+
+    def is_enabled(self):
+        """
+        Determines whether observable is enabled or not. This observable is considered enabled if its values
+        are being continually computed / updated during each update() call.
+
+        Returns:
+            bool: True if this observable is enabled
+        """
+        return self._enabled
+
+    def is_active(self):
+        """
+        Determines whether observable is active or not. This observable is considered active if its current observation
+        value is being returned in self.obs.
+
+        Returns:
+            bool: True if this observable is active
+        """
+        return self._active
+
+    def set_enabled(self, enabled):
+        """
+        Sets whether this observable is enabled or not. If enabled, this observable's values
+        are continually computed / updated every time update() is called.
+
+        Args:
+            enabled (bool): True if this observable should be enabled
+        """
+        self._enabled = enabled
+        # Reset values
+        self.reset()
+
+    def set_active(self, active):
+        """
+        Sets whether this observable is active or not. If active, this observable's current
+        observed value is returned from self.obs, otherwise self.obs returns None.
+
+        Args:
+            active (bool): True if this observable should be active
+        """
+        self._active = active
+
+    def set_sensor(self, sensor):
+        """
+        Sets the sensor for this observable.
+
+        Args:
+            sensor (function with sensor decorator): Method to grab raw sensor data for this observable. Should take in
+                a single dict argument (observation cache if a pre-computed value is required) and return the raw
+                sensor data for the current timestep. Must handle case if inputted argument is empty ({}), and should
+                have `sensor` decorator when defined
+        """
+        self._sensor = sensor
+        self._check_sensor_validity()
+
+    def set_corrupter(self, corrupter):
+        """
+        Sets the corrupter for this observable.
+
+        Args:
+             corrupter (None or function): Method to corrupt the raw sensor data for this observable. Should take in
+                the output of self.sensor and return the same type (corrupted data).
+                If None, results in default no corruption
+        """
+        self._corrupter = corrupter if corrupter is not None else NO_CORRUPTION
+
+    def set_filter(self, filter):
+        """
+        Sets the filter for this observable. Note that this function can also double as an observer, where sampled
+        data is recorded by this function.
+
+        Args:
+             filter (None or function): Method to filter the outputted reading for this observable. Should take in
+                the output of @corrupter and return the same type (filtered data).
+                If None, results in default no filter
+        """
+        self._filter = filter if filter is not None else NO_FILTER
+
+    def set_delayer(self, delayer):
+        """
+        Sets the delayer for this observable.
+
+        Args:
+            delayer (None or function): Method to delay the raw sensor data when polling this observable. Should take
+                in no arguments and return a float, for the number of seconds to delay the measurement by.
+                If None, results in default no filter
+        """
+        self._delayer = delayer if delayer is not None else NO_DELAY
+
+    def set_sampling_rate(self, rate):
+        """
+        Sets the sampling rate for this observable.
+
+        Args:
+            rate (int): New sampling rate for this observable (Hz)
+        """
+        self._sampling_timestep = 1.0 / rate
+
+    def _check_sensor_validity(self):
+        """
+        Internal function that checks the validity of this observable's sensor. It does the following:
+
+            - Asserts that the inputted sensor has its __modality__ attribute defined from the sensor decorator
+            - Asserts that the inputted sensor can handle the empty dict {} arg case
+            - Updates the corresponding name, and data-types for this sensor
+        """
+        try:
+            _ = self.modality
+            self._data_shape = np.array(self._sensor({})).shape
+            self._is_number = len(self._data_shape) == 1 and self._data_shape[0] == 1
+        except Exception as e:
+            from robosuite.utils.log_utils import ROBOSUITE_DEFAULT_LOGGER
+
+            ROBOSUITE_DEFAULT_LOGGER.error(e)
+            raise ValueError("Current sensor for observable {} is invalid.".format(self.name))
+
+    @property
+    def obs(self):
+        """
+        Current observation from this observable
+
+        Returns:
+            None or float or np.array: If active, current observed value from this observable. Otherwise, None
+        """
+        return self._current_observed_value if self._active else None
+
+    @property
+    def modality(self):
+        """
+        Modality of this sensor
+
+        Returns:
+            str: Modality name for this observable
+        """
+        return self._sensor.__modality__
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/opencv_renderer.py b/phantom/submodules/phantom-robosuite/robosuite/utils/opencv_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d6386b75870248519a12a18e62c233e256c246
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/opencv_renderer.py
@@ -0,0 +1,50 @@
+"""
+opencv renderer class.
+"""
+import cv2
+import numpy as np
+
+
+class OpenCVRenderer:
+    def __init__(self, sim):
+        # TODO: update this appropriately - need to get screen dimensions
+        self.width = 1280
+        self.height = 800
+
+        self.sim = sim
+        self.camera_name = self.sim.model.camera_id2name(0)
+
+        self.keypress_callback = None
+
+    def set_camera(self, camera_id):
+        """
+        Set the camera view to the specified camera ID.
+        Args:
+            camera_id (int): id of the camera to set the current viewer to
+        """
+        self.camera_name = self.sim.model.camera_id2name(camera_id)
+
+    def render(self):
+        # get frame with offscreen renderer (assumes that the renderer already exists)
+        im = self.sim.render(camera_name=self.camera_name, height=self.height, width=self.width)[..., ::-1]
+
+        # write frame to window
+        im = np.flip(im, axis=0)
+        cv2.imshow("offscreen render", im)
+        key = cv2.waitKey(1)
+        if self.keypress_callback:
+            self.keypress_callback(key)
+
+    def add_keypress_callback(self, keypress_callback):
+        self.keypress_callback = keypress_callback
+
+    def close(self):
+        """
+        Any cleanup to close renderer.
+        """
+
+        # NOTE: assume that @sim will get cleaned up outside the renderer - just delete the reference
+        self.sim = None
+
+        # close window
+        cv2.destroyAllWindows()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/placement_samplers.py b/phantom/submodules/phantom-robosuite/robosuite/utils/placement_samplers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bd8bb99712d8dac51ca5acf90261ca21cba60c4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/placement_samplers.py
@@ -0,0 +1,441 @@
+import collections
+from copy import copy
+
+import numpy as np
+
+from robosuite.models.objects import MujocoObject
+from robosuite.utils import RandomizationError
+from robosuite.utils.transform_utils import quat_multiply
+
+
+class ObjectPositionSampler:
+    """
+    Base class of object placement sampler.
+
+    Args:
+        name (str): Name of this sampler.
+
+        mujoco_objects (None or MujocoObject or list of MujocoObject): single model or list of MJCF object models
+
+        ensure_object_boundary_in_range (bool): If True, will ensure that the object is enclosed within a given boundary
+            (should be implemented by subclass)
+
+        ensure_valid_placement (bool): If True, will check for correct (valid) object placements
+
+        reference_pos (3-array): global (x,y,z) position relative to which sampling will occur
+
+        z_offset (float): Add a small z-offset to placements. This is useful for fixed objects
+            that do not move (i.e. no free joint) to place them above the table.
+    """
+
+    def __init__(
+        self,
+        name,
+        mujoco_objects=None,
+        ensure_object_boundary_in_range=True,
+        ensure_valid_placement=True,
+        reference_pos=(0, 0, 0),
+        z_offset=0.0,
+    ):
+        # Setup attributes
+        self.name = name
+        if mujoco_objects is None:
+            self.mujoco_objects = []
+        else:
+            # Shallow copy the list so we don't modify the inputted list but still keep the object references
+            self.mujoco_objects = [mujoco_objects] if isinstance(mujoco_objects, MujocoObject) else copy(mujoco_objects)
+        self.ensure_object_boundary_in_range = ensure_object_boundary_in_range
+        self.ensure_valid_placement = ensure_valid_placement
+        self.reference_pos = reference_pos
+        self.z_offset = z_offset
+
+    def add_objects(self, mujoco_objects):
+        """
+        Add additional objects to this sampler. Checks to make sure there's no identical objects already stored.
+
+        Args:
+            mujoco_objects (MujocoObject or list of MujocoObject): single model or list of MJCF object models
+        """
+        mujoco_objects = [mujoco_objects] if isinstance(mujoco_objects, MujocoObject) else mujoco_objects
+        for obj in mujoco_objects:
+            assert obj not in self.mujoco_objects, "Object '{}' already in sampler!".format(obj.name)
+            self.mujoco_objects.append(obj)
+
+    def reset(self):
+        """
+        Resets this sampler. Removes all mujoco objects from this sampler.
+        """
+        self.mujoco_objects = []
+
+    def sample(self, fixtures=None, reference=None, on_top=True):
+        """
+        Uniformly sample on a surface (not necessarily table surface).
+
+        Args:
+            fixtures (dict): dictionary of current object placements in the scene as well as any other relevant
+                obstacles that should not be in contact with newly sampled objects. Used to make sure newly
+                generated placements are valid. Should be object names mapped to (pos, quat, MujocoObject)
+
+            reference (str or 3-tuple or None): if provided, sample relative placement. Can either be a string, which
+                corresponds to an existing object found in @fixtures, or a direct (x,y,z) value. If None, will sample
+                relative to this sampler's `'reference_pos'` value.
+
+            on_top (bool): if True, sample placement on top of the reference object.
+
+        Return:
+            dict: dictionary of all object placements, mapping object_names to (pos, quat, obj), including the
+                placements specified in @fixtures. Note quat is in (w,x,y,z) form
+        """
+        raise NotImplementedError
+
+
+class UniformRandomSampler(ObjectPositionSampler):
+    """
+    Places all objects within the table uniformly random.
+
+    Args:
+        name (str): Name of this sampler.
+
+        mujoco_objects (None or MujocoObject or list of MujocoObject): single model or list of MJCF object models
+
+        x_range (2-array of float): Specify the (min, max) relative x_range used to uniformly place objects
+
+        y_range (2-array of float): Specify the (min, max) relative y_range used to uniformly place objects
+
+        rotation (None or float or Iterable):
+            :`None`: Add uniform random random rotation
+            :`Iterable (a,b)`: Uniformly randomize rotation angle between a and b (in radians)
+            :`value`: Add fixed angle rotation
+
+        rotation_axis (str): Can be 'x', 'y', or 'z'. Axis about which to apply the requested rotation
+
+        ensure_object_boundary_in_range (bool):
+            :`True`: The center of object is at position:
+                 [uniform(min x_range + radius, max x_range - radius)], [uniform(min x_range + radius, max x_range - radius)]
+            :`False`:
+                [uniform(min x_range, max x_range)], [uniform(min x_range, max x_range)]
+
+        ensure_valid_placement (bool): If True, will check for correct (valid) object placements
+
+        reference_pos (3-array): global (x,y,z) position relative to which sampling will occur
+
+        z_offset (float): Add a small z-offset to placements. This is useful for fixed objects
+            that do not move (i.e. no free joint) to place them above the table.
+    """
+
+    def __init__(
+        self,
+        name,
+        mujoco_objects=None,
+        x_range=(0, 0),
+        y_range=(0, 0),
+        rotation=None,
+        rotation_axis="z",
+        ensure_object_boundary_in_range=True,
+        ensure_valid_placement=True,
+        reference_pos=(0, 0, 0),
+        z_offset=0.0,
+    ):
+        self.x_range = x_range
+        self.y_range = y_range
+        self.rotation = rotation
+        self.rotation_axis = rotation_axis
+
+        super().__init__(
+            name=name,
+            mujoco_objects=mujoco_objects,
+            ensure_object_boundary_in_range=ensure_object_boundary_in_range,
+            ensure_valid_placement=ensure_valid_placement,
+            reference_pos=reference_pos,
+            z_offset=z_offset,
+        )
+
+    def _sample_x(self, object_horizontal_radius):
+        """
+        Samples the x location for a given object
+
+        Args:
+            object_horizontal_radius (float): Radius of the object currently being sampled for
+
+        Returns:
+            float: sampled x position
+        """
+        minimum, maximum = self.x_range
+        if self.ensure_object_boundary_in_range:
+            minimum += object_horizontal_radius
+            maximum -= object_horizontal_radius
+        return np.random.uniform(high=maximum, low=minimum)
+
+    def _sample_y(self, object_horizontal_radius):
+        """
+        Samples the y location for a given object
+
+        Args:
+            object_horizontal_radius (float): Radius of the object currently being sampled for
+
+        Returns:
+            float: sampled y position
+        """
+        minimum, maximum = self.y_range
+        if self.ensure_object_boundary_in_range:
+            minimum += object_horizontal_radius
+            maximum -= object_horizontal_radius
+        return np.random.uniform(high=maximum, low=minimum)
+
+    def _sample_quat(self):
+        """
+        Samples the orientation for a given object
+
+        Returns:
+            np.array: sampled object quaternion in (w,x,y,z) form
+
+        Raises:
+            ValueError: [Invalid rotation axis]
+        """
+        if self.rotation is None:
+            rot_angle = np.random.uniform(high=2 * np.pi, low=0)
+        elif isinstance(self.rotation, collections.abc.Iterable):
+            rot_angle = np.random.uniform(high=max(self.rotation), low=min(self.rotation))
+        else:
+            rot_angle = self.rotation
+
+        # Return angle based on axis requested
+        if self.rotation_axis == "x":
+            return np.array([np.cos(rot_angle / 2), np.sin(rot_angle / 2), 0, 0])
+        elif self.rotation_axis == "y":
+            return np.array([np.cos(rot_angle / 2), 0, np.sin(rot_angle / 2), 0])
+        elif self.rotation_axis == "z":
+            return np.array([np.cos(rot_angle / 2), 0, 0, np.sin(rot_angle / 2)])
+        else:
+            # Invalid axis specified, raise error
+            raise ValueError(
+                "Invalid rotation axis specified. Must be 'x', 'y', or 'z'. Got: {}".format(self.rotation_axis)
+            )
+
+    def sample(self, fixtures=None, reference=None, on_top=True):
+        """
+        Uniformly sample relative to this sampler's reference_pos or @reference (if specified).
+
+        Args:
+            fixtures (dict): dictionary of current object placements in the scene as well as any other relevant
+                obstacles that should not be in contact with newly sampled objects. Used to make sure newly
+                generated placements are valid. Should be object names mapped to (pos, quat, MujocoObject)
+
+            reference (str or 3-tuple or None): if provided, sample relative placement. Can either be a string, which
+                corresponds to an existing object found in @fixtures, or a direct (x,y,z) value. If None, will sample
+                relative to this sampler's `'reference_pos'` value.
+
+            on_top (bool): if True, sample placement on top of the reference object. This corresponds to a sampled
+                z-offset of the current sampled object's bottom_offset + the reference object's top_offset
+                (if specified)
+
+        Return:
+            dict: dictionary of all object placements, mapping object_names to (pos, quat, obj), including the
+                placements specified in @fixtures. Note quat is in (w,x,y,z) form
+
+        Raises:
+            RandomizationError: [Cannot place all objects]
+            AssertionError: [Reference object name does not exist, invalid inputs]
+        """
+        # Standardize inputs
+        placed_objects = {} if fixtures is None else copy(fixtures)
+        if reference is None:
+            base_offset = self.reference_pos
+        elif type(reference) is str:
+            assert (
+                reference in placed_objects
+            ), "Invalid reference received. Current options are: {}, requested: {}".format(
+                placed_objects.keys(), reference
+            )
+            ref_pos, _, ref_obj = placed_objects[reference]
+            base_offset = np.array(ref_pos)
+            if on_top:
+                base_offset += np.array((0, 0, ref_obj.top_offset[-1]))
+        else:
+            base_offset = np.array(reference)
+            assert (
+                base_offset.shape[0] == 3
+            ), "Invalid reference received. Should be (x,y,z) 3-tuple, but got: {}".format(base_offset)
+
+        # Sample pos and quat for all objects assigned to this sampler
+        for obj in self.mujoco_objects:
+            # First make sure the currently sampled object hasn't already been sampled
+            assert obj.name not in placed_objects, "Object '{}' has already been sampled!".format(obj.name)
+
+            horizontal_radius = obj.horizontal_radius
+            bottom_offset = obj.bottom_offset
+            success = False
+            for i in range(5000):  # 5000 retries
+                object_x = self._sample_x(horizontal_radius) + base_offset[0]
+                object_y = self._sample_y(horizontal_radius) + base_offset[1]
+                object_z = self.z_offset + base_offset[2]
+                if on_top:
+                    object_z -= bottom_offset[-1]
+
+                # objects cannot overlap
+                location_valid = True
+                if self.ensure_valid_placement:
+                    for (x, y, z), _, other_obj in placed_objects.values():
+                        if (
+                            np.linalg.norm((object_x - x, object_y - y))
+                            <= other_obj.horizontal_radius + horizontal_radius
+                        ) and (object_z - z <= other_obj.top_offset[-1] - bottom_offset[-1]):
+                            location_valid = False
+                            break
+
+                if location_valid:
+                    # random rotation
+                    quat = self._sample_quat()
+
+                    # multiply this quat by the object's initial rotation if it has the attribute specified
+                    if hasattr(obj, "init_quat"):
+                        quat = quat_multiply(quat, obj.init_quat)
+
+                    # location is valid, put the object down
+                    pos = (object_x, object_y, object_z)
+                    placed_objects[obj.name] = (pos, quat, obj)
+                    success = True
+                    break
+
+            if not success:
+                raise RandomizationError("Cannot place all objects ):")
+
+        return placed_objects
+
+
+class SequentialCompositeSampler(ObjectPositionSampler):
+    """
+    Samples position for each object sequentially. Allows chaining
+    multiple placement initializers together - so that object locations can
+    be sampled on top of other objects or relative to other object placements.
+
+    Args:
+        name (str): Name of this sampler.
+    """
+
+    def __init__(self, name):
+        # Samplers / args will be filled in later
+        self.samplers = collections.OrderedDict()
+        self.sample_args = collections.OrderedDict()
+
+        super().__init__(name=name)
+
+    def append_sampler(self, sampler, sample_args=None):
+        """
+        Adds a new placement initializer with corresponding @sampler and arguments
+
+        Args:
+            sampler (ObjectPositionSampler): sampler to add
+            sample_args (None or dict): If specified, should be additional arguments to pass to @sampler's sample()
+                call. Should map corresponding sampler's arguments to values (excluding @fixtures argument)
+
+        Raises:
+            AssertionError: [Object name in samplers]
+        """
+        # Verify that all added mujoco objects haven't already been added, and add to this sampler's objects dict
+        for obj in sampler.mujoco_objects:
+            assert obj not in self.mujoco_objects, f"Object '{obj.name}' already has sampler associated with it!"
+            self.mujoco_objects.append(obj)
+        self.samplers[sampler.name] = sampler
+        self.sample_args[sampler.name] = sample_args
+
+    def hide(self, mujoco_objects):
+        """
+        Helper method to remove an object from the workspace.
+
+        Args:
+            mujoco_objects (MujocoObject or list of MujocoObject): Object(s) to hide
+        """
+        sampler = UniformRandomSampler(
+            name="HideSampler",
+            mujoco_objects=mujoco_objects,
+            x_range=[-10, -20],
+            y_range=[-10, -20],
+            rotation=[0, 0],
+            rotation_axis="z",
+            z_offset=10,
+            ensure_object_boundary_in_range=False,
+            ensure_valid_placement=False,
+        )
+        self.append_sampler(sampler=sampler)
+
+    def add_objects(self, mujoco_objects):
+        """
+        Override super method to make sure user doesn't call this (all objects should implicitly belong to sub-samplers)
+        """
+        raise AttributeError("add_objects() should not be called for SequentialCompsiteSamplers!")
+
+    def add_objects_to_sampler(self, sampler_name, mujoco_objects):
+        """
+        Adds specified @mujoco_objects to sub-sampler with specified @sampler_name.
+
+        Args:
+            sampler_name (str): Existing sub-sampler name
+            mujoco_objects (MujocoObject or list of MujocoObject): Object(s) to add
+        """
+        # First verify that all mujoco objects haven't already been added, and add to this sampler's objects dict
+        mujoco_objects = [mujoco_objects] if isinstance(mujoco_objects, MujocoObject) else mujoco_objects
+        for obj in mujoco_objects:
+            assert obj not in self.mujoco_objects, f"Object '{obj.name}' already has sampler associated with it!"
+            self.mujoco_objects.append(obj)
+        # Make sure sampler_name exists
+        assert (
+            sampler_name in self.samplers.keys()
+        ), "Invalid sub-sampler specified, valid options are: {}, " "requested: {}".format(
+            self.samplers.keys(), sampler_name
+        )
+        # Add the mujoco objects to the requested sub-sampler
+        self.samplers[sampler_name].add_objects(mujoco_objects)
+
+    def reset(self):
+        """
+        Resets this sampler. In addition to base method, iterates over all sub-samplers and resets them
+        """
+        super().reset()
+        for sampler in self.samplers.values():
+            sampler.reset()
+
+    def sample(self, fixtures=None, reference=None, on_top=True):
+        """
+        Sample from each placement initializer sequentially, in the order
+        that they were appended.
+
+        Args:
+            fixtures (dict): dictionary of current object placements in the scene as well as any other relevant
+                obstacles that should not be in contact with newly sampled objects. Used to make sure newly
+                generated placements are valid. Should be object names mapped to (pos, quat, MujocoObject)
+
+            reference (str or 3-tuple or None): if provided, sample relative placement. This will override each
+                sampler's @reference argument if not already specified. Can either be a string, which
+                corresponds to an existing object found in @fixtures, or a direct (x,y,z) value. If None, will sample
+                relative to this sampler's `'reference_pos'` value.
+
+            on_top (bool): if True, sample placement on top of the reference object. This will override each
+                sampler's @on_top argument if not already specified. This corresponds to a sampled
+                z-offset of the current sampled object's bottom_offset + the reference object's top_offset
+                (if specified)
+
+        Return:
+            dict: dictionary of all object placements, mapping object_names to (pos, quat, obj), including the
+                placements specified in @fixtures. Note quat is in (w,x,y,z) form
+
+        Raises:
+            RandomizationError: [Cannot place all objects]
+        """
+        # Standardize inputs
+        placed_objects = {} if fixtures is None else copy(fixtures)
+
+        # Iterate through all samplers to sample
+        for sampler, s_args in zip(self.samplers.values(), self.sample_args.values()):
+            # Pre-process sampler args
+            if s_args is None:
+                s_args = {}
+            for arg_name, arg in zip(("reference", "on_top"), (reference, on_top)):
+                if arg_name not in s_args:
+                    s_args[arg_name] = arg
+            # Run sampler
+            new_placements = sampler.sample(fixtures=placed_objects, **s_args)
+            # Update placements
+            placed_objects.update(new_placements)
+
+        return placed_objects
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/robot_utils.py b/phantom/submodules/phantom-robosuite/robosuite/utils/robot_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..53200dcd6d64c56c6a622b684ba10c62498e6a40
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/robot_utils.py
@@ -0,0 +1,16 @@
+# Utilities functions for working with robots
+
+from robosuite.robots import BIMANUAL_ROBOTS
+
+
+def check_bimanual(robot_name):
+    """
+    Utility function that returns whether the inputted robot_name is a bimanual robot or not
+
+    Args:
+        robot_name (str): Name of the robot to check
+
+    Returns:
+        bool: True if the inputted robot is a bimanual robot
+    """
+    return robot_name.lower() in BIMANUAL_ROBOTS
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/sim_utils.py b/phantom/submodules/phantom-robosuite/robosuite/utils/sim_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5452add1a7ee82b947e81c76bd43698ff128047d
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/sim_utils.py
@@ -0,0 +1,67 @@
+"""
+Collection of useful simulation utilities
+"""
+
+from robosuite.models.base import MujocoModel
+
+
+def check_contact(sim, geoms_1, geoms_2=None):
+    """
+    Finds contact between two geom groups.
+    Args:
+        sim (MjSim): Current simulation object
+        geoms_1 (str or list of str or MujocoModel): an individual geom name or list of geom names or a model. If
+            a MujocoModel is specified, the geoms checked will be its contact_geoms
+        geoms_2 (str or list of str or MujocoModel or None): another individual geom name or list of geom names.
+            If a MujocoModel is specified, the geoms checked will be its contact_geoms. If None, will check
+            any collision with @geoms_1 to any other geom in the environment
+    Returns:
+        bool: True if any geom in @geoms_1 is in contact with any geom in @geoms_2.
+    """
+    # Check if either geoms_1 or geoms_2 is a string, convert to list if so
+    if type(geoms_1) is str:
+        geoms_1 = [geoms_1]
+    elif isinstance(geoms_1, MujocoModel):
+        geoms_1 = geoms_1.contact_geoms
+    if type(geoms_2) is str:
+        geoms_2 = [geoms_2]
+    elif isinstance(geoms_2, MujocoModel):
+        geoms_2 = geoms_2.contact_geoms
+    for i in range(sim.data.ncon):
+        contact = sim.data.contact[i]
+        # check contact geom in geoms
+        c1_in_g1 = sim.model.geom_id2name(contact.geom1) in geoms_1
+        c2_in_g2 = sim.model.geom_id2name(contact.geom2) in geoms_2 if geoms_2 is not None else True
+        # check contact geom in geoms (flipped)
+        c2_in_g1 = sim.model.geom_id2name(contact.geom2) in geoms_1
+        c1_in_g2 = sim.model.geom_id2name(contact.geom1) in geoms_2 if geoms_2 is not None else True
+        if (c1_in_g1 and c2_in_g2) or (c1_in_g2 and c2_in_g1):
+            return True
+    return False
+
+
+def get_contacts(sim, model):
+    """
+    Checks for any contacts with @model (as defined by @model's contact_geoms) and returns the set of
+    geom names currently in contact with that model (excluding the geoms that are part of the model itself).
+    Args:
+        sim (MjSim): Current simulation model
+        model (MujocoModel): Model to check contacts for.
+    Returns:
+        set: Unique geoms that are actively in contact with this model.
+    Raises:
+        AssertionError: [Invalid input type]
+    """
+    # Make sure model is MujocoModel type
+    assert isinstance(model, MujocoModel), "Inputted model must be of type MujocoModel; got type {} instead!".format(
+        type(model)
+    )
+    contact_set = set()
+    for contact in sim.data.contact[: sim.data.ncon]:
+        # check contact geom in geoms; add to contact set if match is found
+        g1, g2 = sim.model.geom_id2name(contact.geom1), sim.model.geom_id2name(contact.geom2)
+        if g1 in model.contact_geoms and g2 not in model.contact_geoms:
+            contact_set.add(g2)
+        elif g2 in model.contact_geoms and g1 not in model.contact_geoms:
+            contact_set.add(g1)
+    return contact_set
diff --git a/phantom/submodules/phantom-robosuite/robosuite/utils/transform_utils.py b/phantom/submodules/phantom-robosuite/robosuite/utils/transform_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac371c09da962dcceea8a376876fc1d46c0e52da
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/utils/transform_utils.py
@@ -0,0 +1,929 @@
+"""
+Utility functions of matrix and vector transformations.
+
+NOTE: convention for quaternions is (x, y, z, w)
+"""
+
+import math
+
+import numpy as np
+
+from robosuite.utils.numba import jit_decorator
+
+PI = np.pi
+EPS = np.finfo(float).eps * 4.0
+
+# axis sequences for Euler angles
+_NEXT_AXIS = [1, 2, 0, 1]
+
+# map axes strings to/from tuples of inner axis, parity, repetition, frame
+_AXES2TUPLE = {
+    "sxyz": (0, 0, 0, 0),
+    "sxyx": (0, 0, 1, 0),
+    "sxzy": (0, 1, 0, 0),
+    "sxzx": (0, 1, 1, 0),
+    "syzx": (1, 0, 0, 0),
+    "syzy": (1, 0, 1, 0),
+    "syxz": (1, 1, 0, 0),
+    "syxy": (1, 1, 1, 0),
+    "szxy": (2, 0, 0, 0),
+    "szxz": (2, 0, 1, 0),
+    "szyx": (2, 1, 0, 0),
+    "szyz": (2, 1, 1, 0),
+    "rzyx": (0, 0, 0, 1),
+    "rxyx": (0, 0, 1, 1),
+    "ryzx": (0, 1, 0, 1),
+    "rxzx": (0, 1, 1, 1),
+    "rxzy": (1, 0, 0, 1),
+    "ryzy": (1, 0, 1, 1),
+    "rzxy": (1, 1, 0, 1),
+    "ryxy": (1, 1, 1, 1),
+    "ryxz": (2, 0, 0, 1),
+    "rzxz": (2, 0, 1, 1),
+    "rxyz": (2, 1, 0, 1),
+    "rzyz": (2, 1, 1, 1),
+}
+
+_TUPLE2AXES = dict((v, k) for k, v in _AXES2TUPLE.items())
+
+
+def convert_quat(q, to="xyzw"):
+    """
+    Converts quaternion from one convention to another.
+    The convention to convert TO is specified as an optional argument.
+    If to == 'xyzw', then the input is in 'wxyz' format, and vice-versa.
+
+    Args:
+        q (np.array): a 4-dim array corresponding to a quaternion
+        to (str): either 'xyzw' or 'wxyz', determining which convention to convert to.
+    """
+    if to == "xyzw":
+        return q[[1, 2, 3, 0]]
+    if to == "wxyz":
+        return q[[3, 0, 1, 2]]
+    raise Exception("convert_quat: choose a valid `to` argument (xyzw or wxyz)")
+
+
+def quat_multiply(quaternion1, quaternion0):
+    """
+    Return multiplication of two quaternions (q1 * q0).
+
+    E.g.:
+    >>> q = quat_multiply([1, -2, 3, 4], [-5, 6, 7, 8])
+    >>> np.allclose(q, [-44, -14, 48, 28])
+    True
+
+    Args:
+        quaternion1 (np.array): (x,y,z,w) quaternion
+        quaternion0 (np.array): (x,y,z,w) quaternion
+
+    Returns:
+        np.array: (x,y,z,w) multiplied quaternion
+    """
+    x0, y0, z0, w0 = quaternion0
+    x1, y1, z1, w1 = quaternion1
+    return np.array(
+        (
+            x1 * w0 + y1 * z0 - z1 * y0 + w1 * x0,
+            -x1 * z0 + y1 * w0 + z1 * x0 + w1 * y0,
+            x1 * y0 - y1 * x0 + z1 * w0 + w1 * z0,
+            -x1 * x0 - y1 * y0 - z1 * z0 + w1 * w0,
+        ),
+        dtype=np.float32,
+    )
+
+
+def quat_conjugate(quaternion):
+    """
+    Return conjugate of quaternion.
+
+    E.g.:
+    >>> q0 = random_quaternion()
+    >>> q1 = quat_conjugate(q0)
+    >>> q1[3] == q0[3] and all(q1[:3] == -q0[:3])
+    True
+
+    Args:
+        quaternion (np.array): (x,y,z,w) quaternion
+
+    Returns:
+        np.array: (x,y,z,w) quaternion conjugate
+    """
+    return np.array(
+        (-quaternion[0], -quaternion[1], -quaternion[2], quaternion[3]),
+        dtype=np.float32,
+    )
+
+
+def quat_inverse(quaternion):
+    """
+    Return inverse of quaternion.
+
+    E.g.:
+    >>> q0 = random_quaternion()
+    >>> q1 = quat_inverse(q0)
+    >>> np.allclose(quat_multiply(q0, q1), [0, 0, 0, 1])
+    True
+
+    Args:
+        quaternion (np.array): (x,y,z,w) quaternion
+
+    Returns:
+        np.array: (x,y,z,w) quaternion inverse
+    """
+    return quat_conjugate(quaternion) / np.dot(quaternion, quaternion)
+
+
+def quat_distance(quaternion1, quaternion0):
+    """
+    Returns distance between two quaternions, such that distance * quaternion0 = quaternion1
+
+    Args:
+        quaternion1 (np.array): (x,y,z,w) quaternion
+        quaternion0 (np.array): (x,y,z,w) quaternion
+
+    Returns:
+        np.array: (x,y,z,w) quaternion distance
+    """
+    return quat_multiply(quaternion1, quat_inverse(quaternion0))
+
+
+def quat_slerp(quat0, quat1, fraction, shortestpath=True):
+    """
+    Return spherical linear interpolation between two quaternions.
+
+    E.g.:
+    >>> q0 = random_quat()
+    >>> q1 = random_quat()
+    >>> q = quat_slerp(q0, q1, 0.0)
+    >>> np.allclose(q, q0)
+    True
+
+    >>> q = quat_slerp(q0, q1, 1.0)
+    >>> np.allclose(q, q1)
+    True
+
+    >>> q = quat_slerp(q0, q1, 0.5)
+    >>> angle = math.acos(np.dot(q0, q))
+    >>> np.allclose(2.0, math.acos(np.dot(q0, q1)) / angle) or \
+        np.allclose(2.0, math.acos(-np.dot(q0, q1)) / angle)
+    True
+
+    Args:
+        quat0 (np.array): (x,y,z,w) quaternion startpoint
+        quat1 (np.array): (x,y,z,w) quaternion endpoint
+        fraction (float): fraction of interpolation to calculate
+        shortestpath (bool): If True, will calculate the shortest path
+
+    Returns:
+        np.array: (x,y,z,w) quaternion distance
+    """
+    q0 = unit_vector(quat0[:4])
+    q1 = unit_vector(quat1[:4])
+    if fraction == 0.0:
+        return q0
+    elif fraction == 1.0:
+        return q1
+    d = np.dot(q0, q1)
+    if abs(abs(d) - 1.0) < EPS:
+        return q0
+    if shortestpath and d < 0.0:
+        # invert rotation
+        d = -d
+        q1 *= -1.0
+    angle = math.acos(np.clip(d, -1, 1))
+    if abs(angle) < EPS:
+        return q0
+    isin = 1.0 / math.sin(angle)
+    q0 *= math.sin((1.0 - fraction) * angle) * isin
+    q1 *= math.sin(fraction * angle) * isin
+    q0 += q1
+    return q0
+
+
+def random_quat(rand=None):
+    """
+    Return uniform random unit quaternion.
+
+    E.g.:
+    >>> q = random_quat()
+    >>> np.allclose(1.0, vector_norm(q))
+    True
+    >>> q = random_quat(np.random.random(3))
+    >>> q.shape
+    (4,)
+
+    Args:
+        rand (3-array or None): If specified, must be three independent random variables that are uniformly distributed
+            between 0 and 1.
+
+    Returns:
+        np.array: (x,y,z,w) random quaternion
+    """
+    if rand is None:
+        rand = np.random.rand(3)
+    else:
+        assert len(rand) == 3
+    r1 = np.sqrt(1.0 - rand[0])
+    r2 = np.sqrt(rand[0])
+    pi2 = math.pi * 2.0
+    t1 = pi2 * rand[1]
+    t2 = pi2 * rand[2]
+    return np.array(
+        (np.sin(t1) * r1, np.cos(t1) * r1, np.sin(t2) * r2, np.cos(t2) * r2),
+        dtype=np.float32,
+    )
+
+
+def random_axis_angle(angle_limit=None, random_state=None):
+    """
+    Samples an axis-angle rotation by first sampling a random axis
+    and then sampling an angle. If @angle_limit is provided, the size
+    of the rotation angle is constrained.
+
+    If @random_state is provided (instance of np.random.RandomState), it
+    will be used to generate random numbers.
+
+    Args:
+        angle_limit (None or float): If set, determines magnitude limit of angles to generate
+        random_state (None or RandomState): RNG to use if specified
+
+    Raises:
+        AssertionError: [Invalid RNG]
+    """
+    if angle_limit is None:
+        angle_limit = 2.0 * np.pi
+
+    if random_state is not None:
+        assert isinstance(random_state, np.random.RandomState)
+        npr = random_state
+    else:
+        npr = np.random
+
+    # sample random axis using a normalized sample from spherical Gaussian.
+    # see (http://extremelearning.com.au/how-to-generate-uniformly-random-points-on-n-spheres-and-n-balls/)
+    # for why it works.
+    random_axis = npr.randn(3)
+    random_axis /= np.linalg.norm(random_axis)
+    random_angle = npr.uniform(low=0.0, high=angle_limit)
+    return random_axis, random_angle
+
+
+def vec(values):
+    """
+    Converts value tuple into a numpy vector.
+
+    Args:
+        values (n-array): a tuple of numbers
+
+    Returns:
+        np.array: vector of given values
+    """
+    return np.array(values, dtype=np.float32)
+
+
+def mat4(array):
+    """
+    Converts an array to 4x4 matrix.
+
+    Args:
+        array (n-array): the array in form of vec, list, or tuple
+
+    Returns:
+        np.array: a 4x4 numpy matrix
+    """
+    return np.array(array, dtype=np.float32).reshape((4, 4))
+
+
+def mat2pose(hmat):
+    """
+    Converts a homogeneous 4x4 matrix into pose.
+
+    Args:
+        hmat (np.array): a 4x4 homogeneous matrix
+
+    Returns:
+        2-tuple:
+
+            - (np.array) (x,y,z) position array in cartesian coordinates
+            - (np.array) (x,y,z,w) orientation array in quaternion form
+    """
+    pos = hmat[:3, 3]
+    orn = mat2quat(hmat[:3, :3])
+    return pos, orn
+
+
+@jit_decorator
+def mat2quat(rmat):
+    """
+    Converts given rotation matrix to quaternion.
+
+    Args:
+        rmat (np.array): 3x3 rotation matrix
+
+    Returns:
+        np.array: (x,y,z,w) float quaternion angles
+    """
+    M = np.asarray(rmat).astype(np.float32)[:3, :3]
+
+    m00 = M[0, 0]
+    m01 = M[0, 1]
+    m02 = M[0, 2]
+    m10 = M[1, 0]
+    m11 = M[1, 1]
+    m12 = M[1, 2]
+    m20 = M[2, 0]
+    m21 = M[2, 1]
+    m22 = M[2, 2]
+    # symmetric matrix K
+    K = np.array(
+        [
+            [m00 - m11 - m22, np.float32(0.0), np.float32(0.0), np.float32(0.0)],
+            [m01 + m10, m11 - m00 - m22, np.float32(0.0), np.float32(0.0)],
+            [m02 + m20, m12 + m21, m22 - m00 - m11, np.float32(0.0)],
+            [m21 - m12, m02 - m20, m10 - m01, m00 + m11 + m22],
+        ]
+    )
+    K /= 3.0
+    # quaternion is Eigen vector of K that corresponds to largest eigenvalue
+    w, V = np.linalg.eigh(K)
+    inds = np.array([3, 0, 1, 2])
+    q1 = V[inds, np.argmax(w)]
+    if q1[0] < 0.0:
+        np.negative(q1, q1)
+    inds = np.array([1, 2, 3, 0])
+    return q1[inds]
+
+
+def euler2mat(euler):
+    """
+    Converts euler angles into rotation matrix form
+
+    Args:
+        euler (np.array): (r,p,y) angles
+
+    Returns:
+        np.array: 3x3 rotation matrix
+
+    Raises:
+        AssertionError: [Invalid input shape]
+    """
+
+    euler = np.asarray(euler, dtype=np.float64)
+    assert euler.shape[-1] == 3, "Invalid shaped euler {}".format(euler)
+
+    ai, aj, ak = -euler[..., 2], -euler[..., 1], -euler[..., 0]
+    si, sj, sk = np.sin(ai), np.sin(aj), np.sin(ak)
+    ci, cj, ck = np.cos(ai), np.cos(aj), np.cos(ak)
+    cc, cs = ci * ck, ci * sk
+    sc, ss = si * ck, si * sk
+
+    mat = np.empty(euler.shape[:-1] + (3, 3), dtype=np.float64)
+    mat[..., 2, 2] = cj * ck
+    mat[..., 2, 1] = sj * sc - cs
+    mat[..., 2, 0] = sj * cc + ss
+    mat[..., 1, 2] = cj * sk
+    mat[..., 1, 1] = sj * ss + cc
+    mat[..., 1, 0] = sj * cs - sc
+    mat[..., 0, 2] = -sj
+    mat[..., 0, 1] = cj * si
+    mat[..., 0, 0] = cj * ci
+    return mat
+
+
+def mat2euler(rmat, axes="sxyz"):
+    """
+    Converts given rotation matrix to euler angles in radian.
+
+    Args:
+        rmat (np.array): 3x3 rotation matrix
+        axes (str): One of 24 axis sequences as string or encoded tuple (see top of this module)
+
+    Returns:
+        np.array: (r,p,y) converted euler angles in radian vec3 float
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+
+    M = np.array(rmat, dtype=np.float32, copy=False)[:3, :3]
+    if repetition:
+        sy = math.sqrt(M[i, j] * M[i, j] + M[i, k] * M[i, k])
+        if sy > EPS:
+            ax = math.atan2(M[i, j], M[i, k])
+            ay = math.atan2(sy, M[i, i])
+            az = math.atan2(M[j, i], -M[k, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(sy, M[i, i])
+            az = 0.0
+    else:
+        cy = math.sqrt(M[i, i] * M[i, i] + M[j, i] * M[j, i])
+        if cy > EPS:
+            ax = math.atan2(M[k, j], M[k, k])
+            ay = math.atan2(-M[k, i], cy)
+            az = math.atan2(M[j, i], M[i, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(-M[k, i], cy)
+            az = 0.0
+
+    if parity:
+        ax, ay, az = -ax, -ay, -az
+    if frame:
+        ax, az = az, ax
+    return vec((ax, ay, az))
+
+
+def pose2mat(pose):
+    """
+    Converts pose to homogeneous matrix.
+
+    Args:
+        pose (2-tuple): a (pos, orn) tuple where pos is vec3 float cartesian, and
+            orn is vec4 float quaternion.
+
+    Returns:
+        np.array: 4x4 homogeneous matrix
+    """
+    homo_pose_mat = np.zeros((4, 4), dtype=np.float32)
+    homo_pose_mat[:3, :3] = quat2mat(pose[1])
+    homo_pose_mat[:3, 3] = np.array(pose[0], dtype=np.float32)
+    homo_pose_mat[3, 3] = 1.0
+    return homo_pose_mat
+
+
+@jit_decorator
+def quat2mat(quaternion):
+    """
+    Converts given quaternion to matrix.
+
+    Args:
+        quaternion (np.array): (x,y,z,w) vec4 float angles
+
+    Returns:
+        np.array: 3x3 rotation matrix
+    """
+    # awkward semantics for use with numba
+    inds = np.array([3, 0, 1, 2])
+    q = np.asarray(quaternion).copy().astype(np.float32)[inds]
+
+    n = np.dot(q, q)
+    if n < EPS:
+        return np.identity(3)
+    q *= math.sqrt(2.0 / n)
+    q2 = np.outer(q, q)
+    return np.array(
+        [
+            [1.0 - q2[2, 2] - q2[3, 3], q2[1, 2] - q2[3, 0], q2[1, 3] + q2[2, 0]],
+            [q2[1, 2] + q2[3, 0], 1.0 - q2[1, 1] - q2[3, 3], q2[2, 3] - q2[1, 0]],
+            [q2[1, 3] - q2[2, 0], q2[2, 3] + q2[1, 0], 1.0 - q2[1, 1] - q2[2, 2]],
+        ]
+    )
+
+
+def quat2axisangle(quat):
+    """
+    Converts quaternion to axis-angle format.
+    Returns a unit vector direction scaled by its angle in radians.
+
+    Args:
+        quat (np.array): (x,y,z,w) vec4 float angles
+
+    Returns:
+        np.array: (ax,ay,az) axis-angle exponential coordinates
+    """
+    # clip quaternion
+    if quat[3] > 1.0:
+        quat[3] = 1.0
+    elif quat[3] < -1.0:
+        quat[3] = -1.0
+
+    den = np.sqrt(1.0 - quat[3] * quat[3])
+    if math.isclose(den, 0.0):
+        # This is (close to) a zero degree rotation, immediately return
+        return np.zeros(3)
+
+    return (quat[:3] * 2.0 * math.acos(quat[3])) / den
+
+
+def axisangle2quat(vec):
+    """
+    Converts scaled axis-angle to quat.
+
+    Args:
+        vec (np.array): (ax,ay,az) axis-angle exponential coordinates
+
+    Returns:
+        np.array: (x,y,z,w) vec4 float angles
+    """
+    # Grab angle
+    angle = np.linalg.norm(vec)
+
+    # handle zero-rotation case
+    if math.isclose(angle, 0.0):
+        return np.array([0.0, 0.0, 0.0, 1.0])
+
+    # make sure that axis is a unit vector
+    axis = vec / angle
+
+    q = np.zeros(4)
+    q[3] = np.cos(angle / 2.0)
+    q[:3] = axis * np.sin(angle / 2.0)
+    return q
+
+
+def pose_in_A_to_pose_in_B(pose_A, pose_A_in_B):
+    """
+    Converts a homogenous matrix corresponding to a point C in frame A
+    to a homogenous matrix corresponding to the same point C in frame B.
+
+    Args:
+        pose_A (np.array): 4x4 matrix corresponding to the pose of C in frame A
+        pose_A_in_B (np.array): 4x4 matrix corresponding to the pose of A in frame B
+
+    Returns:
+        np.array: 4x4 matrix corresponding to the pose of C in frame B
+    """
+
+    # pose of A in B takes a point in A and transforms it to a point in C.
+
+    # pose of C in B = pose of A in B * pose of C in A
+    # take a point in C, transform it to A, then to B
+    # T_B^C = T_A^C * T_B^A
+    return pose_A_in_B.dot(pose_A)
+
+
+def pose_inv(pose):
+    """
+    Computes the inverse of a homogeneous matrix corresponding to the pose of some
+    frame B in frame A. The inverse is the pose of frame A in frame B.
+
+    Args:
+        pose (np.array): 4x4 matrix for the pose to inverse
+
+    Returns:
+        np.array: 4x4 matrix for the inverse pose
+    """
+
+    # Note, the inverse of a pose matrix is the following
+    # [R t; 0 1]^-1 = [R.T -R.T*t; 0 1]
+
+    # Intuitively, this makes sense.
+    # The original pose matrix translates by t, then rotates by R.
+    # We just invert the rotation by applying R-1 = R.T, and also translate back.
+    # Since we apply translation first before rotation, we need to translate by
+    # -t in the original frame, which is -R-1*t in the new frame, and then rotate back by
+    # R-1 to align the axis again.
+
+    pose_inv = np.zeros((4, 4))
+    pose_inv[:3, :3] = pose[:3, :3].T
+    pose_inv[:3, 3] = -pose_inv[:3, :3].dot(pose[:3, 3])
+    pose_inv[3, 3] = 1.0
+    return pose_inv
+
+
+def _skew_symmetric_translation(pos_A_in_B):
+    """
+    Helper function to get a skew symmetric translation matrix for converting quantities
+    between frames.
+
+    Args:
+        pos_A_in_B (np.array): (x,y,z) position of A in frame B
+
+    Returns:
+        np.array: 3x3 skew symmetric translation matrix
+    """
+    return np.array(
+        [
+            0.0,
+            -pos_A_in_B[2],
+            pos_A_in_B[1],
+            pos_A_in_B[2],
+            0.0,
+            -pos_A_in_B[0],
+            -pos_A_in_B[1],
+            pos_A_in_B[0],
+            0.0,
+        ]
+    ).reshape((3, 3))
+
+
+def vel_in_A_to_vel_in_B(vel_A, ang_vel_A, pose_A_in_B):
+    """
+    Converts linear and angular velocity of a point in frame A to the equivalent in frame B.
+
+    Args:
+        vel_A (np.array): (vx,vy,vz) linear velocity in A
+        ang_vel_A (np.array): (wx,wy,wz) angular velocity in A
+        pose_A_in_B (np.array): 4x4 matrix corresponding to the pose of A in frame B
+
+    Returns:
+        2-tuple:
+
+            - (np.array) (vx,vy,vz) linear velocities in frame B
+            - (np.array) (wx,wy,wz) angular velocities in frame B
+    """
+    pos_A_in_B = pose_A_in_B[:3, 3]
+    rot_A_in_B = pose_A_in_B[:3, :3]
+    skew_symm = _skew_symmetric_translation(pos_A_in_B)
+    vel_B = rot_A_in_B.dot(vel_A) + skew_symm.dot(rot_A_in_B.dot(ang_vel_A))
+    ang_vel_B = rot_A_in_B.dot(ang_vel_A)
+    return vel_B, ang_vel_B
+
+
+def force_in_A_to_force_in_B(force_A, torque_A, pose_A_in_B):
+    """
+    Converts linear and rotational force at a point in frame A to the equivalent in frame B.
+
+    Args:
+        force_A (np.array): (fx,fy,fz) linear force in A
+        torque_A (np.array): (tx,ty,tz) rotational force (moment) in A
+        pose_A_in_B (np.array): 4x4 matrix corresponding to the pose of A in frame B
+
+    Returns:
+        2-tuple:
+
+            - (np.array) (fx,fy,fz) linear forces in frame B
+            - (np.array) (tx,ty,tz) moments in frame B
+    """
+    pos_A_in_B = pose_A_in_B[:3, 3]
+    rot_A_in_B = pose_A_in_B[:3, :3]
+    skew_symm = _skew_symmetric_translation(pos_A_in_B)
+    force_B = rot_A_in_B.T.dot(force_A)
+    torque_B = -rot_A_in_B.T.dot(skew_symm.dot(force_A)) + rot_A_in_B.T.dot(torque_A)
+    return force_B, torque_B
+
+
+def rotation_matrix(angle, direction, point=None):
+    """
+    Returns matrix to rotate about axis defined by point and direction.
+
+    E.g.:
+        >>> angle = (random.random() - 0.5) * (2*math.pi)
+        >>> direc = numpy.random.random(3) - 0.5
+        >>> point = numpy.random.random(3) - 0.5
+        >>> R0 = rotation_matrix(angle, direc, point)
+        >>> R1 = rotation_matrix(angle-2*math.pi, direc, point)
+        >>> is_same_transform(R0, R1)
+        True
+
+        >>> R0 = rotation_matrix(angle, direc, point)
+        >>> R1 = rotation_matrix(-angle, -direc, point)
+        >>> is_same_transform(R0, R1)
+        True
+
+        >>> I = numpy.identity(4, numpy.float32)
+        >>> numpy.allclose(I, rotation_matrix(math.pi*2, direc))
+        True
+
+        >>> numpy.allclose(2., numpy.trace(rotation_matrix(math.pi/2,
+        ...                                                direc, point)))
+        True
+
+    Args:
+        angle (float): Magnitude of rotation
+        direction (np.array): (ax,ay,az) axis about which to rotate
+        point (None or np.array): If specified, is the (x,y,z) point about which the rotation will occur
+
+    Returns:
+        np.array: 4x4 homogeneous matrix that includes the desired rotation
+    """
+    sina = math.sin(angle)
+    cosa = math.cos(angle)
+    direction = unit_vector(direction[:3])
+    # rotation matrix around unit vector
+    R = np.array(((cosa, 0.0, 0.0), (0.0, cosa, 0.0), (0.0, 0.0, cosa)), dtype=np.float32)
+    R += np.outer(direction, direction) * (1.0 - cosa)
+    direction *= sina
+    R += np.array(
+        (
+            (0.0, -direction[2], direction[1]),
+            (direction[2], 0.0, -direction[0]),
+            (-direction[1], direction[0], 0.0),
+        ),
+        dtype=np.float32,
+    )
+    M = np.identity(4)
+    M[:3, :3] = R
+    if point is not None:
+        # rotation not around origin
+        point = np.array(point[:3], dtype=np.float32, copy=False)
+        M[:3, 3] = point - np.dot(R, point)
+    return M
+
+
+def clip_translation(dpos, limit):
+    """
+    Limits a translation (delta position) to a specified limit
+
+    Scales down the norm of the dpos to 'limit' if norm(dpos) > limit, else returns immediately
+
+    Args:
+        dpos (n-array): n-dim Translation being clipped (e,g.: (x, y, z)) -- numpy array
+        limit (float): Value to limit translation by -- magnitude (scalar, in same units as input)
+
+    Returns:
+        2-tuple:
+
+            - (np.array) Clipped translation (same dimension as inputs)
+            - (bool) whether the value was clipped or not
+    """
+    input_norm = np.linalg.norm(dpos)
+    return (dpos * limit / input_norm, True) if input_norm > limit else (dpos, False)
+
+
+def clip_rotation(quat, limit):
+    """
+    Limits a (delta) rotation to a specified limit
+
+    Converts rotation to axis-angle, clips, then re-converts back into quaternion
+
+    Args:
+        quat (np.array): (x,y,z,w) rotation being clipped
+        limit (float): Value to limit rotation by -- magnitude (scalar, in radians)
+
+    Returns:
+        2-tuple:
+
+            - (np.array) Clipped rotation quaternion (x, y, z, w)
+            - (bool) whether the value was clipped or not
+    """
+    clipped = False
+
+    # First, normalize the quaternion
+    quat = quat / np.linalg.norm(quat)
+
+    den = np.sqrt(max(1 - quat[3] * quat[3], 0))
+    if den == 0:
+        # This is a zero degree rotation, immediately return
+        return quat, clipped
+    else:
+        # This is all other cases
+        x = quat[0] / den
+        y = quat[1] / den
+        z = quat[2] / den
+        a = 2 * math.acos(quat[3])
+
+    # Clip rotation if necessary and return clipped quat
+    if abs(a) > limit:
+        a = limit * np.sign(a) / 2
+        sa = math.sin(a)
+        ca = math.cos(a)
+        quat = np.array([x * sa, y * sa, z * sa, ca])
+        clipped = True
+
+    return quat, clipped
+
+
+def make_pose(translation, rotation):
+    """
+    Makes a homogeneous pose matrix from a translation vector and a rotation matrix.
+
+    Args:
+        translation (np.array): (x,y,z) translation value
+        rotation (np.array): a 3x3 matrix representing rotation
+
+    Returns:
+        pose (np.array): a 4x4 homogeneous matrix
+    """
+    pose = np.zeros((4, 4))
+    pose[:3, :3] = rotation
+    pose[:3, 3] = translation
+    pose[3, 3] = 1.0
+    return pose
+
+
+def unit_vector(data, axis=None, out=None):
+    """
+    Returns ndarray normalized by length, i.e. eucledian norm, along axis.
+
+    E.g.:
+        >>> v0 = numpy.random.random(3)
+        >>> v1 = unit_vector(v0)
+        >>> numpy.allclose(v1, v0 / numpy.linalg.norm(v0))
+        True
+
+        >>> v0 = numpy.random.rand(5, 4, 3)
+        >>> v1 = unit_vector(v0, axis=-1)
+        >>> v2 = v0 / numpy.expand_dims(numpy.sqrt(numpy.sum(v0*v0, axis=2)), 2)
+        >>> numpy.allclose(v1, v2)
+        True
+
+        >>> v1 = unit_vector(v0, axis=1)
+        >>> v2 = v0 / numpy.expand_dims(numpy.sqrt(numpy.sum(v0*v0, axis=1)), 1)
+        >>> numpy.allclose(v1, v2)
+        True
+
+        >>> v1 = numpy.empty((5, 4, 3), dtype=numpy.float32)
+        >>> unit_vector(v0, axis=1, out=v1)
+        >>> numpy.allclose(v1, v2)
+        True
+
+        >>> list(unit_vector([]))
+        []
+
+        >>> list(unit_vector([1.0]))
+        [1.0]
+
+    Args:
+        data (np.array): data to normalize
+        axis (None or int): If specified, determines specific axis along data to normalize
+        out (None or np.array): If specified, will store computation in this variable
+
+    Returns:
+        None or np.array: If @out is not specified, will return normalized vector. Otherwise, stores the output in @out
+    """
+    if out is None:
+        data = np.array(data, dtype=np.float32, copy=True)
+        if data.ndim == 1:
+            data /= math.sqrt(np.dot(data, data))
+            return data
+    else:
+        if out is not data:
+            out[:] = np.array(data, copy=False)
+        data = out
+    length = np.atleast_1d(np.sum(data * data, axis))
+    np.sqrt(length, length)
+    if axis is not None:
+        length = np.expand_dims(length, axis)
+    data /= length
+    if out is None:
+        return data
+
+
+def get_orientation_error(target_orn, current_orn):
+    """
+    Returns the difference between two quaternion orientations as a 3 DOF numpy array.
+    For use in an impedance controller / task-space PD controller.
+
+    Args:
+        target_orn (np.array): (x, y, z, w) desired quaternion orientation
+        current_orn (np.array): (x, y, z, w) current quaternion orientation
+
+    Returns:
+        orn_error (np.array): (ax,ay,az) current orientation error, corresponds to
+            (target_orn - current_orn)
+    """
+    current_orn = np.array([current_orn[3], current_orn[0], current_orn[1], current_orn[2]])
+    target_orn = np.array([target_orn[3], target_orn[0], target_orn[1], target_orn[2]])
+
+    pinv = np.zeros((3, 4))
+    pinv[0, :] = [-current_orn[1], current_orn[0], -current_orn[3], current_orn[2]]
+    pinv[1, :] = [-current_orn[2], current_orn[3], current_orn[0], -current_orn[1]]
+    pinv[2, :] = [-current_orn[3], -current_orn[2], current_orn[1], current_orn[0]]
+    orn_error = 2.0 * pinv.dot(np.array(target_orn))
+    return orn_error
+
+
+def get_pose_error(target_pose, current_pose):
+    """
+    Computes the error corresponding to target pose - current pose as a 6-dim vector.
+    The first 3 components correspond to translational error while the last 3 components
+    correspond to the rotational error.
+
+    Args:
+        target_pose (np.array): a 4x4 homogenous matrix for the target pose
+        current_pose (np.array): a 4x4 homogenous matrix for the current pose
+
+    Returns:
+        np.array: 6-dim pose error.
+    """
+    error = np.zeros(6)
+
+    # compute translational error
+    target_pos = target_pose[:3, 3]
+    current_pos = current_pose[:3, 3]
+    pos_err = target_pos - current_pos
+
+    # compute rotational error
+    r1 = current_pose[:3, 0]
+    r2 = current_pose[:3, 1]
+    r3 = current_pose[:3, 2]
+    r1d = target_pose[:3, 0]
+    r2d = target_pose[:3, 1]
+    r3d = target_pose[:3, 2]
+    rot_err = 0.5 * (np.cross(r1, r1d) + np.cross(r2, r2d) + np.cross(r3, r3d))
+
+    error[:3] = pos_err
+    error[3:] = rot_err
+    return error
+
+
+@jit_decorator
+def matrix_inverse(matrix):
+    """
+    Helper function to have an efficient matrix inversion function.
+
+    Args:
+        matrix (np.array): 2d-array representing a matrix
+
+    Returns:
+        np.array: 2d-array representing the matrix inverse
+    """
+    return np.linalg.inv(matrix)
diff --git a/phantom/submodules/phantom-robosuite/robosuite/wrappers/__init__.py b/phantom/submodules/phantom-robosuite/robosuite/wrappers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..282a66a0702dda49c0a4d2822bb5c994d85d8cd4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/wrappers/__init__.py
@@ -0,0 +1,10 @@
+from robosuite.wrappers.wrapper import Wrapper
+from robosuite.wrappers.data_collection_wrapper import DataCollectionWrapper
+from robosuite.wrappers.demo_sampler_wrapper import DemoSamplerWrapper
+from robosuite.wrappers.domain_randomization_wrapper import DomainRandomizationWrapper
+from robosuite.wrappers.visualization_wrapper import VisualizationWrapper
+
+try:
+    from robosuite.wrappers.gym_wrapper import GymWrapper
+except:
+    print("Warning: make sure gym is installed if you want to use the GymWrapper.")
diff --git a/phantom/submodules/phantom-robosuite/robosuite/wrappers/data_collection_wrapper.py b/phantom/submodules/phantom-robosuite/robosuite/wrappers/data_collection_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..60602aa6d005b677d2416e0a64638a0b4109f7cd
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/wrappers/data_collection_wrapper.py
@@ -0,0 +1,188 @@
+"""
+This file implements a wrapper for saving simulation states to disk.
+This data collection wrapper is useful for collecting demonstrations.
+"""
+
+import os
+import time
+
+import numpy as np
+
+from robosuite.utils.mjcf_utils import save_sim_model
+from robosuite.wrappers import Wrapper
+
+
+class DataCollectionWrapper(Wrapper):
+    def __init__(self, env, directory, collect_freq=1, flush_freq=100):
+        """
+        Initializes the data collection wrapper.
+
+        Args:
+            env (MujocoEnv): The environment to monitor.
+            directory (str): Where to store collected data.
+            collect_freq (int): How often to save simulation state, in terms of environment steps.
+            flush_freq (int): How frequently to dump data to disk, in terms of environment steps.
+        """
+        super().__init__(env)
+
+        # the base directory for all logging
+        self.directory = directory
+
+        # in-memory cache for simulation states and action info
+        self.states = []
+        self.action_infos = []  # stores information about actions taken
+        self.successful = False  # stores success state of demonstration
+
+        # how often to save simulation state, in terms of environment steps
+        self.collect_freq = collect_freq
+
+        # how frequently to dump data to disk, in terms of environment steps
+        self.flush_freq = flush_freq
+
+        if not os.path.exists(directory):
+            print("DataCollectionWrapper: making new directory at {}".format(directory))
+            os.makedirs(directory)
+
+        # store logging directory for current episode
+        self.ep_directory = None
+
+        # remember whether any environment interaction has occurred
+        self.has_interaction = False
+
+        # some variables for remembering the current episode's initial state and model xml
+        self._current_task_instance_state = None
+        self._current_task_instance_xml = None
+
+    def _start_new_episode(self):
+        """
+        Bookkeeping to do at the start of each new episode.
+        """
+
+        # flush any data left over from the previous episode if any interactions have happened
+        if self.has_interaction:
+            self._flush()
+
+        # timesteps in current episode
+        self.t = 0
+        self.has_interaction = False
+
+        # save the task instance (will be saved on the first env interaction)
+        self._current_task_instance_xml = self.env.sim.model.get_xml()
+        self._current_task_instance_state = np.array(self.env.sim.get_state().flatten())
+
+        # trick for ensuring that we can play MuJoCo demonstrations back
+        # deterministically by using the recorded actions open loop
+        self.env.reset_from_xml_string(self._current_task_instance_xml)
+        self.env.sim.reset()
+        self.env.sim.set_state_from_flattened(self._current_task_instance_state)
+        self.env.sim.forward()
+
+    def _on_first_interaction(self):
+        """
+        Bookkeeping for first timestep of episode.
+        This function is necessary to make sure that logging only happens after the first
+        step call to the simulation, instead of on the reset (people tend to call
+        reset more than is necessary in code).
+
+        Raises:
+            AssertionError: [Episode path already exists]
+        """
+
+        self.has_interaction = True
+
+        # create a directory with a timestamp
+        t1, t2 = str(time.time()).split(".")
+        self.ep_directory = os.path.join(self.directory, "ep_{}_{}".format(t1, t2))
+        assert not os.path.exists(self.ep_directory)
+        print("DataCollectionWrapper: making folder at {}".format(self.ep_directory))
+        os.makedirs(self.ep_directory)
+
+        # save the model xml
+        xml_path = os.path.join(self.ep_directory, "model.xml")
+        with open(xml_path, "w") as f:
+            f.write(self._current_task_instance_xml)
+
+        # save initial state and action
+        assert len(self.states) == 0
+        self.states.append(self._current_task_instance_state)
+
+    def _flush(self):
+        """
+        Method to flush internal state to disk.
+        """
+        t1, t2 = str(time.time()).split(".")
+        state_path = os.path.join(self.ep_directory, "state_{}_{}.npz".format(t1, t2))
+        if hasattr(self.env, "unwrapped"):
+            env_name = self.env.unwrapped.__class__.__name__
+        else:
+            env_name = self.env.__class__.__name__
+        np.savez(
+            state_path,
+            states=np.array(self.states),
+            action_infos=self.action_infos,
+            successful=self.successful,
+            env=env_name,
+        )
+        self.states = []
+        self.action_infos = []
+        self.successful = False
+
+    def reset(self):
+        """
+        Extends vanilla reset() function call to accommodate data collection
+
+        Returns:
+            OrderedDict: Environment observation space after reset occurs
+        """
+        ret = super().reset()
+        self._start_new_episode()
+        return ret
+
+    def step(self, action):
+        """
+        Extends vanilla step() function call to accommodate data collection
+
+        Args:
+            action (np.array): Action to take in environment
+
+        Returns:
+            4-tuple:
+
+                - (OrderedDict) observations from the environment
+                - (float) reward from the environment
+                - (bool) whether the current episode is completed or not
+                - (dict) misc information
+        """
+        ret = super().step(action)
+        self.t += 1
+
+        # on the first time step, make directories for logging
+        if not self.has_interaction:
+            self._on_first_interaction()
+
+        # collect the current simulation state if necessary
+        if self.t % self.collect_freq == 0:
+            state = self.env.sim.get_state().flatten()
+            self.states.append(state)
+
+            info = {}
+            info["actions"] = np.array(action)
+            self.action_infos.append(info)
+
+        # check if the demonstration is successful
+        if self.env._check_success():
+            self.successful = True
+
+        # flush collected data to disk if necessary
+        if self.t % self.flush_freq == 0:
+            self._flush()
+
+        return ret
+
+    def close(self):
+        """
+        Override close method in order to flush left over data
+        """
+        if self.has_interaction:
+            self._flush()
+        self.env.close()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/wrappers/demo_sampler_wrapper.py b/phantom/submodules/phantom-robosuite/robosuite/wrappers/demo_sampler_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..227045fa52c0d3b01ca563886ca56f6eae1593f5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/wrappers/demo_sampler_wrapper.py
@@ -0,0 +1,316 @@
+"""
+This file contains a wrapper for sampling environment states
+from a set of demonstrations on every reset. The main use case is for 
+altering the start state distribution of training episodes for 
+learning RL policies.
+"""
+
+import os
+import random
+import time
+
+import h5py
+import numpy as np
+
+from robosuite.wrappers import Wrapper
+
+
+class DemoSamplerWrapper(Wrapper):
+    """
+    Initializes a wrapper that provides support for resetting the environment
+    state to one from a demonstration. It also supports curriculums for
+    altering how often to sample from demonstration vs. sampling a reset
+    state from the environment.
+
+    Args:
+        env (MujocoEnv): The environment to wrap.
+
+        demo_path (str): The path to the folder containing the demonstrations.
+            There should be a `demo.hdf5` file and a folder named `models` with
+            all of the stored model xml files from the demonstrations.
+
+        need_xml (bool): If True, the mujoco model needs to be reloaded when
+            sampling a state from a demonstration. This could be because every
+            demonstration was taken under varied object properties, for example.
+            In this case, every sampled state comes with a corresponding xml to
+            be used for the environment reset.
+
+        num_traj (int): If provided, subsample @number demonstrations from the
+            provided set of demonstrations instead of using all of them.
+
+        sampling_schemes (list of str): A list of sampling schemes
+            to be used. The following strings are valid schemes:
+
+                `'random'`: sample a reset state directly from the wrapped environment
+
+                `'uniform'`: sample a state from a demonstration uniformly at random
+
+                `'forward'`: sample a state from a window that grows progressively from
+                    the start of demonstrations
+
+                `'reverse'`: sample a state from a window that grows progressively from
+                    the end of demonstrations
+
+        scheme_ratios (list of float --> np.array): A list of probability values to
+            assign to each member of @sampling_schemes. Must be non-negative and
+            sum to 1.
+
+        open_loop_increment_freq (int): How frequently to increase
+            the window size in open loop schemes ("forward" and "reverse"). The
+            window size will increase by @open_loop_window_increment every
+            @open_loop_increment_freq samples. Only samples that are generated
+            by open loop schemes contribute to this count.
+
+        open_loop_initial_window_width (int): The width of the initial sampling
+            window, in terms of number of demonstration time steps, for
+            open loop schemes.
+
+        open_loop_window_increment (int): The window size will increase by
+            @open_loop_window_increment every @open_loop_increment_freq samples.
+            This number is in terms of number of demonstration time steps.
+
+    Raises:
+        AssertionError: [Incompatible envs]
+        AssertionError: [Invalid sampling scheme]
+        AssertionError: [Invalid scheme ratio]
+    """
+
+    def __init__(
+        self,
+        env,
+        demo_path,
+        need_xml=False,
+        num_traj=-1,
+        sampling_schemes=("uniform", "random"),
+        scheme_ratios=(0.9, 0.1),
+        open_loop_increment_freq=100,
+        open_loop_initial_window_width=25,
+        open_loop_window_increment=25,
+    ):
+        super().__init__(env)
+
+        self.demo_path = demo_path
+        hdf5_path = os.path.join(self.demo_path, "demo.hdf5")
+        self.demo_file = h5py.File(hdf5_path, "r")
+
+        # ensure that wrapped env matches the env on which demonstrations were collected
+        env_name = self.demo_file["data"].attrs["env"]
+        assert (
+            env_name == self.unwrapped.__class__.__name__
+        ), "Wrapped env {} does not match env on which demos were collected ({})".format(
+            env.__class__.__name__, env_name
+        )
+
+        # list of all demonstrations episodes
+        self.demo_list = list(self.demo_file["data"].keys())
+
+        # subsample a selection of demonstrations if requested
+        if num_traj > 0:
+            random.seed(3141)  # ensure that the same set is sampled every time
+            self.demo_list = random.sample(self.demo_list, num_traj)
+
+        self.need_xml = need_xml
+        self.demo_sampled = 0
+
+        self.sample_method_dict = {
+            "random": "_random_sample",
+            "uniform": "_uniform_sample",
+            "forward": "_forward_sample_open_loop",
+            "reverse": "_reverse_sample_open_loop",
+        }
+
+        self.sampling_schemes = sampling_schemes
+        self.scheme_ratios = np.asarray(scheme_ratios)
+
+        # make sure the list of schemes is valid
+        schemes = self.sample_method_dict.keys()
+        assert np.all([(s in schemes) for s in self.sampling_schemes])
+
+        # make sure the distribution is the correct size
+        assert len(self.sampling_schemes) == len(self.scheme_ratios)
+
+        # make sure the distribution lies in the probability simplex
+        assert np.all(self.scheme_ratios > 0.0)
+        assert sum(self.scheme_ratios) == 1.0
+
+        # open loop configuration
+        self.open_loop_increment_freq = open_loop_increment_freq
+        self.open_loop_window_increment = open_loop_window_increment
+
+        # keep track of window size
+        self.open_loop_window_size = open_loop_initial_window_width
+
+    def reset(self):
+        """
+        Logic for sampling a state from the demonstration and resetting
+        the simulation to that state.
+
+        Returns:
+            OrderedDict: Environment observation space after reset occurs
+        """
+        state = self.sample()
+        if state is None:
+            # None indicates that a normal env reset should occur
+            return self.env.reset()
+        else:
+            if self.need_xml:
+                # reset the simulation from the model if necessary
+                state, xml = state
+                self.env.reset_from_xml_string(xml)
+
+            if isinstance(state, tuple):
+                state = state[0]
+
+            # force simulator state to one from the demo
+            self.sim.set_state_from_flattened(state)
+            self.sim.forward()
+
+            return self.env._get_observation()
+
+    def sample(self):
+        """
+        This is the core sampling method. Samples a state from a
+        demonstration, in accordance with the configuration.
+
+        Returns:
+            None or np.array or 2-tuple: If np.array, is the state sampled from a demo file. If 2-tuple, additionally
+                includes the model xml file
+        """
+
+        # chooses a sampling scheme randomly based on the mixing ratios
+        seed = random.uniform(0, 1)
+        ratio = np.cumsum(self.scheme_ratios)
+        ratio = ratio > seed
+        for i, v in enumerate(ratio):
+            if v:
+                break
+
+        sample_method = getattr(self, self.sample_method_dict[self.sampling_schemes[i]])
+        return sample_method()
+
+    def _random_sample(self):
+        """
+        Sampling method.
+
+        Return None to indicate that the state should be sampled directly
+        from the environment.
+        """
+        return None
+
+    def _uniform_sample(self):
+        """
+        Sampling method.
+
+        First uniformly sample a demonstration from the set of demonstrations.
+        Then uniformly sample a state from the selected demonstration.
+
+        Returns:
+            np.array or 2-tuple: If np.array, is the state sampled from a demo file. If 2-tuple, additionally
+                includes the model xml file
+        """
+
+        # get a random episode index
+        ep_ind = random.choice(self.demo_list)
+
+        # select a flattened mujoco state uniformly from this episode
+        states = self.demo_file["data/{}/states".format(ep_ind)][()]
+        state = random.choice(states)
+
+        if self.need_xml:
+            model_xml = self._xml_for_episode_index(ep_ind)
+            xml = self.env.edit_model_xml(model_xml)
+            return state, xml
+        return state
+
+    def _reverse_sample_open_loop(self):
+        """
+        Sampling method.
+
+        Open loop reverse sampling from demonstrations. Starts by
+        sampling from states near the end of the demonstrations.
+        Increases the window backwards as the number of calls to
+        this sampling method increases at a fixed rate.
+
+        Returns:
+            np.array or 2-tuple: If np.array, is the state sampled from a demo file. If 2-tuple, additionally
+                includes the model xml file
+        """
+
+        # get a random episode index
+        ep_ind = random.choice(self.demo_list)
+
+        # sample uniformly in a window that grows backwards from the end of the demos
+        states = self.demo_file["data/{}/states".format(ep_ind)][()]
+        eps_len = states.shape[0]
+        index = np.random.randint(max(eps_len - self.open_loop_window_size, 0), eps_len)
+        state = states[index]
+
+        # increase window size at a fixed frequency (open loop)
+        self.demo_sampled += 1
+        if self.demo_sampled >= self.open_loop_increment_freq:
+            if self.open_loop_window_size < eps_len:
+                self.open_loop_window_size += self.open_loop_window_increment
+            self.demo_sampled = 0
+
+        if self.need_xml:
+            model_xml = self._xml_for_episode_index(ep_ind)
+            xml = self.env.edit_model_xml(model_xml)
+            return state, xml
+
+        return state
+
+    def _forward_sample_open_loop(self):
+        """
+        Sampling method.
+
+        Open loop forward sampling from demonstrations. Starts by
+        sampling from states near the beginning of the demonstrations.
+        Increases the window forwards as the number of calls to
+        this sampling method increases at a fixed rate.
+
+        Returns:
+            np.array or 2-tuple: If np.array, is the state sampled from a demo file. If 2-tuple, additionally
+                includes the model xml file
+        """
+
+        # get a random episode index
+        ep_ind = random.choice(self.demo_list)
+
+        # sample uniformly in a window that grows forwards from the beginning of the demos
+        states = self.demo_file["data/{}/states".format(ep_ind)][()]
+        eps_len = states.shape[0]
+        index = np.random.randint(0, min(self.open_loop_window_size, eps_len))
+        state = states[index]
+
+        # increase window size at a fixed frequency (open loop)
+        self.demo_sampled += 1
+        if self.demo_sampled >= self.open_loop_increment_freq:
+            if self.open_loop_window_size < eps_len:
+                self.open_loop_window_size += self.open_loop_window_increment
+            self.demo_sampled = 0
+
+        if self.need_xml:
+            model_xml = self._xml_for_episode_index(ep_ind)
+            xml = self.env.edit_model_xml(model_xml)
+            return state, xml
+
+        return state
+
+    def _xml_for_episode_index(self, ep_ind):
+        """
+        Helper method to retrieve the corresponding model xml string
+        for the passed episode index.
+
+        Args:
+            ep_ind (int): Episode index to pull from demo file
+
+        Returns:
+            str: model xml as a string
+        """
+
+        # read the model xml, using the metadata stored in the attribute for this episode
+        model_file = self.demo_file["data/{}".format(ep_ind)].attrs["model_file"]
+        model_path = os.path.join(self.demo_path, "models", model_file)
+        with open(model_path, "r") as model_f:
+            model_xml = model_f.read()
+        return model_xml
diff --git a/phantom/submodules/phantom-robosuite/robosuite/wrappers/domain_randomization_wrapper.py b/phantom/submodules/phantom-robosuite/robosuite/wrappers/domain_randomization_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..70dcd7cb9ac9e77f38b2036d581015d84afb8a32
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/wrappers/domain_randomization_wrapper.py
@@ -0,0 +1,266 @@
+"""
+This file implements a wrapper for facilitating domain randomization over
+robosuite environments.
+"""
+import numpy as np
+
+from robosuite.utils.mjmod import CameraModder, DynamicsModder, LightingModder, TextureModder
+from robosuite.wrappers import Wrapper
+
+DEFAULT_COLOR_ARGS = {
+    "geom_names": None,  # all geoms are randomized
+    "randomize_local": True,  # sample nearby colors
+    "randomize_material": True,  # randomize material reflectance / shininess / specular
+    "local_rgb_interpolation": 0.2,
+    "local_material_interpolation": 0.3,
+    "texture_variations": ["rgb", "checker", "noise", "gradient"],  # all texture variation types
+    "randomize_skybox": True,  # by default, randomize skybox too
+}
+
+DEFAULT_CAMERA_ARGS = {
+    "camera_names": None,  # all cameras are randomized
+    "randomize_position": True,
+    "randomize_rotation": True,
+    "randomize_fovy": True,
+    "position_perturbation_size": 0.01,
+    "rotation_perturbation_size": 0.087,
+    "fovy_perturbation_size": 5.0,
+}
+
+DEFAULT_LIGHTING_ARGS = {
+    "light_names": None,  # all lights are randomized
+    "randomize_position": True,
+    "randomize_direction": True,
+    "randomize_specular": True,
+    "randomize_ambient": True,
+    "randomize_diffuse": True,
+    "randomize_active": True,
+    "position_perturbation_size": 0.1,
+    "direction_perturbation_size": 0.35,
+    "specular_perturbation_size": 0.1,
+    "ambient_perturbation_size": 0.1,
+    "diffuse_perturbation_size": 0.1,
+}
+
+DEFAULT_DYNAMICS_ARGS = {
+    # Opt parameters
+    "randomize_density": True,
+    "randomize_viscosity": True,
+    "density_perturbation_ratio": 0.1,
+    "viscosity_perturbation_ratio": 0.1,
+    # Body parameters
+    "body_names": None,  # all bodies randomized
+    "randomize_position": True,
+    "randomize_quaternion": True,
+    "randomize_inertia": True,
+    "randomize_mass": True,
+    "position_perturbation_size": 0.0015,
+    "quaternion_perturbation_size": 0.003,
+    "inertia_perturbation_ratio": 0.02,
+    "mass_perturbation_ratio": 0.02,
+    # Geom parameters
+    "geom_names": None,  # all geoms randomized
+    "randomize_friction": True,
+    "randomize_solref": True,
+    "randomize_solimp": True,
+    "friction_perturbation_ratio": 0.1,
+    "solref_perturbation_ratio": 0.1,
+    "solimp_perturbation_ratio": 0.1,
+    # Joint parameters
+    "joint_names": None,  # all joints randomized
+    "randomize_stiffness": True,
+    "randomize_frictionloss": True,
+    "randomize_damping": True,
+    "randomize_armature": True,
+    "stiffness_perturbation_ratio": 0.1,
+    "frictionloss_perturbation_size": 0.05,
+    "damping_perturbation_size": 0.01,
+    "armature_perturbation_size": 0.01,
+}
+
+
+class DomainRandomizationWrapper(Wrapper):
+    """
+    Wrapper that allows for domain randomization mid-simulation.
+
+    Args:
+        env (MujocoEnv): The environment to wrap.
+
+        seed (int): Integer used to seed all randomizations from this wrapper. It is
+            used to create a np.random.RandomState instance to make sure samples here
+            are isolated from sampling occurring elsewhere in the code. If not provided,
+            will default to using global random state.
+
+        randomize_color (bool): if True, randomize geom colors and texture colors
+
+        randomize_camera (bool): if True, randomize camera locations and parameters
+
+        randomize_lighting (bool): if True, randomize light locations and properties
+
+        randomize_dyanmics (bool): if True, randomize dynamics parameters
+
+        color_randomization_args (dict): Color-specific randomization arguments
+
+        camera_randomization_args (dict): Camera-specific randomization arguments
+
+        lighting_randomization_args (dict): Lighting-specific randomization arguments
+
+        dynamics_randomization_args (dict): Dyanmics-specific randomization arguments
+
+        randomize_on_reset (bool): if True, randomize on every call to @reset. This, in
+            conjunction with setting @randomize_every_n_steps to 0, is useful to
+            generate a new domain per episode.
+
+        randomize_every_n_steps (int): determines how often randomization should occur. Set
+            to 0 if randomization should happen manually (by calling @randomize_domain)
+
+    """
+
+    def __init__(
+        self,
+        env,
+        seed=None,
+        randomize_color=True,
+        randomize_camera=True,
+        randomize_lighting=True,
+        randomize_dynamics=True,
+        color_randomization_args=DEFAULT_COLOR_ARGS,
+        camera_randomization_args=DEFAULT_CAMERA_ARGS,
+        lighting_randomization_args=DEFAULT_LIGHTING_ARGS,
+        dynamics_randomization_args=DEFAULT_DYNAMICS_ARGS,
+        randomize_on_reset=True,
+        randomize_every_n_steps=1,
+    ):
+        super().__init__(env)
+
+        self.seed = seed
+        if seed is not None:
+            self.random_state = np.random.RandomState(seed)
+        else:
+            self.random_state = None
+        self.randomize_color = randomize_color
+        self.randomize_camera = randomize_camera
+        self.randomize_lighting = randomize_lighting
+        self.randomize_dynamics = randomize_dynamics
+        self.color_randomization_args = color_randomization_args
+        self.camera_randomization_args = camera_randomization_args
+        self.lighting_randomization_args = lighting_randomization_args
+        self.dynamics_randomization_args = dynamics_randomization_args
+        self.randomize_on_reset = randomize_on_reset
+        self.randomize_every_n_steps = randomize_every_n_steps
+
+        self.step_counter = 0
+
+        self.modders = []
+
+        if self.randomize_color:
+            self.tex_modder = TextureModder(
+                sim=self.env.sim, random_state=self.random_state, **self.color_randomization_args
+            )
+            self.modders.append(self.tex_modder)
+
+        if self.randomize_camera:
+            self.camera_modder = CameraModder(
+                sim=self.env.sim,
+                random_state=self.random_state,
+                **self.camera_randomization_args,
+            )
+            self.modders.append(self.camera_modder)
+
+        if self.randomize_lighting:
+            self.light_modder = LightingModder(
+                sim=self.env.sim,
+                random_state=self.random_state,
+                **self.lighting_randomization_args,
+            )
+            self.modders.append(self.light_modder)
+
+        if self.randomize_dynamics:
+            self.dynamics_modder = DynamicsModder(
+                sim=self.env.sim,
+                random_state=self.random_state,
+                **self.dynamics_randomization_args,
+            )
+            self.modders.append(self.dynamics_modder)
+
+        self.save_default_domain()
+
+    def reset(self):
+        """
+        Extends superclass method to reset the domain randomizer.
+
+        Returns:
+            OrderedDict: Environment observation space after reset occurs
+        """
+        # undo all randomizations
+        self.restore_default_domain()
+
+        # normal env reset
+        ret = super().reset()
+
+        # save the original env parameters
+        self.save_default_domain()
+
+        # reset counter for doing domain randomization at a particular frequency
+        self.step_counter = 0
+
+        # update sims
+        for modder in self.modders:
+            modder.update_sim(self.env.sim)
+
+        if self.randomize_on_reset:
+            # domain randomize + regenerate observation
+            self.randomize_domain()
+            ret = self.env._get_observations()
+
+        return ret
+
+    def step(self, action):
+        """
+        Extends vanilla step() function call to accommodate domain randomization
+
+        Returns:
+            4-tuple:
+
+                - (OrderedDict) observations from the environment
+                - (float) reward from the environment
+                - (bool) whether the current episode is completed or not
+                - (dict) misc information
+        """
+        # Step the internal randomization state
+        self.step_randomization()
+
+        return super().step(action)
+
+    def step_randomization(self):
+        """
+        Steps the internal randomization state
+        """
+        # functionality for randomizing at a particular frequency
+        if self.randomize_every_n_steps > 0:
+            if self.step_counter % self.randomize_every_n_steps == 0:
+                self.randomize_domain()
+        self.step_counter += 1
+
+    def randomize_domain(self):
+        """
+        Runs domain randomization over the environment.
+        """
+        for modder in self.modders:
+            modder.randomize()
+
+    def save_default_domain(self):
+        """
+        Saves the current simulation model parameters so
+        that they can be restored later.
+        """
+        for modder in self.modders:
+            modder.save_defaults()
+
+    def restore_default_domain(self):
+        """
+        Restores the simulation model parameters saved
+        in the last call to @save_default_domain.
+        """
+        for modder in self.modders:
+            modder.restore_defaults()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/wrappers/gym_wrapper.py b/phantom/submodules/phantom-robosuite/robosuite/wrappers/gym_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..329cdaf2ad2b6b691a79e4e0386f8ce24bee7cc4
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/wrappers/gym_wrapper.py
@@ -0,0 +1,134 @@
+"""
+This file implements a wrapper for facilitating compatibility with OpenAI gym.
+This is useful when using these environments with code that assumes a gym-like
+interface.
+"""
+
+import numpy as np
+import gymnasium as gym
+from gymnasium import spaces, Env
+
+from robosuite.wrappers import Wrapper
+
+
+class GymWrapper(Wrapper, gym.Env):
+    metadata = None
+    render_mode = None
+    """
+    Initializes the Gym wrapper. Mimics many of the required functionalities of the Wrapper class
+    found in the gym.core module
+
+    Args:
+        env (MujocoEnv): The environment to wrap.
+        keys (None or list of str): If provided, each observation will
+            consist of concatenated keys from the wrapped environment's
+            observation dictionary. Defaults to proprio-state and object-state.
+
+    Raises:
+        AssertionError: [Object observations must be enabled if no keys]
+    """
+
+    def __init__(self, env, keys=None):
+        # Run super method
+        super().__init__(env=env)
+        # Create name for gym
+        robots = "".join([type(robot.robot_model).__name__ for robot in self.env.robots])
+        self.name = robots + "_" + type(self.env).__name__
+
+        # Get reward range
+        self.reward_range = (0, self.env.reward_scale)
+
+        if keys is None:
+            keys = []
+            # Add object obs if requested
+            if self.env.use_object_obs:
+                keys += ["object-state"]
+            # Add image obs if requested
+            if self.env.use_camera_obs:
+                keys += [f"{cam_name}_image" for cam_name in self.env.camera_names]
+            # Iterate over all robots to add to state
+            for idx in range(len(self.env.robots)):
+                keys += ["robot{}_proprio-state".format(idx)]
+        self.keys = keys
+
+        # Gym specific attributes
+        self.env.spec = None
+
+        # set up observation and action spaces
+        obs = self.env.reset()
+        self.modality_dims = {key: obs[key].shape for key in self.keys}
+        flat_ob = self._flatten_obs(obs)
+        self.obs_dim = flat_ob.size
+        high = np.inf * np.ones(self.obs_dim)
+        low = -high
+        self.observation_space = spaces.Box(low, high)
+        low, high = self.env.action_spec
+        self.action_space = spaces.Box(low, high)
+
+    def _flatten_obs(self, obs_dict, verbose=False):
+        """
+        Filters keys of interest out and concatenate the information.
+
+        Args:
+            obs_dict (OrderedDict): ordered dictionary of observations
+            verbose (bool): Whether to print out to console as observation keys are processed
+
+        Returns:
+            np.array: observations flattened into a 1d array
+        """
+        ob_lst = []
+        for key in self.keys:
+            if key in obs_dict:
+                if verbose:
+                    print("adding key: {}".format(key))
+                ob_lst.append(np.array(obs_dict[key]).flatten())
+        return np.concatenate(ob_lst)
+
+    def reset(self, seed=None, options=None):
+        """
+        Extends env reset method to return flattened observation instead of normal OrderedDict and optionally resets seed
+
+        Returns:
+            np.array: Flattened environment observation space after reset occurs
+        """
+        if seed is not None:
+            if isinstance(seed, int):
+                np.random.seed(seed)
+            else:
+                raise TypeError("Seed must be an integer type!")
+        ob_dict = self.env.reset()
+        return self._flatten_obs(ob_dict), {}
+
+    def step(self, action):
+        """
+        Extends vanilla step() function call to return flattened observation instead of normal OrderedDict.
+
+        Args:
+            action (np.array): Action to take in environment
+
+        Returns:
+            4-tuple:
+
+                - (np.array) flattened observations from the environment
+                - (float) reward from the environment
+                - (bool) episode ending after reaching an env terminal state
+                - (bool) episode ending after an externally defined condition
+                - (dict) misc information
+        """
+        ob_dict, reward, terminated, info = self.env.step(action)
+        return self._flatten_obs(ob_dict), reward, terminated, False, info
+
+    def compute_reward(self, achieved_goal, desired_goal, info):
+        """
+        Dummy function to be compatible with gym interface that simply returns environment reward
+
+        Args:
+            achieved_goal: [NOT USED]
+            desired_goal: [NOT USED]
+            info: [NOT USED]
+
+        Returns:
+            float: environment reward
+        """
+        # Dummy args used to mimic Wrapper interface
+        return self.env.reward()
diff --git a/phantom/submodules/phantom-robosuite/robosuite/wrappers/visualization_wrapper.py b/phantom/submodules/phantom-robosuite/robosuite/wrappers/visualization_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..64b847af6ee67acbe61353340bfbf06f2cb09c1e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/wrappers/visualization_wrapper.py
@@ -0,0 +1,186 @@
+"""
+This file implements a wrapper for visualizing important sites in a given environment.
+
+By default, this visualizes all sites possible for the environment. Visualization options
+for a given environment can be found by calling `get_visualization_settings()`, and can
+be set individually by calling `set_visualization_setting(setting, visible)`.
+"""
+import xml.etree.ElementTree as ET
+from copy import deepcopy
+
+import numpy as np
+
+from robosuite.utils.mjcf_utils import new_body, new_geom, new_site
+from robosuite.wrappers import Wrapper
+
+DEFAULT_INDICATOR_SITE_CONFIG = {
+    "type": "sphere",
+    "size": [0.03],
+    "rgba": [1, 0, 0, 0.5],
+}
+
+
+class VisualizationWrapper(Wrapper):
+    def __init__(self, env, indicator_configs=None):
+        """
+        Initializes the data collection wrapper. Note that this automatically conducts a (hard) reset initially to make
+        sure indicators are properly added to the sim model.
+
+        Args:
+            env (MujocoEnv): The environment to visualize
+
+            indicator_configs (None or str or dict or list): Configurations to use for indicator objects.
+
+                If None, no indicator objects will be used
+
+                If a string, this should be `'default'`, which corresponds to single default spherical indicator
+
+                If a dict, should specify a single indicator object config
+
+                If a list, should specify specific indicator object configs to use for multiple indicators (which in
+                turn can either be `'default'` or a dict)
+
+                As each indicator object is essentially a site element, each dict should map site attribute keywords to
+                values. Note that, at the very minimum, the `'name'` attribute MUST be specified for each indicator. See
+                http://www.mujoco.org/book/XMLreference.html#site for specific site attributes that can be specified.
+        """
+        super().__init__(env)
+
+        # Make sure that the environment is NOT using segmentation sensors, since we cannot use segmentation masks
+        # with visualization sites simultaneously
+        assert all(
+            seg is None for seg in env.camera_segmentations
+        ), "Cannot use camera segmentations with visualization wrapper!"
+
+        # Standardize indicator configs
+        self.indicator_configs = None
+        if indicator_configs is not None:
+            self.indicator_configs = []
+            if type(indicator_configs) in {str, dict}:
+                indicator_configs = [indicator_configs]
+            for i, indicator_config in enumerate(indicator_configs):
+                if indicator_config == "default":
+                    indicator_config = deepcopy(DEFAULT_INDICATOR_SITE_CONFIG)
+                    indicator_config["name"] = f"indicator{i}"
+                # Make sure name attribute is specified
+                assert "name" in indicator_config, "Name must be specified for all indicator object configurations!"
+                # Add this configuration to the internal array
+                self.indicator_configs.append(indicator_config)
+
+        # Create internal dict to store visualization settings (set to True by default)
+        self._vis_settings = {vis: True for vis in self.env._visualizations}
+
+        # Add the post-processor to make sure indicator objects get added to model before it's actually loaded in sim
+        self.env.set_xml_processor(processor=self._add_indicators_to_model)
+
+        # Conduct a (hard) reset to make sure visualization changes propagate
+        reset_mode = self.env.hard_reset
+        self.env.hard_reset = True
+        self.reset()
+        self.env.hard_reset = reset_mode
+
+    def get_indicator_names(self):
+        """
+        Gets all indicator object names for this environment.
+
+        Returns:
+            list: Indicator names for this environment.
+        """
+        return (
+            [ind_config["name"] for ind_config in self.indicator_configs] if self.indicator_configs is not None else []
+        )
+
+    def set_indicator_pos(self, indicator, pos):
+        """
+        Sets the specified @indicator to the desired position @pos
+
+        Args:
+            indicator (str): Name of the indicator to set
+            pos (3-array): (x, y, z) Cartesian world coordinates to set the specified indicator to
+        """
+        # Make sure indicator is valid
+        indicator_names = set(self.get_indicator_names())
+        assert indicator in indicator_names, "Invalid indicator name specified. Valid options are {}, got {}".format(
+            indicator_names, indicator
+        )
+        # Set the specified indicator
+        self.env.sim.model.body_pos[self.env.sim.model.body_name2id(indicator + "_body")] = np.array(pos)
+
+    def get_visualization_settings(self):
+        """
+        Gets all settings for visualizing this environment
+
+        Returns:
+            list: Visualization keywords for this environment.
+        """
+        return self._vis_settings.keys()
+
+    def set_visualization_setting(self, setting, visible):
+        """
+        Sets the specified @setting to have visibility = @visible.
+
+        Args:
+            setting (str): Visualization keyword to set
+            visible (bool): True if setting should be visualized.
+        """
+        assert (
+            setting in self._vis_settings
+        ), "Invalid visualization setting specified. Valid options are {}, got {}".format(
+            self._vis_settings.keys(), setting
+        )
+        self._vis_settings[setting] = visible
+
+    def reset(self):
+        """
+        Extends vanilla reset() function call to accommodate visualization
+
+        Returns:
+            OrderedDict: Environment observation space after reset occurs
+        """
+        ret = super().reset()
+        # Update any visualization
+        self.env.visualize(vis_settings=self._vis_settings)
+        return ret
+
+    def step(self, action):
+        """
+        Extends vanilla step() function call to accommodate visualization
+
+        Args:
+            action (np.array): Action to take in environment
+
+        Returns:
+            4-tuple:
+
+                - (OrderedDict) observations from the environment
+                - (float) reward from the environment
+                - (bool) whether the current episode is completed or not
+                - (dict) misc information
+        """
+        ret = super().step(action)
+
+        # Update any visualization
+        self.env.visualize(vis_settings=self._vis_settings)
+
+        return ret
+
+    def _add_indicators_to_model(self, xml):
+        """
+        Adds indicators to the mujoco simulation model
+
+        Args:
+            xml (string): MJCF model in xml format, for the current simulation to be loaded
+        """
+        if self.indicator_configs is not None:
+            root = ET.fromstring(xml)
+            worldbody = root.find("worldbody")
+
+            for indicator_config in self.indicator_configs:
+                config = deepcopy(indicator_config)
+                indicator_body = new_body(name=config["name"] + "_body", pos=config.pop("pos", (0, 0, 0)))
+                indicator_body.append(new_site(**config))
+                worldbody.append(indicator_body)
+
+            xml = ET.tostring(root, encoding="utf8").decode("utf8")
+
+        return xml
diff --git a/phantom/submodules/phantom-robosuite/robosuite/wrappers/wrapper.py b/phantom/submodules/phantom-robosuite/robosuite/wrappers/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c922dfdfc0fcd58109e4a36c7815dea4524cb8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/robosuite/wrappers/wrapper.py
@@ -0,0 +1,135 @@
+"""
+This file contains the base wrapper class for Mujoco environments.
+Wrappers are useful for data collection and logging. Highly recommended.
+"""
+
+
+class Wrapper:
+    """
+    Base class for all wrappers in robosuite.
+
+    Args:
+        env (MujocoEnv): The environment to wrap.
+    """
+
+    def __init__(self, env):
+        self.env = env
+
+    @classmethod
+    def class_name(cls):
+        return cls.__name__
+
+    def _warn_double_wrap(self):
+        """
+        Utility function that checks if we're accidentally trying to double wrap an env
+
+        Raises:
+            Exception: [Double wrapping env]
+        """
+        env = self.env
+        while True:
+            if isinstance(env, Wrapper):
+                if env.class_name() == self.class_name():
+                    raise Exception("Attempted to double wrap with Wrapper: {}".format(self.__class__.__name__))
+                env = env.env
+            else:
+                break
+
+    def step(self, action):
+        """
+        By default, run the normal environment step() function
+
+        Args:
+            action (np.array): action to take in environment
+
+        Returns:
+            4-tuple:
+
+                - (OrderedDict) observations from the environment
+                - (float) reward from the environment
+                - (bool) whether the current episode is completed or not
+                - (dict) misc information
+        """
+        return self.env.step(action)
+
+    def reset(self):
+        """
+        By default, run the normal environment reset() function
+
+        Returns:
+            OrderedDict: Environment observation space after reset occurs
+        """
+        return self.env.reset()
+
+    def render(self, **kwargs):
+        """
+        By default, run the normal environment render() function
+
+        Args:
+            **kwargs (dict): Any args to pass to environment render function
+        """
+        return self.env.render(**kwargs)
+
+    def observation_spec(self):
+        """
+        By default, grabs the normal environment observation_spec
+
+        Returns:
+            OrderedDict: Observations from the environment
+        """
+        return self.env.observation_spec()
+
+    @property
+    def action_spec(self):
+        """
+        By default, grabs the normal environment action_spec
+
+        Returns:
+            2-tuple:
+
+                - (np.array) minimum (low) action values
+                - (np.array) maximum (high) action values
+        """
+        return self.env.action_spec
+
+    @property
+    def action_dim(self):
+        """
+        By default, grabs the normal environment action_dim
+
+        Returns:
+            int: Action space dimension
+        """
+        return self.env.dof
+
+    @property
+    def unwrapped(self):
+        """
+        Grabs unwrapped environment
+
+        Returns:
+            env (MujocoEnv): Unwrapped environment
+        """
+        if hasattr(self.env, "unwrapped"):
+            return self.env.unwrapped
+        else:
+            return self.env
+
+    # this method is a fallback option on any methods the original env might support
+    def __getattr__(self, attr):
+        # using getattr ensures that both __getattribute__ and __getattr__ (fallback) get called
+        # (see https://stackoverflow.com/questions/3278077/difference-between-getattr-vs-getattribute)
+        orig_attr = getattr(self.env, attr)
+        if callable(orig_attr):
+
+            def hooked(*args, **kwargs):
+                result = orig_attr(*args, **kwargs)
+                # prevent wrapped_class from becoming unwrapped
+                # NOTE: had to use "is" to prevent errors when returning numpy arrays from a wrapped method
+                if result is self.env:
+                    return self
+                return result
+
+            return hooked
+        else:
+            return orig_attr
diff --git a/phantom/submodules/phantom-robosuite/setup.py b/phantom/submodules/phantom-robosuite/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..78b74e6955f406b3023459136090d923bd9949ab
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/setup.py
@@ -0,0 +1,37 @@
+# read the contents of your README file
+from os import path
+
+from setuptools import find_packages, setup
+
+this_directory = path.abspath(path.dirname(__file__))
+with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
+    lines = f.readlines()
+
+# remove images from README
+lines = [x for x in lines if ".png" not in x]
+long_description = "".join(lines)
+
+setup(
+    name="robosuite",
+    packages=[package for package in find_packages() if package.startswith("robosuite")],
+    install_requires=[
+        "numpy>=1.13.3",
+        "numba>=0.49.1",
+        "scipy>=1.2.3",
+        "mujoco>=2.3.0",
+        "Pillow",
+        "opencv-python",
+        "pynput",
+        "termcolor",
+    ],
+    eager_resources=["*"],
+    include_package_data=True,
+    python_requires=">=3",
+    description="robosuite: A Modular Simulation Framework and Benchmark for Robot Learning",
+    author="Yuke Zhu",
+    url="https://github.com/ARISE-Initiative/robosuite",
+    author_email="yukez@cs.utexas.edu",
+    version="1.4.1",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+)
diff --git a/phantom/submodules/phantom-robosuite/tests/test_controllers/test_all_controllers.py b/phantom/submodules/phantom-robosuite/tests/test_controllers/test_all_controllers.py
new file mode 100644
index 0000000000000000000000000000000000000000..356057b0d7aa4f748067fbfe9c8ac801c6d8003e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_controllers/test_all_controllers.py
@@ -0,0 +1,155 @@
+"""
+Test all controllers on the Lift task with Sawyer robot environment as a test case.
+
+The following controllers are tested:
+Operational Space Control - Position & Orientation
+Operational Space Control - Position only
+Inverse Kinematics - Position & Orientation
+Joint Impedance
+Joint Velocity
+Joint Torque
+
+This (non-exhaustive) test script checks for qualitative irregularities in controller behavior.
+However, this testing module also checks for action space correctness and dimensionality.
+For every controller action space, runs through each dimension and executes a perturbation "test_value" from its
+neutral (stationary) value for a certain amount of time "steps_per_action", and then returns to all neutral values
+for time "steps_per_rest" before proceeding with the next action dim.
+
+    E.g.: Given that the expected action space of the Pos / Ori (OSC_POSE) controller (without a gripper) is
+    (dx, dy, dz, ax, ay, az), the testing sequence of actions over time will be:
+
+        ***START OF TEST***
+        ( dx,  0,  0,  0,  0,  0, grip)     <-- Translation in x-direction      for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest'   steps
+        (  0, dy,  0,  0,  0,  0, grip)     <-- Translation in y-direction      for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest'   steps
+        (  0,  0, dz,  0,  0,  0, grip)     <-- Translation in z-direction      for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest'   steps
+        (  0,  0,  0,  a,  0,  0, grip)     <-- Rotation about x axis           for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest'   steps
+        (  0,  0,  0,  0,  a,  0, grip)     <-- Rotation about y axis           for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest'   steps
+        (  0,  0,  0,  0,  0,  a, grip)     <-- Rotation about z axis           for 'steps_per_action' steps
+        (  0,  0,  0,  0,  0,  0, grip)     <-- No movement (pause)             for 'steps_per_rest'   steps
+        ***END OF TEST***
+
+    Thus the OSC_POSE controller should be expected to sequentially move linearly in the x direction first,
+        then the y direction, then the z direction, and then begin sequentially rotating about its x-axis,
+        then y-axis, then z-axis.
+
+Please reference the controller README in the robosuite/controllers directory for an overview of each controller.
+Controllers are expected to behave in a generally controlled manner, according to their control space.
+    E.g.: the Pos / Ori controller should be expected to move linearly in the x direction first, then the y direction,
+        then the z direction, and then begin rotating about its x-axis, then y-axis, then z-axis.
+
+As this is strictly a qualitative set of tests, it is up to the developer / user to examine for specific irregularities.
+However, the expected qualitative behavior is described below for each controller:
+
+* OSC_POSE: Gripper moves sequentially and linearly in x, y, z direction, then sequentially rotates in x-axis,
+            y-axis, z-axis, relative to the global coordinate frame
+* OSC_POSITION: Gripper moves sequentially and linearly in x, y, z direction, relative to the global coordinate frame
+* IK_POSE: Gripper moves sequentially and linearly in x, y, z direction, then sequentially rotates in x-axis, y-axis,
+            z-axis, relative to the local robot end effector frame
+* JOINT_POSITION: Robot Joints move sequentially in a controlled fashion
+* JOINT_VELOCITY: Robot Joints move sequentially in a controlled fashion
+* JOINT_TORQUE: Unlike other controllers, joint torque controller is expected to act rather lethargic, as the
+            "controller" is really just a wrapper for direct torque control of the mujoco actuators. Therefore, a
+            "neutral" value of 0 torque will not guarantee a stable robot when it has non-zero velocity!
+
+Note that by default, there is no rendering. Rendering can be enabled by setting the --render flag when calling this
+test script.
+
+"""
+import argparse
+
+import numpy as np
+
+import robosuite as suite
+import robosuite.utils.transform_utils as T
+from robosuite import load_controller_config
+
+# Arguments for this test script
+parser = argparse.ArgumentParser()
+parser.add_argument("--render", action="store_true", help="Whether to render this test or not for visual validation")
+args = parser.parse_args()
+
+# Define the controllers to use (action_dim, num_test_steps, test_value)
+controllers = {
+    "OSC_POSE": [7, 6, 0.1],
+    "OSC_POSITION": [4, 3, 0.1],
+    "IK_POSE": [7, 6, 0.01],
+    "JOINT_POSITION": [8, 7, 0.2],
+    "JOINT_VELOCITY": [8, 7, -0.1],
+    "JOINT_TORQUE": [8, 7, 0.25],
+}
+
+# Define the number of timesteps to use per controller action as well as timesteps in between actions
+steps_per_action = 50
+steps_per_rest = 25
+
+
+def test_all_controllers():
+    for controller_name in controllers.keys():
+        # Define variables for each controller test
+        action_dim = controllers[controller_name][0]
+        num_test_steps = controllers[controller_name][1]
+        test_value = controllers[controller_name][2]
+        neutral = np.zeros(action_dim)
+
+        # Define controller path to load
+        controller_config = load_controller_config(default_controller=controller_name)
+
+        # Now, create a test env for testing the controller on
+        env = suite.make(
+            "Lift",
+            robots="Sawyer",
+            has_renderer=args.render,  # use on-screen renderer for visual validation only if requested
+            has_offscreen_renderer=False,
+            use_camera_obs=False,
+            horizon=(steps_per_action + steps_per_rest) * num_test_steps,
+            controller_configs=controller_config,
+        )
+        print("Testing controller: {}...".format(controller_name))
+
+        env.reset()
+        # If rendering, set controller to front view to get best angle for viewing robot movements
+        if args.render:
+            env.viewer.set_camera(camera_id=0)
+
+        # get action range
+        action_min, action_max = env.action_spec
+        assert action_min.shape == action_max.shape
+        assert action_min.shape[0] == action_dim, "Expected {}, got {}".format(action_dim, action_min.shape[0])
+
+        # Keep track of done variable to know when to break loop
+        count = 0
+        # Loop through controller space
+        while count < num_test_steps:
+            action = neutral.copy()
+            for i in range(steps_per_action):
+                if controller_name in {"IK_POSE", "OSC_POSE"} and count > 2:
+                    # Set this value to be the angle and set appropriate axis
+                    vec = np.zeros(3)
+                    vec[count - 3] = test_value
+                    action[3:6] = vec
+                else:
+                    action[count] = test_value
+                env.step(action)
+                if args.render:
+                    env.render()
+            for i in range(steps_per_rest):
+                env.step(neutral)
+                if args.render:
+                    env.render()
+            count += 1
+
+        # Shut down this env before starting the next test
+        env.close()
+
+    # Tests passed!
+    print("All controller tests completed.")
+
+
+if __name__ == "__main__":
+
+    test_all_controllers()
diff --git a/phantom/submodules/phantom-robosuite/tests/test_controllers/test_linear_interpolator.py b/phantom/submodules/phantom-robosuite/tests/test_controllers/test_linear_interpolator.py
new file mode 100644
index 0000000000000000000000000000000000000000..d12729562c2bb9538dd3cc9b53b39b250a5622a8
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_controllers/test_linear_interpolator.py
@@ -0,0 +1,195 @@
+"""
+Test the linear interpolator on the Lift task with Sawyer arm environment as a test case.
+
+The linear interpolator is meant to increase the stability and overall safety of a robot arm's trajectory when reaching
+a setpoint, "ramping up" the actual action command sent to a given controller from zero to the actual inputted action
+over a fraction of the timesteps in betwteen each high-level input action (the "ramp ratio"). As a result, the
+resulting trajectory should be smoother, proportional to the interpolator's ramp ratio setting.
+
+This test verifies that the linear interpolator works correctly on both the IK and OSC controller for both position and
+orientation, and proceeds as follows:
+
+    1. Given a constant delta position action, and with the interpolator disabled, we will measure the sum of absolute
+        changes in joint torques between individual simulation timesteps
+
+    2. We will repeat Step 1, but this time with the interpolator enabled and with a ramp ratio of 1.0 (max value)
+
+    3. We expect the interpolated trajectories to experience a smaller overall magnitude of changes in torques, due to
+        the setpoints between controller timesteps being smoothed out over the ramp ratio.
+
+Note: As this is a qualitative test, it is up to the user to evaluate the output and determine the expected behavior of
+the tested controllers.
+"""
+
+import argparse
+import json
+import os
+
+import numpy as np
+
+import robosuite as suite
+import robosuite.utils.transform_utils as T
+
+# Define the threshold locations, delta values, and ratio #
+
+# Translation trajectory
+pos_y_threshold = 0.1
+delta_pos_y = 0.01
+pos_action_osc = [0, delta_pos_y * 40, 0]
+pos_action_ik = [0, delta_pos_y, 0]
+
+# Rotation trajectory
+rot_r_threshold = np.pi / 2
+delta_rot_r = 0.01
+rot_action_osc = [delta_rot_r * 40, 0, 0]
+rot_action_ik = [delta_rot_r * 5, 0, 0]
+
+# Concatenated thresholds and corresponding indexes (y = 1 in x,y,z; roll = 0 in r,p,y)
+thresholds = [pos_y_threshold, rot_r_threshold]
+indexes = [1, 0]
+
+# Threshold ratio
+min_ratio = 1.10
+
+# Define arguments for this test
+parser = argparse.ArgumentParser()
+parser.add_argument("--render", action="store_true", help="Whether to render tests or run headless")
+args = parser.parse_args()
+
+# Setup printing options for numbers
+np.set_printoptions(formatter={"float": lambda x: "{0:0.3f}".format(x)})
+
+
+# function to run the actual sim in order to receive summed absolute delta torques
+def step(env, action, current_torques):
+    env.timestep += 1
+    policy_step = True
+    summed_abs_delta_torques = np.zeros(7)
+
+    for i in range(int(env.control_timestep / env.model_timestep)):
+        env.sim.forward()
+        env._pre_action(action, policy_step)
+        last_torques = current_torques
+        current_torques = env.robots[0].torques
+        summed_abs_delta_torques += np.abs(current_torques - last_torques)
+        env.sim.step()
+        policy_step = False
+
+    env.cur_time += env.control_timestep
+    out = env._post_action(action)
+    return out, summed_abs_delta_torques, current_torques
+
+
+# Running the actual test #
+def test_linear_interpolator():
+
+    for controller_name in ["IK_POSE", "OSC_POSE"]:
+
+        for traj in ["pos", "ori"]:
+
+            # Define counter to increment timesteps and torques for each trajectory
+            timesteps = [0, 0]
+            summed_abs_delta_torques = [np.zeros(7), np.zeros(7)]
+
+            for interpolator in [None, "linear"]:
+                # Define numpy seed so we guarantee consistent starting pos / ori for each trajectory
+                np.random.seed(3)
+
+                # Define controller path to load
+                controller_path = os.path.join(
+                    os.path.dirname(__file__),
+                    "../../robosuite",
+                    "controllers/config/{}.json".format(controller_name.lower()),
+                )
+                with open(controller_path) as f:
+                    controller_config = json.load(f)
+                    controller_config["interpolation"] = interpolator
+                    controller_config["ramp_ratio"] = 1.0
+
+                # Now, create a test env for testing the controller on
+                env = suite.make(
+                    "Lift",
+                    robots="Sawyer",
+                    has_renderer=args.render,  # by default, don't use on-screen renderer for visual validation
+                    has_offscreen_renderer=False,
+                    use_camera_obs=False,
+                    horizon=10000,
+                    control_freq=20,
+                    controller_configs=controller_config,
+                )
+
+                # Reset the environment
+                env.reset()
+
+                # Hardcode the starting position for sawyer
+                init_qpos = [-0.5538, -0.8208, 0.4155, 1.8409, -0.4955, 0.6482, 1.9628]
+                env.robots[0].set_robot_joint_positions(init_qpos)
+                env.robots[0].controller.update_initial_joints(init_qpos)
+                env.robots[0].controller.reset_goal()
+
+                # Notify user a new trajectory is beginning
+                print(
+                    "\nTesting controller {} with trajectory {} and interpolator={}...".format(
+                        controller_name, traj, interpolator
+                    )
+                )
+
+                # If rendering, set controller to front view to get best angle for viewing robot movements
+                if args.render:
+                    env.viewer.set_camera(camera_id=0)
+
+                # Keep track of state of robot eef (pos, ori (euler)) and torques
+                current_torques = np.zeros(7)
+                initial_state = [env.robots[0]._hand_pos, T.mat2quat(env.robots[0]._hand_orn)]
+                dstate = [
+                    env.robots[0]._hand_pos - initial_state[0],
+                    T.mat2euler(T.quat2mat(T.quat_distance(T.mat2quat(env.robots[0]._hand_orn), initial_state[1]))),
+                ]
+
+                # Define the uniform trajectory action
+                if traj == "pos":
+                    pos_act = pos_action_ik if controller_name == "IK_POSE" else pos_action_osc
+                    rot_act = np.zeros(3)
+                else:
+                    pos_act = np.zeros(3)
+                    rot_act = rot_action_ik if controller_name == "IK_POSE" else rot_action_osc
+
+                # Compose the action
+                action = np.concatenate([pos_act, rot_act, [0]])
+
+                # Determine which trajectory we're executing
+                k = 0 if traj == "pos" else 1
+                j = 0 if not interpolator else 1
+
+                # Run trajectory until the threshold condition is met
+                while abs(dstate[k][indexes[k]]) < abs(thresholds[k]):
+                    _, summed_torques, current_torques = step(env, action, current_torques)
+                    if args.render:
+                        env.render()
+
+                    # Update torques, timestep count, and state
+                    summed_abs_delta_torques[j] += summed_torques
+                    timesteps[j] += 1
+                    dstate = [
+                        env.robots[0]._hand_pos - initial_state[0],
+                        T.mat2euler(T.quat2mat(T.quat_distance(T.mat2quat(env.robots[0]._hand_orn), initial_state[1]))),
+                    ]
+
+                # When finished, print out the timestep results
+                print(
+                    "Completed trajectory. Avg per-step absolute delta torques: {}".format(
+                        summed_abs_delta_torques[j] / timesteps[j]
+                    )
+                )
+
+                # Shut down this env before starting the next test
+                env.close()
+
+    # Tests completed!
+    print()
+    print("-" * 80)
+    print("All linear interpolator testing completed.\n")
+
+
+if __name__ == "__main__":
+    test_linear_interpolator()
diff --git a/phantom/submodules/phantom-robosuite/tests/test_controllers/test_variable_impedance.py b/phantom/submodules/phantom-robosuite/tests/test_controllers/test_variable_impedance.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc80c41de0b22526786987c69501bee1ed38596
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_controllers/test_variable_impedance.py
@@ -0,0 +1,189 @@
+"""
+Test the variable impedance feature of impedance-based controllers (OSC, Joint Position) on the Lift task with
+Sawyer arm environment as a test case.
+
+The variable impedance feature allows per-action fine-grained control over the specific impedance gains when executing
+impedance control (namely, "kp" and "damping" ratios). This allows a given controller to execute more complex and
+potentially interactive trajectories by varying the net impedance of the controlled actuators over time.
+
+This (qualitative) test verifies that the variable impedance works correctly on both the OSC Pose / Position and
+Joint Position controllers, and proceeds as follows:
+
+    1. Given a constant delta position action, and with the the kp values set to critically-damped, we will ramp up
+        the kp values to its max and then ramp down the values. We qualitatively expect the arm to accelerate as the kp
+        values are ramped, and then slow down as they are decreased.
+
+    2. The environment will then be reset. Given a constant delta position action, and with kp values set to its
+        default value, we will ramp up the damping values to its max and then ramp down the values. We qualitatively
+        expect the arm to slow down as the damping values are ramped, and then increase in speed as they are decreased.
+
+    3. We will repeat Step 1 and 2 for each of the tested controllers.
+
+Periodic prijntouts should verify the above patterns; conversely, running the script with the "--render" argument will
+render the trajectories to allow for visual analysis of gains
+"""
+
+import argparse
+import json
+import os
+
+import numpy as np
+
+import robosuite as suite
+
+# Define the rate of change when sweeping through kp / damping values
+num_timesteps_per_change = 10
+percent_increase = 0.05
+
+# Define delta values for trajectory
+d = 0.05
+
+# Define default values for fixing one of the two gains
+kp_default = 150
+damping_default = 1  # critically damped
+
+# Define arguments for this test
+parser = argparse.ArgumentParser()
+parser.add_argument("--render", action="store_true", help="Whether to render tests or run headless")
+args = parser.parse_args()
+
+
+# Running the actual test #
+def test_variable_impedance():
+
+    for controller_name in ["OSC_POSE", "OSC_POSITION", "JOINT_POSITION"]:
+
+        # Define numpy seed so we guarantee consistent starting pos / ori for each trajectory
+        np.random.seed(3)
+
+        # Define controller path to load
+        controller_path = os.path.join(
+            os.path.dirname(__file__), "../../robosuite", "controllers/config/{}.json".format(controller_name.lower())
+        )
+
+        # Load the controller
+        with open(controller_path) as f:
+            controller_config = json.load(f)
+
+        # Manually edit impedance settings
+        controller_config["impedance_mode"] = "variable"
+        controller_config["kp_limits"] = [0, 300]
+        controller_config["damping_limits"] = [0, 10]
+
+        # Now, create a test env for testing the controller on
+        env = suite.make(
+            "Lift",
+            robots="Sawyer",
+            has_renderer=args.render,  # by default, don't use on-screen renderer for visual validation
+            has_offscreen_renderer=False,
+            use_camera_obs=False,
+            horizon=10000,
+            control_freq=20,
+            controller_configs=controller_config,
+        )
+
+        # Setup printing options for numbers
+        np.set_printoptions(formatter={"float": lambda x: "{0:0.3f}".format(x)})
+
+        # Get limits on kp and damping values
+        # Define control dim. Note that this is not the action space, but internal dimensionality of gains
+        control_dim = 6 if "OSC" in controller_name else 7
+        low, high = env.action_spec
+        damping_low, kp_low = low[:control_dim], low[control_dim : 2 * control_dim]
+        damping_high, kp_high = high[:control_dim], high[control_dim : 2 * control_dim]
+        damping_range = damping_high - damping_low
+        kp_range = kp_high - kp_low
+
+        # Get delta values for trajectory
+        if controller_name == "OSC_POSE":
+            delta = np.array([0, d, 0, 0, 0, 0])
+        elif controller_name == "OSC_POSITION":
+            delta = np.array([0, d, 0])
+        else:  # JOINT_POSITION
+            delta = np.array([d, 0, 0, 0, 0, 0, 0])
+
+        # Get total number of steps each test should take (num steps ramping up + num steps ramping down)
+        total_steps = num_timesteps_per_change / percent_increase * 2
+
+        # Run a test for both kp and damping
+        gains = ["kp", "damping"]
+
+        for gain in gains:
+
+            # Reset the environment
+            env.reset()
+
+            # Hardcode the starting position for sawyer
+            init_qpos = [-0.5538, -0.8208, 0.4155, 1.8409, -0.4955, 0.6482, 1.9628]
+            env.robots[0].set_robot_joint_positions(init_qpos)
+            env.robots[0].controller.update_initial_joints(init_qpos)
+
+            # Notify user a new test is beginning
+            print("\nTesting controller {} while sweeping {}...".format(controller_name, gain))
+
+            # If rendering, set controller to front view to get best angle for viewing robot movements
+            if args.render:
+                env.viewer.set_camera(camera_id=0)
+
+            # Keep track of relative changes in robot eef position
+            last_pos = env.robots[0]._hand_pos
+
+            # Initialize gains
+            if gain == "kp":
+                kp = kp_low
+                damping = damping_default * np.ones(control_dim)
+                gain_val = kp  # alias for kp
+                gain_range = kp_range
+            else:  # "damping"
+                kp = kp_default * np.ones(control_dim)
+                damping = damping_low
+                gain_val = damping  # alias for damping
+                gain_range = damping_range
+
+            # Initialize counters
+            i = 0
+            sign = 1.0  # Whether to increase or decrease gain
+
+            # Run trajectory until the threshold condition is met
+            while i < total_steps:
+                # Create action (damping, kp, traj, gripper)
+                action = np.concatenate([damping, kp, sign * delta, [0]])
+
+                # Take an environment step
+                env.step(action)
+                if args.render:
+                    env.render()
+
+                # Update the current change in state
+                cur_pos = env.robots[0]._hand_pos
+
+                # If we're at the end of the increase, switch direction of traj and gain changes
+                if i == int(num_timesteps_per_change / percent_increase):
+                    sign *= -1.0
+
+                # Update gain if this is a changing step
+                if i % num_timesteps_per_change == 0:
+                    # Compare delta, print out to user, and update last_pos
+                    delta_pos = np.linalg.norm(cur_pos - last_pos)
+                    print("    Magnitude eef distance change with {} = {}: {:.5f}".format(gain, gain_val[0], delta_pos))
+                    last_pos = cur_pos
+                    # Update gain
+                    gain_val += percent_increase * gain_range * sign
+
+                # Update timestep count
+                i += 1
+
+            # When finished, print out the timestep results
+            print("Completed trajectory.")
+
+            # Shut down this env before starting the next test
+            env.close()
+
+    # Tests completed!
+    print()
+    print("-" * 80)
+    print("All variable impedance testing completed.\n")
+
+
+if __name__ == "__main__":
+    test_variable_impedance()
diff --git a/phantom/submodules/phantom-robosuite/tests/test_environments/test_action_playback.py b/phantom/submodules/phantom-robosuite/tests/test_environments/test_action_playback.py
new file mode 100644
index 0000000000000000000000000000000000000000..96f256e0cd10fc22a8e923b2c8d8c4791680ed23
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_environments/test_action_playback.py
@@ -0,0 +1,76 @@
+"""
+Test script for recording a sequence of random actions and playing them back
+"""
+
+import argparse
+import json
+import os
+import random
+
+import h5py
+import numpy as np
+
+import robosuite
+from robosuite.controllers import load_controller_config
+
+
+def test_playback():
+    # set seeds
+    random.seed(0)
+    np.random.seed(0)
+
+    env = robosuite.make(
+        "Lift",
+        robots=["Panda"],
+        controller_configs=load_controller_config(default_controller="OSC_POSE"),
+        has_renderer=False,
+        has_offscreen_renderer=False,
+        ignore_done=True,
+        use_camera_obs=False,
+        reward_shaping=True,
+        control_freq=20,
+    )
+    env.reset()
+
+    # task instance
+    task_xml = env.sim.model.get_xml()
+    task_init_state = np.array(env.sim.get_state().flatten())
+
+    # trick for ensuring that we can play MuJoCo demonstrations back
+    # deterministically by using the recorded actions open loop
+    env.reset_from_xml_string(task_xml)
+    env.sim.reset()
+    env.sim.set_state_from_flattened(task_init_state)
+    env.sim.forward()
+
+    # random actions to play
+    n_actions = 100
+    actions = 0.1 * np.random.uniform(low=-1.0, high=1.0, size=(n_actions, env.action_spec[0].shape[0]))
+
+    # play actions
+    print("playing random actions...")
+    states = [task_init_state]
+    for i in range(n_actions):
+        env.step(actions[i])
+        states.append(np.array(env.sim.get_state().flatten()))
+
+    # try playback
+    print("attempting playback...")
+    env.reset()
+    env.reset_from_xml_string(task_xml)
+    env.sim.reset()
+    env.sim.set_state_from_flattened(task_init_state)
+    env.sim.forward()
+
+    for i in range(n_actions):
+        env.step(actions[i])
+        state_playback = env.sim.get_state().flatten()
+        assert np.all(np.equal(states[i + 1], state_playback))
+
+    env.close()
+    print("test passed!")
+
+
+if __name__ == "__main__":
+
+    test_playback()
diff --git a/phantom/submodules/phantom-robosuite/tests/test_environments/test_all_environments.py b/phantom/submodules/phantom-robosuite/tests/test_environments/test_all_environments.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbbe918653d248efe2909598f53c990829369fae
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_environments/test_all_environments.py
@@ -0,0 +1,88 @@
+"""
+Test all environments with random policies.
+
+This runs some basic sanity checks on the environment, namely, checking that:
+    - proprio-state exists in the obs, and is a flat array
+    - agentview_image exists and is of the correct shape
+    - no object-obs in state, because we are only using image observations
+
+Obviously, if an environment crashes during runtime, that is considered a failure as well.
+"""
+import numpy as np
+
+import robosuite as suite
+
+
+def test_all_environments():
+
+    envs = sorted(suite.ALL_ENVIRONMENTS)
+    for env_name in envs:
+        # Create config dict
+        env_config = {"env_name": env_name}
+        for robot_name in ("Panda", "Sawyer", "Baxter"):
+            # create an environment for learning on pixels
+            config = None
+            if "TwoArm" in env_name:
+                if robot_name == "Baxter":
+                    robots = robot_name
+                    config = "bimanual"
+                else:
+                    robots = [robot_name, robot_name]
+                    config = "single-arm-opposed"
+                # compile configuration specs
+                env_config["robots"] = robots
+                env_config["env_configuration"] = config
+            else:
+                if robot_name == "Baxter":
+                    continue
+                env_config["robots"] = robot_name
+
+            # Notify user of which test we are currently on
+            print("Testing env: {} with robots {} with config {}...".format(env_name, env_config["robots"], config))
+
+            # Create environment
+            env = suite.make(
+                **env_config,
+                has_renderer=False,  # no on-screen renderer
+                has_offscreen_renderer=True,  # off-screen renderer is required for camera observations
+                ignore_done=True,  # (optional) never terminates episode
+                use_camera_obs=True,  # use camera observations
+                camera_heights=84,  # set camera height
+                camera_widths=84,  # set camera width
+                camera_names="agentview",  # use "agentview" camera
+                use_object_obs=False,  # no object feature when training on pixels
+                reward_shaping=True,  # (optional) using a shaping reward
+            )
+
+            obs = env.reset()
+
+            # get action range
+            action_min, action_max = env.action_spec
+            assert action_min.shape == action_max.shape
+
+            # Get robot prefix
+            pr = env.robots[0].robot_model.naming_prefix
+
+            # run 10 random actions
+            for _ in range(10):
+
+                assert pr + "proprio-state" in obs
+                assert obs[pr + "proprio-state"].ndim == 1
+
+                assert "agentview_image" in obs
+                assert obs["agentview_image"].shape == (84, 84, 3)
+
+                assert "object-state" not in obs
+
+                action = np.random.uniform(action_min, action_max)
+                obs, reward, done, info = env.step(action)
+
+            env.close()
+
+    # Tests passed!
+    print("All environment tests passed successfully!")
+
+
+if __name__ == "__main__":
+
+    test_all_environments()
diff --git a/phantom/submodules/phantom-robosuite/tests/test_environments/test_camera_transforms.py b/phantom/submodules/phantom-robosuite/tests/test_environments/test_camera_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd3e8b6516c2f51edc883fcf83b5da0ad1c73ac6
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_environments/test_camera_transforms.py
@@ -0,0 +1,93 @@
+"""
+Test script for camera transforms. This test will read the ground-truth 
+object state in the Lift environment, transform it into a pixel location
+in the camera frame, then transform it back to the world frame, and assert
+that the values are close.
+"""
+import random
+
+import numpy as np
+
+import robosuite
+import robosuite.utils.camera_utils as CU
+from robosuite.controllers import load_controller_config
+
+
+def test_camera_transforms():
+    # set seeds
+    random.seed(0)
+    np.random.seed(0)
+
+    camera_name = "agentview"
+    camera_height = 120
+    camera_width = 120
+    env = robosuite.make(
+        "Lift",
+        robots=["Panda"],
+        controller_configs=load_controller_config(default_controller="OSC_POSE"),
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        ignore_done=True,
+        use_object_obs=True,
+        use_camera_obs=True,
+        camera_names=[camera_name],
+        camera_depths=[True],
+        camera_heights=[camera_height],
+        camera_widths=[camera_width],
+        reward_shaping=True,
+        control_freq=20,
+    )
+    obs_dict = env.reset()
+    sim = env.sim
+
+    # ground-truth object position
+    obj_pos = obs_dict["object-state"][:3]
+
+    # camera frame
+    image = obs_dict["{}_image".format(camera_name)][::-1]
+
+    # unnormalized depth map
+    depth_map = obs_dict["{}_depth".format(camera_name)][::-1]
+
+    depth_map = CU.get_real_depth_map(sim=env.sim, depth_map=depth_map)
+
+    # get camera matrices
+    world_to_camera = CU.get_camera_transform_matrix(
+        sim=env.sim,
+        camera_name=camera_name,
+        camera_height=camera_height,
+        camera_width=camera_width,
+    )
+    camera_to_world = np.linalg.inv(world_to_camera)
+
+    # transform object position into camera pixel
+    obj_pixel = CU.project_points_from_world_to_camera(
+        points=obj_pos,
+        world_to_camera_transform=world_to_camera,
+        camera_height=camera_height,
+        camera_width=camera_width,
+    )
+
+    # transform from camera pixel back to world position
+    estimated_obj_pos = CU.transform_from_pixels_to_world(
+        pixels=obj_pixel,
+        depth_map=depth_map,
+        camera_to_world_transform=camera_to_world,
+    )
+
+    # the most we should be off by in the z-direction is 3^0.5 times the maximum half-size of the cube
+    max_z_err = np.sqrt(3) * 0.022
+    z_err = np.abs(obj_pos[2] - estimated_obj_pos[2])
+    assert z_err < max_z_err
+
+    print("pixel: {}".format(obj_pixel))
+    print("obj pos: {}".format(obj_pos))
+    print("estimated obj pos: {}".format(estimated_obj_pos))
+    print("z err: {}".format(z_err))
+
+    env.close()
+
+
+if __name__ == "__main__":
+
+    test_camera_transforms()
diff --git a/phantom/submodules/phantom-robosuite/tests/test_grippers/test_all_grippers.py b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_all_grippers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ebae9cb04220b1e0e9f70daafe073695f49f377
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_all_grippers.py
@@ -0,0 +1,29 @@
+"""
+Tests the basic interface of all grippers.
+
+This runs some basic sanity checks on the environment, namely, checking that:
+    - Verifies that the gripper's action, init_qpos exist and are valid
+
+Obviously, if an environment crashes during runtime, that is considered a failure as well.
+"""
+from robosuite.models.grippers import GRIPPER_MAPPING
+
+
+def test_all_gripper():
+    for name, gripper in GRIPPER_MAPPING.items():
+        # Test all grippers except the null gripper
+        if name not in {None, "WipingGripper"}:
+            print("Testing {}...".format(name))
+            _test_gripper(gripper())
+
+
+def _test_gripper(gripper):
+    action = gripper.format_action([1] * gripper.dof)
+    assert action is not None
+
+    assert gripper.init_qpos is not None
+
+
+if __name__ == "__main__":
+    test_all_gripper()
+    print("Gripper tests completed.")
diff --git a/phantom/submodules/phantom-robosuite/tests/test_grippers/test_jaco_threefinger.py b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_jaco_threefinger.py
new file mode 100644
index 0000000000000000000000000000000000000000..d64f6ba8bb6910e852fa6505840bb2a1fcc2cc1e
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_jaco_threefinger.py
@@ -0,0 +1,26 @@
+from robosuite.models.grippers import GripperTester, JacoThreeFingerGripper
+
+
+def test_robotiq():
+    robotiq_tester(False)
+
+
+def robotiq_tester(render, total_iters=1, test_y=True):
+    gripper = JacoThreeFingerGripper()
+    tester = GripperTester(
+        gripper=gripper,
+        pos="0 0 0.3",
+        quat="0 0 1 0",
+        gripper_low_pos=0.01,
+        gripper_high_pos=0.1,
+        box_size=[0.025] * 3,
+        step_time=1000,
+        render=render,
+    )
+    tester.start_simulation()
+    tester.loop(total_iters=total_iters, test_y=test_y)
+    tester.close()
+
+
+if __name__ == "__main__":
+    robotiq_tester(True, 20, False)
diff --git a/phantom/submodules/phantom-robosuite/tests/test_grippers/test_panda_gripper.py b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_panda_gripper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3139ae2690639864c813547089149610968a0dd5
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_panda_gripper.py
@@ -0,0 +1,28 @@
+"""
+Tests panda gripper on grabbing task
+"""
+from robosuite.models.grippers import GripperTester, PandaGripper
+
+
+def test_panda_gripper():
+    panda_gripper_tester(False)
+
+
+def panda_gripper_tester(render, total_iters=1, test_y=True):
+    gripper = PandaGripper()
+    tester = GripperTester(
+        gripper=gripper,
+        pos="0 0 0.3",
+        quat="0 0 1 0",
+        gripper_low_pos=-0.10,
+        gripper_high_pos=0.01,
+        render=render,
+    )
+    tester.start_simulation()
+    tester.loop(total_iters=total_iters, test_y=test_y)
+    tester.close()
+
+
+if __name__ == "__main__":
+    panda_gripper_tester(True, 20, True)
+    panda_gripper_tester(True, 20, True)
diff --git a/phantom/submodules/phantom-robosuite/tests/test_grippers/test_rethink_gripper.py b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_rethink_gripper.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b937878bd9c05646bc3236e736f1bae083533f2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_rethink_gripper.py
@@ -0,0 +1,27 @@
+"""
+Tests two finger gripper and left two finger gripper on grabbing task
+"""
+from robosuite.models.grippers import GripperTester, RethinkGripper
+
+
+def test_two_finger():
+    two_finger_tester(False)
+
+
+def two_finger_tester(render, total_iters=1, test_y=True):
+    gripper = RethinkGripper()
+    tester = GripperTester(
+        gripper=gripper,
+        pos="0 0 0.3",
+        quat="0 0 1 0",
+        gripper_low_pos=-0.07,
+        gripper_high_pos=0.02,
+        render=render,
+    )
+    tester.start_simulation()
+    tester.loop(total_iters=total_iters, test_y=test_y)
+    tester.close()
+
+
+if __name__ == "__main__":
+    two_finger_tester(True, 20, True)
diff --git a/phantom/submodules/phantom-robosuite/tests/test_grippers/test_robotiq_140.py b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_robotiq_140.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6235138beb779ed51df4aa9f2a4fd4a50aba029
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_robotiq_140.py
@@ -0,0 +1,25 @@
+from robosuite.models.grippers import GripperTester, Robotiq140Gripper
+
+
+def test_robotiq():
+    robotiq_tester(False)
+
+
+def robotiq_tester(render, total_iters=1, test_y=True):
+    gripper = Robotiq140Gripper()
+    tester = GripperTester(
+        gripper=gripper,
+        pos="0 0 0.3",
+        quat="0 0 1 0",
+        gripper_low_pos=0.02,
+        gripper_high_pos=0.1,
+        box_size=[0.025] * 3,
+        render=render,
+    )
+    tester.start_simulation()
+    tester.loop(total_iters=total_iters, test_y=test_y)
+    tester.close()
+
+
+if __name__ == "__main__":
+    robotiq_tester(True, 20, False)
diff --git a/phantom/submodules/phantom-robosuite/tests/test_grippers/test_robotiq_85.py b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_robotiq_85.py
new file mode 100644
index 0000000000000000000000000000000000000000..636b0b64ff8638fc971167bbcc05ca0b4e7a6a2f
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_robotiq_85.py
@@ -0,0 +1,25 @@
+from robosuite.models.grippers import GripperTester, Robotiq85Gripper
+
+
+def test_robotiq():
+    robotiq_tester(False)
+
+
+def robotiq_tester(render, total_iters=1, test_y=True):
+    gripper = Robotiq85Gripper()
+    tester = GripperTester(
+        gripper=gripper,
+        pos="-0.02 0 0.3",
+        quat="0 0 1 0",
+        gripper_low_pos=-0.065,
+        gripper_high_pos=0.01,
+        box_size=[0.025] * 3,
+        render=render,
+    )
+    tester.start_simulation()
+    tester.loop(total_iters=total_iters, test_y=test_y)
+    tester.close()
+
+
+if __name__ == "__main__":
+    robotiq_tester(True, 20, False)
diff --git a/phantom/submodules/phantom-robosuite/tests/test_grippers/test_robotiq_threefinger.py b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_robotiq_threefinger.py
new file mode 100644
index 0000000000000000000000000000000000000000..a040d7686d4d46bd361474a0bc511c005a94b6a2
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_grippers/test_robotiq_threefinger.py
@@ -0,0 +1,26 @@
+from robosuite.models.grippers import GripperTester, RobotiqThreeFingerGripper
+
+
+def test_robotiq_three_finger():
+    robotiq_three_finger_tester(False)
+
+
+def robotiq_three_finger_tester(render, total_iters=1, test_y=True):
+    gripper = RobotiqThreeFingerGripper()
+    tester = GripperTester(
+        gripper=gripper,
+        pos="0 0 0.3",
+        quat="0 0 1 0",
+        gripper_low_pos=-0.02,
+        gripper_high_pos=0.1,
+        box_size=[0.035] * 3,
+        box_density=500,
+        render=render,
+    )
+    tester.start_simulation()
+    tester.loop(total_iters=total_iters, test_y=test_y)
+    tester.close()
+
+
+if __name__ == "__main__":
+    robotiq_three_finger_tester(True, 20, False)
diff --git a/phantom/submodules/phantom-robosuite/tests/test_robots/test_all_robots.py b/phantom/submodules/phantom-robosuite/tests/test_robots/test_all_robots.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e6340dd2d9c96a5617d9720c9c11556433cae3b
--- /dev/null
+++ b/phantom/submodules/phantom-robosuite/tests/test_robots/test_all_robots.py
@@ -0,0 +1,28 @@
+"""
+Tests the basic interface of all robots.
+
+This runs some basic sanity checks on the robots, namely, checking that:
+    - Verifies that all single-arm robots have properly defined contact geoms.
+
+Obviously, if an environment crashes during runtime, that is considered a failure as well.
+"""
+from robosuite.robots import ROBOT_CLASS_MAPPING, SingleArm
+
+
+def test_single_arm_robots():
+    for name, robot in ROBOT_CLASS_MAPPING.items():
+        if robot == SingleArm:
+            print(f"Testing {name}")
+            _test_contact_geoms(robot(name))
+
+
+def _test_contact_geoms(robot):
+    robot.load_model()
+    contact_geoms = robot.robot_model._contact_geoms
+    for geom in contact_geoms:
+        assert isinstance(geom, str), f"The geom {geom} is of type {type(geom)}, but should be {type('placeholder')}"
+
+
+if __name__ == "__main__":
+    test_single_arm_robots()
+    print("Robot tests completed.")
diff --git a/phantom/submodules/sam2/.clang-format b/phantom/submodules/sam2/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..39b1b3d603ed0cf6b7f94c9c08067f148f35613f
--- /dev/null
+++ b/phantom/submodules/sam2/.clang-format
@@ -0,0 +1,85 @@
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH, FOR_EACH_R, FOR_EACH_RANGE, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
diff --git a/phantom/submodules/sam2/.github/workflows/check_fmt.yml b/phantom/submodules/sam2/.github/workflows/check_fmt.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0a29b884af2b5c0bdb71b607e7b8220e879755be
--- /dev/null
+++ b/phantom/submodules/sam2/.github/workflows/check_fmt.yml
@@ -0,0 +1,17 @@
+name: SAM2/fmt
+on:
+  pull_request:
+    branches:
+    - main
+jobs:
+  ufmt_check:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check formatting
+        uses: omnilib/ufmt@action-v1
+        with:
+          path: sam2 tools
+          version: "2.0.0b2"
+          python-version: "3.10"
+          black-version: "24.2.0"
+          usort-version: "1.0.2"
diff --git a/phantom/submodules/sam2/.gitignore b/phantom/submodules/sam2/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..121d46aa5c1854ee2b2a5085ff2ce22f5a04043f
--- /dev/null
+++ b/phantom/submodules/sam2/.gitignore
@@ -0,0 +1,11 @@
+.vscode/
+.DS_Store
+__pycache__/
+*-checkpoint.ipynb
+.venv
+*.egg*
+build/*
+_C.*
+outputs/*
+checkpoints/*.pt
+demo/backend/checkpoints/*.pt
diff --git a/phantom/submodules/sam2/.watchmanconfig b/phantom/submodules/sam2/.watchmanconfig
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/phantom/submodules/sam2/.watchmanconfig
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/phantom/submodules/sam2/CODE_OF_CONDUCT.md b/phantom/submodules/sam2/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..08b500a221857ec3f451338e80b4a9ab1173a1af
--- /dev/null
+++ b/phantom/submodules/sam2/CODE_OF_CONDUCT.md
@@ -0,0 +1,80 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/phantom/submodules/sam2/CONTRIBUTING.md b/phantom/submodules/sam2/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad15049f583e1bc9a418686493405875b98c7f0f
--- /dev/null
+++ b/phantom/submodules/sam2/CONTRIBUTING.md
@@ -0,0 +1,31 @@
+# Contributing to segment-anything
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints, using the `ufmt format` command. Linting requires `black==24.2.0`, `usort==1.0.2`, and `ufmt==2.0.0b2`, which can be installed via `pip install -e ".[dev]"`.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## License
+By contributing to segment-anything, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
diff --git a/phantom/submodules/sam2/INSTALL.md b/phantom/submodules/sam2/INSTALL.md
new file mode 100644
index 0000000000000000000000000000000000000000..9480ba1bb52c171cfccc6a078c68abdb49125daa
--- /dev/null
+++ b/phantom/submodules/sam2/INSTALL.md
@@ -0,0 +1,189 @@
+## Installation
+
+### Requirements
+
+- Linux with Python ≥ 3.10, PyTorch ≥ 2.5.1 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. Install them together at https://pytorch.org to ensure this.
+  * Note older versions of Python or PyTorch may also work. However, the versions above are strongly recommended to provide all features such as `torch.compile`.
+- [CUDA toolkits](https://developer.nvidia.com/cuda-toolkit-archive) that match the CUDA version for your PyTorch installation. This should typically be CUDA 12.1 if you follow the default installation command.
+- If you are installing on Windows, it's strongly recommended to use [Windows Subsystem for Linux (WSL)](https://learn.microsoft.com/en-us/windows/wsl/install) with Ubuntu.
+
+Then, install SAM 2 from the root of this repository via
+```bash
+pip install -e ".[notebooks]"
+```
+
+Note that you may skip building the SAM 2 CUDA extension during installation via environment variable `SAM2_BUILD_CUDA=0`, as follows:
+```bash
+# skip the SAM 2 CUDA extension
+SAM2_BUILD_CUDA=0 pip install -e ".[notebooks]"
+```
+This would also skip the post-processing step at runtime (removing small holes and sprinkles in the output masks, which requires the CUDA extension), but shouldn't affect the results in most cases.
+
+### Building the SAM 2 CUDA extension
+
+By default, we allow the installation to proceed even if the SAM 2 CUDA extension fails to build. (In this case, the build errors are hidden unless using `-v` for verbose output in `pip install`.)
+
+If you see a message like `Skipping the post-processing step due to the error above` at runtime or `Failed to build the SAM 2 CUDA extension due to the error above` during installation, it indicates that the SAM 2 CUDA extension failed to build in your environment. In this case, **you can still use SAM 2 for both image and video applications**. The post-processing step (removing small holes and sprinkles in the output masks) will be skipped, but this shouldn't affect the results in most cases.
+
+If you would like to enable this post-processing step, you can reinstall SAM 2 on a GPU machine with environment variable `SAM2_BUILD_ALLOW_ERRORS=0` to force building the CUDA extension (and raise errors if it fails to build), as follows
+```bash
+pip uninstall -y SAM-2 && \
+rm -f ./sam2/*.so && \
+SAM2_BUILD_ALLOW_ERRORS=0 pip install -v -e ".[notebooks]"
+```
+
+Note that PyTorch needs to be installed first before building the SAM 2 CUDA extension. It's also necessary to install [CUDA toolkits](https://developer.nvidia.com/cuda-toolkit-archive) that match the CUDA version for your PyTorch installation. (This should typically be CUDA 12.1 if you follow the default installation command.) After installing the CUDA toolkits, you can check its version via `nvcc --version`.
+
+Please check the section below on common installation issues if the CUDA extension fails to build during installation or load at runtime.
+
+### Common Installation Issues
+
+Click each issue for its solutions:
+
+<details>
+<summary>
+I got `ImportError: cannot import name '_C' from 'sam2'`
+</summary>
+<br/>
+
+This is usually because you haven't run the `pip install -e ".[notebooks]"` step above or the installation failed. Please install SAM 2 first, and see the other issues if your installation fails.
+
+In some systems, you may need to run `python setup.py build_ext --inplace` in the SAM 2 repo root as suggested in https://github.com/facebookresearch/sam2/issues/77.
+</details>
+
+<details>
+<summary>
+I got `MissingConfigException: Cannot find primary config 'configs/sam2.1/sam2.1_hiera_l.yaml'`
+</summary>
+<br/>
+
+This is usually because you haven't run the `pip install -e .` step above, so `sam2` isn't in your Python's `sys.path`. Please run this installation step. In case it still fails after the installation step, you may try manually adding the root of this repo to `PYTHONPATH` via
+```bash
+export SAM2_REPO_ROOT=/path/to/sam2  # path to this repo
+export PYTHONPATH="${SAM2_REPO_ROOT}:${PYTHONPATH}"
+```
+to manually add `sam2_configs` into your Python's `sys.path`.
+
+</details>
+
+<details>
+<summary>
+I got `RuntimeError: Error(s) in loading state_dict for SAM2Base` when loading the new SAM 2.1 checkpoints
+</summary>
+<br/>
+
+This is likely because you have installed a previous version of this repo, which doesn't have the new modules to support the SAM 2.1 checkpoints yet. Please try the following steps:
+
+1. pull the latest code from the `main` branch of this repo
+2. run `pip uninstall -y SAM-2` to uninstall any previous installations
+3. then install the latest repo again using `pip install -e ".[notebooks]"`
+
+In case the steps above still don't resolve the error, please try running in your Python environment the following
+```python
+from sam2.modeling import sam2_base
+
+print(sam2_base.__file__)
+```
+and check whether the content in the printed local path of `sam2/modeling/sam2_base.py` matches the latest one in https://github.com/facebookresearch/sam2/blob/main/sam2/modeling/sam2_base.py (e.g. whether your local file has `no_obj_embed_spatial`) to indentify if you're still using a previous installation.
+
+</details>
+
+<details>
+<summary>
+My installation failed with `CUDA_HOME environment variable is not set`
+</summary>
+<br/>
+
+This usually happens because the installation step cannot find the CUDA toolkits (that contain the NVCC compiler) to build a custom CUDA kernel in SAM 2. Please install [CUDA toolkits](https://developer.nvidia.com/cuda-toolkit-archive) or the version that matches the CUDA version for your PyTorch installation. If the error persists after installing CUDA toolkits, you may explicitly specify `CUDA_HOME` via
+```
+export CUDA_HOME=/usr/local/cuda  # change to your CUDA toolkit path
+```
+and rerun the installation.
+
+Also, you should make sure
+```
+python -c 'import torch; from torch.utils.cpp_extension import CUDA_HOME; print(torch.cuda.is_available(), CUDA_HOME)'
+```
+print `(True, a directory with cuda)` to verify that the CUDA toolkits are correctly set up.
+
+If you are still having problems after verifying that the CUDA toolkit is installed and the `CUDA_HOME` environment variable is set properly, you may have to add the `--no-build-isolation` flag to the pip command:
+```
+pip install --no-build-isolation -e .
+```
+
+</details>
+
+<details>
+<summary>
+I got `undefined symbol: _ZN3c1015SmallVectorBaseIjE8grow_podEPKvmm` (or similar errors)
+</summary>
+<br/>
+
+This usually happens because you have multiple versions of dependencies (PyTorch or CUDA) in your environment. During installation, the SAM 2 library is compiled against one version library while at run time it links against another version. This might be due to that you have different versions of PyTorch or CUDA installed separately via `pip` or `conda`. You may delete one of the duplicates to only keep a single PyTorch and CUDA version.
+
+In particular, if you have a lower PyTorch version than 2.5.1, it's recommended to upgrade to PyTorch 2.5.1 or higher first. Otherwise, the installation script will try to upgrade to the latest PyTorch using `pip`, which could sometimes lead to duplicated PyTorch installation if you have previously installed another PyTorch version using `conda`.
+
+We have been building SAM 2 against PyTorch 2.5.1 internally. However, a few user comments (e.g. https://github.com/facebookresearch/sam2/issues/22, https://github.com/facebookresearch/sam2/issues/14) suggested that downgrading to PyTorch 2.1.0 might resolve this problem. In case the error persists, you may try changing the restriction from `torch>=2.5.1` to `torch==2.1.0` in both [`pyproject.toml`](pyproject.toml) and [`setup.py`](setup.py) to allow PyTorch 2.1.0.
+</details>
+
+<details>
+<summary>
+I got `CUDA error: no kernel image is available for execution on the device`
+</summary>
+<br/>
+
+A possible cause could be that the CUDA kernel is somehow not compiled towards your GPU's CUDA [capability](https://developer.nvidia.com/cuda-gpus). This could happen if the installation is done in an environment different from the runtime (e.g. in a slurm system).
+
+You can try pulling the latest code from the SAM 2 repo and running the following
+```
+export TORCH_CUDA_ARCH_LIST=9.0 8.0 8.6 8.9 7.0 7.2 7.5 6.0`
+```
+to manually specify the CUDA capability in the compilation target that matches your GPU.
+</details>
+
+<details>
+<summary>
+I got `RuntimeError: No available kernel. Aborting execution.` (or similar errors)
+</summary>
+<br/>
+
+This is probably because your machine doesn't have a GPU or a compatible PyTorch version for Flash Attention (see also https://discuss.pytorch.org/t/using-f-scaled-dot-product-attention-gives-the-error-runtimeerror-no-available-kernel-aborting-execution/180900 for a discussion in PyTorch forum). You may be able to resolve this error by replacing the line
+```python
+OLD_GPU, USE_FLASH_ATTN, MATH_KERNEL_ON = get_sdpa_settings()
+```
+in [`sam2/modeling/sam/transformer.py`](sam2/modeling/sam/transformer.py) with
+```python
+OLD_GPU, USE_FLASH_ATTN, MATH_KERNEL_ON = True, True, True
+```
+to relax the attention kernel setting and use other kernels than Flash Attention.
+</details>
+
+<details>
+<summary>
+I got `Error compiling objects for extension`
+</summary>
+<br/>
+
+You may see error log of:
+> unsupported Microsoft Visual Studio version! Only the versions between 2017 and 2022 (inclusive) are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+This is probably because your versions of CUDA and Visual Studio are incompatible. (see also https://stackoverflow.com/questions/78515942/cuda-compatibility-with-visual-studio-2022-version-17-10 for a discussion in stackoverflow).<br> 
+You may be able to fix this by adding the `-allow-unsupported-compiler` argument to `nvcc` after L48 in the [setup.py](https://github.com/facebookresearch/sam2/blob/main/setup.py). <br>
+After adding the argument, `get_extension()` will look like this:
+```python
+def get_extensions():
+    srcs = ["sam2/csrc/connected_components.cu"]
+    compile_args = {
+        "cxx": [],
+        "nvcc": [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+            "-allow-unsupported-compiler"  # Add this argument
+        ],
+    }
+    ext_modules = [CUDAExtension("sam2._C", srcs, extra_compile_args=compile_args)]
+    return ext_modules
+```
+</details>
diff --git a/phantom/submodules/sam2/LICENSE b/phantom/submodules/sam2/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/phantom/submodules/sam2/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/phantom/submodules/sam2/LICENSE_cctorch b/phantom/submodules/sam2/LICENSE_cctorch
new file mode 100644
index 0000000000000000000000000000000000000000..23da14a65aad4c5bac18061b80ae6040bb7d2c8c
--- /dev/null
+++ b/phantom/submodules/sam2/LICENSE_cctorch
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2020, the respective contributors, as shown by the AUTHORS file.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/phantom/submodules/sam2/MANIFEST.in b/phantom/submodules/sam2/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..794311fd9854453b134c828c0cb241a7cfdbfc65
--- /dev/null
+++ b/phantom/submodules/sam2/MANIFEST.in
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+recursive-include sam2 *.yaml #include all config files
diff --git a/phantom/submodules/sam2/README.md b/phantom/submodules/sam2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..85a7eb958bced5495ff990c2bcbe7d99662c660f
--- /dev/null
+++ b/phantom/submodules/sam2/README.md
@@ -0,0 +1,224 @@
+# SAM 2: Segment Anything in Images and Videos
+
+**[AI at Meta, FAIR](https://ai.meta.com/research/)**
+
+[Nikhila Ravi](https://nikhilaravi.com/), [Valentin Gabeur](https://gabeur.github.io/), [Yuan-Ting Hu](https://scholar.google.com/citations?user=E8DVVYQAAAAJ&hl=en), [Ronghang Hu](https://ronghanghu.com/), [Chaitanya Ryali](https://scholar.google.com/citations?user=4LWx24UAAAAJ&hl=en), [Tengyu Ma](https://scholar.google.com/citations?user=VeTSl0wAAAAJ&hl=en), [Haitham Khedr](https://hkhedr.com/), [Roman Rädle](https://scholar.google.de/citations?user=Tpt57v0AAAAJ&hl=en), [Chloe Rolland](https://scholar.google.com/citations?hl=fr&user=n-SnMhoAAAAJ), [Laura Gustafson](https://scholar.google.com/citations?user=c8IpF9gAAAAJ&hl=en), [Eric Mintun](https://ericmintun.github.io/), [Junting Pan](https://junting.github.io/), [Kalyan Vasudev Alwala](https://scholar.google.co.in/citations?user=m34oaWEAAAAJ&hl=en), [Nicolas Carion](https://www.nicolascarion.com/), [Chao-Yuan Wu](https://chaoyuan.org/), [Ross Girshick](https://www.rossgirshick.info/), [Piotr Dollár](https://pdollar.github.io/), [Christoph Feichtenhofer](https://feichtenhofer.github.io/)
+
+[[`Paper`](https://ai.meta.com/research/publications/sam-2-segment-anything-in-images-and-videos/)] [[`Project`](https://ai.meta.com/sam2)] [[`Demo`](https://sam2.metademolab.com/)] [[`Dataset`](https://ai.meta.com/datasets/segment-anything-video)] [[`Blog`](https://ai.meta.com/blog/segment-anything-2)] [[`BibTeX`](#citing-sam-2)]
+
+![SAM 2 architecture](assets/model_diagram.png?raw=true)
+
+**Segment Anything Model 2 (SAM 2)** is a foundation model towards solving promptable visual segmentation in images and videos. We extend SAM to video by considering images as a video with a single frame. The model design is a simple transformer architecture with streaming memory for real-time video processing. We build a model-in-the-loop data engine, which improves model and data via user interaction, to collect [**our SA-V dataset**](https://ai.meta.com/datasets/segment-anything-video), the largest video segmentation dataset to date. SAM 2 trained on our data provides strong performance across a wide range of tasks and visual domains.
+
+![SA-V dataset](assets/sa_v_dataset.jpg?raw=true)
+
+## Latest updates
+
+**12/11/2024 -- full model compilation for a major VOS speedup and a new `SAM2VideoPredictor` to better handle multi-object tracking**
+
+- We now support `torch.compile` of the entire SAM 2 model on videos, which can be turned on by setting `vos_optimized=True` in `build_sam2_video_predictor`, leading to a major speedup for VOS inference.
+- We update the implementation of `SAM2VideoPredictor` to support independent per-object inference, allowing us to relax the assumption of prompting for multi-object tracking and adding new objects after tracking starts.
+- See [`RELEASE_NOTES.md`](RELEASE_NOTES.md) for full details.
+
+**09/30/2024 -- SAM 2.1 Developer Suite (new checkpoints, training code, web demo) is released**
+
+- A new suite of improved model checkpoints (denoted as **SAM 2.1**) are released. See [Model Description](#model-description) for details.
+  * To use the new SAM 2.1 checkpoints, you need the latest model code from this repo. If you have installed an earlier version of this repo, please first uninstall the previous version via `pip uninstall SAM-2`, pull the latest code from this repo (with `git pull`), and then reinstall the repo following [Installation](#installation) below.
+- The training (and fine-tuning) code has been released. See [`training/README.md`](training/README.md) on how to get started.
+- The frontend + backend code for the SAM 2 web demo has been released. See [`demo/README.md`](demo/README.md) for details.
+
+## Installation
+
+SAM 2 needs to be installed first before use. The code requires `python>=3.10`, as well as `torch>=2.5.1` and `torchvision>=0.20.1`. Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install both PyTorch and TorchVision dependencies. You can install SAM 2 on a GPU machine using:
+
+```bash
+git clone https://github.com/facebookresearch/sam2.git && cd sam2
+
+pip install -e .
+```
+If you are installing on Windows, it's strongly recommended to use [Windows Subsystem for Linux (WSL)](https://learn.microsoft.com/en-us/windows/wsl/install) with Ubuntu.
+
+To use the SAM 2 predictor and run the example notebooks, `jupyter` and `matplotlib` are required and can be installed by:
+
+```bash
+pip install -e ".[notebooks]"
+```
+
+Note:
+1. It's recommended to create a new Python environment via [Anaconda](https://www.anaconda.com/) for this installation and install PyTorch 2.5.1 (or higher) via `pip` following https://pytorch.org/. If you have a PyTorch version lower than 2.5.1 in your current environment, the installation command above will try to upgrade it to the latest PyTorch version using `pip`.
+2. The step above requires compiling a custom CUDA kernel with the `nvcc` compiler. If it isn't already available on your machine, please install the [CUDA toolkits](https://developer.nvidia.com/cuda-toolkit-archive) with a version that matches your PyTorch CUDA version.
+3. If you see a message like `Failed to build the SAM 2 CUDA extension` during installation, you can ignore it and still use SAM 2 (some post-processing functionality may be limited, but it doesn't affect the results in most cases).
+
+Please see [`INSTALL.md`](./INSTALL.md) for FAQs on potential issues and solutions.
+
+## Getting Started
+
+### Download Checkpoints
+
+First, we need to download a model checkpoint. All the model checkpoints can be downloaded by running:
+
+```bash
+cd checkpoints && \
+./download_ckpts.sh && \
+cd ..
+```
+
+or individually from:
+
+- [sam2.1_hiera_tiny.pt](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_tiny.pt)
+- [sam2.1_hiera_small.pt](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_small.pt)
+- [sam2.1_hiera_base_plus.pt](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_base_plus.pt)
+- [sam2.1_hiera_large.pt](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt)
+
+(note that these are the improved checkpoints denoted as SAM 2.1; see [Model Description](#model-description) for details.)
+
+Then SAM 2 can be used in a few lines as follows for image and video prediction.
+
+### Image prediction
+
+SAM 2 has all the capabilities of [SAM](https://github.com/facebookresearch/segment-anything) on static images, and we provide image prediction APIs that closely resemble SAM for image use cases. The `SAM2ImagePredictor` class has an easy interface for image prompting.
+
+```python
+import torch
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+
+checkpoint = "./checkpoints/sam2.1_hiera_large.pt"
+model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
+predictor = SAM2ImagePredictor(build_sam2(model_cfg, checkpoint))
+
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+    predictor.set_image(<your_image>)
+    masks, _, _ = predictor.predict(<input_prompts>)
+```
+
+Please refer to the examples in [image_predictor_example.ipynb](./notebooks/image_predictor_example.ipynb) (also in Colab [here](https://colab.research.google.com/github/facebookresearch/sam2/blob/main/notebooks/image_predictor_example.ipynb)) for static image use cases.
+
+SAM 2 also supports automatic mask generation on images just like SAM. Please see [automatic_mask_generator_example.ipynb](./notebooks/automatic_mask_generator_example.ipynb) (also in Colab [here](https://colab.research.google.com/github/facebookresearch/sam2/blob/main/notebooks/automatic_mask_generator_example.ipynb)) for automatic mask generation in images.
+
+### Video prediction
+
+For promptable segmentation and tracking in videos, we provide a video predictor with APIs for example to add prompts and propagate masklets throughout a video. SAM 2 supports video inference on multiple objects and uses an inference state to keep track of the interactions in each video.
+
+```python
+import torch
+from sam2.build_sam import build_sam2_video_predictor
+
+checkpoint = "./checkpoints/sam2.1_hiera_large.pt"
+model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
+predictor = build_sam2_video_predictor(model_cfg, checkpoint)
+
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+    state = predictor.init_state(<your_video>)
+
+    # add new prompts and instantly get the output on the same frame
+    frame_idx, object_ids, masks = predictor.add_new_points_or_box(state, <your_prompts>):
+
+    # propagate the prompts to get masklets throughout the video
+    for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
+        ...
+```
+
+Please refer to the examples in [video_predictor_example.ipynb](./notebooks/video_predictor_example.ipynb) (also in Colab [here](https://colab.research.google.com/github/facebookresearch/sam2/blob/main/notebooks/video_predictor_example.ipynb)) for details on how to add click or box prompts, make refinements, and track multiple objects in videos.
+
+## Load from 🤗 Hugging Face
+
+Alternatively, models can also be loaded from [Hugging Face](https://huggingface.co/models?search=facebook/sam2) (requires `pip install huggingface_hub`).
+
+For image prediction:
+
+```python
+import torch
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+
+predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-large")
+
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+    predictor.set_image(<your_image>)
+    masks, _, _ = predictor.predict(<input_prompts>)
+```
+
+For video prediction:
+
+```python
+import torch
+from sam2.sam2_video_predictor import SAM2VideoPredictor
+
+predictor = SAM2VideoPredictor.from_pretrained("facebook/sam2-hiera-large")
+
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+    state = predictor.init_state(<your_video>)
+
+    # add new prompts and instantly get the output on the same frame
+    frame_idx, object_ids, masks = predictor.add_new_points_or_box(state, <your_prompts>):
+
+    # propagate the prompts to get masklets throughout the video
+    for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
+        ...
+```
+
+## Model Description
+
+### SAM 2.1 checkpoints
+
+The table below shows the improved SAM 2.1 checkpoints released on September 29, 2024.
+|      **Model**       | **Size (M)** |    **Speed (FPS)**     | **SA-V test (J&F)** | **MOSE val (J&F)** | **LVOS v2 (J&F)** |
+| :------------------: | :----------: | :--------------------: | :-----------------: | :----------------: | :---------------: |
+|   sam2.1_hiera_tiny <br /> ([config](sam2/configs/sam2.1/sam2.1_hiera_t.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_tiny.pt))    |     38.9     |          91.2          |        76.5         |        71.8        |       77.3        |
+|   sam2.1_hiera_small <br /> ([config](sam2/configs/sam2.1/sam2.1_hiera_s.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_small.pt))   |      46      |          84.8          |        76.6         |        73.5        |       78.3        |
+| sam2.1_hiera_base_plus <br /> ([config](sam2/configs/sam2.1/sam2.1_hiera_b+.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_base_plus.pt)) |     80.8     |        64.1          |        78.2         |        73.7        |       78.2        |
+|   sam2.1_hiera_large <br /> ([config](sam2/configs/sam2.1/sam2.1_hiera_l.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt))   |    224.4     |          39.5          |        79.5         |        74.6        |       80.6        |
+
+### SAM 2 checkpoints
+
+The previous SAM 2 checkpoints released on July 29, 2024 can be found as follows:
+
+|      **Model**       | **Size (M)** |    **Speed (FPS)**     | **SA-V test (J&F)** | **MOSE val (J&F)** | **LVOS v2 (J&F)** |
+| :------------------: | :----------: | :--------------------: | :-----------------: | :----------------: | :---------------: |
+|   sam2_hiera_tiny <br /> ([config](sam2/configs/sam2/sam2_hiera_t.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt))   |     38.9     |          91.5          |        75.0         |        70.9        |       75.3        |
+|   sam2_hiera_small <br /> ([config](sam2/configs/sam2/sam2_hiera_s.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt))   |      46      |          85.6          |        74.9         |        71.5        |       76.4        |
+| sam2_hiera_base_plus <br /> ([config](sam2/configs/sam2/sam2_hiera_b+.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_base_plus.pt)) |     80.8     |     64.8    |        74.7         |        72.8        |       75.8        |
+|   sam2_hiera_large <br /> ([config](sam2/configs/sam2/sam2_hiera_l.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt))   |    224.4     | 39.7 |        76.0         |        74.6        |       79.8        |
+
+Speed measured on an A100 with `torch 2.5.1, cuda 12.4`. See `benchmark.py` for an example on benchmarking (compiling all the model components). Compiling only the image encoder can be more flexible and also provide (a smaller) speed-up (set `compile_image_encoder: True` in the config).
+## Segment Anything Video Dataset
+
+See [sav_dataset/README.md](sav_dataset/README.md) for details.
+
+## Training SAM 2
+
+You can train or fine-tune SAM 2 on custom datasets of images, videos, or both. Please check the training [README](training/README.md) on how to get started.
+
+## Web demo for SAM 2
+
+We have released the frontend + backend code for the SAM 2 web demo (a locally deployable version similar to https://sam2.metademolab.com/demo). Please see the web demo [README](demo/README.md) for details.
+
+## License
+
+The SAM 2 model checkpoints, SAM 2 demo code (front-end and back-end), and SAM 2 training code are licensed under [Apache 2.0](./LICENSE), however the [Inter Font](https://github.com/rsms/inter?tab=OFL-1.1-1-ov-file) and [Noto Color Emoji](https://github.com/googlefonts/noto-emoji) used in the SAM 2 demo code are made available under the [SIL Open Font License, version 1.1](https://openfontlicense.org/open-font-license-official-text/).
+
+## Contributing
+
+See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
+
+## Contributors
+
+The SAM 2 project was made possible with the help of many contributors (alphabetical):
+
+Karen Bergan, Daniel Bolya, Alex Bosenberg, Kai Brown, Vispi Cassod, Christopher Chedeau, Ida Cheng, Luc Dahlin, Shoubhik Debnath, Rene Martinez Doehner, Grant Gardner, Sahir Gomez, Rishi Godugu, Baishan Guo, Caleb Ho, Andrew Huang, Somya Jain, Bob Kamma, Amanda Kallet, Jake Kinney, Alexander Kirillov, Shiva Koduvayur, Devansh Kukreja, Robert Kuo, Aohan Lin, Parth Malani, Jitendra Malik, Mallika Malhotra, Miguel Martin, Alexander Miller, Sasha Mitts, William Ngan, George Orlin, Joelle Pineau, Kate Saenko, Rodrick Shepard, Azita Shokrpour, David Soofian, Jonathan Torres, Jenny Truong, Sagar Vaze, Meng Wang, Claudette Ward, Pengchuan Zhang.
+
+Third-party code: we use a GPU-based connected component algorithm adapted from [`cc_torch`](https://github.com/zsef123/Connected_components_PyTorch) (with its license in [`LICENSE_cctorch`](./LICENSE_cctorch)) as an optional post-processing step for the mask predictions.
+
+## Citing SAM 2
+
+If you use SAM 2 or the SA-V dataset in your research, please use the following BibTeX entry.
+
+```bibtex
+@article{ravi2024sam2,
+  title={SAM 2: Segment Anything in Images and Videos},
+  author={Ravi, Nikhila and Gabeur, Valentin and Hu, Yuan-Ting and Hu, Ronghang and Ryali, Chaitanya and Ma, Tengyu and Khedr, Haitham and R{\"a}dle, Roman and Rolland, Chloe and Gustafson, Laura and Mintun, Eric and Pan, Junting and Alwala, Kalyan Vasudev and Carion, Nicolas and Wu, Chao-Yuan and Girshick, Ross and Doll{\'a}r, Piotr and Feichtenhofer, Christoph},
+  journal={arXiv preprint arXiv:2408.00714},
+  url={https://arxiv.org/abs/2408.00714},
+  year={2024}
+}
+```
diff --git a/phantom/submodules/sam2/RELEASE_NOTES.md b/phantom/submodules/sam2/RELEASE_NOTES.md
new file mode 100644
index 0000000000000000000000000000000000000000..ee65ae7f4a51f1c7fce81204c5dc94467882d366
--- /dev/null
+++ b/phantom/submodules/sam2/RELEASE_NOTES.md
@@ -0,0 +1,27 @@
+## SAM 2 release notes
+
+### 12/11/2024 -- full model compilation for a major VOS speedup and a new `SAM2VideoPredictor` to better handle multi-object tracking
+
+- We now support `torch.compile` of the entire SAM 2 model on videos, which can be turned on by setting `vos_optimized=True` in `build_sam2_video_predictor` (it uses the new `SAM2VideoPredictorVOS` predictor class in `sam2/sam2_video_predictor.py`).
+  * Compared to the previous setting (which only compiles the image encoder backbone), the new full model compilation gives a major speedup in inference FPS.
+  * In the VOS prediction script `tools/vos_inference.py`, you can specify this option in `tools/vos_inference.py` via the `--use_vos_optimized_video_predictor` flag.
+  * Note that turning on this flag might introduce a small variance in the predictions due to numerical differences caused by `torch.compile` of the full model.
+  * **PyTorch 2.5.1 is the minimum version for full support of this feature**. (Earlier PyTorch versions might run into compilation errors in some cases.) Therefore, we have updated the minimum PyTorch version to 2.5.1 accordingly in the installation scripts.
+- We also update the implementation of the `SAM2VideoPredictor` class for the SAM 2 video prediction in `sam2/sam2_video_predictor.py`, which allows for independent per-object inference. Specifically, in the new `SAM2VideoPredictor`:
+  * Now **we handle the inference of each object independently** (as if we are opening a separate session for each object) while sharing their backbone features.
+  * This change allows us to relax the assumption of prompting for multi-object tracking. Previously (due to the batching behavior in inference), if a video frame receives clicks for only a subset of objects, the rest of the (non-prompted) objects are assumed to be non-existent in this frame (i.e., in such frames, the user is telling SAM 2 that the rest of the objects don't appear). Now, if a frame receives clicks for only a subset of objects, we do not make any assumptions about the remaining (non-prompted) objects (i.e., now each object is handled independently and is not affected by how other objects are prompted). As a result, **we allow adding new objects after tracking starts** after this change (which was previously a restriction on usage).
+  * We believe that the new version is a more natural inference behavior and therefore switched to it as the default behavior. The previous implementation of `SAM2VideoPredictor` is backed up to in `sam2/sam2_video_predictor_legacy.py`. All the VOS inference results using `tools/vos_inference.py` should remain the same after this change to the `SAM2VideoPredictor` class.
+
+### 09/30/2024 -- SAM 2.1 Developer Suite (new checkpoints, training code, web demo) is released
+
+- A new suite of improved model checkpoints (denoted as **SAM 2.1**) are released. See [Model Description](#model-description) for details.
+  * To use the new SAM 2.1 checkpoints, you need the latest model code from this repo. If you have installed an earlier version of this repo, please first uninstall the previous version via `pip uninstall SAM-2`, pull the latest code from this repo (with `git pull`), and then reinstall the repo following [Installation](#installation) below.
+- The training (and fine-tuning) code has been released. See [`training/README.md`](training/README.md) on how to get started.
+- The frontend + backend code for the SAM 2 web demo has been released. See [`demo/README.md`](demo/README.md) for details.
+
+### 07/29/2024 -- SAM 2 is released
+
+- We release Segment Anything Model 2 (SAM 2), a foundation model towards solving promptable visual segmentation in images and videos.
+  * SAM 2 code: https://github.com/facebookresearch/sam2
+  * SAM 2 demo: https://sam2.metademolab.com/
+  * SAM 2 paper: https://arxiv.org/abs/2408.00714
diff --git a/phantom/submodules/sam2/backend.Dockerfile b/phantom/submodules/sam2/backend.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..54a32967b0e053ae2e7d16c734928636ef46db7b
--- /dev/null
+++ b/phantom/submodules/sam2/backend.Dockerfile
@@ -0,0 +1,64 @@
+ARG BASE_IMAGE=pytorch/pytorch:2.5.1-cuda12.1-cudnn9-runtime
+ARG MODEL_SIZE=base_plus
+
+FROM ${BASE_IMAGE}
+
+# Gunicorn environment variables
+ENV GUNICORN_WORKERS=1
+ENV GUNICORN_THREADS=2
+ENV GUNICORN_PORT=5000
+
+# SAM 2 environment variables
+ENV APP_ROOT=/opt/sam2
+ENV PYTHONUNBUFFERED=1
+ENV SAM2_BUILD_CUDA=0
+ENV MODEL_SIZE=${MODEL_SIZE}
+
+# Install system requirements
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    libavutil-dev \
+    libavcodec-dev \
+    libavformat-dev \
+    libswscale-dev \
+    pkg-config \
+    build-essential \
+    libffi-dev
+
+COPY setup.py .
+COPY README.md .
+
+RUN pip install --upgrade pip setuptools
+RUN pip install -e ".[interactive-demo]"
+
+# https://github.com/Kosinkadink/ComfyUI-VideoHelperSuite/issues/69#issuecomment-1826764707
+RUN rm /opt/conda/bin/ffmpeg && ln -s /bin/ffmpeg /opt/conda/bin/ffmpeg
+
+# Make app directory. This directory will host all files required for the
+# backend and SAM 2 inference files.
+RUN mkdir ${APP_ROOT}
+
+# Copy backend server files
+COPY demo/backend/server ${APP_ROOT}/server
+
+# Copy SAM 2 inference files
+COPY sam2 ${APP_ROOT}/server/sam2
+
+# Download SAM 2.1 checkpoints
+ADD https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_tiny.pt ${APP_ROOT}/checkpoints/sam2.1_hiera_tiny.pt
+ADD https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_small.pt ${APP_ROOT}/checkpoints/sam2.1_hiera_small.pt
+ADD https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_base_plus.pt ${APP_ROOT}/checkpoints/sam2.1_hiera_base_plus.pt
+ADD https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt ${APP_ROOT}/checkpoints/sam2.1_hiera_large.pt
+
+WORKDIR ${APP_ROOT}/server
+
+# https://pythonspeed.com/articles/gunicorn-in-docker/
+CMD gunicorn --worker-tmp-dir /dev/shm \
+    --worker-class gthread app:app \
+    --log-level info \
+    --access-logfile /dev/stdout \
+    --log-file /dev/stderr \
+    --workers ${GUNICORN_WORKERS} \
+    --threads ${GUNICORN_THREADS} \
+    --bind 0.0.0.0:${GUNICORN_PORT} \
+    --timeout 60
diff --git a/phantom/submodules/sam2/checkpoints/download_ckpts.sh b/phantom/submodules/sam2/checkpoints/download_ckpts.sh
new file mode 100755
index 0000000000000000000000000000000000000000..eedee8eee153f17c6db3b92de5492fa0a11ec3b7
--- /dev/null
+++ b/phantom/submodules/sam2/checkpoints/download_ckpts.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Use either wget or curl to download the checkpoints
+if command -v wget &> /dev/null; then
+    CMD="wget"
+elif command -v curl &> /dev/null; then
+    CMD="curl -L -O"
+else
+    echo "Please install wget or curl to download the checkpoints."
+    exit 1
+fi
+
+# Define the URLs for SAM 2 checkpoints
+# SAM2_BASE_URL="https://dl.fbaipublicfiles.com/segment_anything_2/072824"
+# sam2_hiera_t_url="${SAM2_BASE_URL}/sam2_hiera_tiny.pt"
+# sam2_hiera_s_url="${SAM2_BASE_URL}/sam2_hiera_small.pt"
+# sam2_hiera_b_plus_url="${SAM2_BASE_URL}/sam2_hiera_base_plus.pt"
+# sam2_hiera_l_url="${SAM2_BASE_URL}/sam2_hiera_large.pt"
+
+# Download each of the four checkpoints using wget
+# echo "Downloading sam2_hiera_tiny.pt checkpoint..."
+# $CMD $sam2_hiera_t_url || { echo "Failed to download checkpoint from $sam2_hiera_t_url"; exit 1; }
+
+# echo "Downloading sam2_hiera_small.pt checkpoint..."
+# $CMD $sam2_hiera_s_url || { echo "Failed to download checkpoint from $sam2_hiera_s_url"; exit 1; }
+
+# echo "Downloading sam2_hiera_base_plus.pt checkpoint..."
+# $CMD $sam2_hiera_b_plus_url || { echo "Failed to download checkpoint from $sam2_hiera_b_plus_url"; exit 1; }
+
+# echo "Downloading sam2_hiera_large.pt checkpoint..."
+# $CMD $sam2_hiera_l_url || { echo "Failed to download checkpoint from $sam2_hiera_l_url"; exit 1; }
+
+# Define the URLs for SAM 2.1 checkpoints
+SAM2p1_BASE_URL="https://dl.fbaipublicfiles.com/segment_anything_2/092824"
+sam2p1_hiera_t_url="${SAM2p1_BASE_URL}/sam2.1_hiera_tiny.pt"
+sam2p1_hiera_s_url="${SAM2p1_BASE_URL}/sam2.1_hiera_small.pt"
+sam2p1_hiera_b_plus_url="${SAM2p1_BASE_URL}/sam2.1_hiera_base_plus.pt"
+sam2p1_hiera_l_url="${SAM2p1_BASE_URL}/sam2.1_hiera_large.pt"
+
+# SAM 2.1 checkpoints
+echo "Downloading sam2.1_hiera_tiny.pt checkpoint..."
+$CMD $sam2p1_hiera_t_url || { echo "Failed to download checkpoint from $sam2p1_hiera_t_url"; exit 1; }
+
+echo "Downloading sam2.1_hiera_small.pt checkpoint..."
+$CMD $sam2p1_hiera_s_url || { echo "Failed to download checkpoint from $sam2p1_hiera_s_url"; exit 1; }
+
+echo "Downloading sam2.1_hiera_base_plus.pt checkpoint..."
+$CMD $sam2p1_hiera_b_plus_url || { echo "Failed to download checkpoint from $sam2p1_hiera_b_plus_url"; exit 1; }
+
+echo "Downloading sam2.1_hiera_large.pt checkpoint..."
+$CMD $sam2p1_hiera_l_url || { echo "Failed to download checkpoint from $sam2p1_hiera_l_url"; exit 1; }
+
+echo "All checkpoints are downloaded successfully."
diff --git a/phantom/submodules/sam2/docker-compose.yaml b/phantom/submodules/sam2/docker-compose.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7a5395a585daa7d5a6e0e97d3a30b48f225fb2cf
--- /dev/null
+++ b/phantom/submodules/sam2/docker-compose.yaml
@@ -0,0 +1,42 @@
+services:
+  frontend:
+    image: sam2/frontend
+    build:
+      context: ./demo/frontend
+      dockerfile: frontend.Dockerfile
+    ports:
+      - 7262:80
+
+  backend:
+    image: sam2/backend
+    build:
+      context: .
+      dockerfile: backend.Dockerfile
+    ports:
+      - 7263:5000
+    volumes:
+      - ./demo/data/:/data/:rw
+    environment:
+      - SERVER_ENVIRONMENT=DEV
+      - GUNICORN_WORKERS=1
+      # Inference API needs to have at least 2 threads to handle an incoming
+      # parallel cancel propagation request
+      - GUNICORN_THREADS=2
+      - GUNICORN_PORT=5000
+      - API_URL=http://localhost:7263
+      - DEFAULT_VIDEO_PATH=gallery/05_default_juggle.mp4
+      # # ffmpeg/video encode settings
+      - FFMPEG_NUM_THREADS=1
+      - VIDEO_ENCODE_CODEC=libx264
+      - VIDEO_ENCODE_CRF=23
+      - VIDEO_ENCODE_FPS=24
+      - VIDEO_ENCODE_MAX_WIDTH=1280
+      - VIDEO_ENCODE_MAX_HEIGHT=720
+      - VIDEO_ENCODE_VERBOSE=False
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
diff --git a/phantom/submodules/sam2/pyproject.toml b/phantom/submodules/sam2/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..f84317dbbfa6ba4f2d972cab2e2e0d0bdf07f003
--- /dev/null
+++ b/phantom/submodules/sam2/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = [
+    "setuptools>=61.0",  
+    "torch>=2.5.1",
+    ]
+build-backend = "setuptools.build_meta"
diff --git a/phantom/submodules/sam2/sam2/__init__.py b/phantom/submodules/sam2/sam2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0712dd03cb280ab94ba04f8a32aa8ddc8aa3db4a
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from hydra import initialize_config_module
+from hydra.core.global_hydra import GlobalHydra
+
+if not GlobalHydra.instance().is_initialized():
+    initialize_config_module("sam2", version_base="1.2")
diff --git a/phantom/submodules/sam2/sam2/automatic_mask_generator.py b/phantom/submodules/sam2/sam2/automatic_mask_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..065e469e27c2d3af40d51d072031e828692c799b
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/automatic_mask_generator.py
@@ -0,0 +1,454 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Adapted from https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/automatic_mask_generator.py
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
+
+from sam2.modeling.sam2_base import SAM2Base
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+from sam2.utils.amg import (
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    MaskData,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+class SAM2AutomaticMaskGenerator:
+    def __init__(
+        self,
+        model: SAM2Base,
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 64,
+        pred_iou_thresh: float = 0.8,
+        stability_score_thresh: float = 0.95,
+        stability_score_offset: float = 1.0,
+        mask_threshold: float = 0.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        point_grids: Optional[List[np.ndarray]] = None,
+        min_mask_region_area: int = 0,
+        output_mode: str = "binary_mask",
+        use_m2m: bool = False,
+        multimask_output: bool = True,
+        **kwargs,
+    ) -> None:
+        """
+        Using a SAM 2 model, generates masks for the entire image.
+        Generates a grid of point prompts over the image, then filters
+        low quality and duplicate masks. The default settings are chosen
+        for SAM 2 with a HieraL backbone.
+
+        Arguments:
+          model (Sam): The SAM 2 model to use for mask prediction.
+          points_per_side (int or None): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2. If None, 'point_grids' must provide explicit
+            point sampling.
+          points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+          pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+          stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+          stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+          mask_threshold (float): Threshold for binarizing the mask logits
+          box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+          crop_n_layers (int): If >0, mask prediction will be run again on
+            crops of the image. Sets the number of layers to run, where each
+            layer has 2**i_layer number of image crops.
+          crop_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks between different crops.
+          crop_overlap_ratio (float): Sets the degree to which crops overlap.
+            In the first crop layer, crops will overlap by this fraction of
+            the image length. Later layers with more crops scale down this overlap.
+          crop_n_points_downscale_factor (int): The number of points-per-side
+            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+          point_grids (list(np.ndarray) or None): A list over explicit grids
+            of points used for sampling, normalized to [0,1]. The nth grid in the
+            list is used in the nth crop layer. Exclusive with points_per_side.
+          min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove disconnected regions and holes in masks with area smaller
+            than min_mask_region_area. Requires opencv.
+          output_mode (str): The form masks are returned in. Can be 'binary_mask',
+            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+            For large resolutions, 'binary_mask' may consume large amounts of
+            memory.
+          use_m2m (bool): Whether to add a one step refinement using previous mask predictions.
+          multimask_output (bool): Whether to output multimask at each point of the grid.
+        """
+
+        assert (points_per_side is None) != (
+            point_grids is None
+        ), "Exactly one of points_per_side or point_grid must be provided."
+        if points_per_side is not None:
+            self.point_grids = build_all_layer_point_grids(
+                points_per_side,
+                crop_n_layers,
+                crop_n_points_downscale_factor,
+            )
+        elif point_grids is not None:
+            self.point_grids = point_grids
+        else:
+            raise ValueError("Can't have both points_per_side and point_grid be None.")
+
+        assert output_mode in [
+            "binary_mask",
+            "uncompressed_rle",
+            "coco_rle",
+        ], f"Unknown output_mode {output_mode}."
+        if output_mode == "coco_rle":
+            try:
+                from pycocotools import mask as mask_utils  # type: ignore  # noqa: F401
+            except ImportError as e:
+                print("Please install pycocotools")
+                raise e
+
+        self.predictor = SAM2ImagePredictor(
+            model,
+            max_hole_area=min_mask_region_area,
+            max_sprinkle_area=min_mask_region_area,
+        )
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.stability_score_offset = stability_score_offset
+        self.mask_threshold = mask_threshold
+        self.box_nms_thresh = box_nms_thresh
+        self.crop_n_layers = crop_n_layers
+        self.crop_nms_thresh = crop_nms_thresh
+        self.crop_overlap_ratio = crop_overlap_ratio
+        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+        self.min_mask_region_area = min_mask_region_area
+        self.output_mode = output_mode
+        self.use_m2m = use_m2m
+        self.multimask_output = multimask_output
+
+    @classmethod
+    def from_pretrained(cls, model_id: str, **kwargs) -> "SAM2AutomaticMaskGenerator":
+        """
+        Load a pretrained model from the Hugging Face hub.
+
+        Arguments:
+          model_id (str): The Hugging Face repository ID.
+          **kwargs: Additional arguments to pass to the model constructor.
+
+        Returns:
+          (SAM2AutomaticMaskGenerator): The loaded model.
+        """
+        from sam2.build_sam import build_sam2_hf
+
+        sam_model = build_sam2_hf(model_id, **kwargs)
+        return cls(sam_model, **kwargs)
+
+    @torch.no_grad()
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """
+        Generates masks for the given image.
+
+        Arguments:
+          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+        Returns:
+           list(dict(str, any)): A list over records for masks. Each record is
+             a dict containing the following keys:
+               segmentation (dict(str, any) or np.ndarray): The mask. If
+                 output_mode='binary_mask', is an array of shape HW. Otherwise,
+                 is a dictionary containing the RLE.
+               bbox (list(float)): The box around the mask, in XYWH format.
+               area (int): The area in pixels of the mask.
+               predicted_iou (float): The model's own prediction of the mask's
+                 quality. This is filtered by the pred_iou_thresh parameter.
+               point_coords (list(list(float))): The point coordinates input
+                 to the model to generate this mask.
+               stability_score (float): A measure of the mask's quality. This
+                 is filtered on using the stability_score_thresh parameter.
+               crop_box (list(float)): The crop of the image used to generate
+                 the mask, given in XYWH format.
+        """
+
+        # Generate masks
+        mask_data = self._generate_masks(image)
+
+        # Encode masks
+        if self.output_mode == "coco_rle":
+            mask_data["segmentations"] = [
+                coco_encode_rle(rle) for rle in mask_data["rles"]
+            ]
+        elif self.output_mode == "binary_mask":
+            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+        else:
+            mask_data["segmentations"] = mask_data["rles"]
+
+        # Write mask records
+        curr_anns = []
+        for idx in range(len(mask_data["segmentations"])):
+            ann = {
+                "segmentation": mask_data["segmentations"][idx],
+                "area": area_from_rle(mask_data["rles"][idx]),
+                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+                "predicted_iou": mask_data["iou_preds"][idx].item(),
+                "point_coords": [mask_data["points"][idx].tolist()],
+                "stability_score": mask_data["stability_score"][idx].item(),
+                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+            }
+            curr_anns.append(ann)
+
+        return curr_anns
+
+    def _generate_masks(self, image: np.ndarray) -> MaskData:
+        orig_size = image.shape[:2]
+        crop_boxes, layer_idxs = generate_crop_boxes(
+            orig_size, self.crop_n_layers, self.crop_overlap_ratio
+        )
+
+        # Iterate over image crops
+        data = MaskData()
+        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+            data.cat(crop_data)
+
+        # Remove duplicate masks between crops
+        if len(crop_boxes) > 1:
+            # Prefer masks from smaller crops
+            scores = 1 / box_area(data["crop_boxes"])
+            scores = scores.to(data["boxes"].device)
+            keep_by_nms = batched_nms(
+                data["boxes"].float(),
+                scores,
+                torch.zeros_like(data["boxes"][:, 0]),  # categories
+                iou_threshold=self.crop_nms_thresh,
+            )
+            data.filter(keep_by_nms)
+        data.to_numpy()
+        return data
+
+    def _process_crop(
+        self,
+        image: np.ndarray,
+        crop_box: List[int],
+        crop_layer_idx: int,
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        # Crop the image and calculate embeddings
+        x0, y0, x1, y1 = crop_box
+        cropped_im = image[y0:y1, x0:x1, :]
+        cropped_im_size = cropped_im.shape[:2]
+        self.predictor.set_image(cropped_im)
+
+        # Get points for this crop
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+        points_for_image = self.point_grids[crop_layer_idx] * points_scale
+
+        # Generate masks for this crop in batches
+        data = MaskData()
+        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+            batch_data = self._process_batch(
+                points, cropped_im_size, crop_box, orig_size, normalize=True
+            )
+            data.cat(batch_data)
+            del batch_data
+        self.predictor.reset_predictor()
+
+        # Remove duplicates within this crop.
+        keep_by_nms = batched_nms(
+            data["boxes"].float(),
+            data["iou_preds"],
+            torch.zeros_like(data["boxes"][:, 0]),  # categories
+            iou_threshold=self.box_nms_thresh,
+        )
+        data.filter(keep_by_nms)
+
+        # Return to the original image frame
+        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+        data["points"] = uncrop_points(data["points"], crop_box)
+        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+
+        return data
+
+    def _process_batch(
+        self,
+        points: np.ndarray,
+        im_size: Tuple[int, ...],
+        crop_box: List[int],
+        orig_size: Tuple[int, ...],
+        normalize=False,
+    ) -> MaskData:
+        orig_h, orig_w = orig_size
+
+        # Run model on this batch
+        points = torch.as_tensor(
+            points, dtype=torch.float32, device=self.predictor.device
+        )
+        in_points = self.predictor._transforms.transform_coords(
+            points, normalize=normalize, orig_hw=im_size
+        )
+        in_labels = torch.ones(
+            in_points.shape[0], dtype=torch.int, device=in_points.device
+        )
+        masks, iou_preds, low_res_masks = self.predictor._predict(
+            in_points[:, None, :],
+            in_labels[:, None],
+            multimask_output=self.multimask_output,
+            return_logits=True,
+        )
+
+        # Serialize predictions and store in MaskData
+        data = MaskData(
+            masks=masks.flatten(0, 1),
+            iou_preds=iou_preds.flatten(0, 1),
+            points=points.repeat_interleave(masks.shape[1], dim=0),
+            low_res_masks=low_res_masks.flatten(0, 1),
+        )
+        del masks
+
+        if not self.use_m2m:
+            # Filter by predicted IoU
+            if self.pred_iou_thresh > 0.0:
+                keep_mask = data["iou_preds"] > self.pred_iou_thresh
+                data.filter(keep_mask)
+
+            # Calculate and filter by stability score
+            data["stability_score"] = calculate_stability_score(
+                data["masks"], self.mask_threshold, self.stability_score_offset
+            )
+            if self.stability_score_thresh > 0.0:
+                keep_mask = data["stability_score"] >= self.stability_score_thresh
+                data.filter(keep_mask)
+        else:
+            # One step refinement using previous mask predictions
+            in_points = self.predictor._transforms.transform_coords(
+                data["points"], normalize=normalize, orig_hw=im_size
+            )
+            labels = torch.ones(
+                in_points.shape[0], dtype=torch.int, device=in_points.device
+            )
+            masks, ious = self.refine_with_m2m(
+                in_points, labels, data["low_res_masks"], self.points_per_batch
+            )
+            data["masks"] = masks.squeeze(1)
+            data["iou_preds"] = ious.squeeze(1)
+
+            if self.pred_iou_thresh > 0.0:
+                keep_mask = data["iou_preds"] > self.pred_iou_thresh
+                data.filter(keep_mask)
+
+            data["stability_score"] = calculate_stability_score(
+                data["masks"], self.mask_threshold, self.stability_score_offset
+            )
+            if self.stability_score_thresh > 0.0:
+                keep_mask = data["stability_score"] >= self.stability_score_thresh
+                data.filter(keep_mask)
+
+        # Threshold masks and calculate boxes
+        data["masks"] = data["masks"] > self.mask_threshold
+        data["boxes"] = batched_mask_to_box(data["masks"])
+
+        # Filter boxes that touch crop boundaries
+        keep_mask = ~is_box_near_crop_edge(
+            data["boxes"], crop_box, [0, 0, orig_w, orig_h]
+        )
+        if not torch.all(keep_mask):
+            data.filter(keep_mask)
+
+        # Compress to RLE
+        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+        data["rles"] = mask_to_rle_pytorch(data["masks"])
+        del data["masks"]
+
+        return data
+
+    @staticmethod
+    def postprocess_small_regions(
+        mask_data: MaskData, min_area: int, nms_thresh: float
+    ) -> MaskData:
+        """
+        Removes small disconnected regions and holes in masks, then reruns
+        box NMS to remove any new duplicates.
+
+        Edits mask_data in place.
+
+        Requires open-cv as a dependency.
+        """
+        if len(mask_data["rles"]) == 0:
+            return mask_data
+
+        # Filter small disconnected regions and holes
+        new_masks = []
+        scores = []
+        for rle in mask_data["rles"]:
+            mask = rle_to_mask(rle)
+
+            mask, changed = remove_small_regions(mask, min_area, mode="holes")
+            unchanged = not changed
+            mask, changed = remove_small_regions(mask, min_area, mode="islands")
+            unchanged = unchanged and not changed
+
+            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+            # Give score=0 to changed masks and score=1 to unchanged masks
+            # so NMS will prefer ones that didn't need postprocessing
+            scores.append(float(unchanged))
+
+        # Recalculate boxes and remove any new duplicates
+        masks = torch.cat(new_masks, dim=0)
+        boxes = batched_mask_to_box(masks)
+        keep_by_nms = batched_nms(
+            boxes.float(),
+            torch.as_tensor(scores),
+            torch.zeros_like(boxes[:, 0]),  # categories
+            iou_threshold=nms_thresh,
+        )
+
+        # Only recalculate RLEs for masks that have changed
+        for i_mask in keep_by_nms:
+            if scores[i_mask] == 0.0:
+                mask_torch = masks[i_mask].unsqueeze(0)
+                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
+        mask_data.filter(keep_by_nms)
+
+        return mask_data
+
+    def refine_with_m2m(self, points, point_labels, low_res_masks, points_per_batch):
+        new_masks = []
+        new_iou_preds = []
+
+        for cur_points, cur_point_labels, low_res_mask in batch_iterator(
+            points_per_batch, points, point_labels, low_res_masks
+        ):
+            best_masks, best_iou_preds, _ = self.predictor._predict(
+                cur_points[:, None, :],
+                cur_point_labels[:, None],
+                mask_input=low_res_mask[:, None, :],
+                multimask_output=False,
+                return_logits=True,
+            )
+            new_masks.append(best_masks)
+            new_iou_preds.append(best_iou_preds)
+        masks = torch.cat(new_masks, dim=0)
+        return masks, torch.cat(new_iou_preds, dim=0)
diff --git a/phantom/submodules/sam2/sam2/benchmark.py b/phantom/submodules/sam2/sam2/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6519534c8619e04b9a632859a5128ad2cee34c13
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/benchmark.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import time
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from sam2.build_sam import build_sam2_video_predictor
+
+# Only cuda supported
+assert torch.cuda.is_available()
+device = torch.device("cuda")
+
+torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+if torch.cuda.get_device_properties(0).major >= 8:
+    # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+# Config and checkpoint
+sam2_checkpoint = "checkpoints/sam2.1_hiera_base_plus.pt"
+model_cfg = "configs/sam2.1/sam2.1_hiera_b+.yaml"
+
+# Build video predictor with vos_optimized=True setting
+predictor = build_sam2_video_predictor(
+    model_cfg, sam2_checkpoint, device=device, vos_optimized=True
+)
+
+
+# Initialize with video
+video_dir = "notebooks/videos/bedroom"
+# scan all the JPEG frame names in this directory
+frame_names = [
+    p
+    for p in os.listdir(video_dir)
+    if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
+]
+frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
+inference_state = predictor.init_state(video_path=video_dir)
+
+
+# Number of runs, warmup etc
+warm_up, runs = 5, 25
+verbose = True
+num_frames = len(frame_names)
+total, count = 0, 0
+torch.cuda.empty_cache()
+
+# We will select an object with a click.
+# See video_predictor_example.ipynb for more detailed explanation
+ann_frame_idx, ann_obj_id = 0, 1
+# Add a positive click at (x, y) = (210, 350)
+# For labels, `1` means positive click
+points = np.array([[210, 350]], dtype=np.float32)
+labels = np.array([1], np.int32)
+
+_, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(
+    inference_state=inference_state,
+    frame_idx=ann_frame_idx,
+    obj_id=ann_obj_id,
+    points=points,
+    labels=labels,
+)
+
+# Warmup and then average FPS over several runs
+with torch.autocast("cuda", torch.bfloat16):
+    with torch.inference_mode():
+        for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"):
+            start = time.time()
+            # Start tracking
+            for (
+                out_frame_idx,
+                out_obj_ids,
+                out_mask_logits,
+            ) in predictor.propagate_in_video(inference_state):
+                pass
+
+            end = time.time()
+            total += end - start
+            count += 1
+            if i == warm_up - 1:
+                print("Warmup FPS: ", count * num_frames / total)
+                total = 0
+                count = 0
+
+print("FPS: ", count * num_frames / total)
diff --git a/phantom/submodules/sam2/sam2/build_sam.py b/phantom/submodules/sam2/sam2/build_sam.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a3bef1e566d86c3ba0fd75f425530bc6505e9bf
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/build_sam.py
@@ -0,0 +1,174 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import torch
+from hydra import compose
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
+
+import sam2
+
+# Check if the user is running Python from the parent directory of the sam2 repo
+# (i.e. the directory where this repo is cloned into) -- this is not supported since
+# it could shadow the sam2 package and cause issues.
+if os.path.isdir(os.path.join(sam2.__path__[0], "sam2")):
+    # If the user has "sam2/sam2" in their path, they are likey importing the repo itself
+    # as "sam2" rather than importing the "sam2" python package (i.e. "sam2/sam2" directory).
+    # This typically happens because the user is running Python from the parent directory
+    # that contains the sam2 repo they cloned.
+    raise RuntimeError(
+        "You're likely running Python from the parent directory of the sam2 repository "
+        "(i.e. the directory where https://github.com/facebookresearch/sam2 is cloned into). "
+        "This is not supported since the `sam2` Python package could be shadowed by the "
+        "repository name (the repository is also named `sam2` and contains the Python package "
+        "in `sam2/sam2`). Please run Python from another directory (e.g. from the repo dir "
+        "rather than its parent dir, or from your home directory) after installing SAM 2."
+    )
+
+
+HF_MODEL_ID_TO_FILENAMES = {
+    "facebook/sam2-hiera-tiny": (
+        "configs/sam2/sam2_hiera_t.yaml",
+        "sam2_hiera_tiny.pt",
+    ),
+    "facebook/sam2-hiera-small": (
+        "configs/sam2/sam2_hiera_s.yaml",
+        "sam2_hiera_small.pt",
+    ),
+    "facebook/sam2-hiera-base-plus": (
+        "configs/sam2/sam2_hiera_b+.yaml",
+        "sam2_hiera_base_plus.pt",
+    ),
+    "facebook/sam2-hiera-large": (
+        "configs/sam2/sam2_hiera_l.yaml",
+        "sam2_hiera_large.pt",
+    ),
+    "facebook/sam2.1-hiera-tiny": (
+        "configs/sam2.1/sam2.1_hiera_t.yaml",
+        "sam2.1_hiera_tiny.pt",
+    ),
+    "facebook/sam2.1-hiera-small": (
+        "configs/sam2.1/sam2.1_hiera_s.yaml",
+        "sam2.1_hiera_small.pt",
+    ),
+    "facebook/sam2.1-hiera-base-plus": (
+        "configs/sam2.1/sam2.1_hiera_b+.yaml",
+        "sam2.1_hiera_base_plus.pt",
+    ),
+    "facebook/sam2.1-hiera-large": (
+        "configs/sam2.1/sam2.1_hiera_l.yaml",
+        "sam2.1_hiera_large.pt",
+    ),
+}
+
+
+def build_sam2(
+    config_file,
+    ckpt_path=None,
+    device="cuda",
+    mode="eval",
+    hydra_overrides_extra=[],
+    apply_postprocessing=True,
+    **kwargs,
+):
+
+    if apply_postprocessing:
+        hydra_overrides_extra = hydra_overrides_extra.copy()
+        hydra_overrides_extra += [
+            # dynamically fall back to multi-mask if the single mask is not stable
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
+        ]
+    # Read config and init model
+    cfg = compose(config_name=config_file, overrides=hydra_overrides_extra)
+    OmegaConf.resolve(cfg)
+    model = instantiate(cfg.model, _recursive_=True)
+    _load_checkpoint(model, ckpt_path)
+    model = model.to(device)
+    if mode == "eval":
+        model.eval()
+    return model
+
+
+def build_sam2_video_predictor(
+    config_file,
+    ckpt_path=None,
+    device="cuda",
+    mode="eval",
+    hydra_overrides_extra=[],
+    apply_postprocessing=True,
+    vos_optimized=False,
+    **kwargs,
+):
+    hydra_overrides = [
+        "++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictor",
+    ]
+    if vos_optimized:
+        hydra_overrides = [
+            "++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictorVOS",
+            "++model.compile_image_encoder=True",  # Let sam2_base handle this
+        ]
+
+    if apply_postprocessing:
+        hydra_overrides_extra = hydra_overrides_extra.copy()
+        hydra_overrides_extra += [
+            # dynamically fall back to multi-mask if the single mask is not stable
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
+            # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
+            "++model.binarize_mask_from_pts_for_mem_enc=true",
+            # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
+            "++model.fill_hole_area=8",
+        ]
+    hydra_overrides.extend(hydra_overrides_extra)
+
+    # Read config and init model
+    cfg = compose(config_name=config_file, overrides=hydra_overrides)
+    OmegaConf.resolve(cfg)
+    model = instantiate(cfg.model, _recursive_=True)
+    _load_checkpoint(model, ckpt_path)
+    model = model.to(device)
+    if mode == "eval":
+        model.eval()
+    return model
+
+
+def _hf_download(model_id):
+    from huggingface_hub import hf_hub_download
+
+    config_name, checkpoint_name = HF_MODEL_ID_TO_FILENAMES[model_id]
+    ckpt_path = hf_hub_download(repo_id=model_id, filename=checkpoint_name)
+    return config_name, ckpt_path
+
+
+def build_sam2_hf(model_id, **kwargs):
+    config_name, ckpt_path = _hf_download(model_id)
+    return build_sam2(config_file=config_name, ckpt_path=ckpt_path, **kwargs)
+
+
+def build_sam2_video_predictor_hf(model_id, **kwargs):
+    config_name, ckpt_path = _hf_download(model_id)
+    return build_sam2_video_predictor(
+        config_file=config_name, ckpt_path=ckpt_path, **kwargs
+    )
+
+
+def _load_checkpoint(model, ckpt_path):
+    if ckpt_path is not None:
+        sd = torch.load(ckpt_path, map_location="cpu", weights_only=True)["model"]
+        missing_keys, unexpected_keys = model.load_state_dict(sd)
+        if missing_keys:
+            logging.error(missing_keys)
+            raise RuntimeError()
+        if unexpected_keys:
+            logging.error(unexpected_keys)
+            raise RuntimeError()
+        logging.info("Loaded checkpoint sucessfully")
diff --git a/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml b/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7172f9b0b663aaaace97fed7e2a08db75150461
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
@@ -0,0 +1,116 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 112
+      num_heads: 2
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [896, 448, 224, 112]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False
diff --git a/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_l.yaml b/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23073ea7a95901be656b3c6d1a66ce8736ab7ad3
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
@@ -0,0 +1,120 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False
diff --git a/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_s.yaml b/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd8d40465b18b3de39b0a565aca712306306c4ed
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
@@ -0,0 +1,119 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 11, 2]
+      global_att_blocks: [7, 10, 13]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False
diff --git a/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_t.yaml b/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e762aec932f26436d13798f3feb3ec82c360a943
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
@@ -0,0 +1,121 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 7, 2]
+      global_att_blocks: [5, 7, 9]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  # SAM decoder
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  # HieraT does not currently support compilation, should always be set to False
+  compile_image_encoder: False
diff --git a/phantom/submodules/sam2/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml b/phantom/submodules/sam2/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b6faa79f47ee576faf007bffd23fb6649bd881d
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
@@ -0,0 +1,339 @@
+# @package _global_
+
+scratch:
+  resolution: 1024
+  train_batch_size: 1
+  num_train_workers: 10
+  num_frames: 8
+  max_num_objects: 3
+  base_lr: 5.0e-6
+  vision_lr: 3.0e-06
+  phases_per_epoch: 1
+  num_epochs: 40
+
+dataset:
+  # PATHS to Dataset
+  img_folder: null # PATH to MOSE JPEGImages folder
+  gt_folder: null  # PATH to MOSE Annotations folder
+  file_list_txt: training/assets/MOSE_sample_train_list.txt # Optional PATH to filelist containing a subset of videos to be used for training
+  multiplier: 2
+
+# Video transforms
+vos:
+  train_transforms:
+    - _target_: training.dataset.transforms.ComposeAPI
+      transforms:
+        - _target_: training.dataset.transforms.RandomHorizontalFlip
+          consistent_transform: True
+        - _target_: training.dataset.transforms.RandomAffine
+          degrees: 25
+          shear: 20
+          image_interpolation: bilinear
+          consistent_transform: True
+        - _target_: training.dataset.transforms.RandomResizeAPI
+          sizes: ${scratch.resolution}
+          square: true
+          consistent_transform: True
+        - _target_: training.dataset.transforms.ColorJitter
+          consistent_transform: True
+          brightness: 0.1
+          contrast: 0.03
+          saturation: 0.03
+          hue: null
+        - _target_: training.dataset.transforms.RandomGrayscale
+          p: 0.05
+          consistent_transform: True
+        - _target_: training.dataset.transforms.ColorJitter
+          consistent_transform: False
+          brightness: 0.1
+          contrast: 0.05
+          saturation: 0.05
+          hue: null
+        - _target_: training.dataset.transforms.ToTensorAPI
+        - _target_: training.dataset.transforms.NormalizeAPI
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+
+trainer:
+  _target_: training.trainer.Trainer
+  mode: train_only
+  max_epochs: ${times:${scratch.num_epochs},${scratch.phases_per_epoch}}
+  accelerator: cuda
+  seed_value: 123
+
+  model:
+    _target_: training.model.sam2.SAM2Train
+    image_encoder:
+      _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+      scalp: 1
+      trunk:
+        _target_: sam2.modeling.backbones.hieradet.Hiera
+        embed_dim: 112
+        num_heads: 2
+        drop_path_rate: 0.1
+      neck:
+        _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+        position_encoding:
+          _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+          num_pos_feats: 256
+          normalize: true
+          scale: null
+          temperature: 10000
+        d_model: 256
+        backbone_channel_list: [896, 448, 224, 112]
+        fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+        fpn_interp_model: nearest
+
+    memory_attention:
+      _target_: sam2.modeling.memory_attention.MemoryAttention
+      d_model: 256
+      pos_enc_at_input: true
+      layer:
+        _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+        activation: relu
+        dim_feedforward: 2048
+        dropout: 0.1
+        pos_enc_at_attn: false
+        self_attention:
+          _target_: sam2.modeling.sam.transformer.RoPEAttention
+          rope_theta: 10000.0
+          feat_sizes: [64, 64]
+          embedding_dim: 256
+          num_heads: 1
+          downsample_rate: 1
+          dropout: 0.1
+        d_model: 256
+        pos_enc_at_cross_attn_keys: true
+        pos_enc_at_cross_attn_queries: false
+        cross_attention:
+          _target_: sam2.modeling.sam.transformer.RoPEAttention
+          rope_theta: 10000.0
+          feat_sizes: [64, 64]
+          rope_k_repeat: True
+          embedding_dim: 256
+          num_heads: 1
+          downsample_rate: 1
+          dropout: 0.1
+          kv_in_dim: 64
+      num_layers: 4
+
+    memory_encoder:
+        _target_: sam2.modeling.memory_encoder.MemoryEncoder
+        out_dim: 64
+        position_encoding:
+          _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+          num_pos_feats: 64
+          normalize: true
+          scale: null
+          temperature: 10000
+        mask_downsampler:
+          _target_: sam2.modeling.memory_encoder.MaskDownSampler
+          kernel_size: 3
+          stride: 2
+          padding: 1
+        fuser:
+          _target_: sam2.modeling.memory_encoder.Fuser
+          layer:
+            _target_: sam2.modeling.memory_encoder.CXBlock
+            dim: 256
+            kernel_size: 7
+            padding: 3
+            layer_scale_init_value: 1e-6
+            use_dwconv: True  # depth-wise convs
+          num_layers: 2
+
+    num_maskmem: 7
+    image_size: ${scratch.resolution}
+    # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+    sigmoid_scale_for_mem_enc: 20.0
+    sigmoid_bias_for_mem_enc: -10.0
+    use_mask_input_as_output_without_sam: true
+    # Memory
+    directly_add_no_mem_embed: true
+    no_obj_embed_spatial: true
+    # use high-resolution feature map in the SAM mask decoder
+    use_high_res_features_in_sam: true
+    # output 3 masks on the first click on initial conditioning frames
+    multimask_output_in_sam: true
+    # SAM heads
+    iou_prediction_use_sigmoid: True
+    # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+    use_obj_ptrs_in_encoder: true
+    add_tpos_enc_to_obj_ptrs: true
+    proj_tpos_enc_in_obj_ptrs: true
+    use_signed_tpos_enc_to_obj_ptrs: true
+    only_obj_ptrs_in_the_past_for_eval: true
+    # object occlusion prediction
+    pred_obj_scores: true
+    pred_obj_scores_mlp: true
+    fixed_no_obj_ptr: true
+    # multimask tracking settings
+    multimask_output_for_tracking: true
+    use_multimask_token_for_obj_ptr: true
+    multimask_min_pt_num: 0
+    multimask_max_pt_num: 1
+    use_mlp_for_obj_ptr_proj: true
+    # Compilation flag
+    # compile_image_encoder: False
+
+    ####### Training specific params #######
+    # box/point input and corrections
+    prob_to_use_pt_input_for_train: 0.5
+    prob_to_use_pt_input_for_eval: 0.0
+    prob_to_use_box_input_for_train: 0.5  # 0.5*0.5 = 0.25 prob to use box instead of points
+    prob_to_use_box_input_for_eval: 0.0
+    prob_to_sample_from_gt_for_train: 0.1  # with a small prob, sampling correction points from GT mask instead of prediction errors
+    num_frames_to_correct_for_train: 2  # iteratively sample on random 1~2 frames (always include the first frame)
+    num_frames_to_correct_for_eval: 1  # only iteratively sample on first frame
+    rand_frames_to_correct_for_train: True  # random #init-cond-frame ~ 2
+    add_all_frames_to_correct_as_cond: True  # when a frame receives a correction click, it becomes a conditioning frame (even if it's not initially a conditioning frame)
+    # maximum 2 initial conditioning frames
+    num_init_cond_frames_for_train: 2
+    rand_init_cond_frames_for_train: True  # random 1~2
+    num_correction_pt_per_frame: 7
+    use_act_ckpt_iterative_pt_sampling: false
+    
+
+    
+    num_init_cond_frames_for_eval: 1  # only mask on the first frame
+    forward_backbone_per_frame_for_eval: True
+    
+
+  data:
+    train:
+      _target_: training.dataset.sam2_datasets.TorchTrainMixedDataset
+      phases_per_epoch: ${scratch.phases_per_epoch}
+      batch_sizes:
+        - ${scratch.train_batch_size}
+
+      datasets:
+        - _target_: training.dataset.utils.RepeatFactorWrapper
+          dataset:
+            _target_: training.dataset.utils.ConcatDataset
+            datasets:
+            - _target_: training.dataset.vos_dataset.VOSDataset
+              transforms: ${vos.train_transforms}
+              training: true
+              video_dataset:
+                _target_: training.dataset.vos_raw_dataset.PNGRawDataset
+                img_folder: ${dataset.img_folder}
+                gt_folder: ${dataset.gt_folder}
+                file_list_txt: ${dataset.file_list_txt}
+              sampler:
+                _target_: training.dataset.vos_sampler.RandomUniformSampler
+                num_frames: ${scratch.num_frames}
+                max_num_objects: ${scratch.max_num_objects}
+              multiplier: ${dataset.multiplier}
+      shuffle: True
+      num_workers: ${scratch.num_train_workers}
+      pin_memory: True
+      drop_last: True
+      collate_fn:
+        _target_: training.utils.data_utils.collate_fn
+        _partial_: true
+        dict_key: all
+
+  optim:
+    amp:
+      enabled: True
+      amp_dtype: bfloat16
+
+    optimizer:
+      _target_: torch.optim.AdamW
+
+    gradient_clip:
+      _target_: training.optimizer.GradientClipper
+      max_norm: 0.1
+      norm_type: 2
+
+    param_group_modifiers:
+      - _target_: training.optimizer.layer_decay_param_modifier
+        _partial_: True
+        layer_decay_value: 0.9
+        apply_to: 'image_encoder.trunk'
+        overrides:
+          - pattern: '*pos_embed*'
+            value: 1.0
+
+    options:
+      lr:
+        - scheduler:
+            _target_: fvcore.common.param_scheduler.CosineParamScheduler
+            start_value: ${scratch.base_lr}
+            end_value: ${divide:${scratch.base_lr},10}
+        - scheduler:
+            _target_: fvcore.common.param_scheduler.CosineParamScheduler
+            start_value: ${scratch.vision_lr}
+            end_value: ${divide:${scratch.vision_lr},10}
+          param_names:
+            - 'image_encoder.*'
+      weight_decay:
+        - scheduler:
+            _target_: fvcore.common.param_scheduler.ConstantParamScheduler
+            value: 0.1
+        - scheduler:
+            _target_: fvcore.common.param_scheduler.ConstantParamScheduler
+            value: 0.0
+          param_names:
+            - '*bias*'
+          module_cls_names: ['torch.nn.LayerNorm']
+
+  loss:
+    all:
+      _target_: training.loss_fns.MultiStepMultiMasksAndIous
+      weight_dict:
+        loss_mask: 20
+        loss_dice: 1
+        loss_iou: 1
+        loss_class: 1
+      supervise_all_iou: true
+      iou_use_l1_loss: true
+      pred_obj_scores: true
+      focal_gamma_obj_score: 0.0
+      focal_alpha_obj_score: -1.0
+
+  distributed:
+    backend: nccl
+    find_unused_parameters: True
+
+  logging:
+    tensorboard_writer:
+      _target_: training.utils.logger.make_tensorboard_logger
+      log_dir:  ${launcher.experiment_log_dir}/tensorboard
+      flush_secs: 120
+      should_log: True
+    log_dir: ${launcher.experiment_log_dir}/logs
+    log_freq: 10
+
+  # initialize from a SAM 2 checkpoint
+  checkpoint:
+    save_dir: ${launcher.experiment_log_dir}/checkpoints
+    save_freq: 0 # 0 only last checkpoint is saved.
+    model_weight_initializer:
+      _partial_: True
+      _target_: training.utils.checkpoint_utils.load_state_dict_into_model
+      strict: True
+      ignore_unexpected_keys: null
+      ignore_missing_keys: null
+
+      state_dict:
+        _target_: training.utils.checkpoint_utils.load_checkpoint_and_apply_kernels
+        checkpoint_path: ./checkpoints/sam2.1_hiera_base_plus.pt # PATH to SAM 2.1 checkpoint
+        ckpt_state_dict_keys: ['model']
+
+launcher:
+  num_nodes: 1
+  gpus_per_node: 8
+  experiment_log_dir: null # Path to log directory, defaults to ./sam2_logs/${config_name}
+
+# SLURM args if running on a cluster
+submitit:
+  partition: null
+  account: null
+  qos: null
+  cpus_per_task: 10
+  use_cluster: false
+  timeout_hour: 24
+  name: null
+  port_range: [10000, 65000]
+
diff --git a/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_b+.yaml b/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_b+.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f435af02fc88e2d3b7bff06f8cf8013cc079c24
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_b+.yaml
@@ -0,0 +1,113 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 112
+      num_heads: 2
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [896, 448, 224, 112]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False
diff --git a/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_l.yaml b/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_l.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1092802b1d24be6fedf78939f45b0d021d4ec560
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_l.yaml
@@ -0,0 +1,117 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False
diff --git a/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_s.yaml b/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_s.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..174e414f1467d80e94a34e9525dc373058f8caaa
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_s.yaml
@@ -0,0 +1,116 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 11, 2]
+      global_att_blocks: [7, 10, 13]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False
diff --git a/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_t.yaml b/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_t.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..121447aabd5318fac20efc2bc00d7c406ca26f01
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/configs/sam2/sam2_hiera_t.yaml
@@ -0,0 +1,118 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 7, 2]
+      global_att_blocks: [5, 7, 9]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  # SAM decoder
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  # HieraT does not currently support compilation, should always be set to False
+  compile_image_encoder: False
diff --git a/phantom/submodules/sam2/sam2/csrc/connected_components.cu b/phantom/submodules/sam2/sam2/csrc/connected_components.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ced21eb32eaaadb818d441c1322b99d1bf068f45
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/csrc/connected_components.cu
@@ -0,0 +1,289 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+// adapted from https://github.com/zsef123/Connected_components_PyTorch
+// with license found in the LICENSE_cctorch file in the root directory.
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+#include <torch/script.h>
+#include <vector>
+
+// 2d
+#define BLOCK_ROWS 16
+#define BLOCK_COLS 16
+
+namespace cc2d {
+
+template <typename T>
+__device__ __forceinline__ unsigned char hasBit(T bitmap, unsigned char pos) {
+  return (bitmap >> pos) & 1;
+}
+
+__device__ int32_t find(const int32_t* s_buf, int32_t n) {
+  while (s_buf[n] != n)
+    n = s_buf[n];
+  return n;
+}
+
+__device__ int32_t find_n_compress(int32_t* s_buf, int32_t n) {
+  const int32_t id = n;
+  while (s_buf[n] != n) {
+    n = s_buf[n];
+    s_buf[id] = n;
+  }
+  return n;
+}
+
+__device__ void union_(int32_t* s_buf, int32_t a, int32_t b) {
+  bool done;
+  do {
+    a = find(s_buf, a);
+    b = find(s_buf, b);
+
+    if (a < b) {
+      int32_t old = atomicMin(s_buf + b, a);
+      done = (old == b);
+      b = old;
+    } else if (b < a) {
+      int32_t old = atomicMin(s_buf + a, b);
+      done = (old == a);
+      a = old;
+    } else
+      done = true;
+
+  } while (!done);
+}
+
+__global__ void
+init_labeling(int32_t* label, const uint32_t W, const uint32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+
+  if (row < H && col < W)
+    label[idx] = idx;
+}
+
+__global__ void
+merge(uint8_t* img, int32_t* label, const uint32_t W, const uint32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+
+  if (row >= H || col >= W)
+    return;
+
+  uint32_t P = 0;
+
+  if (img[idx])
+    P |= 0x777;
+  if (row + 1 < H && img[idx + W])
+    P |= 0x777 << 4;
+  if (col + 1 < W && img[idx + 1])
+    P |= 0x777 << 1;
+
+  if (col == 0)
+    P &= 0xEEEE;
+  if (col + 1 >= W)
+    P &= 0x3333;
+  else if (col + 2 >= W)
+    P &= 0x7777;
+
+  if (row == 0)
+    P &= 0xFFF0;
+  if (row + 1 >= H)
+    P &= 0xFF;
+
+  if (P > 0) {
+    // If need check about top-left pixel(if flag the first bit) and hit the
+    // top-left pixel
+    if (hasBit(P, 0) && img[idx - W - 1]) {
+      union_(label, idx, idx - 2 * W - 2); // top left block
+    }
+
+    if ((hasBit(P, 1) && img[idx - W]) || (hasBit(P, 2) && img[idx - W + 1]))
+      union_(label, idx, idx - 2 * W); // top bottom block
+
+    if (hasBit(P, 3) && img[idx + 2 - W])
+      union_(label, idx, idx - 2 * W + 2); // top right block
+
+    if ((hasBit(P, 4) && img[idx - 1]) || (hasBit(P, 8) && img[idx + W - 1]))
+      union_(label, idx, idx - 2); // just left block
+  }
+}
+
+__global__ void compression(int32_t* label, const int32_t W, const int32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+
+  if (row < H && col < W)
+    find_n_compress(label, idx);
+}
+
+__global__ void final_labeling(
+    const uint8_t* img,
+    int32_t* label,
+    const int32_t W,
+    const int32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+
+  if (row >= H || col >= W)
+    return;
+
+  int32_t y = label[idx] + 1;
+
+  if (img[idx])
+    label[idx] = y;
+  else
+    label[idx] = 0;
+
+  if (col + 1 < W) {
+    if (img[idx + 1])
+      label[idx + 1] = y;
+    else
+      label[idx + 1] = 0;
+
+    if (row + 1 < H) {
+      if (img[idx + W + 1])
+        label[idx + W + 1] = y;
+      else
+        label[idx + W + 1] = 0;
+    }
+  }
+
+  if (row + 1 < H) {
+    if (img[idx + W])
+      label[idx + W] = y;
+    else
+      label[idx + W] = 0;
+  }
+}
+
+__global__ void init_counting(
+    const int32_t* label,
+    int32_t* count_init,
+    const int32_t W,
+    const int32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
+  const uint32_t idx = row * W + col;
+
+  if (row >= H || col >= W)
+    return;
+
+  int32_t y = label[idx];
+  if (y > 0) {
+    int32_t count_idx = y - 1;
+    atomicAdd(count_init + count_idx, 1);
+  }
+}
+
+__global__ void final_counting(
+    const int32_t* label,
+    const int32_t* count_init,
+    int32_t* count_final,
+    const int32_t W,
+    const int32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
+  const uint32_t idx = row * W + col;
+
+  if (row >= H || col >= W)
+    return;
+
+  int32_t y = label[idx];
+  if (y > 0) {
+    int32_t count_idx = y - 1;
+    count_final[idx] = count_init[count_idx];
+  } else {
+    count_final[idx] = 0;
+  }
+}
+
+} // namespace cc2d
+
+std::vector<torch::Tensor> get_connected_componnets(
+    const torch::Tensor& inputs) {
+  AT_ASSERTM(inputs.is_cuda(), "inputs must be a CUDA tensor");
+  AT_ASSERTM(inputs.ndimension() == 4, "inputs must be [N, 1, H, W] shape");
+  AT_ASSERTM(
+      inputs.scalar_type() == torch::kUInt8, "inputs must be a uint8 type");
+
+  const uint32_t N = inputs.size(0);
+  const uint32_t C = inputs.size(1);
+  const uint32_t H = inputs.size(2);
+  const uint32_t W = inputs.size(3);
+
+  AT_ASSERTM(C == 1, "inputs must be [N, 1, H, W] shape");
+  AT_ASSERTM((H % 2) == 0, "height must be an even number");
+  AT_ASSERTM((W % 2) == 0, "width must be an even number");
+
+  // label must be uint32_t
+  auto label_options =
+      torch::TensorOptions().dtype(torch::kInt32).device(inputs.device());
+  torch::Tensor labels = torch::zeros({N, C, H, W}, label_options);
+  torch::Tensor counts_init = torch::zeros({N, C, H, W}, label_options);
+  torch::Tensor counts_final = torch::zeros({N, C, H, W}, label_options);
+
+  dim3 grid = dim3(
+      ((W + 1) / 2 + BLOCK_COLS - 1) / BLOCK_COLS,
+      ((H + 1) / 2 + BLOCK_ROWS - 1) / BLOCK_ROWS);
+  dim3 block = dim3(BLOCK_COLS, BLOCK_ROWS);
+  dim3 grid_count =
+      dim3((W + BLOCK_COLS) / BLOCK_COLS, (H + BLOCK_ROWS) / BLOCK_ROWS);
+  dim3 block_count = dim3(BLOCK_COLS, BLOCK_ROWS);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  for (int n = 0; n < N; n++) {
+    uint32_t offset = n * H * W;
+
+    cc2d::init_labeling<<<grid, block, 0, stream>>>(
+        labels.data_ptr<int32_t>() + offset, W, H);
+    cc2d::merge<<<grid, block, 0, stream>>>(
+        inputs.data_ptr<uint8_t>() + offset,
+        labels.data_ptr<int32_t>() + offset,
+        W,
+        H);
+    cc2d::compression<<<grid, block, 0, stream>>>(
+        labels.data_ptr<int32_t>() + offset, W, H);
+    cc2d::final_labeling<<<grid, block, 0, stream>>>(
+        inputs.data_ptr<uint8_t>() + offset,
+        labels.data_ptr<int32_t>() + offset,
+        W,
+        H);
+
+    // get the counting of each pixel
+    cc2d::init_counting<<<grid_count, block_count, 0, stream>>>(
+        labels.data_ptr<int32_t>() + offset,
+        counts_init.data_ptr<int32_t>() + offset,
+        W,
+        H);
+    cc2d::final_counting<<<grid_count, block_count, 0, stream>>>(
+        labels.data_ptr<int32_t>() + offset,
+        counts_init.data_ptr<int32_t>() + offset,
+        counts_final.data_ptr<int32_t>() + offset,
+        W,
+        H);
+  }
+
+  // returned values are [labels, counts]
+  std::vector<torch::Tensor> outputs;
+  outputs.push_back(labels);
+  outputs.push_back(counts_final);
+  return outputs;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def(
+      "get_connected_componnets",
+      &get_connected_componnets,
+      "get_connected_componnets");
+}
diff --git a/phantom/submodules/sam2/sam2/modeling/__init__.py b/phantom/submodules/sam2/sam2/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/phantom/submodules/sam2/sam2/modeling/backbones/__init__.py b/phantom/submodules/sam2/sam2/modeling/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/backbones/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/phantom/submodules/sam2/sam2/modeling/backbones/hieradet.py b/phantom/submodules/sam2/sam2/modeling/backbones/hieradet.py
new file mode 100644
index 0000000000000000000000000000000000000000..19ac77b61d8e1345a301686d39ef2ab6e4b035fb
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/backbones/hieradet.py
@@ -0,0 +1,317 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from functools import partial
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from iopath.common.file_io import g_pathmgr
+
+from sam2.modeling.backbones.utils import (
+    PatchEmbed,
+    window_partition,
+    window_unpartition,
+)
+
+from sam2.modeling.sam2_utils import DropPath, MLP
+
+
+def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
+    if pool is None:
+        return x
+    # (B, H, W, C) -> (B, C, H, W)
+    x = x.permute(0, 3, 1, 2)
+    x = pool(x)
+    # (B, C, H', W') -> (B, H', W', C)
+    x = x.permute(0, 2, 3, 1)
+    if norm:
+        x = norm(x)
+
+    return x
+
+
+class MultiScaleAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        q_pool: nn.Module = None,
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.dim_out = dim_out
+        self.num_heads = num_heads
+        self.q_pool = q_pool
+        self.qkv = nn.Linear(dim, dim_out * 3)
+        self.proj = nn.Linear(dim_out, dim_out)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = x.shape
+        # qkv with shape (B, H * W, 3, nHead, C)
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1)
+        # q, k, v with shape (B, H * W, nheads, C)
+        q, k, v = torch.unbind(qkv, 2)
+
+        # Q pooling (for downsample at stage changes)
+        if self.q_pool:
+            q = do_pool(q.reshape(B, H, W, -1), self.q_pool)
+            H, W = q.shape[1:3]  # downsampled shape
+            q = q.reshape(B, H * W, self.num_heads, -1)
+
+        # Torch's SDPA expects [B, nheads, H*W, C] so we transpose
+        x = F.scaled_dot_product_attention(
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+        )
+        # Transpose back
+        x = x.transpose(1, 2)
+        x = x.reshape(B, H, W, -1)
+
+        x = self.proj(x)
+
+        return x
+
+
+class MultiScaleBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        drop_path: float = 0.0,
+        norm_layer: Union[nn.Module, str] = "LayerNorm",
+        q_stride: Tuple[int, int] = None,
+        act_layer: nn.Module = nn.GELU,
+        window_size: int = 0,
+    ):
+        super().__init__()
+
+        if isinstance(norm_layer, str):
+            norm_layer = partial(getattr(nn, norm_layer), eps=1e-6)
+
+        self.dim = dim
+        self.dim_out = dim_out
+        self.norm1 = norm_layer(dim)
+
+        self.window_size = window_size
+
+        self.pool, self.q_stride = None, q_stride
+        if self.q_stride:
+            self.pool = nn.MaxPool2d(
+                kernel_size=q_stride, stride=q_stride, ceil_mode=False
+            )
+
+        self.attn = MultiScaleAttention(
+            dim,
+            dim_out,
+            num_heads=num_heads,
+            q_pool=self.pool,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim_out)
+        self.mlp = MLP(
+            dim_out,
+            int(dim_out * mlp_ratio),
+            dim_out,
+            num_layers=2,
+            activation=act_layer,
+        )
+
+        if dim != dim_out:
+            self.proj = nn.Linear(dim, dim_out)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x  # B, H, W, C
+        x = self.norm1(x)
+
+        # Skip connection
+        if self.dim != self.dim_out:
+            shortcut = do_pool(self.proj(x), self.pool)
+
+        # Window partition
+        window_size = self.window_size
+        if window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, window_size)
+
+        # Window Attention + Q Pooling (if stage change)
+        x = self.attn(x)
+        if self.q_stride:
+            # Shapes have changed due to Q pooling
+            window_size = self.window_size // self.q_stride[0]
+            H, W = shortcut.shape[1:3]
+
+            pad_h = (window_size - H % window_size) % window_size
+            pad_w = (window_size - W % window_size) % window_size
+            pad_hw = (H + pad_h, W + pad_w)
+
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, window_size, pad_hw, (H, W))
+
+        x = shortcut + self.drop_path(x)
+        # MLP
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class Hiera(nn.Module):
+    """
+    Reference: https://arxiv.org/abs/2306.00989
+    """
+
+    def __init__(
+        self,
+        embed_dim: int = 96,  # initial embed dim
+        num_heads: int = 1,  # initial number of heads
+        drop_path_rate: float = 0.0,  # stochastic depth
+        q_pool: int = 3,  # number of q_pool stages
+        q_stride: Tuple[int, int] = (2, 2),  # downsample stride bet. stages
+        stages: Tuple[int, ...] = (2, 3, 16, 3),  # blocks per stage
+        dim_mul: float = 2.0,  # dim_mul factor at stage shift
+        head_mul: float = 2.0,  # head_mul factor at stage shift
+        window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14),
+        # window size per stage, when not using global att.
+        window_spec: Tuple[int, ...] = (
+            8,
+            4,
+            14,
+            7,
+        ),
+        # global attn in these blocks
+        global_att_blocks: Tuple[int, ...] = (
+            12,
+            16,
+            20,
+        ),
+        weights_path=None,
+        return_interm_layers=True,  # return feats from every stage
+    ):
+        super().__init__()
+
+        assert len(stages) == len(window_spec)
+        self.window_spec = window_spec
+
+        depth = sum(stages)
+        self.q_stride = q_stride
+        self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
+        assert 0 <= q_pool <= len(self.stage_ends[:-1])
+        self.q_pool_blocks = [x + 1 for x in self.stage_ends[:-1]][:q_pool]
+        self.return_interm_layers = return_interm_layers
+
+        self.patch_embed = PatchEmbed(
+            embed_dim=embed_dim,
+        )
+        # Which blocks have global att?
+        self.global_att_blocks = global_att_blocks
+
+        # Windowed positional embedding (https://arxiv.org/abs/2311.05613)
+        self.window_pos_embed_bkg_spatial_size = window_pos_embed_bkg_spatial_size
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, embed_dim, *self.window_pos_embed_bkg_spatial_size)
+        )
+        self.pos_embed_window = nn.Parameter(
+            torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0])
+        )
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+
+        cur_stage = 1
+        self.blocks = nn.ModuleList()
+
+        for i in range(depth):
+            dim_out = embed_dim
+            # lags by a block, so first block of
+            # next stage uses an initial window size
+            # of previous stage and final window size of current stage
+            window_size = self.window_spec[cur_stage - 1]
+
+            if self.global_att_blocks is not None:
+                window_size = 0 if i in self.global_att_blocks else window_size
+
+            if i - 1 in self.stage_ends:
+                dim_out = int(embed_dim * dim_mul)
+                num_heads = int(num_heads * head_mul)
+                cur_stage += 1
+
+            block = MultiScaleBlock(
+                dim=embed_dim,
+                dim_out=dim_out,
+                num_heads=num_heads,
+                drop_path=dpr[i],
+                q_stride=self.q_stride if i in self.q_pool_blocks else None,
+                window_size=window_size,
+            )
+
+            embed_dim = dim_out
+            self.blocks.append(block)
+
+        self.channel_list = (
+            [self.blocks[i].dim_out for i in self.stage_ends[::-1]]
+            if return_interm_layers
+            else [self.blocks[-1].dim_out]
+        )
+
+        if weights_path is not None:
+            with g_pathmgr.open(weights_path, "rb") as f:
+                chkpt = torch.load(f, map_location="cpu")
+            logging.info("loading Hiera", self.load_state_dict(chkpt, strict=False))
+
+    def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
+        h, w = hw
+        window_embed = self.pos_embed_window
+        pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic")
+        pos_embed = pos_embed + window_embed.tile(
+            [x // y for x, y in zip(pos_embed.shape, window_embed.shape)]
+        )
+        pos_embed = pos_embed.permute(0, 2, 3, 1)
+        return pos_embed
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        x = self.patch_embed(x)
+        # x: (B, H, W, C)
+
+        # Add pos embed
+        x = x + self._get_pos_embed(x.shape[1:3])
+
+        outputs = []
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if (i == self.stage_ends[-1]) or (
+                i in self.stage_ends and self.return_interm_layers
+            ):
+                feats = x.permute(0, 3, 1, 2)
+                outputs.append(feats)
+
+        return outputs
+
+    def get_layer_id(self, layer_name):
+        # https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
+        num_layers = self.get_num_layers()
+
+        if layer_name.find("rel_pos") != -1:
+            return num_layers + 1
+        elif layer_name.find("pos_embed") != -1:
+            return 0
+        elif layer_name.find("patch_embed") != -1:
+            return 0
+        elif layer_name.find("blocks") != -1:
+            return int(layer_name.split("blocks")[1].split(".")[1]) + 1
+        else:
+            return num_layers + 1
+
+    def get_num_layers(self) -> int:
+        return len(self.blocks)
diff --git a/phantom/submodules/sam2/sam2/modeling/backbones/image_encoder.py b/phantom/submodules/sam2/sam2/modeling/backbones/image_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..37e9266bc98596e97ca303118c910ed24f6cee2c
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/backbones/image_encoder.py
@@ -0,0 +1,134 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ImageEncoder(nn.Module):
+    def __init__(
+        self,
+        trunk: nn.Module,
+        neck: nn.Module,
+        scalp: int = 0,
+    ):
+        super().__init__()
+        self.trunk = trunk
+        self.neck = neck
+        self.scalp = scalp
+        assert (
+            self.trunk.channel_list == self.neck.backbone_channel_list
+        ), f"Channel dims of trunk and neck do not match. Trunk: {self.trunk.channel_list}, neck: {self.neck.backbone_channel_list}"
+
+    def forward(self, sample: torch.Tensor):
+        # Forward through backbone
+        features, pos = self.neck(self.trunk(sample))
+        if self.scalp > 0:
+            # Discard the lowest resolution features
+            features, pos = features[: -self.scalp], pos[: -self.scalp]
+
+        src = features[-1]
+        output = {
+            "vision_features": src,
+            "vision_pos_enc": pos,
+            "backbone_fpn": features,
+        }
+        return output
+
+
+class FpnNeck(nn.Module):
+    """
+    A modified variant of Feature Pyramid Network (FPN) neck
+    (we remove output conv and also do bicubic interpolation similar to ViT
+    pos embed interpolation)
+    """
+
+    def __init__(
+        self,
+        position_encoding: nn.Module,
+        d_model: int,
+        backbone_channel_list: List[int],
+        kernel_size: int = 1,
+        stride: int = 1,
+        padding: int = 0,
+        fpn_interp_model: str = "bilinear",
+        fuse_type: str = "sum",
+        fpn_top_down_levels: Optional[List[int]] = None,
+    ):
+        """Initialize the neck
+        :param trunk: the backbone
+        :param position_encoding: the positional encoding to use
+        :param d_model: the dimension of the model
+        :param neck_norm: the normalization to use
+        """
+        super().__init__()
+        self.position_encoding = position_encoding
+        self.convs = nn.ModuleList()
+        self.backbone_channel_list = backbone_channel_list
+        self.d_model = d_model
+        for dim in backbone_channel_list:
+            current = nn.Sequential()
+            current.add_module(
+                "conv",
+                nn.Conv2d(
+                    in_channels=dim,
+                    out_channels=d_model,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                ),
+            )
+
+            self.convs.append(current)
+        self.fpn_interp_model = fpn_interp_model
+        assert fuse_type in ["sum", "avg"]
+        self.fuse_type = fuse_type
+
+        # levels to have top-down features in its outputs
+        # e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3
+        # have top-down propagation, while outputs of level 0 and level 1 have only
+        # lateral features from the same backbone level.
+        if fpn_top_down_levels is None:
+            # default is to have top-down features on all levels
+            fpn_top_down_levels = range(len(self.convs))
+        self.fpn_top_down_levels = list(fpn_top_down_levels)
+
+    def forward(self, xs: List[torch.Tensor]):
+
+        out = [None] * len(self.convs)
+        pos = [None] * len(self.convs)
+        assert len(xs) == len(self.convs)
+        # fpn forward pass
+        # see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py
+        prev_features = None
+        # forward in top-down order (from low to high resolution)
+        n = len(self.convs) - 1
+        for i in range(n, -1, -1):
+            x = xs[i]
+            lateral_features = self.convs[n - i](x)
+            if i in self.fpn_top_down_levels and prev_features is not None:
+                top_down_features = F.interpolate(
+                    prev_features.to(dtype=torch.float32),
+                    scale_factor=2.0,
+                    mode=self.fpn_interp_model,
+                    align_corners=(
+                        None if self.fpn_interp_model == "nearest" else False
+                    ),
+                    antialias=False,
+                )
+                prev_features = lateral_features + top_down_features
+                if self.fuse_type == "avg":
+                    prev_features /= 2
+            else:
+                prev_features = lateral_features
+            x_out = prev_features
+            out[i] = x_out
+            pos[i] = self.position_encoding(x_out).to(x_out.dtype)
+
+        return out, pos
diff --git a/phantom/submodules/sam2/sam2/modeling/backbones/utils.py b/phantom/submodules/sam2/sam2/modeling/backbones/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..930b1b7622e7b0e7270120dcafccc242ef0f4f28
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/backbones/utils.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Some utilities for backbones, in particular for windowing"""
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+
+
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.reshape(
+        B, Hp // window_size, Wp // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1)
+
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :]
+    return x
+
+
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(
+        self,
+        kernel_size: Tuple[int, ...] = (7, 7),
+        stride: Tuple[int, ...] = (4, 4),
+        padding: Tuple[int, ...] = (3, 3),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x
diff --git a/phantom/submodules/sam2/sam2/modeling/memory_attention.py b/phantom/submodules/sam2/sam2/modeling/memory_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b07f9d87e3d8194ca5e11fc20f01604d591a59d
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/memory_attention.py
@@ -0,0 +1,169 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+from torch import nn, Tensor
+
+from sam2.modeling.sam.transformer import RoPEAttention
+
+from sam2.modeling.sam2_utils import get_activation_fn, get_clones
+
+
+class MemoryAttentionLayer(nn.Module):
+
+    def __init__(
+        self,
+        activation: str,
+        cross_attention: nn.Module,
+        d_model: int,
+        dim_feedforward: int,
+        dropout: float,
+        pos_enc_at_attn: bool,
+        pos_enc_at_cross_attn_keys: bool,
+        pos_enc_at_cross_attn_queries: bool,
+        self_attention: nn.Module,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.dim_feedforward = dim_feedforward
+        self.dropout_value = dropout
+        self.self_attn = self_attention
+        self.cross_attn_image = cross_attention
+
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation_str = activation
+        self.activation = get_activation_fn(activation)
+
+        # Where to add pos enc
+        self.pos_enc_at_attn = pos_enc_at_attn
+        self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
+        self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
+
+    def _forward_sa(self, tgt, query_pos):
+        # Self-Attention
+        tgt2 = self.norm1(tgt)
+        q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
+        tgt2 = self.self_attn(q, k, v=tgt2)
+        tgt = tgt + self.dropout1(tgt2)
+        return tgt
+
+    def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0):
+        kwds = {}
+        if num_k_exclude_rope > 0:
+            assert isinstance(self.cross_attn_image, RoPEAttention)
+            kwds = {"num_k_exclude_rope": num_k_exclude_rope}
+
+        # Cross-Attention
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.cross_attn_image(
+            q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
+            k=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
+            v=memory,
+            **kwds,
+        )
+        tgt = tgt + self.dropout2(tgt2)
+        return tgt
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+        num_k_exclude_rope: int = 0,
+    ) -> torch.Tensor:
+
+        # Self-Attn, Cross-Attn
+        tgt = self._forward_sa(tgt, query_pos)
+        tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
+        # MLP
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+
+
+class MemoryAttention(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        pos_enc_at_input: bool,
+        layer: nn.Module,
+        num_layers: int,
+        batch_first: bool = True,  # Do layers expect batch first input?
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.layers = get_clones(layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = nn.LayerNorm(d_model)
+        self.pos_enc_at_input = pos_enc_at_input
+        self.batch_first = batch_first
+
+    def forward(
+        self,
+        curr: torch.Tensor,  # self-attention inputs
+        memory: torch.Tensor,  # cross-attention inputs
+        curr_pos: Optional[Tensor] = None,  # pos_enc for self-attention inputs
+        memory_pos: Optional[Tensor] = None,  # pos_enc for cross-attention inputs
+        num_obj_ptr_tokens: int = 0,  # number of object pointer *tokens*
+    ):
+        if isinstance(curr, list):
+            assert isinstance(curr_pos, list)
+            assert len(curr) == len(curr_pos) == 1
+            curr, curr_pos = (
+                curr[0],
+                curr_pos[0],
+            )
+
+        assert (
+            curr.shape[1] == memory.shape[1]
+        ), "Batch size must be the same for curr and memory"
+
+        output = curr
+        if self.pos_enc_at_input and curr_pos is not None:
+            output = output + 0.1 * curr_pos
+
+        if self.batch_first:
+            # Convert to batch first
+            output = output.transpose(0, 1)
+            curr_pos = curr_pos.transpose(0, 1)
+            memory = memory.transpose(0, 1)
+            memory_pos = memory_pos.transpose(0, 1)
+
+        for layer in self.layers:
+            kwds = {}
+            if isinstance(layer.cross_attn_image, RoPEAttention):
+                kwds = {"num_k_exclude_rope": num_obj_ptr_tokens}
+
+            output = layer(
+                tgt=output,
+                memory=memory,
+                pos=memory_pos,
+                query_pos=curr_pos,
+                **kwds,
+            )
+        normed_output = self.norm(output)
+
+        if self.batch_first:
+            # Convert back to seq first
+            normed_output = normed_output.transpose(0, 1)
+            curr_pos = curr_pos.transpose(0, 1)
+
+        return normed_output
diff --git a/phantom/submodules/sam2/sam2/modeling/memory_encoder.py b/phantom/submodules/sam2/sam2/modeling/memory_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f60202dfaba87232c3870fb2101b5322a119d985
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/memory_encoder.py
@@ -0,0 +1,181 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from sam2.modeling.sam2_utils import DropPath, get_clones, LayerNorm2d
+
+
+class MaskDownSampler(nn.Module):
+    """
+    Progressively downsample a mask by total_stride, each time by stride.
+    Note that LayerNorm is applied per *token*, like in ViT.
+
+    With each downsample (by a factor stride**2), channel capacity increases by the same factor.
+    In the end, we linearly project to embed_dim channels.
+    """
+
+    def __init__(
+        self,
+        embed_dim=256,
+        kernel_size=4,
+        stride=4,
+        padding=0,
+        total_stride=16,
+        activation=nn.GELU,
+    ):
+        super().__init__()
+        num_layers = int(math.log2(total_stride) // math.log2(stride))
+        assert stride**num_layers == total_stride
+        self.encoder = nn.Sequential()
+        mask_in_chans, mask_out_chans = 1, 1
+        for _ in range(num_layers):
+            mask_out_chans = mask_in_chans * (stride**2)
+            self.encoder.append(
+                nn.Conv2d(
+                    mask_in_chans,
+                    mask_out_chans,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                )
+            )
+            self.encoder.append(LayerNorm2d(mask_out_chans))
+            self.encoder.append(activation())
+            mask_in_chans = mask_out_chans
+
+        self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1))
+
+    def forward(self, x):
+        return self.encoder(x)
+
+
+# Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt)
+class CXBlock(nn.Module):
+    r"""ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(
+        self,
+        dim,
+        kernel_size=7,
+        padding=3,
+        drop_path=0.0,
+        layer_scale_init_value=1e-6,
+        use_dwconv=True,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv2d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=dim if use_dwconv else 1,
+        )  # depthwise conv
+        self.norm = LayerNorm2d(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, 4 * dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = self.norm(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+
+class Fuser(nn.Module):
+    def __init__(self, layer, num_layers, dim=None, input_projection=False):
+        super().__init__()
+        self.proj = nn.Identity()
+        self.layers = get_clones(layer, num_layers)
+
+        if input_projection:
+            assert dim is not None
+            self.proj = nn.Conv2d(dim, dim, kernel_size=1)
+
+    def forward(self, x):
+        # normally x: (N, C, H, W)
+        x = self.proj(x)
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class MemoryEncoder(nn.Module):
+    def __init__(
+        self,
+        out_dim,
+        mask_downsampler,
+        fuser,
+        position_encoding,
+        in_dim=256,  # in_dim of pix_feats
+    ):
+        super().__init__()
+
+        self.mask_downsampler = mask_downsampler
+
+        self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1)
+        self.fuser = fuser
+        self.position_encoding = position_encoding
+        self.out_proj = nn.Identity()
+        if out_dim != in_dim:
+            self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1)
+
+    def forward(
+        self,
+        pix_feat: torch.Tensor,
+        masks: torch.Tensor,
+        skip_mask_sigmoid: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        ## Process masks
+        # sigmoid, so that less domain shift from gt masks which are bool
+        if not skip_mask_sigmoid:
+            masks = F.sigmoid(masks)
+        masks = self.mask_downsampler(masks)
+
+        ## Fuse pix_feats and downsampled masks
+        # in case the visual features are on CPU, cast them to CUDA
+        pix_feat = pix_feat.to(masks.device)
+
+        x = self.pix_feat_proj(pix_feat)
+        x = x + masks
+        x = self.fuser(x)
+        x = self.out_proj(x)
+
+        pos = self.position_encoding(x).to(x.dtype)
+
+        return {"vision_features": x, "vision_pos_enc": [pos]}
diff --git a/phantom/submodules/sam2/sam2/modeling/position_encoding.py b/phantom/submodules/sam2/sam2/modeling/position_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..2241d4cf1a4495b4c67dc35cbed1c606357b9b7a
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/position_encoding.py
@@ -0,0 +1,239 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Any, Optional, Tuple
+
+import numpy as np
+
+import torch
+from torch import nn
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention Is All You Need paper, generalized to work on images.
+    """
+
+    def __init__(
+        self,
+        num_pos_feats,
+        temperature: int = 10000,
+        normalize: bool = True,
+        scale: Optional[float] = None,
+        # Following settings only relevant
+        # for warmping up cache for compilation
+        warmup_cache: bool = True,
+        image_size: int = 1024,
+        strides: Tuple[int] = (4, 8, 16, 32),
+    ):
+        super().__init__()
+        assert num_pos_feats % 2 == 0, "Expecting even model width"
+        self.num_pos_feats = num_pos_feats // 2
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+        self.cache = {}
+        if warmup_cache and torch.cuda.is_available():
+            # Warmup cache for cuda, to help with compilation
+            device = torch.device("cuda")
+            for stride in strides:
+                cache_key = (image_size // stride, image_size // stride)
+                self._pe(1, device, *cache_key)
+
+    def _encode_xy(self, x, y):
+        # The positions are expected to be normalized
+        assert len(x) == len(y) and x.ndim == y.ndim == 1
+        x_embed = x * self.scale
+        y_embed = y * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, None] / dim_t
+        pos_y = y_embed[:, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2
+        ).flatten(1)
+        pos_y = torch.stack(
+            (pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2
+        ).flatten(1)
+        return pos_x, pos_y
+
+    @torch.no_grad()
+    def encode_boxes(self, x, y, w, h):
+        pos_x, pos_y = self._encode_xy(x, y)
+        pos = torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
+        return pos
+
+    encode = encode_boxes  # Backwards compatibility
+
+    @torch.no_grad()
+    def encode_points(self, x, y, labels):
+        (bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
+        assert bx == by and nx == ny and bx == bl and nx == nl
+        pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten())
+        pos_x, pos_y = pos_x.reshape(bx, nx, -1), pos_y.reshape(by, ny, -1)
+        pos = torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2)
+        return pos
+
+    @torch.no_grad()
+    def _pe(self, B, device, *cache_key):
+        H, W = cache_key
+        if cache_key in self.cache:
+            return self.cache[cache_key].to(device)[None].repeat(B, 1, 1, 1)
+
+        y_embed = (
+            torch.arange(1, H + 1, dtype=torch.float32, device=device)
+            .view(1, -1, 1)
+            .repeat(B, 1, W)
+        )
+        x_embed = (
+            torch.arange(1, W + 1, dtype=torch.float32, device=device)
+            .view(1, 1, -1)
+            .repeat(B, H, 1)
+        )
+
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        self.cache[cache_key] = pos[0]
+        return pos
+
+    @torch.no_grad()
+    def forward(self, x: torch.Tensor):
+        B = x.shape[0]
+        cache_key = (x.shape[-2], x.shape[-1])
+        return self._pe(B, x.device, *cache_key)
+
+
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix",
+            scale * torch.randn((2, num_pos_feats)),
+        )
+
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones((h, w), device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
+
+
+# Rotary Positional Encoding, adapted from:
+# 1. https://github.com/meta-llama/codellama/blob/main/llama/model.py
+# 2. https://github.com/naver-ai/rope-vit
+# 3. https://github.com/lucidrains/rotary-embedding-torch
+
+
+def init_t_xy(end_x: int, end_y: int):
+    t = torch.arange(end_x * end_y, dtype=torch.float32)
+    t_x = (t % end_x).float()
+    t_y = torch.div(t, end_x, rounding_mode="floor").float()
+    return t_x, t_y
+
+
+def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
+    freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+
+    t_x, t_y = init_t_xy(end_x, end_y)
+    freqs_x = torch.outer(t_x, freqs_x)
+    freqs_y = torch.outer(t_y, freqs_y)
+    freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
+    freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
+    return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
+
+
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
+    shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+
+
+def apply_rotary_enc(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+    repeat_freqs_k: bool = False,
+):
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = (
+        torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+        if xk.shape[-2] != 0
+        else None
+    )
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    if xk_ is None:
+        # no keys to rotate, due to dropout
+        return xq_out.type_as(xq).to(xq.device), xk
+    # repeat freqs along seq_len dim to match k seq_len
+    if repeat_freqs_k:
+        r = xk_.shape[-2] // xq_.shape[-2]
+        if freqs_cis.is_cuda:
+            freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1)
+        else:
+            # torch.repeat on complex numbers may not be supported on non-CUDA devices
+            # (freqs_cis has 4 dims and we repeat on dim 2) so we use expand + flatten
+            freqs_cis = freqs_cis.unsqueeze(2).expand(-1, -1, r, -1, -1).flatten(2, 3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)
diff --git a/phantom/submodules/sam2/sam2/modeling/sam/__init__.py b/phantom/submodules/sam2/sam2/modeling/sam/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/sam/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/phantom/submodules/sam2/sam2/modeling/sam/mask_decoder.py b/phantom/submodules/sam2/sam2/modeling/sam/mask_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bebc0366b2703ffcb80a44bfd19cce8339b4fed
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/sam/mask_decoder.py
@@ -0,0 +1,295 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional, Tuple, Type
+
+import torch
+from torch import nn
+
+from sam2.modeling.sam2_utils import LayerNorm2d, MLP
+
+
+class MaskDecoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: nn.Module,
+        num_multimask_outputs: int = 3,
+        activation: Type[nn.Module] = nn.GELU,
+        iou_head_depth: int = 3,
+        iou_head_hidden_dim: int = 256,
+        use_high_res_features: bool = False,
+        iou_prediction_use_sigmoid=False,
+        dynamic_multimask_via_stability=False,
+        dynamic_multimask_stability_delta=0.05,
+        dynamic_multimask_stability_thresh=0.98,
+        pred_obj_scores: bool = False,
+        pred_obj_scores_mlp: bool = False,
+        use_multimask_token_for_obj_ptr: bool = False,
+    ) -> None:
+        """
+        Predicts masks given an image and prompt embeddings, using a
+        transformer architecture.
+
+        Arguments:
+          transformer_dim (int): the channel dimension of the transformer
+          transformer (nn.Module): the transformer used to predict masks
+          num_multimask_outputs (int): the number of masks to predict
+            when disambiguating masks
+          activation (nn.Module): the type of activation to use when
+            upscaling masks
+          iou_head_depth (int): the depth of the MLP used to predict
+            mask quality
+          iou_head_hidden_dim (int): the hidden dimension of the MLP
+            used to predict mask quality
+        """
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+
+        self.num_multimask_outputs = num_multimask_outputs
+
+        self.iou_token = nn.Embedding(1, transformer_dim)
+        self.num_mask_tokens = num_multimask_outputs + 1
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+
+        self.pred_obj_scores = pred_obj_scores
+        if self.pred_obj_scores:
+            self.obj_score_token = nn.Embedding(1, transformer_dim)
+        self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
+
+        self.output_upscaling = nn.Sequential(
+            nn.ConvTranspose2d(
+                transformer_dim, transformer_dim // 4, kernel_size=2, stride=2
+            ),
+            LayerNorm2d(transformer_dim // 4),
+            activation(),
+            nn.ConvTranspose2d(
+                transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2
+            ),
+            activation(),
+        )
+        self.use_high_res_features = use_high_res_features
+        if use_high_res_features:
+            self.conv_s0 = nn.Conv2d(
+                transformer_dim, transformer_dim // 8, kernel_size=1, stride=1
+            )
+            self.conv_s1 = nn.Conv2d(
+                transformer_dim, transformer_dim // 4, kernel_size=1, stride=1
+            )
+
+        self.output_hypernetworks_mlps = nn.ModuleList(
+            [
+                MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
+                for i in range(self.num_mask_tokens)
+            ]
+        )
+
+        self.iou_prediction_head = MLP(
+            transformer_dim,
+            iou_head_hidden_dim,
+            self.num_mask_tokens,
+            iou_head_depth,
+            sigmoid_output=iou_prediction_use_sigmoid,
+        )
+        if self.pred_obj_scores:
+            self.pred_obj_score_head = nn.Linear(transformer_dim, 1)
+            if pred_obj_scores_mlp:
+                self.pred_obj_score_head = MLP(transformer_dim, transformer_dim, 1, 3)
+
+        # When outputting a single mask, optionally we can dynamically fall back to the best
+        # multimask output token if the single mask output token gives low stability scores.
+        self.dynamic_multimask_via_stability = dynamic_multimask_via_stability
+        self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta
+        self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh
+
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+        repeat_image: bool,
+        high_res_features: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+
+        Arguments:
+          image_embeddings (torch.Tensor): the embeddings from the image encoder
+          image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
+          sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+          dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+
+        Returns:
+          torch.Tensor: batched predicted masks
+          torch.Tensor: batched predictions of mask quality
+          torch.Tensor: batched SAM token for mask output
+        """
+        masks, iou_pred, mask_tokens_out, object_score_logits = self.predict_masks(
+            image_embeddings=image_embeddings,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+            dense_prompt_embeddings=dense_prompt_embeddings,
+            repeat_image=repeat_image,
+            high_res_features=high_res_features,
+        )
+
+        # Select the correct mask or masks for output
+        if multimask_output:
+            masks = masks[:, 1:, :, :]
+            iou_pred = iou_pred[:, 1:]
+        elif self.dynamic_multimask_via_stability and not self.training:
+            masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred)
+        else:
+            masks = masks[:, 0:1, :, :]
+            iou_pred = iou_pred[:, 0:1]
+
+        if multimask_output and self.use_multimask_token_for_obj_ptr:
+            sam_tokens_out = mask_tokens_out[:, 1:]  # [b, 3, c] shape
+        else:
+            # Take the mask output token. Here we *always* use the token for single mask output.
+            # At test time, even if we track after 1-click (and using multimask_output=True),
+            # we still take the single mask token here. The rationale is that we always track
+            # after multiple clicks during training, so the past tokens seen during training
+            # are always the single mask token (and we'll let it be the object-memory token).
+            sam_tokens_out = mask_tokens_out[:, 0:1]  # [b, 1, c] shape
+
+        # Prepare output
+        return masks, iou_pred, sam_tokens_out, object_score_logits
+
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+        repeat_image: bool,
+        high_res_features: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts masks. See 'forward' for more details."""
+        # Concatenate output tokens
+        s = 0
+        if self.pred_obj_scores:
+            output_tokens = torch.cat(
+                [
+                    self.obj_score_token.weight,
+                    self.iou_token.weight,
+                    self.mask_tokens.weight,
+                ],
+                dim=0,
+            )
+            s = 1
+        else:
+            output_tokens = torch.cat(
+                [self.iou_token.weight, self.mask_tokens.weight], dim=0
+            )
+        output_tokens = output_tokens.unsqueeze(0).expand(
+            sparse_prompt_embeddings.size(0), -1, -1
+        )
+        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+
+        # Expand per-image data in batch direction to be per-mask
+        if repeat_image:
+            src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
+        else:
+            assert image_embeddings.shape[0] == tokens.shape[0]
+            src = image_embeddings
+        src = src + dense_prompt_embeddings
+        assert (
+            image_pe.size(0) == 1
+        ), "image_pe should have size 1 in batch dim (from `get_dense_pe()`)"
+        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+        b, c, h, w = src.shape
+
+        # Run the transformer
+        hs, src = self.transformer(src, pos_src, tokens)
+        iou_token_out = hs[:, s, :]
+        mask_tokens_out = hs[:, s + 1 : (s + 1 + self.num_mask_tokens), :]
+
+        # Upscale mask embeddings and predict masks using the mask tokens
+        src = src.transpose(1, 2).view(b, c, h, w)
+        if not self.use_high_res_features:
+            upscaled_embedding = self.output_upscaling(src)
+        else:
+            dc1, ln1, act1, dc2, act2 = self.output_upscaling
+            feat_s0, feat_s1 = high_res_features
+            upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
+            upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)
+
+        hyper_in_list: List[torch.Tensor] = []
+        for i in range(self.num_mask_tokens):
+            hyper_in_list.append(
+                self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :])
+            )
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+        if self.pred_obj_scores:
+            assert s == 1
+            object_score_logits = self.pred_obj_score_head(hs[:, 0, :])
+        else:
+            # Obj scores logits - default to 10.0, i.e. assuming the object is present, sigmoid(10)=1
+            object_score_logits = 10.0 * iou_pred.new_ones(iou_pred.shape[0], 1)
+
+        return masks, iou_pred, mask_tokens_out, object_score_logits
+
+    def _get_stability_scores(self, mask_logits):
+        """
+        Compute stability scores of the mask logits based on the IoU between upper and
+        lower thresholds.
+        """
+        mask_logits = mask_logits.flatten(-2)
+        stability_delta = self.dynamic_multimask_stability_delta
+        area_i = torch.sum(mask_logits > stability_delta, dim=-1).float()
+        area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float()
+        stability_scores = torch.where(area_u > 0, area_i / area_u, 1.0)
+        return stability_scores
+
+    def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores):
+        """
+        When outputting a single mask, if the stability score from the current single-mask
+        output (based on output token 0) falls below a threshold, we instead select from
+        multi-mask outputs (based on output token 1~3) the mask with the highest predicted
+        IoU score. This is intended to ensure a valid mask for both clicking and tracking.
+        """
+        # The best mask from multimask output tokens (1~3)
+        multimask_logits = all_mask_logits[:, 1:, :, :]
+        multimask_iou_scores = all_iou_scores[:, 1:]
+        best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1)
+        batch_inds = torch.arange(
+            multimask_iou_scores.size(0), device=all_iou_scores.device
+        )
+        best_multimask_logits = multimask_logits[batch_inds, best_scores_inds]
+        best_multimask_logits = best_multimask_logits.unsqueeze(1)
+        best_multimask_iou_scores = multimask_iou_scores[batch_inds, best_scores_inds]
+        best_multimask_iou_scores = best_multimask_iou_scores.unsqueeze(1)
+
+        # The mask from singlemask output token 0 and its stability score
+        singlemask_logits = all_mask_logits[:, 0:1, :, :]
+        singlemask_iou_scores = all_iou_scores[:, 0:1]
+        stability_scores = self._get_stability_scores(singlemask_logits)
+        is_stable = stability_scores >= self.dynamic_multimask_stability_thresh
+
+        # Dynamically fall back to best multimask output upon low stability scores.
+        mask_logits_out = torch.where(
+            is_stable[..., None, None].expand_as(singlemask_logits),
+            singlemask_logits,
+            best_multimask_logits,
+        )
+        iou_scores_out = torch.where(
+            is_stable.expand_as(singlemask_iou_scores),
+            singlemask_iou_scores,
+            best_multimask_iou_scores,
+        )
+        return mask_logits_out, iou_scores_out
diff --git a/phantom/submodules/sam2/sam2/modeling/sam/prompt_encoder.py b/phantom/submodules/sam2/sam2/modeling/sam/prompt_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57876264b51f8c5236867359350e32d590efcb5
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/sam/prompt_encoder.py
@@ -0,0 +1,202 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Tuple, Type
+
+import torch
+from torch import nn
+
+from sam2.modeling.position_encoding import PositionEmbeddingRandom
+
+from sam2.modeling.sam2_utils import LayerNorm2d
+
+
+class PromptEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int],
+        input_image_size: Tuple[int, int],
+        mask_in_chans: int,
+        activation: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        """
+        Encodes prompts for input to SAM's mask decoder.
+
+        Arguments:
+          embed_dim (int): The prompts' embedding dimension
+          image_embedding_size (tuple(int, int)): The spatial size of the
+            image embedding, as (H, W).
+          input_image_size (int): The padded size of the image as input
+            to the image encoder, as (H, W).
+          mask_in_chans (int): The number of hidden channels used for
+            encoding input masks.
+          activation (nn.Module): The activation to use when encoding
+            input masks.
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_image_size = input_image_size
+        self.image_embedding_size = image_embedding_size
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+
+        self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
+        point_embeddings = [
+            nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)
+        ]
+        self.point_embeddings = nn.ModuleList(point_embeddings)
+        self.not_a_point_embed = nn.Embedding(1, embed_dim)
+
+        self.mask_input_size = (
+            4 * image_embedding_size[0],
+            4 * image_embedding_size[1],
+        )
+        self.mask_downscaling = nn.Sequential(
+            nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans // 4),
+            activation(),
+            nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans),
+            activation(),
+            nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+        )
+        self.no_mask_embed = nn.Embedding(1, embed_dim)
+
+    def get_dense_pe(self) -> torch.Tensor:
+        """
+        Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+
+    def _embed_points(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+        pad: bool,
+    ) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        if pad:
+            padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
+            padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
+            points = torch.cat([points, padding_point], dim=1)
+            labels = torch.cat([labels, padding_label], dim=1)
+        point_embedding = self.pe_layer.forward_with_coords(
+            points, self.input_image_size
+        )
+
+        point_embedding = torch.where(
+            (labels == -1).unsqueeze(-1),
+            torch.zeros_like(point_embedding) + self.not_a_point_embed.weight,
+            point_embedding,
+        )
+        point_embedding = torch.where(
+            (labels == 0).unsqueeze(-1),
+            point_embedding + self.point_embeddings[0].weight,
+            point_embedding,
+        )
+        point_embedding = torch.where(
+            (labels == 1).unsqueeze(-1),
+            point_embedding + self.point_embeddings[1].weight,
+            point_embedding,
+        )
+        point_embedding = torch.where(
+            (labels == 2).unsqueeze(-1),
+            point_embedding + self.point_embeddings[2].weight,
+            point_embedding,
+        )
+        point_embedding = torch.where(
+            (labels == 3).unsqueeze(-1),
+            point_embedding + self.point_embeddings[3].weight,
+            point_embedding,
+        )
+        return point_embedding
+
+    def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+        """Embeds box prompts."""
+        boxes = boxes + 0.5  # Shift to center of pixel
+        coords = boxes.reshape(-1, 2, 2)
+        corner_embedding = self.pe_layer.forward_with_coords(
+            coords, self.input_image_size
+        )
+        corner_embedding[:, 0, :] += self.point_embeddings[2].weight
+        corner_embedding[:, 1, :] += self.point_embeddings[3].weight
+        return corner_embedding
+
+    def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
+        """Embeds mask inputs."""
+        mask_embedding = self.mask_downscaling(masks)
+        return mask_embedding
+
+    def _get_batch_size(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+    ) -> int:
+        """
+        Gets the batch size of the output given the batch size of the input prompts.
+        """
+        if points is not None:
+            return points[0].shape[0]
+        elif boxes is not None:
+            return boxes.shape[0]
+        elif masks is not None:
+            return masks.shape[0]
+        else:
+            return 1
+
+    def _get_device(self) -> torch.device:
+        return self.point_embeddings[0].weight.device
+
+    def forward(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+
+        Arguments:
+          points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+            and labels to embed.
+          boxes (torch.Tensor or none): boxes to embed
+          masks (torch.Tensor or none): masks to embed
+
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+          torch.Tensor: dense embeddings for the masks, in the shape
+            Bx(embed_dim)x(embed_H)x(embed_W)
+        """
+        bs = self._get_batch_size(points, boxes, masks)
+        sparse_embeddings = torch.empty(
+            (bs, 0, self.embed_dim), device=self._get_device()
+        )
+        if points is not None:
+            coords, labels = points
+            point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
+            sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
+        if boxes is not None:
+            box_embeddings = self._embed_boxes(boxes)
+            sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
+
+        if masks is not None:
+            dense_embeddings = self._embed_masks(masks)
+        else:
+            dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+                bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
+            )
+
+        return sparse_embeddings, dense_embeddings
diff --git a/phantom/submodules/sam2/sam2/modeling/sam/transformer.py b/phantom/submodules/sam2/sam2/modeling/sam/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9fe9a3fbc5cce4f1abe8ee0ae3a8602bbe2ff1b
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/sam/transformer.py
@@ -0,0 +1,311 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from functools import partial
+from typing import Tuple, Type
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from sam2.modeling.position_encoding import apply_rotary_enc, compute_axial_cis
+from sam2.modeling.sam2_utils import MLP
+
+
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+
+        for i in range(depth):
+            self.layers.append(
+                TwoWayAttentionBlock(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    mlp_dim=mlp_dim,
+                    activation=activation,
+                    attention_downsample_rate=attention_downsample_rate,
+                    skip_first_layer_pe=(i == 0),
+                )
+            )
+
+        self.final_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+
+        return queries, keys
+
+
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = Attention(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+
+        self.cross_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+
+        self.mlp = MLP(
+            embedding_dim, mlp_dim, embedding_dim, num_layers=2, activation=activation
+        )
+        self.norm3 = nn.LayerNorm(embedding_dim)
+
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+
+        self.skip_first_layer_pe = skip_first_layer_pe
+
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(q=queries, k=queries, v=queries)
+        else:
+            q = queries + query_pe
+            attn_out = self.self_attn(q=q, k=q, v=queries)
+            queries = queries + attn_out
+        queries = self.norm1(queries)
+
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+
+        return queries, keys
+
+
+class Attention(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+        dropout: float = 0.0,
+        kv_in_dim: int = None,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.kv_in_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert (
+            self.internal_dim % num_heads == 0
+        ), "num_heads must divide embedding_dim."
+
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
+        self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+
+        self.dropout_p = dropout
+
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+
+        dropout_p = self.dropout_p if self.training else 0.0
+        # Attention
+        out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+
+        return out
+
+
+class RoPEAttention(Attention):
+    """Attention with rotary position encoding."""
+
+    def __init__(
+        self,
+        *args,
+        rope_theta=10000.0,
+        # whether to repeat q rope to match k length
+        # this is needed for cross-attention to memories
+        rope_k_repeat=False,
+        feat_sizes=(64, 64),  # [w, h] for stride 16 feats at 1024 resolution
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.compute_cis = partial(
+            compute_axial_cis, dim=self.internal_dim // self.num_heads, theta=rope_theta
+        )
+        freqs_cis = self.compute_cis(end_x=feat_sizes[0], end_y=feat_sizes[1])
+        self.freqs_cis = (
+            freqs_cis.to("cuda") if torch.cuda.is_available() else freqs_cis
+        )
+        self.rope_k_repeat = rope_k_repeat
+
+    def forward(
+        self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0
+    ) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+
+        # Apply rotary position encoding
+        w = h = math.sqrt(q.shape[-2])
+        self.freqs_cis = self.freqs_cis.to(q.device)
+        if self.freqs_cis.shape[0] != q.shape[-2]:
+            self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device)
+        if q.shape[-2] != k.shape[-2]:
+            assert self.rope_k_repeat
+
+        num_k_rope = k.size(-2) - num_k_exclude_rope
+        q, k[:, :, :num_k_rope] = apply_rotary_enc(
+            q,
+            k[:, :, :num_k_rope],
+            freqs_cis=self.freqs_cis,
+            repeat_freqs_k=self.rope_k_repeat,
+        )
+
+        dropout_p = self.dropout_p if self.training else 0.0
+        # Attention
+        out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+
+        return out
diff --git a/phantom/submodules/sam2/sam2/modeling/sam2_base.py b/phantom/submodules/sam2/sam2/modeling/sam2_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f4e515b0d161942bf2bb64560056b3efbe6dac
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/sam2_base.py
@@ -0,0 +1,909 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed
+import torch.nn.functional as F
+
+from torch.nn.init import trunc_normal_
+
+from sam2.modeling.sam.mask_decoder import MaskDecoder
+from sam2.modeling.sam.prompt_encoder import PromptEncoder
+from sam2.modeling.sam.transformer import TwoWayTransformer
+from sam2.modeling.sam2_utils import get_1d_sine_pe, MLP, select_closest_cond_frames
+
+# a large negative value as a placeholder score for missing objects
+NO_OBJ_SCORE = -1024.0
+
+
+class SAM2Base(torch.nn.Module):
+    def __init__(
+        self,
+        image_encoder,
+        memory_attention,
+        memory_encoder,
+        num_maskmem=7,  # default 1 input frame + 6 previous frames
+        image_size=512,
+        backbone_stride=16,  # stride of the image backbone output
+        sigmoid_scale_for_mem_enc=1.0,  # scale factor for mask sigmoid prob
+        sigmoid_bias_for_mem_enc=0.0,  # bias factor for mask sigmoid prob
+        # During evaluation, whether to binarize the sigmoid mask logits on interacted frames with clicks
+        binarize_mask_from_pts_for_mem_enc=False,
+        use_mask_input_as_output_without_sam=False,  # on frames with mask input, whether to directly output the input mask without using a SAM prompt encoder + mask decoder
+        # The maximum number of conditioning frames to participate in the memory attention (-1 means no limit; if there are more conditioning frames than this limit,
+        # we only cross-attend to the temporally closest `max_cond_frames_in_attn` conditioning frames in the encoder when tracking each frame). This gives the model
+        # a temporal locality when handling a large number of annotated frames (since closer frames should be more important) and also avoids GPU OOM.
+        max_cond_frames_in_attn=-1,
+        # on the first frame, whether to directly add the no-memory embedding to the image feature
+        # (instead of using the transformer encoder)
+        directly_add_no_mem_embed=False,
+        # whether to use high-resolution feature maps in the SAM mask decoder
+        use_high_res_features_in_sam=False,
+        # whether to output multiple (3) masks for the first click on initial conditioning frames
+        multimask_output_in_sam=False,
+        # the minimum and maximum number of clicks to use multimask_output_in_sam (only relevant when `multimask_output_in_sam=True`;
+        # default is 1 for both, meaning that only the first click gives multimask output; also note that a box counts as two points)
+        multimask_min_pt_num=1,
+        multimask_max_pt_num=1,
+        # whether to also use multimask output for tracking (not just for the first click on initial conditioning frames; only relevant when `multimask_output_in_sam=True`)
+        multimask_output_for_tracking=False,
+        # Whether to use multimask tokens for obj ptr; Only relevant when both
+        # use_obj_ptrs_in_encoder=True and multimask_output_for_tracking=True
+        use_multimask_token_for_obj_ptr: bool = False,
+        # whether to use sigmoid to restrict ious prediction to [0-1]
+        iou_prediction_use_sigmoid=False,
+        # The memory bank's temporal stride during evaluation (i.e. the `r` parameter in XMem and Cutie; XMem and Cutie use r=5).
+        # For r>1, the (self.num_maskmem - 1) non-conditioning memory frames consist of
+        # (self.num_maskmem - 2) nearest frames from every r-th frames, plus the last frame.
+        memory_temporal_stride_for_eval=1,
+        # whether to apply non-overlapping constraints on the object masks in the memory encoder during evaluation (to avoid/alleviate superposing masks)
+        non_overlap_masks_for_mem_enc=False,
+        # whether to cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+        use_obj_ptrs_in_encoder=False,
+        # the maximum number of object pointers from other frames in encoder cross attention (only relevant when `use_obj_ptrs_in_encoder=True`)
+        max_obj_ptrs_in_encoder=16,
+        # whether to add temporal positional encoding to the object pointers in the encoder (only relevant when `use_obj_ptrs_in_encoder=True`)
+        add_tpos_enc_to_obj_ptrs=True,
+        # whether to add an extra linear projection layer for the temporal positional encoding in the object pointers to avoid potential interference
+        # with spatial positional encoding (only relevant when both `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`)
+        proj_tpos_enc_in_obj_ptrs=False,
+        # whether to use signed distance (instead of unsigned absolute distance) in the temporal positional encoding in the object pointers
+        # (only relevant when both `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`)
+        use_signed_tpos_enc_to_obj_ptrs=False,
+        # whether to only attend to object pointers in the past (before the current frame) in the encoder during evaluation
+        # (only relevant when `use_obj_ptrs_in_encoder=True`; this might avoid pointer information too far in the future to distract the initial tracking)
+        only_obj_ptrs_in_the_past_for_eval=False,
+        # Whether to predict if there is an object in the frame
+        pred_obj_scores: bool = False,
+        # Whether to use an MLP to predict object scores
+        pred_obj_scores_mlp: bool = False,
+        # Only relevant if pred_obj_scores=True and use_obj_ptrs_in_encoder=True;
+        # Whether to have a fixed no obj pointer when there is no object present
+        # or to use it as an additive embedding with obj_ptr produced by decoder
+        fixed_no_obj_ptr: bool = False,
+        # Soft no object, i.e. mix in no_obj_ptr softly,
+        # hope to make recovery easier if there is a mistake and mitigate accumulation of errors
+        soft_no_obj_ptr: bool = False,
+        use_mlp_for_obj_ptr_proj: bool = False,
+        # add no obj embedding to spatial frames
+        no_obj_embed_spatial: bool = False,
+        # extra arguments used to construct the SAM mask decoder; if not None, it should be a dict of kwargs to be passed into `MaskDecoder` class.
+        sam_mask_decoder_extra_args=None,
+        compile_image_encoder: bool = False,
+    ):
+        super().__init__()
+
+        # Part 1: the image backbone
+        self.image_encoder = image_encoder
+        # Use level 0, 1, 2 for high-res setting, or just level 2 for the default setting
+        self.use_high_res_features_in_sam = use_high_res_features_in_sam
+        self.num_feature_levels = 3 if use_high_res_features_in_sam else 1
+        self.use_obj_ptrs_in_encoder = use_obj_ptrs_in_encoder
+        self.max_obj_ptrs_in_encoder = max_obj_ptrs_in_encoder
+        if use_obj_ptrs_in_encoder:
+            # A conv layer to downsample the mask prompt to stride 4 (the same stride as
+            # low-res SAM mask logits) and to change its scales from 0~1 to SAM logit scale,
+            # so that it can be fed into the SAM mask decoder to generate a pointer.
+            self.mask_downsample = torch.nn.Conv2d(1, 1, kernel_size=4, stride=4)
+        self.add_tpos_enc_to_obj_ptrs = add_tpos_enc_to_obj_ptrs
+        if proj_tpos_enc_in_obj_ptrs:
+            assert add_tpos_enc_to_obj_ptrs  # these options need to be used together
+        self.proj_tpos_enc_in_obj_ptrs = proj_tpos_enc_in_obj_ptrs
+        self.use_signed_tpos_enc_to_obj_ptrs = use_signed_tpos_enc_to_obj_ptrs
+        self.only_obj_ptrs_in_the_past_for_eval = only_obj_ptrs_in_the_past_for_eval
+
+        # Part 2: memory attention to condition current frame's visual features
+        # with memories (and obj ptrs) from past frames
+        self.memory_attention = memory_attention
+        self.hidden_dim = image_encoder.neck.d_model
+
+        # Part 3: memory encoder for the previous frame's outputs
+        self.memory_encoder = memory_encoder
+        self.mem_dim = self.hidden_dim
+        if hasattr(self.memory_encoder, "out_proj") and hasattr(
+            self.memory_encoder.out_proj, "weight"
+        ):
+            # if there is compression of memories along channel dim
+            self.mem_dim = self.memory_encoder.out_proj.weight.shape[0]
+        self.num_maskmem = num_maskmem  # Number of memories accessible
+        # Temporal encoding of the memories
+        self.maskmem_tpos_enc = torch.nn.Parameter(
+            torch.zeros(num_maskmem, 1, 1, self.mem_dim)
+        )
+        trunc_normal_(self.maskmem_tpos_enc, std=0.02)
+        # a single token to indicate no memory embedding from previous frames
+        self.no_mem_embed = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
+        self.no_mem_pos_enc = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
+        trunc_normal_(self.no_mem_embed, std=0.02)
+        trunc_normal_(self.no_mem_pos_enc, std=0.02)
+        self.directly_add_no_mem_embed = directly_add_no_mem_embed
+        # Apply sigmoid to the output raw mask logits (to turn them from
+        # range (-inf, +inf) to range (0, 1)) before feeding them into the memory encoder
+        self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc
+        self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc
+        self.binarize_mask_from_pts_for_mem_enc = binarize_mask_from_pts_for_mem_enc
+        self.non_overlap_masks_for_mem_enc = non_overlap_masks_for_mem_enc
+        self.memory_temporal_stride_for_eval = memory_temporal_stride_for_eval
+        # On frames with mask input, whether to directly output the input mask without
+        # using a SAM prompt encoder + mask decoder
+        self.use_mask_input_as_output_without_sam = use_mask_input_as_output_without_sam
+        self.multimask_output_in_sam = multimask_output_in_sam
+        self.multimask_min_pt_num = multimask_min_pt_num
+        self.multimask_max_pt_num = multimask_max_pt_num
+        self.multimask_output_for_tracking = multimask_output_for_tracking
+        self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
+        self.iou_prediction_use_sigmoid = iou_prediction_use_sigmoid
+
+        # Part 4: SAM-style prompt encoder (for both mask and point inputs)
+        # and SAM-style mask decoder for the final mask output
+        self.image_size = image_size
+        self.backbone_stride = backbone_stride
+        self.sam_mask_decoder_extra_args = sam_mask_decoder_extra_args
+        self.pred_obj_scores = pred_obj_scores
+        self.pred_obj_scores_mlp = pred_obj_scores_mlp
+        self.fixed_no_obj_ptr = fixed_no_obj_ptr
+        self.soft_no_obj_ptr = soft_no_obj_ptr
+        if self.fixed_no_obj_ptr:
+            assert self.pred_obj_scores
+            assert self.use_obj_ptrs_in_encoder
+        if self.pred_obj_scores and self.use_obj_ptrs_in_encoder:
+            self.no_obj_ptr = torch.nn.Parameter(torch.zeros(1, self.hidden_dim))
+            trunc_normal_(self.no_obj_ptr, std=0.02)
+        self.use_mlp_for_obj_ptr_proj = use_mlp_for_obj_ptr_proj
+        self.no_obj_embed_spatial = None
+        if no_obj_embed_spatial:
+            self.no_obj_embed_spatial = torch.nn.Parameter(torch.zeros(1, self.mem_dim))
+            trunc_normal_(self.no_obj_embed_spatial, std=0.02)
+
+        self._build_sam_heads()
+        self.max_cond_frames_in_attn = max_cond_frames_in_attn
+
+        # Model compilation
+        if compile_image_encoder:
+            # Compile the forward function (not the full module) to allow loading checkpoints.
+            print(
+                "Image encoder compilation is enabled. First forward pass will be slow."
+            )
+            self.image_encoder.forward = torch.compile(
+                self.image_encoder.forward,
+                mode="max-autotune",
+                fullgraph=True,
+                dynamic=False,
+            )
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Please use the corresponding methods in SAM2VideoPredictor for inference or SAM2Train for training/fine-tuning"
+            "See notebooks/video_predictor_example.ipynb for an inference example."
+        )
+
+    def _build_sam_heads(self):
+        """Build SAM-style prompt encoder and mask decoder."""
+        self.sam_prompt_embed_dim = self.hidden_dim
+        self.sam_image_embedding_size = self.image_size // self.backbone_stride
+
+        # build PromptEncoder and MaskDecoder from SAM
+        # (their hyperparameters like `mask_in_chans=16` are from SAM code)
+        self.sam_prompt_encoder = PromptEncoder(
+            embed_dim=self.sam_prompt_embed_dim,
+            image_embedding_size=(
+                self.sam_image_embedding_size,
+                self.sam_image_embedding_size,
+            ),
+            input_image_size=(self.image_size, self.image_size),
+            mask_in_chans=16,
+        )
+        self.sam_mask_decoder = MaskDecoder(
+            num_multimask_outputs=3,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=self.sam_prompt_embed_dim,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+            transformer_dim=self.sam_prompt_embed_dim,
+            iou_head_depth=3,
+            iou_head_hidden_dim=256,
+            use_high_res_features=self.use_high_res_features_in_sam,
+            iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid,
+            pred_obj_scores=self.pred_obj_scores,
+            pred_obj_scores_mlp=self.pred_obj_scores_mlp,
+            use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr,
+            **(self.sam_mask_decoder_extra_args or {}),
+        )
+        if self.use_obj_ptrs_in_encoder:
+            # a linear projection on SAM output tokens to turn them into object pointers
+            self.obj_ptr_proj = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
+            if self.use_mlp_for_obj_ptr_proj:
+                self.obj_ptr_proj = MLP(
+                    self.hidden_dim, self.hidden_dim, self.hidden_dim, 3
+                )
+        else:
+            self.obj_ptr_proj = torch.nn.Identity()
+        if self.proj_tpos_enc_in_obj_ptrs:
+            # a linear projection on temporal positional encoding in object pointers to
+            # avoid potential interference with spatial positional encoding
+            self.obj_ptr_tpos_proj = torch.nn.Linear(self.hidden_dim, self.mem_dim)
+        else:
+            self.obj_ptr_tpos_proj = torch.nn.Identity()
+
+    def _forward_sam_heads(
+        self,
+        backbone_features,
+        point_inputs=None,
+        mask_inputs=None,
+        high_res_features=None,
+        multimask_output=False,
+    ):
+        """
+        Forward SAM prompt encoders and mask heads.
+
+        Inputs:
+        - backbone_features: image features of [B, C, H, W] shape
+        - point_inputs: a dictionary with "point_coords" and "point_labels", where
+          1) "point_coords" has [B, P, 2] shape and float32 dtype and contains the
+             absolute pixel-unit coordinate in (x, y) format of the P input points
+          2) "point_labels" has shape [B, P] and int32 dtype, where 1 means
+             positive clicks, 0 means negative clicks, and -1 means padding
+        - mask_inputs: a mask of [B, 1, H*16, W*16] shape, float or bool, with the
+          same spatial size as the image.
+        - high_res_features: either 1) None or 2) or a list of length 2 containing
+          two feature maps of [B, C, 4*H, 4*W] and [B, C, 2*H, 2*W] shapes respectively,
+          which will be used as high-resolution feature maps for SAM decoder.
+        - multimask_output: if it's True, we output 3 candidate masks and their 3
+          corresponding IoU estimates, and if it's False, we output only 1 mask and
+          its corresponding IoU estimate.
+
+        Outputs:
+        - low_res_multimasks: [B, M, H*4, W*4] shape (where M = 3 if
+          `multimask_output=True` and M = 1 if `multimask_output=False`), the SAM
+          output mask logits (before sigmoid) for the low-resolution masks, with 4x
+          the resolution (1/4 stride) of the input backbone_features.
+        - high_res_multimasks: [B, M, H*16, W*16] shape (where M = 3
+          if `multimask_output=True` and M = 1 if `multimask_output=False`),
+          upsampled from the low-resolution masks, with shape size as the image
+          (stride is 1 pixel).
+        - ious, [B, M] shape, where (where M = 3 if `multimask_output=True` and M = 1
+          if `multimask_output=False`), the estimated IoU of each output mask.
+        - low_res_masks: [B, 1, H*4, W*4] shape, the best mask in `low_res_multimasks`.
+          If `multimask_output=True`, it's the mask with the highest IoU estimate.
+          If `multimask_output=False`, it's the same as `low_res_multimasks`.
+        - high_res_masks: [B, 1, H*16, W*16] shape, the best mask in `high_res_multimasks`.
+          If `multimask_output=True`, it's the mask with the highest IoU estimate.
+          If `multimask_output=False`, it's the same as `high_res_multimasks`.
+        - obj_ptr: [B, C] shape, the object pointer vector for the output mask, extracted
+          based on the output token from the SAM mask decoder.
+        """
+        B = backbone_features.size(0)
+        device = backbone_features.device
+        assert backbone_features.size(1) == self.sam_prompt_embed_dim
+        assert backbone_features.size(2) == self.sam_image_embedding_size
+        assert backbone_features.size(3) == self.sam_image_embedding_size
+
+        # a) Handle point prompts
+        if point_inputs is not None:
+            sam_point_coords = point_inputs["point_coords"]
+            sam_point_labels = point_inputs["point_labels"]
+            assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
+        else:
+            # If no points are provide, pad with an empty point (with label -1)
+            sam_point_coords = torch.zeros(B, 1, 2, device=device)
+            sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
+
+        # b) Handle mask prompts
+        if mask_inputs is not None:
+            # If mask_inputs is provided, downsize it into low-res mask input if needed
+            # and feed it as a dense mask prompt into the SAM mask encoder
+            assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
+            if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
+                sam_mask_prompt = F.interpolate(
+                    mask_inputs.float(),
+                    size=self.sam_prompt_encoder.mask_input_size,
+                    align_corners=False,
+                    mode="bilinear",
+                    antialias=True,  # use antialias for downsampling
+                )
+            else:
+                sam_mask_prompt = mask_inputs
+        else:
+            # Otherwise, simply feed None (and SAM's prompt encoder will add
+            # a learned `no_mask_embed` to indicate no mask input in this case).
+            sam_mask_prompt = None
+
+        sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
+            points=(sam_point_coords, sam_point_labels),
+            boxes=None,
+            masks=sam_mask_prompt,
+        )
+        (
+            low_res_multimasks,
+            ious,
+            sam_output_tokens,
+            object_score_logits,
+        ) = self.sam_mask_decoder(
+            image_embeddings=backbone_features,
+            image_pe=self.sam_prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+            repeat_image=False,  # the image is already batched
+            high_res_features=high_res_features,
+        )
+        if self.pred_obj_scores:
+            is_obj_appearing = object_score_logits > 0
+
+            # Mask used for spatial memories is always a *hard* choice between obj and no obj,
+            # consistent with the actual mask prediction
+            low_res_multimasks = torch.where(
+                is_obj_appearing[:, None, None],
+                low_res_multimasks,
+                NO_OBJ_SCORE,
+            )
+
+        # convert masks from possibly bfloat16 (or float16) to float32
+        # (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
+        low_res_multimasks = low_res_multimasks.float()
+        high_res_multimasks = F.interpolate(
+            low_res_multimasks,
+            size=(self.image_size, self.image_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        sam_output_token = sam_output_tokens[:, 0]
+        if multimask_output:
+            # take the best mask prediction (with the highest IoU estimation)
+            best_iou_inds = torch.argmax(ious, dim=-1)
+            batch_inds = torch.arange(B, device=device)
+            low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+            high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+            if sam_output_tokens.size(1) > 1:
+                sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
+        else:
+            low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks
+
+        # Extract object pointer from the SAM output token (with occlusion handling)
+        obj_ptr = self.obj_ptr_proj(sam_output_token)
+        if self.pred_obj_scores:
+            # Allow *soft* no obj ptr, unlike for masks
+            if self.soft_no_obj_ptr:
+                lambda_is_obj_appearing = object_score_logits.sigmoid()
+            else:
+                lambda_is_obj_appearing = is_obj_appearing.float()
+
+            if self.fixed_no_obj_ptr:
+                obj_ptr = lambda_is_obj_appearing * obj_ptr
+            obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
+
+        return (
+            low_res_multimasks,
+            high_res_multimasks,
+            ious,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        )
+
+    def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs):
+        """
+        Directly turn binary `mask_inputs` into a output mask logits without using SAM.
+        (same input and output shapes as in _forward_sam_heads above).
+        """
+        # Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid).
+        out_scale, out_bias = 20.0, -10.0  # sigmoid(-10.0)=4.5398e-05
+        mask_inputs_float = mask_inputs.float()
+        high_res_masks = mask_inputs_float * out_scale + out_bias
+        low_res_masks = F.interpolate(
+            high_res_masks,
+            size=(high_res_masks.size(-2) // 4, high_res_masks.size(-1) // 4),
+            align_corners=False,
+            mode="bilinear",
+            antialias=True,  # use antialias for downsampling
+        )
+        # a dummy IoU prediction of all 1's under mask input
+        ious = mask_inputs.new_ones(mask_inputs.size(0), 1).float()
+        if not self.use_obj_ptrs_in_encoder:
+            # all zeros as a dummy object pointer (of shape [B, C])
+            obj_ptr = torch.zeros(
+                mask_inputs.size(0), self.hidden_dim, device=mask_inputs.device
+            )
+        else:
+            # produce an object pointer using the SAM decoder from the mask input
+            _, _, _, _, _, obj_ptr, _ = self._forward_sam_heads(
+                backbone_features=backbone_features,
+                mask_inputs=self.mask_downsample(mask_inputs_float),
+                high_res_features=high_res_features,
+            )
+        # In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem;
+        # Below, we follow the same design axiom to use mask_input to decide if obj appears or not instead of relying
+        # on the object_scores from the SAM decoder.
+        is_obj_appearing = torch.any(mask_inputs.flatten(1).float() > 0.0, dim=1)
+        is_obj_appearing = is_obj_appearing[..., None]
+        lambda_is_obj_appearing = is_obj_appearing.float()
+        object_score_logits = out_scale * lambda_is_obj_appearing + out_bias
+        if self.pred_obj_scores:
+            if self.fixed_no_obj_ptr:
+                obj_ptr = lambda_is_obj_appearing * obj_ptr
+            obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
+
+        return (
+            low_res_masks,
+            high_res_masks,
+            ious,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        )
+
+    def forward_image(self, img_batch: torch.Tensor):
+        """Get the image feature on the input batch."""
+        backbone_out = self.image_encoder(img_batch)
+        if self.use_high_res_features_in_sam:
+            # precompute projected level 0 and level 1 features in SAM decoder
+            # to avoid running it again on every SAM click
+            backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(
+                backbone_out["backbone_fpn"][0]
+            )
+            backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(
+                backbone_out["backbone_fpn"][1]
+            )
+        return backbone_out
+
+    def _prepare_backbone_features(self, backbone_out):
+        """Prepare and flatten visual features."""
+        backbone_out = backbone_out.copy()
+        assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
+        assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
+
+        feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
+        vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
+
+        feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
+        # flatten NxCxHxW to HWxNxC
+        vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
+        vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
+
+        return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
+
+    def _prepare_memory_conditioned_features(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        output_dict,
+        num_frames,
+        track_in_reverse=False,  # tracking in reverse time order (for demo usage)
+    ):
+        """Fuse the current frame's visual feature map with previous memory."""
+        B = current_vision_feats[-1].size(1)  # batch size on this frame
+        C = self.hidden_dim
+        H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
+        device = current_vision_feats[-1].device
+        # The case of `self.num_maskmem == 0` below is primarily used for reproducing SAM on images.
+        # In this case, we skip the fusion with any memory.
+        if self.num_maskmem == 0:  # Disable memory and skip fusion
+            pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
+            return pix_feat
+
+        num_obj_ptr_tokens = 0
+        tpos_sign_mul = -1 if track_in_reverse else 1
+        # Step 1: condition the visual features of the current frame on previous memories
+        if not is_init_cond_frame:
+            # Retrieve the memories encoded with the maskmem backbone
+            to_cat_memory, to_cat_memory_pos_embed = [], []
+            # Add conditioning frames's output first (all cond frames have t_pos=0 for
+            # when getting temporal positional embedding below)
+            assert len(output_dict["cond_frame_outputs"]) > 0
+            # Select a maximum number of temporally closest cond frames for cross attention
+            cond_outputs = output_dict["cond_frame_outputs"]
+            selected_cond_outputs, unselected_cond_outputs = select_closest_cond_frames(
+                frame_idx, cond_outputs, self.max_cond_frames_in_attn
+            )
+            t_pos_and_prevs = [(0, out) for out in selected_cond_outputs.values()]
+            # Add last (self.num_maskmem - 1) frames before current frame for non-conditioning memory
+            # the earliest one has t_pos=1 and the latest one has t_pos=self.num_maskmem-1
+            # We also allow taking the memory frame non-consecutively (with stride>1), in which case
+            # we take (self.num_maskmem - 2) frames among every stride-th frames plus the last frame.
+            stride = 1 if self.training else self.memory_temporal_stride_for_eval
+            for t_pos in range(1, self.num_maskmem):
+                t_rel = self.num_maskmem - t_pos  # how many frames before current frame
+                if t_rel == 1:
+                    # for t_rel == 1, we take the last frame (regardless of r)
+                    if not track_in_reverse:
+                        # the frame immediately before this frame (i.e. frame_idx - 1)
+                        prev_frame_idx = frame_idx - t_rel
+                    else:
+                        # the frame immediately after this frame (i.e. frame_idx + 1)
+                        prev_frame_idx = frame_idx + t_rel
+                else:
+                    # for t_rel >= 2, we take the memory frame from every r-th frames
+                    if not track_in_reverse:
+                        # first find the nearest frame among every r-th frames before this frame
+                        # for r=1, this would be (frame_idx - 2)
+                        prev_frame_idx = ((frame_idx - 2) // stride) * stride
+                        # then seek further among every r-th frames
+                        prev_frame_idx = prev_frame_idx - (t_rel - 2) * stride
+                    else:
+                        # first find the nearest frame among every r-th frames after this frame
+                        # for r=1, this would be (frame_idx + 2)
+                        prev_frame_idx = -(-(frame_idx + 2) // stride) * stride
+                        # then seek further among every r-th frames
+                        prev_frame_idx = prev_frame_idx + (t_rel - 2) * stride
+                out = output_dict["non_cond_frame_outputs"].get(prev_frame_idx, None)
+                if out is None:
+                    # If an unselected conditioning frame is among the last (self.num_maskmem - 1)
+                    # frames, we still attend to it as if it's a non-conditioning frame.
+                    out = unselected_cond_outputs.get(prev_frame_idx, None)
+                t_pos_and_prevs.append((t_pos, out))
+
+            for t_pos, prev in t_pos_and_prevs:
+                if prev is None:
+                    continue  # skip padding frames
+                # "maskmem_features" might have been offloaded to CPU in demo use cases,
+                # so we load it back to GPU (it's a no-op if it's already on GPU).
+                feats = prev["maskmem_features"].to(device, non_blocking=True)
+                to_cat_memory.append(feats.flatten(2).permute(2, 0, 1))
+                # Spatial positional encoding (it might have been offloaded to CPU in eval)
+                maskmem_enc = prev["maskmem_pos_enc"][-1].to(device)
+                maskmem_enc = maskmem_enc.flatten(2).permute(2, 0, 1)
+                # Temporal positional encoding
+                maskmem_enc = (
+                    maskmem_enc + self.maskmem_tpos_enc[self.num_maskmem - t_pos - 1]
+                )
+                to_cat_memory_pos_embed.append(maskmem_enc)
+
+            # Construct the list of past object pointers
+            if self.use_obj_ptrs_in_encoder:
+                max_obj_ptrs_in_encoder = min(num_frames, self.max_obj_ptrs_in_encoder)
+                # First add those object pointers from selected conditioning frames
+                # (optionally, only include object pointers in the past during evaluation)
+                if not self.training and self.only_obj_ptrs_in_the_past_for_eval:
+                    ptr_cond_outputs = {
+                        t: out
+                        for t, out in selected_cond_outputs.items()
+                        if (t >= frame_idx if track_in_reverse else t <= frame_idx)
+                    }
+                else:
+                    ptr_cond_outputs = selected_cond_outputs
+                pos_and_ptrs = [
+                    # Temporal pos encoding contains how far away each pointer is from current frame
+                    (
+                        (
+                            (frame_idx - t) * tpos_sign_mul
+                            if self.use_signed_tpos_enc_to_obj_ptrs
+                            else abs(frame_idx - t)
+                        ),
+                        out["obj_ptr"],
+                    )
+                    for t, out in ptr_cond_outputs.items()
+                ]
+                # Add up to (max_obj_ptrs_in_encoder - 1) non-conditioning frames before current frame
+                for t_diff in range(1, max_obj_ptrs_in_encoder):
+                    t = frame_idx + t_diff if track_in_reverse else frame_idx - t_diff
+                    if t < 0 or (num_frames is not None and t >= num_frames):
+                        break
+                    out = output_dict["non_cond_frame_outputs"].get(
+                        t, unselected_cond_outputs.get(t, None)
+                    )
+                    if out is not None:
+                        pos_and_ptrs.append((t_diff, out["obj_ptr"]))
+                # If we have at least one object pointer, add them to the across attention
+                if len(pos_and_ptrs) > 0:
+                    pos_list, ptrs_list = zip(*pos_and_ptrs)
+                    # stack object pointers along dim=0 into [ptr_seq_len, B, C] shape
+                    obj_ptrs = torch.stack(ptrs_list, dim=0)
+                    # a temporal positional embedding based on how far each object pointer is from
+                    # the current frame (sine embedding normalized by the max pointer num).
+                    if self.add_tpos_enc_to_obj_ptrs:
+                        t_diff_max = max_obj_ptrs_in_encoder - 1
+                        tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim
+                        obj_pos = torch.tensor(pos_list).to(
+                            device=device, non_blocking=True
+                        )
+                        obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim)
+                        obj_pos = self.obj_ptr_tpos_proj(obj_pos)
+                        obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim)
+                    else:
+                        obj_pos = obj_ptrs.new_zeros(len(pos_list), B, self.mem_dim)
+                    if self.mem_dim < C:
+                        # split a pointer into (C // self.mem_dim) tokens for self.mem_dim < C
+                        obj_ptrs = obj_ptrs.reshape(
+                            -1, B, C // self.mem_dim, self.mem_dim
+                        )
+                        obj_ptrs = obj_ptrs.permute(0, 2, 1, 3).flatten(0, 1)
+                        obj_pos = obj_pos.repeat_interleave(C // self.mem_dim, dim=0)
+                    to_cat_memory.append(obj_ptrs)
+                    to_cat_memory_pos_embed.append(obj_pos)
+                    num_obj_ptr_tokens = obj_ptrs.shape[0]
+                else:
+                    num_obj_ptr_tokens = 0
+        else:
+            # for initial conditioning frames, encode them without using any previous memory
+            if self.directly_add_no_mem_embed:
+                # directly add no-mem embedding (instead of using the transformer encoder)
+                pix_feat_with_mem = current_vision_feats[-1] + self.no_mem_embed
+                pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
+                return pix_feat_with_mem
+
+            # Use a dummy token on the first frame (to avoid empty memory input to tranformer encoder)
+            to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
+            to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
+
+        # Step 2: Concatenate the memories and forward through the transformer encoder
+        memory = torch.cat(to_cat_memory, dim=0)
+        memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
+
+        pix_feat_with_mem = self.memory_attention(
+            curr=current_vision_feats,
+            curr_pos=current_vision_pos_embeds,
+            memory=memory,
+            memory_pos=memory_pos_embed,
+            num_obj_ptr_tokens=num_obj_ptr_tokens,
+        )
+        # reshape the output (HW)BC => BCHW
+        pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
+        return pix_feat_with_mem
+
+    def _encode_new_memory(
+        self,
+        current_vision_feats,
+        feat_sizes,
+        pred_masks_high_res,
+        object_score_logits,
+        is_mask_from_pts,
+    ):
+        """Encode the current image and its prediction into a memory feature."""
+        B = current_vision_feats[-1].size(1)  # batch size on this frame
+        C = self.hidden_dim
+        H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
+        # top-level feature, (HW)BC => BCHW
+        pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
+        if self.non_overlap_masks_for_mem_enc and not self.training:
+            # optionally, apply non-overlapping constraints to the masks (it's applied
+            # in the batch dimension and should only be used during eval, where all
+            # the objects come from the same video under batch size 1).
+            pred_masks_high_res = self._apply_non_overlapping_constraints(
+                pred_masks_high_res
+            )
+        # scale the raw mask logits with a temperature before applying sigmoid
+        binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
+        if binarize and not self.training:
+            mask_for_mem = (pred_masks_high_res > 0).float()
+        else:
+            # apply sigmoid on the raw mask logits to turn them into range (0, 1)
+            mask_for_mem = torch.sigmoid(pred_masks_high_res)
+        # apply scale and bias terms to the sigmoid probabilities
+        if self.sigmoid_scale_for_mem_enc != 1.0:
+            mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
+        if self.sigmoid_bias_for_mem_enc != 0.0:
+            mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
+        maskmem_out = self.memory_encoder(
+            pix_feat, mask_for_mem, skip_mask_sigmoid=True  # sigmoid already applied
+        )
+        maskmem_features = maskmem_out["vision_features"]
+        maskmem_pos_enc = maskmem_out["vision_pos_enc"]
+        # add a no-object embedding to the spatial memory to indicate that the frame
+        # is predicted to be occluded (i.e. no object is appearing in the frame)
+        if self.no_obj_embed_spatial is not None:
+            is_obj_appearing = (object_score_logits > 0).float()
+            maskmem_features += (
+                1 - is_obj_appearing[..., None, None]
+            ) * self.no_obj_embed_spatial[..., None, None].expand(
+                *maskmem_features.shape
+            )
+
+        return maskmem_features, maskmem_pos_enc
+
+    def _track_step(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        point_inputs,
+        mask_inputs,
+        output_dict,
+        num_frames,
+        track_in_reverse,
+        prev_sam_mask_logits,
+    ):
+        current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
+        # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
+        if len(current_vision_feats) > 1:
+            high_res_features = [
+                x.permute(1, 2, 0).view(x.size(1), x.size(2), *s)
+                for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1])
+            ]
+        else:
+            high_res_features = None
+        if mask_inputs is not None and self.use_mask_input_as_output_without_sam:
+            # When use_mask_input_as_output_without_sam=True, we directly output the mask input
+            # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
+            pix_feat = current_vision_feats[-1].permute(1, 2, 0)
+            pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1])
+            sam_outputs = self._use_mask_as_output(
+                pix_feat, high_res_features, mask_inputs
+            )
+        else:
+            # fused the visual feature with previous memory features in the memory bank
+            pix_feat = self._prepare_memory_conditioned_features(
+                frame_idx=frame_idx,
+                is_init_cond_frame=is_init_cond_frame,
+                current_vision_feats=current_vision_feats[-1:],
+                current_vision_pos_embeds=current_vision_pos_embeds[-1:],
+                feat_sizes=feat_sizes[-1:],
+                output_dict=output_dict,
+                num_frames=num_frames,
+                track_in_reverse=track_in_reverse,
+            )
+            # apply SAM-style segmentation head
+            # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder,
+            # e.g. in demo where such logits come from earlier interaction instead of correction sampling
+            # (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead)
+            if prev_sam_mask_logits is not None:
+                assert point_inputs is not None and mask_inputs is None
+                mask_inputs = prev_sam_mask_logits
+            multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
+            sam_outputs = self._forward_sam_heads(
+                backbone_features=pix_feat,
+                point_inputs=point_inputs,
+                mask_inputs=mask_inputs,
+                high_res_features=high_res_features,
+                multimask_output=multimask_output,
+            )
+
+        return current_out, sam_outputs, high_res_features, pix_feat
+
+    def _encode_memory_in_output(
+        self,
+        current_vision_feats,
+        feat_sizes,
+        point_inputs,
+        run_mem_encoder,
+        high_res_masks,
+        object_score_logits,
+        current_out,
+    ):
+        if run_mem_encoder and self.num_maskmem > 0:
+            high_res_masks_for_mem_enc = high_res_masks
+            maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+                current_vision_feats=current_vision_feats,
+                feat_sizes=feat_sizes,
+                pred_masks_high_res=high_res_masks_for_mem_enc,
+                object_score_logits=object_score_logits,
+                is_mask_from_pts=(point_inputs is not None),
+            )
+            current_out["maskmem_features"] = maskmem_features
+            current_out["maskmem_pos_enc"] = maskmem_pos_enc
+        else:
+            current_out["maskmem_features"] = None
+            current_out["maskmem_pos_enc"] = None
+
+    def track_step(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        point_inputs,
+        mask_inputs,
+        output_dict,
+        num_frames,
+        track_in_reverse=False,  # tracking in reverse time order (for demo usage)
+        # Whether to run the memory encoder on the predicted masks. Sometimes we might want
+        # to skip the memory encoder with `run_mem_encoder=False`. For example,
+        # in demo we might call `track_step` multiple times for each user click,
+        # and only encode the memory when the user finalizes their clicks. And in ablation
+        # settings like SAM training on static images, we don't need the memory encoder.
+        run_mem_encoder=True,
+        # The previously predicted SAM mask logits (which can be fed together with new clicks in demo).
+        prev_sam_mask_logits=None,
+    ):
+        current_out, sam_outputs, _, _ = self._track_step(
+            frame_idx,
+            is_init_cond_frame,
+            current_vision_feats,
+            current_vision_pos_embeds,
+            feat_sizes,
+            point_inputs,
+            mask_inputs,
+            output_dict,
+            num_frames,
+            track_in_reverse,
+            prev_sam_mask_logits,
+        )
+
+        (
+            _,
+            _,
+            _,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        ) = sam_outputs
+
+        current_out["pred_masks"] = low_res_masks
+        current_out["pred_masks_high_res"] = high_res_masks
+        current_out["obj_ptr"] = obj_ptr
+        if not self.training:
+            # Only add this in inference (to avoid unused param in activation checkpointing;
+            # it's mainly used in the demo to encode spatial memories w/ consolidated masks)
+            current_out["object_score_logits"] = object_score_logits
+
+        # Finally run the memory encoder on the predicted mask to encode
+        # it into a new memory feature (that can be used in future frames)
+        self._encode_memory_in_output(
+            current_vision_feats,
+            feat_sizes,
+            point_inputs,
+            run_mem_encoder,
+            high_res_masks,
+            object_score_logits,
+            current_out,
+        )
+
+        return current_out
+
+    def _use_multimask(self, is_init_cond_frame, point_inputs):
+        """Whether to use multimask output in the SAM head."""
+        num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
+        multimask_output = (
+            self.multimask_output_in_sam
+            and (is_init_cond_frame or self.multimask_output_for_tracking)
+            and (self.multimask_min_pt_num <= num_pts <= self.multimask_max_pt_num)
+        )
+        return multimask_output
+
+    def _apply_non_overlapping_constraints(self, pred_masks):
+        """
+        Apply non-overlapping constraints to the object scores in pred_masks. Here we
+        keep only the highest scoring object at each spatial location in pred_masks.
+        """
+        batch_size = pred_masks.size(0)
+        if batch_size == 1:
+            return pred_masks
+
+        device = pred_masks.device
+        # "max_obj_inds": object index of the object with the highest score at each location
+        max_obj_inds = torch.argmax(pred_masks, dim=0, keepdim=True)
+        # "batch_obj_inds": object index of each object slice (along dim 0) in `pred_masks`
+        batch_obj_inds = torch.arange(batch_size, device=device)[:, None, None, None]
+        keep = max_obj_inds == batch_obj_inds
+        # suppress overlapping regions' scores below -10.0 so that the foreground regions
+        # don't overlap (here sigmoid(-10.0)=4.5398e-05)
+        pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0))
+        return pred_masks
diff --git a/phantom/submodules/sam2/sam2/modeling/sam2_utils.py b/phantom/submodules/sam2/sam2/modeling/sam2_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16caae3a9a49e451b2d03d1ee60c47f8e9ed23c
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/modeling/sam2_utils.py
@@ -0,0 +1,323 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import copy
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from sam2.utils.misc import mask_to_box
+
+
+def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num):
+    """
+    Select up to `max_cond_frame_num` conditioning frames from `cond_frame_outputs`
+    that are temporally closest to the current frame at `frame_idx`. Here, we take
+    - a) the closest conditioning frame before `frame_idx` (if any);
+    - b) the closest conditioning frame after `frame_idx` (if any);
+    - c) any other temporally closest conditioning frames until reaching a total
+         of `max_cond_frame_num` conditioning frames.
+
+    Outputs:
+    - selected_outputs: selected items (keys & values) from `cond_frame_outputs`.
+    - unselected_outputs: items (keys & values) not selected in `cond_frame_outputs`.
+    """
+    if max_cond_frame_num == -1 or len(cond_frame_outputs) <= max_cond_frame_num:
+        selected_outputs = cond_frame_outputs
+        unselected_outputs = {}
+    else:
+        assert max_cond_frame_num >= 2, "we should allow using 2+ conditioning frames"
+        selected_outputs = {}
+
+        # the closest conditioning frame before `frame_idx` (if any)
+        idx_before = max((t for t in cond_frame_outputs if t < frame_idx), default=None)
+        if idx_before is not None:
+            selected_outputs[idx_before] = cond_frame_outputs[idx_before]
+
+        # the closest conditioning frame after `frame_idx` (if any)
+        idx_after = min((t for t in cond_frame_outputs if t >= frame_idx), default=None)
+        if idx_after is not None:
+            selected_outputs[idx_after] = cond_frame_outputs[idx_after]
+
+        # add other temporally closest conditioning frames until reaching a total
+        # of `max_cond_frame_num` conditioning frames.
+        num_remain = max_cond_frame_num - len(selected_outputs)
+        inds_remain = sorted(
+            (t for t in cond_frame_outputs if t not in selected_outputs),
+            key=lambda x: abs(x - frame_idx),
+        )[:num_remain]
+        selected_outputs.update((t, cond_frame_outputs[t]) for t in inds_remain)
+        unselected_outputs = {
+            t: v for t, v in cond_frame_outputs.items() if t not in selected_outputs
+        }
+
+    return selected_outputs, unselected_outputs
+
+
+def get_1d_sine_pe(pos_inds, dim, temperature=10000):
+    """
+    Get 1D sine positional embedding as in the original Transformer paper.
+    """
+    pe_dim = dim // 2
+    dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device)
+    dim_t = temperature ** (2 * (dim_t // 2) / pe_dim)
+
+    pos_embed = pos_inds.unsqueeze(-1) / dim_t
+    pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1)
+    return pos_embed
+
+
+def get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
+
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+class DropPath(nn.Module):
+    # adapted from https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    def __init__(self, drop_prob=0.0, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        if self.drop_prob == 0.0 or not self.training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+        if keep_prob > 0.0 and self.scale_by_keep:
+            random_tensor.div_(keep_prob)
+        return x * random_tensor
+
+
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        activation: nn.Module = nn.ReLU,
+        sigmoid_output: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+        self.sigmoid_output = sigmoid_output
+        self.act = activation()
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
+        if self.sigmoid_output:
+            x = F.sigmoid(x)
+        return x
+
+
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+def sample_box_points(
+    masks: torch.Tensor,
+    noise: float = 0.1,  # SAM default
+    noise_bound: int = 20,  # SAM default
+    top_left_label: int = 2,
+    bottom_right_label: int = 3,
+) -> Tuple[np.array, np.array]:
+    """
+    Sample a noised version of the top left and bottom right corners of a given `bbox`
+
+    Inputs:
+    - masks: [B, 1, H,W] boxes, dtype=torch.Tensor
+    - noise: noise as a fraction of box width and height, dtype=float
+    - noise_bound: maximum amount of noise (in pure pixesl), dtype=int
+
+    Returns:
+    - box_coords: [B, num_pt, 2], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.float
+    - box_labels: [B, num_pt], label 2 is reserverd for top left and 3 for bottom right corners, dtype=torch.int32
+    """
+    device = masks.device
+    box_coords = mask_to_box(masks)
+    B, _, H, W = masks.shape
+    box_labels = torch.tensor(
+        [top_left_label, bottom_right_label], dtype=torch.int, device=device
+    ).repeat(B)
+    if noise > 0.0:
+        if not isinstance(noise_bound, torch.Tensor):
+            noise_bound = torch.tensor(noise_bound, device=device)
+        bbox_w = box_coords[..., 2] - box_coords[..., 0]
+        bbox_h = box_coords[..., 3] - box_coords[..., 1]
+        max_dx = torch.min(bbox_w * noise, noise_bound)
+        max_dy = torch.min(bbox_h * noise, noise_bound)
+        box_noise = 2 * torch.rand(B, 1, 4, device=device) - 1
+        box_noise = box_noise * torch.stack((max_dx, max_dy, max_dx, max_dy), dim=-1)
+
+        box_coords = box_coords + box_noise
+        img_bounds = (
+            torch.tensor([W, H, W, H], device=device) - 1
+        )  # uncentered pixel coords
+        box_coords.clamp_(torch.zeros_like(img_bounds), img_bounds)  # In place clamping
+
+    box_coords = box_coords.reshape(-1, 2, 2)  # always 2 points
+    box_labels = box_labels.reshape(-1, 2)
+    return box_coords, box_labels
+
+
+def sample_random_points_from_errors(gt_masks, pred_masks, num_pt=1):
+    """
+    Sample `num_pt` random points (along with their labels) independently from the error regions.
+
+    Inputs:
+    - gt_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool
+    - pred_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool or None
+    - num_pt: int, number of points to sample independently for each of the B error maps
+
+    Outputs:
+    - points: [B, num_pt, 2], dtype=torch.float, contains (x, y) coordinates of each sampled point
+    - labels: [B, num_pt], dtype=torch.int32, where 1 means positive clicks and 0 means
+      negative clicks
+    """
+    if pred_masks is None:  # if pred_masks is not provided, treat it as empty
+        pred_masks = torch.zeros_like(gt_masks)
+    assert gt_masks.dtype == torch.bool and gt_masks.size(1) == 1
+    assert pred_masks.dtype == torch.bool and pred_masks.shape == gt_masks.shape
+    assert num_pt >= 0
+
+    B, _, H_im, W_im = gt_masks.shape
+    device = gt_masks.device
+
+    # false positive region, a new point sampled in this region should have
+    # negative label to correct the FP error
+    fp_masks = ~gt_masks & pred_masks
+    # false negative region, a new point sampled in this region should have
+    # positive label to correct the FN error
+    fn_masks = gt_masks & ~pred_masks
+    # whether the prediction completely match the ground-truth on each mask
+    all_correct = torch.all((gt_masks == pred_masks).flatten(2), dim=2)
+    all_correct = all_correct[..., None, None]
+
+    # channel 0 is FP map, while channel 1 is FN map
+    pts_noise = torch.rand(B, num_pt, H_im, W_im, 2, device=device)
+    # sample a negative new click from FP region or a positive new click
+    # from FN region, depend on where the maximum falls,
+    # and in case the predictions are all correct (no FP or FN), we just
+    # sample a negative click from the background region
+    pts_noise[..., 0] *= fp_masks | (all_correct & ~gt_masks)
+    pts_noise[..., 1] *= fn_masks
+    pts_idx = pts_noise.flatten(2).argmax(dim=2)
+    labels = (pts_idx % 2).to(torch.int32)
+    pts_idx = pts_idx // 2
+    pts_x = pts_idx % W_im
+    pts_y = pts_idx // W_im
+    points = torch.stack([pts_x, pts_y], dim=2).to(torch.float)
+    return points, labels
+
+
+def sample_one_point_from_error_center(gt_masks, pred_masks, padding=True):
+    """
+    Sample 1 random point (along with its label) from the center of each error region,
+    that is, the point with the largest distance to the boundary of each error region.
+    This is the RITM sampling method from https://github.com/saic-vul/ritm_interactive_segmentation/blob/master/isegm/inference/clicker.py
+
+    Inputs:
+    - gt_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool
+    - pred_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool or None
+    - padding: if True, pad with boundary of 1 px for distance transform
+
+    Outputs:
+    - points: [B, 1, 2], dtype=torch.float, contains (x, y) coordinates of each sampled point
+    - labels: [B, 1], dtype=torch.int32, where 1 means positive clicks and 0 means negative clicks
+    """
+    import cv2
+
+    if pred_masks is None:
+        pred_masks = torch.zeros_like(gt_masks)
+    assert gt_masks.dtype == torch.bool and gt_masks.size(1) == 1
+    assert pred_masks.dtype == torch.bool and pred_masks.shape == gt_masks.shape
+
+    B, _, _, W_im = gt_masks.shape
+    device = gt_masks.device
+
+    # false positive region, a new point sampled in this region should have
+    # negative label to correct the FP error
+    fp_masks = ~gt_masks & pred_masks
+    # false negative region, a new point sampled in this region should have
+    # positive label to correct the FN error
+    fn_masks = gt_masks & ~pred_masks
+
+    fp_masks = fp_masks.cpu().numpy()
+    fn_masks = fn_masks.cpu().numpy()
+    points = torch.zeros(B, 1, 2, dtype=torch.float)
+    labels = torch.ones(B, 1, dtype=torch.int32)
+    for b in range(B):
+        fn_mask = fn_masks[b, 0]
+        fp_mask = fp_masks[b, 0]
+        if padding:
+            fn_mask = np.pad(fn_mask, ((1, 1), (1, 1)), "constant")
+            fp_mask = np.pad(fp_mask, ((1, 1), (1, 1)), "constant")
+        # compute the distance of each point in FN/FP region to its boundary
+        fn_mask_dt = cv2.distanceTransform(fn_mask.astype(np.uint8), cv2.DIST_L2, 0)
+        fp_mask_dt = cv2.distanceTransform(fp_mask.astype(np.uint8), cv2.DIST_L2, 0)
+        if padding:
+            fn_mask_dt = fn_mask_dt[1:-1, 1:-1]
+            fp_mask_dt = fp_mask_dt[1:-1, 1:-1]
+
+        # take the point in FN/FP region with the largest distance to its boundary
+        fn_mask_dt_flat = fn_mask_dt.reshape(-1)
+        fp_mask_dt_flat = fp_mask_dt.reshape(-1)
+        fn_argmax = np.argmax(fn_mask_dt_flat)
+        fp_argmax = np.argmax(fp_mask_dt_flat)
+        is_positive = fn_mask_dt_flat[fn_argmax] > fp_mask_dt_flat[fp_argmax]
+        pt_idx = fn_argmax if is_positive else fp_argmax
+        points[b, 0, 0] = pt_idx % W_im  # x
+        points[b, 0, 1] = pt_idx // W_im  # y
+        labels[b, 0] = int(is_positive)
+
+    points = points.to(device)
+    labels = labels.to(device)
+    return points, labels
+
+
+def get_next_point(gt_masks, pred_masks, method):
+    if method == "uniform":
+        return sample_random_points_from_errors(gt_masks, pred_masks)
+    elif method == "center":
+        return sample_one_point_from_error_center(gt_masks, pred_masks)
+    else:
+        raise ValueError(f"unknown sampling method {method}")
diff --git a/phantom/submodules/sam2/sam2/sam2_hiera_b+.yaml b/phantom/submodules/sam2/sam2/sam2_hiera_b+.yaml
new file mode 120000
index 0000000000000000000000000000000000000000..998d9c98c9ff4e8ddd55deff72aa0d9067977418
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/sam2_hiera_b+.yaml
@@ -0,0 +1 @@
+configs/sam2/sam2_hiera_b+.yaml
\ No newline at end of file
diff --git a/phantom/submodules/sam2/sam2/sam2_hiera_l.yaml b/phantom/submodules/sam2/sam2/sam2_hiera_l.yaml
new file mode 120000
index 0000000000000000000000000000000000000000..c0e7e58e1951d5c55a3a3ebe6b803dd814cf9d86
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/sam2_hiera_l.yaml
@@ -0,0 +1 @@
+configs/sam2/sam2_hiera_l.yaml
\ No newline at end of file
diff --git a/phantom/submodules/sam2/sam2/sam2_hiera_s.yaml b/phantom/submodules/sam2/sam2/sam2_hiera_s.yaml
new file mode 120000
index 0000000000000000000000000000000000000000..41896a26beb2aa831d18b0bf3c349ed43deeef68
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/sam2_hiera_s.yaml
@@ -0,0 +1 @@
+configs/sam2/sam2_hiera_s.yaml
\ No newline at end of file
diff --git a/phantom/submodules/sam2/sam2/sam2_hiera_t.yaml b/phantom/submodules/sam2/sam2/sam2_hiera_t.yaml
new file mode 120000
index 0000000000000000000000000000000000000000..71ff3abbb1e11f8b82100a0a1d63cb267eefe52a
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/sam2_hiera_t.yaml
@@ -0,0 +1 @@
+configs/sam2/sam2_hiera_t.yaml
\ No newline at end of file
diff --git a/phantom/submodules/sam2/sam2/sam2_image_predictor.py b/phantom/submodules/sam2/sam2/sam2_image_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..41ce53af5924504c07216df52b2d2eefaeec7ae9
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/sam2_image_predictor.py
@@ -0,0 +1,466 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL.Image import Image
+
+from sam2.modeling.sam2_base import SAM2Base
+
+from sam2.utils.transforms import SAM2Transforms
+
+
+class SAM2ImagePredictor:
+    def __init__(
+        self,
+        sam_model: SAM2Base,
+        mask_threshold=0.0,
+        max_hole_area=0.0,
+        max_sprinkle_area=0.0,
+        **kwargs,
+    ) -> None:
+        """
+        Uses SAM-2 to calculate the image embedding for an image, and then
+        allow repeated, efficient mask prediction given prompts.
+
+        Arguments:
+          sam_model (Sam-2): The model to use for mask prediction.
+          mask_threshold (float): The threshold to use when converting mask logits
+            to binary masks. Masks are thresholded at 0 by default.
+          max_hole_area (int): If max_hole_area > 0, we fill small holes in up to
+            the maximum area of max_hole_area in low_res_masks.
+          max_sprinkle_area (int): If max_sprinkle_area > 0, we remove small sprinkles up to
+            the maximum area of max_sprinkle_area in low_res_masks.
+        """
+        super().__init__()
+        self.model = sam_model
+        self._transforms = SAM2Transforms(
+            resolution=self.model.image_size,
+            mask_threshold=mask_threshold,
+            max_hole_area=max_hole_area,
+            max_sprinkle_area=max_sprinkle_area,
+        )
+
+        # Predictor state
+        self._is_image_set = False
+        self._features = None
+        self._orig_hw = None
+        # Whether the predictor is set for single image or a batch of images
+        self._is_batch = False
+
+        # Predictor config
+        self.mask_threshold = mask_threshold
+
+        # Spatial dim for backbone feature maps
+        self._bb_feat_sizes = [
+            (256, 256),
+            (128, 128),
+            (64, 64),
+        ]
+
+    @classmethod
+    def from_pretrained(cls, model_id: str, **kwargs) -> "SAM2ImagePredictor":
+        """
+        Load a pretrained model from the Hugging Face hub.
+
+        Arguments:
+          model_id (str): The Hugging Face repository ID.
+          **kwargs: Additional arguments to pass to the model constructor.
+
+        Returns:
+          (SAM2ImagePredictor): The loaded model.
+        """
+        from sam2.build_sam import build_sam2_hf
+
+        sam_model = build_sam2_hf(model_id, **kwargs)
+        return cls(sam_model, **kwargs)
+
+    @torch.no_grad()
+    def set_image(
+        self,
+        image: Union[np.ndarray, Image],
+    ) -> None:
+        """
+        Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method.
+
+        Arguments:
+          image (np.ndarray or PIL Image): The input image to embed in RGB format. The image should be in HWC format if np.ndarray, or WHC format if PIL Image
+          with pixel values in [0, 255].
+          image_format (str): The color format of the image, in ['RGB', 'BGR'].
+        """
+        self.reset_predictor()
+        # Transform the image to the form expected by the model
+        if isinstance(image, np.ndarray):
+            logging.info("For numpy array image, we assume (HxWxC) format")
+            self._orig_hw = [image.shape[:2]]
+        elif isinstance(image, Image):
+            w, h = image.size
+            self._orig_hw = [(h, w)]
+        else:
+            raise NotImplementedError("Image format not supported")
+
+        input_image = self._transforms(image)
+        input_image = input_image[None, ...].to(self.device)
+
+        assert (
+            len(input_image.shape) == 4 and input_image.shape[1] == 3
+        ), f"input_image must be of size 1x3xHxW, got {input_image.shape}"
+        logging.info("Computing image embeddings for the provided image...")
+        backbone_out = self.model.forward_image(input_image)
+        _, vision_feats, _, _ = self.model._prepare_backbone_features(backbone_out)
+        # Add no_mem_embed, which is added to the lowest rest feat. map during training on videos
+        if self.model.directly_add_no_mem_embed:
+            vision_feats[-1] = vision_feats[-1] + self.model.no_mem_embed
+
+        feats = [
+            feat.permute(1, 2, 0).view(1, -1, *feat_size)
+            for feat, feat_size in zip(vision_feats[::-1], self._bb_feat_sizes[::-1])
+        ][::-1]
+        self._features = {"image_embed": feats[-1], "high_res_feats": feats[:-1]}
+        self._is_image_set = True
+        logging.info("Image embeddings computed.")
+
+    @torch.no_grad()
+    def set_image_batch(
+        self,
+        image_list: List[Union[np.ndarray]],
+    ) -> None:
+        """
+        Calculates the image embeddings for the provided image batch, allowing
+        masks to be predicted with the 'predict_batch' method.
+
+        Arguments:
+          image_list (List[np.ndarray]): The input images to embed in RGB format. The image should be in HWC format if np.ndarray
+          with pixel values in [0, 255].
+        """
+        self.reset_predictor()
+        assert isinstance(image_list, list)
+        self._orig_hw = []
+        for image in image_list:
+            assert isinstance(
+                image, np.ndarray
+            ), "Images are expected to be an np.ndarray in RGB format, and of shape  HWC"
+            self._orig_hw.append(image.shape[:2])
+        # Transform the image to the form expected by the model
+        img_batch = self._transforms.forward_batch(image_list)
+        img_batch = img_batch.to(self.device)
+        batch_size = img_batch.shape[0]
+        assert (
+            len(img_batch.shape) == 4 and img_batch.shape[1] == 3
+        ), f"img_batch must be of size Bx3xHxW, got {img_batch.shape}"
+        logging.info("Computing image embeddings for the provided images...")
+        backbone_out = self.model.forward_image(img_batch)
+        _, vision_feats, _, _ = self.model._prepare_backbone_features(backbone_out)
+        # Add no_mem_embed, which is added to the lowest rest feat. map during training on videos
+        if self.model.directly_add_no_mem_embed:
+            vision_feats[-1] = vision_feats[-1] + self.model.no_mem_embed
+
+        feats = [
+            feat.permute(1, 2, 0).view(batch_size, -1, *feat_size)
+            for feat, feat_size in zip(vision_feats[::-1], self._bb_feat_sizes[::-1])
+        ][::-1]
+        self._features = {"image_embed": feats[-1], "high_res_feats": feats[:-1]}
+        self._is_image_set = True
+        self._is_batch = True
+        logging.info("Image embeddings computed.")
+
+    def predict_batch(
+        self,
+        point_coords_batch: List[np.ndarray] = None,
+        point_labels_batch: List[np.ndarray] = None,
+        box_batch: List[np.ndarray] = None,
+        mask_input_batch: List[np.ndarray] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+        normalize_coords=True,
+    ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
+        """This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images.
+        It returns a tuple of lists of masks, ious, and low_res_masks_logits.
+        """
+        assert self._is_batch, "This function should only be used when in batched mode"
+        if not self._is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image_batch(...) before mask prediction."
+            )
+        num_images = len(self._features["image_embed"])
+        all_masks = []
+        all_ious = []
+        all_low_res_masks = []
+        for img_idx in range(num_images):
+            # Transform input prompts
+            point_coords = (
+                point_coords_batch[img_idx] if point_coords_batch is not None else None
+            )
+            point_labels = (
+                point_labels_batch[img_idx] if point_labels_batch is not None else None
+            )
+            box = box_batch[img_idx] if box_batch is not None else None
+            mask_input = (
+                mask_input_batch[img_idx] if mask_input_batch is not None else None
+            )
+            mask_input, unnorm_coords, labels, unnorm_box = self._prep_prompts(
+                point_coords,
+                point_labels,
+                box,
+                mask_input,
+                normalize_coords,
+                img_idx=img_idx,
+            )
+            masks, iou_predictions, low_res_masks = self._predict(
+                unnorm_coords,
+                labels,
+                unnorm_box,
+                mask_input,
+                multimask_output,
+                return_logits=return_logits,
+                img_idx=img_idx,
+            )
+            masks_np = masks.squeeze(0).float().detach().cpu().numpy()
+            iou_predictions_np = (
+                iou_predictions.squeeze(0).float().detach().cpu().numpy()
+            )
+            low_res_masks_np = low_res_masks.squeeze(0).float().detach().cpu().numpy()
+            all_masks.append(masks_np)
+            all_ious.append(iou_predictions_np)
+            all_low_res_masks.append(low_res_masks_np)
+
+        return all_masks, all_ious, all_low_res_masks
+
+    def predict(
+        self,
+        point_coords: Optional[np.ndarray] = None,
+        point_labels: Optional[np.ndarray] = None,
+        box: Optional[np.ndarray] = None,
+        mask_input: Optional[np.ndarray] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+        normalize_coords=True,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+
+        Arguments:
+          point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (np.ndarray or None): A length N array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          box (np.ndarray or None): A length 4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form 1xHxW, where
+            for SAM, H=W=256.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+          normalize_coords (bool): If true, the point coordinates will be normalized to the range [0,1] and point_coords is expected to be wrt. image dimensions.
+
+        Returns:
+          (np.ndarray): The output masks in CxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (np.ndarray): An array of length C containing the model's
+            predictions for the quality of each mask.
+          (np.ndarray): An array of shape CxHxW, where C is the number
+            of masks and H=W=256. These low resolution logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self._is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) before mask prediction."
+            )
+
+        # Transform input prompts
+
+        mask_input, unnorm_coords, labels, unnorm_box = self._prep_prompts(
+            point_coords, point_labels, box, mask_input, normalize_coords
+        )
+
+        masks, iou_predictions, low_res_masks = self._predict(
+            unnorm_coords,
+            labels,
+            unnorm_box,
+            mask_input,
+            multimask_output,
+            return_logits=return_logits,
+        )
+
+        masks_np = masks.squeeze(0).float().detach().cpu().numpy()
+        iou_predictions_np = iou_predictions.squeeze(0).float().detach().cpu().numpy()
+        low_res_masks_np = low_res_masks.squeeze(0).float().detach().cpu().numpy()
+        return masks_np, iou_predictions_np, low_res_masks_np
+
+    def _prep_prompts(
+        self, point_coords, point_labels, box, mask_logits, normalize_coords, img_idx=-1
+    ):
+
+        unnorm_coords, labels, unnorm_box, mask_input = None, None, None, None
+        if point_coords is not None:
+            assert (
+                point_labels is not None
+            ), "point_labels must be supplied if point_coords is supplied."
+            point_coords = torch.as_tensor(
+                point_coords, dtype=torch.float, device=self.device
+            )
+            unnorm_coords = self._transforms.transform_coords(
+                point_coords, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx]
+            )
+            labels = torch.as_tensor(point_labels, dtype=torch.int, device=self.device)
+            if len(unnorm_coords.shape) == 2:
+                unnorm_coords, labels = unnorm_coords[None, ...], labels[None, ...]
+        if box is not None:
+            box = torch.as_tensor(box, dtype=torch.float, device=self.device)
+            unnorm_box = self._transforms.transform_boxes(
+                box, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx]
+            )  # Bx2x2
+        if mask_logits is not None:
+            mask_input = torch.as_tensor(
+                mask_logits, dtype=torch.float, device=self.device
+            )
+            if len(mask_input.shape) == 3:
+                mask_input = mask_input[None, :, :, :]
+        return mask_input, unnorm_coords, labels, unnorm_box
+
+    @torch.no_grad()
+    def _predict(
+        self,
+        point_coords: Optional[torch.Tensor],
+        point_labels: Optional[torch.Tensor],
+        boxes: Optional[torch.Tensor] = None,
+        mask_input: Optional[torch.Tensor] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+        img_idx: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Input prompts are batched torch tensors and are expected to already be
+        transformed to the input frame using SAM2Transforms.
+
+        Arguments:
+          point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (torch.Tensor or None): A BxN array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form Bx1xHxW, where
+            for SAM, H=W=256. Masks returned by a previous iteration of the
+            predict method do not need further transformation.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+
+        Returns:
+          (torch.Tensor): The output masks in BxCxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (torch.Tensor): An array of shape BxC containing the model's
+            predictions for the quality of each mask.
+          (torch.Tensor): An array of shape BxCxHxW, where C is the number
+            of masks and H=W=256. These low res logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self._is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) before mask prediction."
+            )
+
+        if point_coords is not None:
+            concat_points = (point_coords, point_labels)
+        else:
+            concat_points = None
+
+        # Embed prompts
+        if boxes is not None:
+            box_coords = boxes.reshape(-1, 2, 2)
+            box_labels = torch.tensor([[2, 3]], dtype=torch.int, device=boxes.device)
+            box_labels = box_labels.repeat(boxes.size(0), 1)
+            # we merge "boxes" and "points" into a single "concat_points" input (where
+            # boxes are added at the beginning) to sam_prompt_encoder
+            if concat_points is not None:
+                concat_coords = torch.cat([box_coords, concat_points[0]], dim=1)
+                concat_labels = torch.cat([box_labels, concat_points[1]], dim=1)
+                concat_points = (concat_coords, concat_labels)
+            else:
+                concat_points = (box_coords, box_labels)
+
+        sparse_embeddings, dense_embeddings = self.model.sam_prompt_encoder(
+            points=concat_points,
+            boxes=None,
+            masks=mask_input,
+        )
+
+        # Predict masks
+        batched_mode = (
+            concat_points is not None and concat_points[0].shape[0] > 1
+        )  # multi object prediction
+        high_res_features = [
+            feat_level[img_idx].unsqueeze(0)
+            for feat_level in self._features["high_res_feats"]
+        ]
+        low_res_masks, iou_predictions, _, _ = self.model.sam_mask_decoder(
+            image_embeddings=self._features["image_embed"][img_idx].unsqueeze(0),
+            image_pe=self.model.sam_prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+            repeat_image=batched_mode,
+            high_res_features=high_res_features,
+        )
+
+        # Upscale the masks to the original image resolution
+        masks = self._transforms.postprocess_masks(
+            low_res_masks, self._orig_hw[img_idx]
+        )
+        low_res_masks = torch.clamp(low_res_masks, -32.0, 32.0)
+        if not return_logits:
+            masks = masks > self.mask_threshold
+
+        return masks, iou_predictions, low_res_masks
+
+    def get_image_embedding(self) -> torch.Tensor:
+        """
+        Returns the image embeddings for the currently set image, with
+        shape 1xCxHxW, where C is the embedding dimension and (H,W) are
+        the embedding spatial dimension of SAM (typically C=256, H=W=64).
+        """
+        if not self._is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) to generate an embedding."
+            )
+        assert (
+            self._features is not None
+        ), "Features must exist if an image has been set."
+        return self._features["image_embed"]
+
+    @property
+    def device(self) -> torch.device:
+        return self.model.device
+
+    def reset_predictor(self) -> None:
+        """
+        Resets the image embeddings and other state variables.
+        """
+        self._is_image_set = False
+        self._features = None
+        self._orig_hw = None
+        self._is_batch = False
diff --git a/phantom/submodules/sam2/sam2/sam2_video_predictor.py b/phantom/submodules/sam2/sam2/sam2_video_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a7e1a01c4d6e89db0453ce982ea8a31b16651c8
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/sam2_video_predictor.py
@@ -0,0 +1,1223 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+
+from tqdm import tqdm
+
+from sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
+from sam2.utils.misc import concat_points, fill_holes_in_mask_scores, load_video_frames
+
+
+class SAM2VideoPredictor(SAM2Base):
+    """The predictor class to handle user interactions and manage inference states."""
+
+    def __init__(
+        self,
+        fill_hole_area=0,
+        # whether to apply non-overlapping constraints on the output object masks
+        non_overlap_masks=False,
+        # whether to clear non-conditioning memory of the surrounding frames (which may contain outdated information) after adding correction clicks;
+        # note that this would only apply to *single-object tracking* unless `clear_non_cond_mem_for_multi_obj` is also set to True)
+        clear_non_cond_mem_around_input=False,
+        # if `add_all_frames_to_correct_as_cond` is True, we also append to the conditioning frame list any frame that receives a later correction click
+        # if `add_all_frames_to_correct_as_cond` is False, we conditioning frame list to only use those initial conditioning frames
+        add_all_frames_to_correct_as_cond=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.fill_hole_area = fill_hole_area
+        self.non_overlap_masks = non_overlap_masks
+        self.clear_non_cond_mem_around_input = clear_non_cond_mem_around_input
+        self.add_all_frames_to_correct_as_cond = add_all_frames_to_correct_as_cond
+
+    @torch.inference_mode()
+    def init_state(
+        self,
+        video_path,
+        offload_video_to_cpu=False,
+        offload_state_to_cpu=False,
+        async_loading_frames=False,
+    ):
+        """Initialize an inference state."""
+        compute_device = self.device  # device of the model
+        images, video_height, video_width = load_video_frames(
+            video_path=video_path,
+            image_size=self.image_size,
+            offload_video_to_cpu=offload_video_to_cpu,
+            async_loading_frames=async_loading_frames,
+            compute_device=compute_device,
+        )
+        inference_state = {}
+        inference_state["images"] = images
+        inference_state["num_frames"] = len(images)
+        # whether to offload the video frames to CPU memory
+        # turning on this option saves the GPU memory with only a very small overhead
+        inference_state["offload_video_to_cpu"] = offload_video_to_cpu
+        # whether to offload the inference state to CPU memory
+        # turning on this option saves the GPU memory at the cost of a lower tracking fps
+        # (e.g. in a test case of 768x768 model, fps dropped from 27 to 24 when tracking one object
+        # and from 24 to 21 when tracking two objects)
+        inference_state["offload_state_to_cpu"] = offload_state_to_cpu
+        # the original video height and width, used for resizing final output scores
+        inference_state["video_height"] = video_height
+        inference_state["video_width"] = video_width
+        inference_state["device"] = compute_device
+        if offload_state_to_cpu:
+            inference_state["storage_device"] = torch.device("cpu")
+        else:
+            inference_state["storage_device"] = compute_device
+        # inputs on each frame
+        inference_state["point_inputs_per_obj"] = {}
+        inference_state["mask_inputs_per_obj"] = {}
+        # visual features on a small number of recently visited frames for quick interactions
+        inference_state["cached_features"] = {}
+        # values that don't change across frames (so we only need to hold one copy of them)
+        inference_state["constants"] = {}
+        # mapping between client-side object id and model-side object index
+        inference_state["obj_id_to_idx"] = OrderedDict()
+        inference_state["obj_idx_to_id"] = OrderedDict()
+        inference_state["obj_ids"] = []
+        # Slice (view) of each object tracking results, sharing the same memory with "output_dict"
+        inference_state["output_dict_per_obj"] = {}
+        # A temporary storage to hold new outputs when user interact with a frame
+        # to add clicks or mask (it's merged into "output_dict" before propagation starts)
+        inference_state["temp_output_dict_per_obj"] = {}
+        # Frames that already holds consolidated outputs from click or mask inputs
+        # (we directly use their consolidated outputs during tracking)
+        # metadata for each tracking frame (e.g. which direction it's tracked)
+        inference_state["frames_tracked_per_obj"] = {}
+        # Warm up the visual backbone and cache the image feature on frame 0
+        self._get_image_feature(inference_state, frame_idx=0, batch_size=1)
+        return inference_state
+
+    @classmethod
+    def from_pretrained(cls, model_id: str, **kwargs) -> "SAM2VideoPredictor":
+        """
+        Load a pretrained model from the Hugging Face hub.
+
+        Arguments:
+          model_id (str): The Hugging Face repository ID.
+          **kwargs: Additional arguments to pass to the model constructor.
+
+        Returns:
+          (SAM2VideoPredictor): The loaded model.
+        """
+        from sam2.build_sam import build_sam2_video_predictor_hf
+
+        sam_model = build_sam2_video_predictor_hf(model_id, **kwargs)
+        return sam_model
+
+    def _obj_id_to_idx(self, inference_state, obj_id):
+        """Map client-side object id to model-side object index."""
+        obj_idx = inference_state["obj_id_to_idx"].get(obj_id, None)
+        if obj_idx is not None:
+            return obj_idx
+
+        # We always allow adding new objects (including after tracking starts).
+        allow_new_object = True
+        if allow_new_object:
+            # get the next object slot
+            obj_idx = len(inference_state["obj_id_to_idx"])
+            inference_state["obj_id_to_idx"][obj_id] = obj_idx
+            inference_state["obj_idx_to_id"][obj_idx] = obj_id
+            inference_state["obj_ids"] = list(inference_state["obj_id_to_idx"])
+            # set up input and output structures for this object
+            inference_state["point_inputs_per_obj"][obj_idx] = {}
+            inference_state["mask_inputs_per_obj"][obj_idx] = {}
+            inference_state["output_dict_per_obj"][obj_idx] = {
+                "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+                "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+            }
+            inference_state["temp_output_dict_per_obj"][obj_idx] = {
+                "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+                "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+            }
+            inference_state["frames_tracked_per_obj"][obj_idx] = {}
+            return obj_idx
+        else:
+            raise RuntimeError(
+                f"Cannot add new object id {obj_id} after tracking starts. "
+                f"All existing object ids: {inference_state['obj_ids']}. "
+                f"Please call 'reset_state' to restart from scratch."
+            )
+
+    def _obj_idx_to_id(self, inference_state, obj_idx):
+        """Map model-side object index to client-side object id."""
+        return inference_state["obj_idx_to_id"][obj_idx]
+
+    def _get_obj_num(self, inference_state):
+        """Get the total number of unique object ids received so far in this session."""
+        return len(inference_state["obj_idx_to_id"])
+
+    @torch.inference_mode()
+    def add_new_points_or_box(
+        self,
+        inference_state,
+        frame_idx,
+        obj_id,
+        points=None,
+        labels=None,
+        clear_old_points=True,
+        normalize_coords=True,
+        box=None,
+    ):
+        """Add new points to a frame."""
+        obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+        point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
+        mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]
+
+        if (points is not None) != (labels is not None):
+            raise ValueError("points and labels must be provided together")
+        if points is None and box is None:
+            raise ValueError("at least one of points or box must be provided as input")
+
+        if points is None:
+            points = torch.zeros(0, 2, dtype=torch.float32)
+        elif not isinstance(points, torch.Tensor):
+            points = torch.tensor(points, dtype=torch.float32)
+        if labels is None:
+            labels = torch.zeros(0, dtype=torch.int32)
+        elif not isinstance(labels, torch.Tensor):
+            labels = torch.tensor(labels, dtype=torch.int32)
+        if points.dim() == 2:
+            points = points.unsqueeze(0)  # add batch dimension
+        if labels.dim() == 1:
+            labels = labels.unsqueeze(0)  # add batch dimension
+
+        # If `box` is provided, we add it as the first two points with labels 2 and 3
+        # along with the user-provided points (consistent with how SAM 2 is trained).
+        if box is not None:
+            if not clear_old_points:
+                raise ValueError(
+                    "cannot add box without clearing old points, since "
+                    "box prompt must be provided before any point prompt "
+                    "(please use clear_old_points=True instead)"
+                )
+            if not isinstance(box, torch.Tensor):
+                box = torch.tensor(box, dtype=torch.float32, device=points.device)
+            box_coords = box.reshape(1, 2, 2)
+            box_labels = torch.tensor([2, 3], dtype=torch.int32, device=labels.device)
+            box_labels = box_labels.reshape(1, 2)
+            points = torch.cat([box_coords, points], dim=1)
+            labels = torch.cat([box_labels, labels], dim=1)
+
+        if normalize_coords:
+            video_H = inference_state["video_height"]
+            video_W = inference_state["video_width"]
+            points = points / torch.tensor([video_W, video_H]).to(points.device)
+        # scale the (normalized) coordinates by the model's internal image size
+        points = points * self.image_size
+        points = points.to(inference_state["device"])
+        labels = labels.to(inference_state["device"])
+
+        if not clear_old_points:
+            point_inputs = point_inputs_per_frame.get(frame_idx, None)
+        else:
+            point_inputs = None
+        point_inputs = concat_points(point_inputs, points, labels)
+
+        point_inputs_per_frame[frame_idx] = point_inputs
+        mask_inputs_per_frame.pop(frame_idx, None)
+        # If this frame hasn't been tracked before, we treat it as an initial conditioning
+        # frame, meaning that the inputs points are to generate segments on this frame without
+        # using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
+        # the input points will be used to correct the already tracked masks.
+        obj_frames_tracked = inference_state["frames_tracked_per_obj"][obj_idx]
+        is_init_cond_frame = frame_idx not in obj_frames_tracked
+        # whether to track in reverse time order
+        if is_init_cond_frame:
+            reverse = False
+        else:
+            reverse = obj_frames_tracked[frame_idx]["reverse"]
+        obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+        obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+        # Add a frame to conditioning output if it's an initial conditioning frame or
+        # if the model sees all frames receiving clicks/mask as conditioning frames.
+        is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
+        storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+
+        # Get any previously predicted mask logits on this object and feed it along with
+        # the new clicks into the SAM mask decoder.
+        prev_sam_mask_logits = None
+        # lookup temporary output dict first, which contains the most recent output
+        # (if not found, then lookup conditioning and non-conditioning frame output)
+        prev_out = obj_temp_output_dict[storage_key].get(frame_idx)
+        if prev_out is None:
+            prev_out = obj_output_dict["cond_frame_outputs"].get(frame_idx)
+            if prev_out is None:
+                prev_out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx)
+
+        if prev_out is not None and prev_out["pred_masks"] is not None:
+            device = inference_state["device"]
+            prev_sam_mask_logits = prev_out["pred_masks"].to(device, non_blocking=True)
+            # Clamp the scale of prev_sam_mask_logits to avoid rare numerical issues.
+            prev_sam_mask_logits = torch.clamp(prev_sam_mask_logits, -32.0, 32.0)
+        current_out, _ = self._run_single_frame_inference(
+            inference_state=inference_state,
+            output_dict=obj_output_dict,  # run on the slice of a single object
+            frame_idx=frame_idx,
+            batch_size=1,  # run on the slice of a single object
+            is_init_cond_frame=is_init_cond_frame,
+            point_inputs=point_inputs,
+            mask_inputs=None,
+            reverse=reverse,
+            # Skip the memory encoder when adding clicks or mask. We execute the memory encoder
+            # at the beginning of `propagate_in_video` (after user finalize their clicks). This
+            # allows us to enforce non-overlapping constraints on all objects before encoding
+            # them into memory.
+            run_mem_encoder=False,
+            prev_sam_mask_logits=prev_sam_mask_logits,
+        )
+        # Add the output to the output dict (to be used as future memory)
+        obj_temp_output_dict[storage_key][frame_idx] = current_out
+
+        # Resize the output mask to the original video resolution
+        obj_ids = inference_state["obj_ids"]
+        consolidated_out = self._consolidate_temp_output_across_obj(
+            inference_state,
+            frame_idx,
+            is_cond=is_cond,
+            consolidate_at_video_res=True,
+        )
+        _, video_res_masks = self._get_orig_video_res_output(
+            inference_state, consolidated_out["pred_masks_video_res"]
+        )
+        return frame_idx, obj_ids, video_res_masks
+
+    def add_new_points(self, *args, **kwargs):
+        """Deprecated method. Please use `add_new_points_or_box` instead."""
+        return self.add_new_points_or_box(*args, **kwargs)
+
+    @torch.inference_mode()
+    def add_new_mask(
+        self,
+        inference_state,
+        frame_idx,
+        obj_id,
+        mask,
+    ):
+        """Add new mask to a frame."""
+        obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+        point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
+        mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]
+
+        if not isinstance(mask, torch.Tensor):
+            mask = torch.tensor(mask, dtype=torch.bool)
+        assert mask.dim() == 2
+        mask_H, mask_W = mask.shape
+        mask_inputs_orig = mask[None, None]  # add batch and channel dimension
+        mask_inputs_orig = mask_inputs_orig.float().to(inference_state["device"])
+
+        # resize the mask if it doesn't match the model's image size
+        if mask_H != self.image_size or mask_W != self.image_size:
+            mask_inputs = torch.nn.functional.interpolate(
+                mask_inputs_orig,
+                size=(self.image_size, self.image_size),
+                align_corners=False,
+                mode="bilinear",
+                antialias=True,  # use antialias for downsampling
+            )
+            mask_inputs = (mask_inputs >= 0.5).float()
+        else:
+            mask_inputs = mask_inputs_orig
+
+        mask_inputs_per_frame[frame_idx] = mask_inputs
+        point_inputs_per_frame.pop(frame_idx, None)
+        # If this frame hasn't been tracked before, we treat it as an initial conditioning
+        # frame, meaning that the inputs points are to generate segments on this frame without
+        # using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
+        # the input points will be used to correct the already tracked masks.
+        obj_frames_tracked = inference_state["frames_tracked_per_obj"][obj_idx]
+        is_init_cond_frame = frame_idx not in obj_frames_tracked
+        # whether to track in reverse time order
+        if is_init_cond_frame:
+            reverse = False
+        else:
+            reverse = obj_frames_tracked[frame_idx]["reverse"]
+        obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+        obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+        # Add a frame to conditioning output if it's an initial conditioning frame or
+        # if the model sees all frames receiving clicks/mask as conditioning frames.
+        is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
+        storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+
+        current_out, _ = self._run_single_frame_inference(
+            inference_state=inference_state,
+            output_dict=obj_output_dict,  # run on the slice of a single object
+            frame_idx=frame_idx,
+            batch_size=1,  # run on the slice of a single object
+            is_init_cond_frame=is_init_cond_frame,
+            point_inputs=None,
+            mask_inputs=mask_inputs,
+            reverse=reverse,
+            # Skip the memory encoder when adding clicks or mask. We execute the memory encoder
+            # at the beginning of `propagate_in_video` (after user finalize their clicks). This
+            # allows us to enforce non-overlapping constraints on all objects before encoding
+            # them into memory.
+            run_mem_encoder=False,
+        )
+        # Add the output to the output dict (to be used as future memory)
+        obj_temp_output_dict[storage_key][frame_idx] = current_out
+
+        # Resize the output mask to the original video resolution
+        obj_ids = inference_state["obj_ids"]
+        consolidated_out = self._consolidate_temp_output_across_obj(
+            inference_state,
+            frame_idx,
+            is_cond=is_cond,
+            consolidate_at_video_res=True,
+        )
+        _, video_res_masks = self._get_orig_video_res_output(
+            inference_state, consolidated_out["pred_masks_video_res"]
+        )
+        return frame_idx, obj_ids, video_res_masks
+
+    def _get_orig_video_res_output(self, inference_state, any_res_masks):
+        """
+        Resize the object scores to the original video resolution (video_res_masks)
+        and apply non-overlapping constraints for final output.
+        """
+        device = inference_state["device"]
+        video_H = inference_state["video_height"]
+        video_W = inference_state["video_width"]
+        any_res_masks = any_res_masks.to(device, non_blocking=True)
+        if any_res_masks.shape[-2:] == (video_H, video_W):
+            video_res_masks = any_res_masks
+        else:
+            video_res_masks = torch.nn.functional.interpolate(
+                any_res_masks,
+                size=(video_H, video_W),
+                mode="bilinear",
+                align_corners=False,
+            )
+        if self.non_overlap_masks:
+            video_res_masks = self._apply_non_overlapping_constraints(video_res_masks)
+        return any_res_masks, video_res_masks
+
+    def _consolidate_temp_output_across_obj(
+        self,
+        inference_state,
+        frame_idx,
+        is_cond,
+        consolidate_at_video_res=False,
+    ):
+        """
+        Consolidate the per-object temporary outputs in `temp_output_dict_per_obj` on
+        a frame into a single output for all objects, including
+        1) fill any missing objects either from `output_dict_per_obj` (if they exist in
+           `output_dict_per_obj` for this frame) or leave them as placeholder values
+           (if they don't exist in `output_dict_per_obj` for this frame);
+        2) if specified, rerun memory encoder after apply non-overlapping constraints
+           on the object scores.
+        """
+        batch_size = self._get_obj_num(inference_state)
+        storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+        # Optionally, we allow consolidating the temporary outputs at the original
+        # video resolution (to provide a better editing experience for mask prompts).
+        if consolidate_at_video_res:
+            consolidated_H = inference_state["video_height"]
+            consolidated_W = inference_state["video_width"]
+            consolidated_mask_key = "pred_masks_video_res"
+        else:
+            consolidated_H = consolidated_W = self.image_size // 4
+            consolidated_mask_key = "pred_masks"
+
+        # Initialize `consolidated_out`. Its "maskmem_features" and "maskmem_pos_enc"
+        # will be added when rerunning the memory encoder after applying non-overlapping
+        # constraints to object scores. Its "pred_masks" are prefilled with a large
+        # negative value (NO_OBJ_SCORE) to represent missing objects.
+        consolidated_out = {
+            consolidated_mask_key: torch.full(
+                size=(batch_size, 1, consolidated_H, consolidated_W),
+                fill_value=NO_OBJ_SCORE,
+                dtype=torch.float32,
+                device=inference_state["storage_device"],
+            ),
+        }
+        for obj_idx in range(batch_size):
+            obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+            obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+            out = obj_temp_output_dict[storage_key].get(frame_idx, None)
+            # If the object doesn't appear in "temp_output_dict_per_obj" on this frame,
+            # we fall back and look up its previous output in "output_dict_per_obj".
+            # We look up both "cond_frame_outputs" and "non_cond_frame_outputs" in
+            # "output_dict_per_obj" to find a previous output for this object.
+            if out is None:
+                out = obj_output_dict["cond_frame_outputs"].get(frame_idx, None)
+            if out is None:
+                out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx, None)
+            # If the object doesn't appear in "output_dict_per_obj" either, we skip it
+            # and leave its mask scores to the default scores (i.e. the NO_OBJ_SCORE
+            # placeholder above) and set its object pointer to be a dummy pointer.
+            if out is None:
+                continue
+            # Add the temporary object output mask to consolidated output mask
+            obj_mask = out["pred_masks"]
+            consolidated_pred_masks = consolidated_out[consolidated_mask_key]
+            if obj_mask.shape[-2:] == consolidated_pred_masks.shape[-2:]:
+                consolidated_pred_masks[obj_idx : obj_idx + 1] = obj_mask
+            else:
+                # Resize first if temporary object mask has a different resolution
+                resized_obj_mask = torch.nn.functional.interpolate(
+                    obj_mask,
+                    size=consolidated_pred_masks.shape[-2:],
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                consolidated_pred_masks[obj_idx : obj_idx + 1] = resized_obj_mask
+
+        return consolidated_out
+
+    @torch.inference_mode()
+    def propagate_in_video_preflight(self, inference_state):
+        """Prepare inference_state and consolidate temporary outputs before tracking."""
+        # Check and make sure that every object has received input points or masks.
+        batch_size = self._get_obj_num(inference_state)
+        if batch_size == 0:
+            raise RuntimeError(
+                "No input points or masks are provided for any object; please add inputs first."
+            )
+
+        # Consolidate per-object temporary outputs in "temp_output_dict_per_obj" and
+        # add them into "output_dict".
+        for obj_idx in range(batch_size):
+            obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+            obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+            for is_cond in [False, True]:
+                # Separately consolidate conditioning and non-conditioning temp outputs
+                storage_key = (
+                    "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+                )
+                # Find all the frames that contain temporary outputs for any objects
+                # (these should be the frames that have just received clicks for mask inputs
+                # via `add_new_points_or_box` or `add_new_mask`)
+                for frame_idx, out in obj_temp_output_dict[storage_key].items():
+                    # Run memory encoder on the temporary outputs (if the memory feature is missing)
+                    if out["maskmem_features"] is None:
+                        high_res_masks = torch.nn.functional.interpolate(
+                            out["pred_masks"].to(inference_state["device"]),
+                            size=(self.image_size, self.image_size),
+                            mode="bilinear",
+                            align_corners=False,
+                        )
+                        maskmem_features, maskmem_pos_enc = self._run_memory_encoder(
+                            inference_state=inference_state,
+                            frame_idx=frame_idx,
+                            batch_size=1,  # run on the slice of a single object
+                            high_res_masks=high_res_masks,
+                            object_score_logits=out["object_score_logits"],
+                            # these frames are what the user interacted with
+                            is_mask_from_pts=True,
+                        )
+                        out["maskmem_features"] = maskmem_features
+                        out["maskmem_pos_enc"] = maskmem_pos_enc
+
+                    obj_output_dict[storage_key][frame_idx] = out
+                    if self.clear_non_cond_mem_around_input:
+                        # clear non-conditioning memory of the surrounding frames
+                        self._clear_obj_non_cond_mem_around_input(
+                            inference_state, frame_idx, obj_idx
+                        )
+
+                # clear temporary outputs in `temp_output_dict_per_obj`
+                obj_temp_output_dict[storage_key].clear()
+
+            # check and make sure that every object has received input points or masks
+            obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+            if len(obj_output_dict["cond_frame_outputs"]) == 0:
+                obj_id = self._obj_idx_to_id(inference_state, obj_idx)
+                raise RuntimeError(
+                    f"No input points or masks are provided for object id {obj_id}; please add inputs first."
+                )
+            # edge case: if an output is added to "cond_frame_outputs", we remove any prior
+            # output on the same frame in "non_cond_frame_outputs"
+            for frame_idx in obj_output_dict["cond_frame_outputs"]:
+                obj_output_dict["non_cond_frame_outputs"].pop(frame_idx, None)
+
+    @torch.inference_mode()
+    def propagate_in_video(
+        self,
+        inference_state,
+        start_frame_idx=None,
+        max_frame_num_to_track=None,
+        reverse=False,
+    ):
+        """Propagate the input points across frames to track in the entire video."""
+        self.propagate_in_video_preflight(inference_state)
+
+        obj_ids = inference_state["obj_ids"]
+        num_frames = inference_state["num_frames"]
+        batch_size = self._get_obj_num(inference_state)
+
+        # set start index, end index, and processing order
+        if start_frame_idx is None:
+            # default: start from the earliest frame with input points
+            start_frame_idx = min(
+                t
+                for obj_output_dict in inference_state["output_dict_per_obj"].values()
+                for t in obj_output_dict["cond_frame_outputs"]
+            )
+        if max_frame_num_to_track is None:
+            # default: track all the frames in the video
+            max_frame_num_to_track = num_frames
+        if reverse:
+            end_frame_idx = max(start_frame_idx - max_frame_num_to_track, 0)
+            if start_frame_idx > 0:
+                processing_order = range(start_frame_idx, end_frame_idx - 1, -1)
+            else:
+                processing_order = []  # skip reverse tracking if starting from frame 0
+        else:
+            end_frame_idx = min(
+                start_frame_idx + max_frame_num_to_track, num_frames - 1
+            )
+            processing_order = range(start_frame_idx, end_frame_idx + 1)
+
+        for frame_idx in tqdm(processing_order, desc="propagate in video"):
+            pred_masks_per_obj = [None] * batch_size
+            for obj_idx in range(batch_size):
+                obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+                # We skip those frames already in consolidated outputs (these are frames
+                # that received input clicks or mask). Note that we cannot directly run
+                # batched forward on them via `_run_single_frame_inference` because the
+                # number of clicks on each object might be different.
+                if frame_idx in obj_output_dict["cond_frame_outputs"]:
+                    storage_key = "cond_frame_outputs"
+                    current_out = obj_output_dict[storage_key][frame_idx]
+                    device = inference_state["device"]
+                    pred_masks = current_out["pred_masks"].to(device, non_blocking=True)
+                    if self.clear_non_cond_mem_around_input:
+                        # clear non-conditioning memory of the surrounding frames
+                        self._clear_obj_non_cond_mem_around_input(
+                            inference_state, frame_idx, obj_idx
+                        )
+                else:
+                    storage_key = "non_cond_frame_outputs"
+                    current_out, pred_masks = self._run_single_frame_inference(
+                        inference_state=inference_state,
+                        output_dict=obj_output_dict,
+                        frame_idx=frame_idx,
+                        batch_size=1,  # run on the slice of a single object
+                        is_init_cond_frame=False,
+                        point_inputs=None,
+                        mask_inputs=None,
+                        reverse=reverse,
+                        run_mem_encoder=True,
+                    )
+                    obj_output_dict[storage_key][frame_idx] = current_out
+
+                inference_state["frames_tracked_per_obj"][obj_idx][frame_idx] = {
+                    "reverse": reverse
+                }
+                pred_masks_per_obj[obj_idx] = pred_masks
+
+            # Resize the output mask to the original video resolution (we directly use
+            # the mask scores on GPU for output to avoid any CPU conversion in between)
+            if len(pred_masks_per_obj) > 1:
+                all_pred_masks = torch.cat(pred_masks_per_obj, dim=0)
+            else:
+                all_pred_masks = pred_masks_per_obj[0]
+            _, video_res_masks = self._get_orig_video_res_output(
+                inference_state, all_pred_masks
+            )
+            yield frame_idx, obj_ids, video_res_masks
+
+    @torch.inference_mode()
+    def clear_all_prompts_in_frame(
+        self, inference_state, frame_idx, obj_id, need_output=True
+    ):
+        """Remove all input points or mask in a specific frame for a given object."""
+        obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+
+        # Clear the conditioning information on the given frame
+        inference_state["point_inputs_per_obj"][obj_idx].pop(frame_idx, None)
+        inference_state["mask_inputs_per_obj"][obj_idx].pop(frame_idx, None)
+
+        temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
+        temp_output_dict_per_obj[obj_idx]["cond_frame_outputs"].pop(frame_idx, None)
+        temp_output_dict_per_obj[obj_idx]["non_cond_frame_outputs"].pop(frame_idx, None)
+
+        # Remove the frame's conditioning output (possibly downgrading it to non-conditioning)
+        obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+        out = obj_output_dict["cond_frame_outputs"].pop(frame_idx, None)
+        if out is not None:
+            # The frame is not a conditioning frame anymore since it's not receiving inputs,
+            # so we "downgrade" its output (if exists) to a non-conditioning frame output.
+            obj_output_dict["non_cond_frame_outputs"][frame_idx] = out
+            inference_state["frames_tracked_per_obj"][obj_idx].pop(frame_idx, None)
+
+        if not need_output:
+            return
+        # Finally, output updated masks per object (after removing the inputs above)
+        obj_ids = inference_state["obj_ids"]
+        is_cond = any(
+            frame_idx in obj_temp_output_dict["cond_frame_outputs"]
+            for obj_temp_output_dict in temp_output_dict_per_obj.values()
+        )
+        consolidated_out = self._consolidate_temp_output_across_obj(
+            inference_state,
+            frame_idx,
+            is_cond=is_cond,
+            consolidate_at_video_res=True,
+        )
+        _, video_res_masks = self._get_orig_video_res_output(
+            inference_state, consolidated_out["pred_masks_video_res"]
+        )
+        return frame_idx, obj_ids, video_res_masks
+
+    @torch.inference_mode()
+    def reset_state(self, inference_state):
+        """Remove all input points or mask in all frames throughout the video."""
+        self._reset_tracking_results(inference_state)
+        # Remove all object ids
+        inference_state["obj_id_to_idx"].clear()
+        inference_state["obj_idx_to_id"].clear()
+        inference_state["obj_ids"].clear()
+        inference_state["point_inputs_per_obj"].clear()
+        inference_state["mask_inputs_per_obj"].clear()
+        inference_state["output_dict_per_obj"].clear()
+        inference_state["temp_output_dict_per_obj"].clear()
+        inference_state["frames_tracked_per_obj"].clear()
+
+    def _reset_tracking_results(self, inference_state):
+        """Reset all tracking inputs and results across the videos."""
+        for v in inference_state["point_inputs_per_obj"].values():
+            v.clear()
+        for v in inference_state["mask_inputs_per_obj"].values():
+            v.clear()
+        for v in inference_state["output_dict_per_obj"].values():
+            v["cond_frame_outputs"].clear()
+            v["non_cond_frame_outputs"].clear()
+        for v in inference_state["temp_output_dict_per_obj"].values():
+            v["cond_frame_outputs"].clear()
+            v["non_cond_frame_outputs"].clear()
+        for v in inference_state["frames_tracked_per_obj"].values():
+            v.clear()
+
+    def _get_image_feature(self, inference_state, frame_idx, batch_size):
+        """Compute the image features on a given frame."""
+        # Look up in the cache first
+        image, backbone_out = inference_state["cached_features"].get(
+            frame_idx, (None, None)
+        )
+        if backbone_out is None:
+            # Cache miss -- we will run inference on a single image
+            device = inference_state["device"]
+            image = inference_state["images"][frame_idx].to(device).float().unsqueeze(0)
+            backbone_out = self.forward_image(image)
+            # Cache the most recent frame's feature (for repeated interactions with
+            # a frame; we can use an LRU cache for more frames in the future).
+            inference_state["cached_features"] = {frame_idx: (image, backbone_out)}
+
+        # expand the features to have the same dimension as the number of objects
+        expanded_image = image.expand(batch_size, -1, -1, -1)
+        expanded_backbone_out = {
+            "backbone_fpn": backbone_out["backbone_fpn"].copy(),
+            "vision_pos_enc": backbone_out["vision_pos_enc"].copy(),
+        }
+        for i, feat in enumerate(expanded_backbone_out["backbone_fpn"]):
+            expanded_backbone_out["backbone_fpn"][i] = feat.expand(
+                batch_size, -1, -1, -1
+            )
+        for i, pos in enumerate(expanded_backbone_out["vision_pos_enc"]):
+            pos = pos.expand(batch_size, -1, -1, -1)
+            expanded_backbone_out["vision_pos_enc"][i] = pos
+
+        features = self._prepare_backbone_features(expanded_backbone_out)
+        features = (expanded_image,) + features
+        return features
+
+    def _run_single_frame_inference(
+        self,
+        inference_state,
+        output_dict,
+        frame_idx,
+        batch_size,
+        is_init_cond_frame,
+        point_inputs,
+        mask_inputs,
+        reverse,
+        run_mem_encoder,
+        prev_sam_mask_logits=None,
+    ):
+        """Run tracking on a single frame based on current inputs and previous memory."""
+        # Retrieve correct image features
+        (
+            _,
+            _,
+            current_vision_feats,
+            current_vision_pos_embeds,
+            feat_sizes,
+        ) = self._get_image_feature(inference_state, frame_idx, batch_size)
+
+        # point and mask should not appear as input simultaneously on the same frame
+        assert point_inputs is None or mask_inputs is None
+        current_out = self.track_step(
+            frame_idx=frame_idx,
+            is_init_cond_frame=is_init_cond_frame,
+            current_vision_feats=current_vision_feats,
+            current_vision_pos_embeds=current_vision_pos_embeds,
+            feat_sizes=feat_sizes,
+            point_inputs=point_inputs,
+            mask_inputs=mask_inputs,
+            output_dict=output_dict,
+            num_frames=inference_state["num_frames"],
+            track_in_reverse=reverse,
+            run_mem_encoder=run_mem_encoder,
+            prev_sam_mask_logits=prev_sam_mask_logits,
+        )
+
+        # optionally offload the output to CPU memory to save GPU space
+        storage_device = inference_state["storage_device"]
+        maskmem_features = current_out["maskmem_features"]
+        if maskmem_features is not None:
+            maskmem_features = maskmem_features.to(torch.bfloat16)
+            maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
+        pred_masks_gpu = current_out["pred_masks"]
+        # potentially fill holes in the predicted masks
+        if self.fill_hole_area > 0:
+            pred_masks_gpu = fill_holes_in_mask_scores(
+                pred_masks_gpu, self.fill_hole_area
+            )
+        pred_masks = pred_masks_gpu.to(storage_device, non_blocking=True)
+        # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
+        maskmem_pos_enc = self._get_maskmem_pos_enc(inference_state, current_out)
+        # object pointer is a small tensor, so we always keep it on GPU memory for fast access
+        obj_ptr = current_out["obj_ptr"]
+        object_score_logits = current_out["object_score_logits"]
+        # make a compact version of this frame's output to reduce the state size
+        compact_current_out = {
+            "maskmem_features": maskmem_features,
+            "maskmem_pos_enc": maskmem_pos_enc,
+            "pred_masks": pred_masks,
+            "obj_ptr": obj_ptr,
+            "object_score_logits": object_score_logits,
+        }
+        return compact_current_out, pred_masks_gpu
+
+    def _run_memory_encoder(
+        self,
+        inference_state,
+        frame_idx,
+        batch_size,
+        high_res_masks,
+        object_score_logits,
+        is_mask_from_pts,
+    ):
+        """
+        Run the memory encoder on `high_res_masks`. This is usually after applying
+        non-overlapping constraints to object scores. Since their scores changed, their
+        memory also need to be computed again with the memory encoder.
+        """
+        # Retrieve correct image features
+        _, _, current_vision_feats, _, feat_sizes = self._get_image_feature(
+            inference_state, frame_idx, batch_size
+        )
+        maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+            current_vision_feats=current_vision_feats,
+            feat_sizes=feat_sizes,
+            pred_masks_high_res=high_res_masks,
+            object_score_logits=object_score_logits,
+            is_mask_from_pts=is_mask_from_pts,
+        )
+
+        # optionally offload the output to CPU memory to save GPU space
+        storage_device = inference_state["storage_device"]
+        maskmem_features = maskmem_features.to(torch.bfloat16)
+        maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
+        # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
+        maskmem_pos_enc = self._get_maskmem_pos_enc(
+            inference_state, {"maskmem_pos_enc": maskmem_pos_enc}
+        )
+        return maskmem_features, maskmem_pos_enc
+
+    def _get_maskmem_pos_enc(self, inference_state, current_out):
+        """
+        `maskmem_pos_enc` is the same across frames and objects, so we cache it as
+        a constant in the inference session to reduce session storage size.
+        """
+        model_constants = inference_state["constants"]
+        # "out_maskmem_pos_enc" should be either a list of tensors or None
+        out_maskmem_pos_enc = current_out["maskmem_pos_enc"]
+        if out_maskmem_pos_enc is not None:
+            if "maskmem_pos_enc" not in model_constants:
+                assert isinstance(out_maskmem_pos_enc, list)
+                # only take the slice for one object, since it's same across objects
+                maskmem_pos_enc = [x[0:1].clone() for x in out_maskmem_pos_enc]
+                model_constants["maskmem_pos_enc"] = maskmem_pos_enc
+            else:
+                maskmem_pos_enc = model_constants["maskmem_pos_enc"]
+            # expand the cached maskmem_pos_enc to the actual batch size
+            batch_size = out_maskmem_pos_enc[0].size(0)
+            expanded_maskmem_pos_enc = [
+                x.expand(batch_size, -1, -1, -1) for x in maskmem_pos_enc
+            ]
+        else:
+            expanded_maskmem_pos_enc = None
+        return expanded_maskmem_pos_enc
+
+    @torch.inference_mode()
+    def remove_object(self, inference_state, obj_id, strict=False, need_output=True):
+        """
+        Remove an object id from the tracking state. If strict is True, we check whether
+        the object id actually exists and raise an error if it doesn't exist.
+        """
+        old_obj_idx_to_rm = inference_state["obj_id_to_idx"].get(obj_id, None)
+        updated_frames = []
+        # Check whether this object_id to remove actually exists and possibly raise an error.
+        if old_obj_idx_to_rm is None:
+            if not strict:
+                return inference_state["obj_ids"], updated_frames
+            raise RuntimeError(
+                f"Cannot remove object id {obj_id} as it doesn't exist. "
+                f"All existing object ids: {inference_state['obj_ids']}."
+            )
+
+        # If this is the only remaining object id, we simply reset the state.
+        if len(inference_state["obj_id_to_idx"]) == 1:
+            self.reset_state(inference_state)
+            return inference_state["obj_ids"], updated_frames
+
+        # There are still remaining objects after removing this object id. In this case,
+        # we need to delete the object storage from inference state tensors.
+        # Step 0: clear the input on those frames where this object id has point or mask input
+        # (note that this step is required as it might downgrade conditioning frames to
+        # non-conditioning ones)
+        obj_input_frames_inds = set()
+        obj_input_frames_inds.update(
+            inference_state["point_inputs_per_obj"][old_obj_idx_to_rm]
+        )
+        obj_input_frames_inds.update(
+            inference_state["mask_inputs_per_obj"][old_obj_idx_to_rm]
+        )
+        for frame_idx in obj_input_frames_inds:
+            self.clear_all_prompts_in_frame(
+                inference_state, frame_idx, obj_id, need_output=False
+            )
+
+        # Step 1: Update the object id mapping (note that it must be done after Step 0,
+        # since Step 0 still requires the old object id mappings in inference_state)
+        old_obj_ids = inference_state["obj_ids"]
+        old_obj_inds = list(range(len(old_obj_ids)))
+        remain_old_obj_inds = old_obj_inds.copy()
+        remain_old_obj_inds.remove(old_obj_idx_to_rm)
+        new_obj_ids = [old_obj_ids[old_idx] for old_idx in remain_old_obj_inds]
+        new_obj_inds = list(range(len(new_obj_ids)))
+        # build new mappings
+        old_idx_to_new_idx = dict(zip(remain_old_obj_inds, new_obj_inds))
+        inference_state["obj_id_to_idx"] = dict(zip(new_obj_ids, new_obj_inds))
+        inference_state["obj_idx_to_id"] = dict(zip(new_obj_inds, new_obj_ids))
+        inference_state["obj_ids"] = new_obj_ids
+
+        # Step 2: For per-object tensor storage, we shift their obj_idx in the dict keys.
+        def _map_keys(container):
+            new_kvs = []
+            for k in old_obj_inds:
+                v = container.pop(k)
+                if k in old_idx_to_new_idx:
+                    new_kvs.append((old_idx_to_new_idx[k], v))
+            container.update(new_kvs)
+
+        _map_keys(inference_state["point_inputs_per_obj"])
+        _map_keys(inference_state["mask_inputs_per_obj"])
+        _map_keys(inference_state["output_dict_per_obj"])
+        _map_keys(inference_state["temp_output_dict_per_obj"])
+        _map_keys(inference_state["frames_tracked_per_obj"])
+
+        # Step 3: Further collect the outputs on those frames in `obj_input_frames_inds`, which
+        # could show an updated mask for objects previously occluded by the object being removed
+        if need_output:
+            temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
+            for frame_idx in obj_input_frames_inds:
+                is_cond = any(
+                    frame_idx in obj_temp_output_dict["cond_frame_outputs"]
+                    for obj_temp_output_dict in temp_output_dict_per_obj.values()
+                )
+                consolidated_out = self._consolidate_temp_output_across_obj(
+                    inference_state,
+                    frame_idx,
+                    is_cond=is_cond,
+                    consolidate_at_video_res=True,
+                )
+                _, video_res_masks = self._get_orig_video_res_output(
+                    inference_state, consolidated_out["pred_masks_video_res"]
+                )
+                updated_frames.append((frame_idx, video_res_masks))
+
+        return inference_state["obj_ids"], updated_frames
+
+    def _clear_non_cond_mem_around_input(self, inference_state, frame_idx):
+        """
+        Remove the non-conditioning memory around the input frame. When users provide
+        correction clicks, the surrounding frames' non-conditioning memories can still
+        contain outdated object appearance information and could confuse the model.
+
+        This method clears those non-conditioning memories surrounding the interacted
+        frame to avoid giving the model both old and new information about the object.
+        """
+        r = self.memory_temporal_stride_for_eval
+        frame_idx_begin = frame_idx - r * self.num_maskmem
+        frame_idx_end = frame_idx + r * self.num_maskmem
+        batch_size = self._get_obj_num(inference_state)
+        for obj_idx in range(batch_size):
+            obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+            non_cond_frame_outputs = obj_output_dict["non_cond_frame_outputs"]
+            for t in range(frame_idx_begin, frame_idx_end + 1):
+                non_cond_frame_outputs.pop(t, None)
+
+
+class SAM2VideoPredictorVOS(SAM2VideoPredictor):
+    """Optimized for the VOS setting"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._compile_all_components()
+
+    def _compile_all_components(self):
+        print("Compiling all components for VOS setting. First time may be very slow.")
+        self.memory_encoder.forward = torch.compile(
+            self.memory_encoder.forward,
+            mode="max-autotune",
+            fullgraph=True,
+            dynamic=False,
+        )
+
+        self.memory_attention.forward = torch.compile(
+            self.memory_attention.forward,
+            mode="max-autotune",
+            fullgraph=True,
+            dynamic=True,  # Num. of memories varies
+        )
+
+        self.sam_prompt_encoder.forward = torch.compile(
+            self.sam_prompt_encoder.forward,
+            mode="max-autotune",
+            fullgraph=True,
+            dynamic=False,  # Accuracy regression on True
+        )
+
+        self.sam_mask_decoder.forward = torch.compile(
+            self.sam_mask_decoder.forward,
+            mode="max-autotune",
+            fullgraph=True,
+            dynamic=False,  # Accuracy regression on True
+        )
+
+    def forward_image(self, img_batch: torch.Tensor):
+        """
+        Identical to the corresponding method in the parent (SAM2VideoPredictor), but
+        cloning the backbone features and pos encoding to enable compilation.
+        """
+        backbone_out = self.image_encoder(img_batch)
+        if self.use_high_res_features_in_sam:
+            # precompute projected level 0 and level 1 features in SAM decoder
+            # to avoid running it again on every SAM click
+            backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(
+                backbone_out["backbone_fpn"][0]
+            )
+            backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(
+                backbone_out["backbone_fpn"][1]
+            )
+        # Clone to help torch.compile
+        for i in range(len(backbone_out["backbone_fpn"])):
+            backbone_out["backbone_fpn"][i] = backbone_out["backbone_fpn"][i].clone()
+            backbone_out["vision_pos_enc"][i] = backbone_out["vision_pos_enc"][
+                i
+            ].clone()
+        return backbone_out
+
+    def _forward_sam_heads(
+        self,
+        backbone_features,
+        point_inputs=None,
+        mask_inputs=None,
+        high_res_features=None,
+        multimask_output=False,
+    ):
+        """
+        Identical to the corresponding method in the parent (SAM2VideoPredictor), but
+        cloning the outputs of prompt_encoder and mask_decoder to enable compilation.
+        """
+        B = backbone_features.size(0)
+        device = backbone_features.device
+        assert backbone_features.size(1) == self.sam_prompt_embed_dim
+        assert backbone_features.size(2) == self.sam_image_embedding_size
+        assert backbone_features.size(3) == self.sam_image_embedding_size
+
+        # a) Handle point prompts
+        if point_inputs is not None:
+            sam_point_coords = point_inputs["point_coords"]
+            sam_point_labels = point_inputs["point_labels"]
+            assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
+        else:
+            # If no points are provide, pad with an empty point (with label -1)
+            sam_point_coords = torch.zeros(B, 1, 2, device=device)
+            sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
+
+        # b) Handle mask prompts
+        if mask_inputs is not None:
+            # If mask_inputs is provided, downsize it into low-res mask input if needed
+            # and feed it as a dense mask prompt into the SAM mask encoder
+            assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
+            if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
+                sam_mask_prompt = F.interpolate(
+                    mask_inputs.float(),
+                    size=self.sam_prompt_encoder.mask_input_size,
+                    align_corners=False,
+                    mode="bilinear",
+                    antialias=True,  # use antialias for downsampling
+                )
+            else:
+                sam_mask_prompt = mask_inputs
+        else:
+            # Otherwise, simply feed None (and SAM's prompt encoder will add
+            # a learned `no_mask_embed` to indicate no mask input in this case).
+            sam_mask_prompt = None
+
+        sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
+            points=(sam_point_coords, sam_point_labels),
+            boxes=None,
+            masks=sam_mask_prompt,
+        )
+        # Clone image_pe and the outputs of sam_prompt_encoder
+        # to enable compilation
+        sparse_embeddings = sparse_embeddings.clone()
+        dense_embeddings = dense_embeddings.clone()
+        image_pe = self.sam_prompt_encoder.get_dense_pe().clone()
+        (
+            low_res_multimasks,
+            ious,
+            sam_output_tokens,
+            object_score_logits,
+        ) = self.sam_mask_decoder(
+            image_embeddings=backbone_features,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+            repeat_image=False,  # the image is already batched
+            high_res_features=high_res_features,
+        )
+        # Clone the output of sam_mask_decoder
+        # to enable compilation
+        low_res_multimasks = low_res_multimasks.clone()
+        ious = ious.clone()
+        sam_output_tokens = sam_output_tokens.clone()
+        object_score_logits = object_score_logits.clone()
+
+        if self.pred_obj_scores:
+            is_obj_appearing = object_score_logits > 0
+
+            # Mask used for spatial memories is always a *hard* choice between obj and no obj,
+            # consistent with the actual mask prediction
+            low_res_multimasks = torch.where(
+                is_obj_appearing[:, None, None],
+                low_res_multimasks,
+                NO_OBJ_SCORE,
+            )
+
+        # convert masks from possibly bfloat16 (or float16) to float32
+        # (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
+        low_res_multimasks = low_res_multimasks.float()
+        high_res_multimasks = F.interpolate(
+            low_res_multimasks,
+            size=(self.image_size, self.image_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        sam_output_token = sam_output_tokens[:, 0]
+        if multimask_output:
+            # take the best mask prediction (with the highest IoU estimation)
+            best_iou_inds = torch.argmax(ious, dim=-1)
+            batch_inds = torch.arange(B, device=device)
+            low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+            high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+            if sam_output_tokens.size(1) > 1:
+                sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
+        else:
+            low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks
+
+        # Extract object pointer from the SAM output token (with occlusion handling)
+        obj_ptr = self.obj_ptr_proj(sam_output_token)
+        if self.pred_obj_scores:
+            # Allow *soft* no obj ptr, unlike for masks
+            if self.soft_no_obj_ptr:
+                lambda_is_obj_appearing = object_score_logits.sigmoid()
+            else:
+                lambda_is_obj_appearing = is_obj_appearing.float()
+
+            if self.fixed_no_obj_ptr:
+                obj_ptr = lambda_is_obj_appearing * obj_ptr
+            obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
+
+        return (
+            low_res_multimasks,
+            high_res_multimasks,
+            ious,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        )
+
+    def _encode_new_memory(
+        self,
+        current_vision_feats,
+        feat_sizes,
+        pred_masks_high_res,
+        object_score_logits,
+        is_mask_from_pts,
+    ):
+        """
+        Identical to the corresponding method in the parent (SAM2VideoPredictor), but
+        cloning the memories and their pos enc to enable compilation.
+        """
+        B = current_vision_feats[-1].size(1)  # batch size on this frame
+        C = self.hidden_dim
+        H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
+        # top-level feature, (HW)BC => BCHW
+        pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
+        if self.non_overlap_masks_for_mem_enc and not self.training:
+            # optionally, apply non-overlapping constraints to the masks (it's applied
+            # in the batch dimension and should only be used during eval, where all
+            # the objects come from the same video under batch size 1).
+            pred_masks_high_res = self._apply_non_overlapping_constraints(
+                pred_masks_high_res
+            )
+        # scale the raw mask logits with a temperature before applying sigmoid
+        binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
+        if binarize and not self.training:
+            mask_for_mem = (pred_masks_high_res > 0).float()
+        else:
+            # apply sigmoid on the raw mask logits to turn them into range (0, 1)
+            mask_for_mem = torch.sigmoid(pred_masks_high_res)
+        # apply scale and bias terms to the sigmoid probabilities
+        if self.sigmoid_scale_for_mem_enc != 1.0:
+            mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
+        if self.sigmoid_bias_for_mem_enc != 0.0:
+            mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
+        maskmem_out = self.memory_encoder(
+            pix_feat, mask_for_mem, skip_mask_sigmoid=True  # sigmoid already applied
+        )
+        # Clone the feats and pos_enc to enable compilation
+        maskmem_features = maskmem_out["vision_features"].clone()
+        maskmem_pos_enc = [m.clone() for m in maskmem_out["vision_pos_enc"]]
+        # add a no-object embedding to the spatial memory to indicate that the frame
+        # is predicted to be occluded (i.e. no object is appearing in the frame)
+        if self.no_obj_embed_spatial is not None:
+            is_obj_appearing = (object_score_logits > 0).float()
+            maskmem_features += (
+                1 - is_obj_appearing[..., None, None]
+            ) * self.no_obj_embed_spatial[..., None, None].expand(
+                *maskmem_features.shape
+            )
+
+        return maskmem_features, maskmem_pos_enc
diff --git a/phantom/submodules/sam2/sam2/sam2_video_predictor_legacy.py b/phantom/submodules/sam2/sam2/sam2_video_predictor_legacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7e01ccf972491904b013526333826b337354db1
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/sam2_video_predictor_legacy.py
@@ -0,0 +1,1172 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from collections import OrderedDict
+
+import torch
+
+from tqdm import tqdm
+
+from sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
+from sam2.utils.misc import concat_points, fill_holes_in_mask_scores, load_video_frames
+
+
+class SAM2VideoPredictor(SAM2Base):
+    """The predictor class to handle user interactions and manage inference states."""
+
+    def __init__(
+        self,
+        fill_hole_area=0,
+        # whether to apply non-overlapping constraints on the output object masks
+        non_overlap_masks=False,
+        # whether to clear non-conditioning memory of the surrounding frames (which may contain outdated information) after adding correction clicks;
+        # note that this would only apply to *single-object tracking* unless `clear_non_cond_mem_for_multi_obj` is also set to True)
+        clear_non_cond_mem_around_input=False,
+        # whether to also clear non-conditioning memory of the surrounding frames (only effective when `clear_non_cond_mem_around_input` is True).
+        clear_non_cond_mem_for_multi_obj=False,
+        # if `add_all_frames_to_correct_as_cond` is True, we also append to the conditioning frame list any frame that receives a later correction click
+        # if `add_all_frames_to_correct_as_cond` is False, we conditioning frame list to only use those initial conditioning frames
+        add_all_frames_to_correct_as_cond=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.fill_hole_area = fill_hole_area
+        self.non_overlap_masks = non_overlap_masks
+        self.clear_non_cond_mem_around_input = clear_non_cond_mem_around_input
+        self.clear_non_cond_mem_for_multi_obj = clear_non_cond_mem_for_multi_obj
+        self.add_all_frames_to_correct_as_cond = add_all_frames_to_correct_as_cond
+
+    @torch.inference_mode()
+    def init_state(
+        self,
+        video_path,
+        offload_video_to_cpu=False,
+        offload_state_to_cpu=False,
+        async_loading_frames=False,
+    ):
+        """Initialize an inference state."""
+        compute_device = self.device  # device of the model
+        images, video_height, video_width = load_video_frames(
+            video_path=video_path,
+            image_size=self.image_size,
+            offload_video_to_cpu=offload_video_to_cpu,
+            async_loading_frames=async_loading_frames,
+            compute_device=compute_device,
+        )
+        inference_state = {}
+        inference_state["images"] = images
+        inference_state["num_frames"] = len(images)
+        # whether to offload the video frames to CPU memory
+        # turning on this option saves the GPU memory with only a very small overhead
+        inference_state["offload_video_to_cpu"] = offload_video_to_cpu
+        # whether to offload the inference state to CPU memory
+        # turning on this option saves the GPU memory at the cost of a lower tracking fps
+        # (e.g. in a test case of 768x768 model, fps dropped from 27 to 24 when tracking one object
+        # and from 24 to 21 when tracking two objects)
+        inference_state["offload_state_to_cpu"] = offload_state_to_cpu
+        # the original video height and width, used for resizing final output scores
+        inference_state["video_height"] = video_height
+        inference_state["video_width"] = video_width
+        inference_state["device"] = compute_device
+        if offload_state_to_cpu:
+            inference_state["storage_device"] = torch.device("cpu")
+        else:
+            inference_state["storage_device"] = compute_device
+        # inputs on each frame
+        inference_state["point_inputs_per_obj"] = {}
+        inference_state["mask_inputs_per_obj"] = {}
+        # visual features on a small number of recently visited frames for quick interactions
+        inference_state["cached_features"] = {}
+        # values that don't change across frames (so we only need to hold one copy of them)
+        inference_state["constants"] = {}
+        # mapping between client-side object id and model-side object index
+        inference_state["obj_id_to_idx"] = OrderedDict()
+        inference_state["obj_idx_to_id"] = OrderedDict()
+        inference_state["obj_ids"] = []
+        # A storage to hold the model's tracking results and states on each frame
+        inference_state["output_dict"] = {
+            "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+            "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+        }
+        # Slice (view) of each object tracking results, sharing the same memory with "output_dict"
+        inference_state["output_dict_per_obj"] = {}
+        # A temporary storage to hold new outputs when user interact with a frame
+        # to add clicks or mask (it's merged into "output_dict" before propagation starts)
+        inference_state["temp_output_dict_per_obj"] = {}
+        # Frames that already holds consolidated outputs from click or mask inputs
+        # (we directly use their consolidated outputs during tracking)
+        inference_state["consolidated_frame_inds"] = {
+            "cond_frame_outputs": set(),  # set containing frame indices
+            "non_cond_frame_outputs": set(),  # set containing frame indices
+        }
+        # metadata for each tracking frame (e.g. which direction it's tracked)
+        inference_state["tracking_has_started"] = False
+        inference_state["frames_already_tracked"] = {}
+        # Warm up the visual backbone and cache the image feature on frame 0
+        self._get_image_feature(inference_state, frame_idx=0, batch_size=1)
+        return inference_state
+
+    @classmethod
+    def from_pretrained(cls, model_id: str, **kwargs) -> "SAM2VideoPredictor":
+        """
+        Load a pretrained model from the Hugging Face hub.
+
+        Arguments:
+          model_id (str): The Hugging Face repository ID.
+          **kwargs: Additional arguments to pass to the model constructor.
+
+        Returns:
+          (SAM2VideoPredictor): The loaded model.
+        """
+        from sam2.build_sam import build_sam2_video_predictor_hf
+
+        sam_model = build_sam2_video_predictor_hf(model_id, **kwargs)
+        return sam_model
+
+    def _obj_id_to_idx(self, inference_state, obj_id):
+        """Map client-side object id to model-side object index."""
+        obj_idx = inference_state["obj_id_to_idx"].get(obj_id, None)
+        if obj_idx is not None:
+            return obj_idx
+
+        # This is a new object id not sent to the server before. We only allow adding
+        # new objects *before* the tracking starts.
+        allow_new_object = not inference_state["tracking_has_started"]
+        if allow_new_object:
+            # get the next object slot
+            obj_idx = len(inference_state["obj_id_to_idx"])
+            inference_state["obj_id_to_idx"][obj_id] = obj_idx
+            inference_state["obj_idx_to_id"][obj_idx] = obj_id
+            inference_state["obj_ids"] = list(inference_state["obj_id_to_idx"])
+            # set up input and output structures for this object
+            inference_state["point_inputs_per_obj"][obj_idx] = {}
+            inference_state["mask_inputs_per_obj"][obj_idx] = {}
+            inference_state["output_dict_per_obj"][obj_idx] = {
+                "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+                "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+            }
+            inference_state["temp_output_dict_per_obj"][obj_idx] = {
+                "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+                "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+            }
+            return obj_idx
+        else:
+            raise RuntimeError(
+                f"Cannot add new object id {obj_id} after tracking starts. "
+                f"All existing object ids: {inference_state['obj_ids']}. "
+                f"Please call 'reset_state' to restart from scratch."
+            )
+
+    def _obj_idx_to_id(self, inference_state, obj_idx):
+        """Map model-side object index to client-side object id."""
+        return inference_state["obj_idx_to_id"][obj_idx]
+
+    def _get_obj_num(self, inference_state):
+        """Get the total number of unique object ids received so far in this session."""
+        return len(inference_state["obj_idx_to_id"])
+
+    @torch.inference_mode()
+    def add_new_points_or_box(
+        self,
+        inference_state,
+        frame_idx,
+        obj_id,
+        points=None,
+        labels=None,
+        clear_old_points=True,
+        normalize_coords=True,
+        box=None,
+    ):
+        """Add new points to a frame."""
+        obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+        point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
+        mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]
+
+        if (points is not None) != (labels is not None):
+            raise ValueError("points and labels must be provided together")
+        if points is None and box is None:
+            raise ValueError("at least one of points or box must be provided as input")
+
+        if points is None:
+            points = torch.zeros(0, 2, dtype=torch.float32)
+        elif not isinstance(points, torch.Tensor):
+            points = torch.tensor(points, dtype=torch.float32)
+        if labels is None:
+            labels = torch.zeros(0, dtype=torch.int32)
+        elif not isinstance(labels, torch.Tensor):
+            labels = torch.tensor(labels, dtype=torch.int32)
+        if points.dim() == 2:
+            points = points.unsqueeze(0)  # add batch dimension
+        if labels.dim() == 1:
+            labels = labels.unsqueeze(0)  # add batch dimension
+
+        # If `box` is provided, we add it as the first two points with labels 2 and 3
+        # along with the user-provided points (consistent with how SAM 2 is trained).
+        if box is not None:
+            if not clear_old_points:
+                raise ValueError(
+                    "cannot add box without clearing old points, since "
+                    "box prompt must be provided before any point prompt "
+                    "(please use clear_old_points=True instead)"
+                )
+            if inference_state["tracking_has_started"]:
+                warnings.warn(
+                    "You are adding a box after tracking starts. SAM 2 may not always be "
+                    "able to incorporate a box prompt for *refinement*. If you intend to "
+                    "use box prompt as an *initial* input before tracking, please call "
+                    "'reset_state' on the inference state to restart from scratch.",
+                    category=UserWarning,
+                    stacklevel=2,
+                )
+            if not isinstance(box, torch.Tensor):
+                box = torch.tensor(box, dtype=torch.float32, device=points.device)
+            box_coords = box.reshape(1, 2, 2)
+            box_labels = torch.tensor([2, 3], dtype=torch.int32, device=labels.device)
+            box_labels = box_labels.reshape(1, 2)
+            points = torch.cat([box_coords, points], dim=1)
+            labels = torch.cat([box_labels, labels], dim=1)
+
+        if normalize_coords:
+            video_H = inference_state["video_height"]
+            video_W = inference_state["video_width"]
+            points = points / torch.tensor([video_W, video_H]).to(points.device)
+        # scale the (normalized) coordinates by the model's internal image size
+        points = points * self.image_size
+        points = points.to(inference_state["device"])
+        labels = labels.to(inference_state["device"])
+
+        if not clear_old_points:
+            point_inputs = point_inputs_per_frame.get(frame_idx, None)
+        else:
+            point_inputs = None
+        point_inputs = concat_points(point_inputs, points, labels)
+
+        point_inputs_per_frame[frame_idx] = point_inputs
+        mask_inputs_per_frame.pop(frame_idx, None)
+        # If this frame hasn't been tracked before, we treat it as an initial conditioning
+        # frame, meaning that the inputs points are to generate segments on this frame without
+        # using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
+        # the input points will be used to correct the already tracked masks.
+        is_init_cond_frame = frame_idx not in inference_state["frames_already_tracked"]
+        # whether to track in reverse time order
+        if is_init_cond_frame:
+            reverse = False
+        else:
+            reverse = inference_state["frames_already_tracked"][frame_idx]["reverse"]
+        obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+        obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+        # Add a frame to conditioning output if it's an initial conditioning frame or
+        # if the model sees all frames receiving clicks/mask as conditioning frames.
+        is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
+        storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+
+        # Get any previously predicted mask logits on this object and feed it along with
+        # the new clicks into the SAM mask decoder.
+        prev_sam_mask_logits = None
+        # lookup temporary output dict first, which contains the most recent output
+        # (if not found, then lookup conditioning and non-conditioning frame output)
+        prev_out = obj_temp_output_dict[storage_key].get(frame_idx)
+        if prev_out is None:
+            prev_out = obj_output_dict["cond_frame_outputs"].get(frame_idx)
+            if prev_out is None:
+                prev_out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx)
+
+        if prev_out is not None and prev_out["pred_masks"] is not None:
+            device = inference_state["device"]
+            prev_sam_mask_logits = prev_out["pred_masks"].to(device, non_blocking=True)
+            # Clamp the scale of prev_sam_mask_logits to avoid rare numerical issues.
+            prev_sam_mask_logits = torch.clamp(prev_sam_mask_logits, -32.0, 32.0)
+        current_out, _ = self._run_single_frame_inference(
+            inference_state=inference_state,
+            output_dict=obj_output_dict,  # run on the slice of a single object
+            frame_idx=frame_idx,
+            batch_size=1,  # run on the slice of a single object
+            is_init_cond_frame=is_init_cond_frame,
+            point_inputs=point_inputs,
+            mask_inputs=None,
+            reverse=reverse,
+            # Skip the memory encoder when adding clicks or mask. We execute the memory encoder
+            # at the beginning of `propagate_in_video` (after user finalize their clicks). This
+            # allows us to enforce non-overlapping constraints on all objects before encoding
+            # them into memory.
+            run_mem_encoder=False,
+            prev_sam_mask_logits=prev_sam_mask_logits,
+        )
+        # Add the output to the output dict (to be used as future memory)
+        obj_temp_output_dict[storage_key][frame_idx] = current_out
+
+        # Resize the output mask to the original video resolution
+        obj_ids = inference_state["obj_ids"]
+        consolidated_out = self._consolidate_temp_output_across_obj(
+            inference_state,
+            frame_idx,
+            is_cond=is_cond,
+            run_mem_encoder=False,
+            consolidate_at_video_res=True,
+        )
+        _, video_res_masks = self._get_orig_video_res_output(
+            inference_state, consolidated_out["pred_masks_video_res"]
+        )
+        return frame_idx, obj_ids, video_res_masks
+
+    def add_new_points(self, *args, **kwargs):
+        """Deprecated method. Please use `add_new_points_or_box` instead."""
+        return self.add_new_points_or_box(*args, **kwargs)
+
+    @torch.inference_mode()
+    def add_new_mask(
+        self,
+        inference_state,
+        frame_idx,
+        obj_id,
+        mask,
+    ):
+        """Add new mask to a frame."""
+        obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+        point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
+        mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]
+
+        if not isinstance(mask, torch.Tensor):
+            mask = torch.tensor(mask, dtype=torch.bool)
+        assert mask.dim() == 2
+        mask_H, mask_W = mask.shape
+        mask_inputs_orig = mask[None, None]  # add batch and channel dimension
+        mask_inputs_orig = mask_inputs_orig.float().to(inference_state["device"])
+
+        # resize the mask if it doesn't match the model's image size
+        if mask_H != self.image_size or mask_W != self.image_size:
+            mask_inputs = torch.nn.functional.interpolate(
+                mask_inputs_orig,
+                size=(self.image_size, self.image_size),
+                align_corners=False,
+                mode="bilinear",
+                antialias=True,  # use antialias for downsampling
+            )
+            mask_inputs = (mask_inputs >= 0.5).float()
+        else:
+            mask_inputs = mask_inputs_orig
+
+        mask_inputs_per_frame[frame_idx] = mask_inputs
+        point_inputs_per_frame.pop(frame_idx, None)
+        # If this frame hasn't been tracked before, we treat it as an initial conditioning
+        # frame, meaning that the inputs points are to generate segments on this frame without
+        # using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
+        # the input points will be used to correct the already tracked masks.
+        is_init_cond_frame = frame_idx not in inference_state["frames_already_tracked"]
+        # whether to track in reverse time order
+        if is_init_cond_frame:
+            reverse = False
+        else:
+            reverse = inference_state["frames_already_tracked"][frame_idx]["reverse"]
+        obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+        obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+        # Add a frame to conditioning output if it's an initial conditioning frame or
+        # if the model sees all frames receiving clicks/mask as conditioning frames.
+        is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
+        storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+
+        current_out, _ = self._run_single_frame_inference(
+            inference_state=inference_state,
+            output_dict=obj_output_dict,  # run on the slice of a single object
+            frame_idx=frame_idx,
+            batch_size=1,  # run on the slice of a single object
+            is_init_cond_frame=is_init_cond_frame,
+            point_inputs=None,
+            mask_inputs=mask_inputs,
+            reverse=reverse,
+            # Skip the memory encoder when adding clicks or mask. We execute the memory encoder
+            # at the beginning of `propagate_in_video` (after user finalize their clicks). This
+            # allows us to enforce non-overlapping constraints on all objects before encoding
+            # them into memory.
+            run_mem_encoder=False,
+        )
+        # Add the output to the output dict (to be used as future memory)
+        obj_temp_output_dict[storage_key][frame_idx] = current_out
+
+        # Resize the output mask to the original video resolution
+        obj_ids = inference_state["obj_ids"]
+        consolidated_out = self._consolidate_temp_output_across_obj(
+            inference_state,
+            frame_idx,
+            is_cond=is_cond,
+            run_mem_encoder=False,
+            consolidate_at_video_res=True,
+        )
+        _, video_res_masks = self._get_orig_video_res_output(
+            inference_state, consolidated_out["pred_masks_video_res"]
+        )
+        return frame_idx, obj_ids, video_res_masks
+
+    def _get_orig_video_res_output(self, inference_state, any_res_masks):
+        """
+        Resize the object scores to the original video resolution (video_res_masks)
+        and apply non-overlapping constraints for final output.
+        """
+        device = inference_state["device"]
+        video_H = inference_state["video_height"]
+        video_W = inference_state["video_width"]
+        any_res_masks = any_res_masks.to(device, non_blocking=True)
+        if any_res_masks.shape[-2:] == (video_H, video_W):
+            video_res_masks = any_res_masks
+        else:
+            video_res_masks = torch.nn.functional.interpolate(
+                any_res_masks,
+                size=(video_H, video_W),
+                mode="bilinear",
+                align_corners=False,
+            )
+        if self.non_overlap_masks:
+            video_res_masks = self._apply_non_overlapping_constraints(video_res_masks)
+        return any_res_masks, video_res_masks
+
+    def _consolidate_temp_output_across_obj(
+        self,
+        inference_state,
+        frame_idx,
+        is_cond,
+        run_mem_encoder,
+        consolidate_at_video_res=False,
+    ):
+        """
+        Consolidate the per-object temporary outputs in `temp_output_dict_per_obj` on
+        a frame into a single output for all objects, including
+        1) fill any missing objects either from `output_dict_per_obj` (if they exist in
+           `output_dict_per_obj` for this frame) or leave them as placeholder values
+           (if they don't exist in `output_dict_per_obj` for this frame);
+        2) if specified, rerun memory encoder after apply non-overlapping constraints
+           on the object scores.
+        """
+        batch_size = self._get_obj_num(inference_state)
+        storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+        # Optionally, we allow consolidating the temporary outputs at the original
+        # video resolution (to provide a better editing experience for mask prompts).
+        if consolidate_at_video_res:
+            assert not run_mem_encoder, "memory encoder cannot run at video resolution"
+            consolidated_H = inference_state["video_height"]
+            consolidated_W = inference_state["video_width"]
+            consolidated_mask_key = "pred_masks_video_res"
+        else:
+            consolidated_H = consolidated_W = self.image_size // 4
+            consolidated_mask_key = "pred_masks"
+
+        # Initialize `consolidated_out`. Its "maskmem_features" and "maskmem_pos_enc"
+        # will be added when rerunning the memory encoder after applying non-overlapping
+        # constraints to object scores. Its "pred_masks" are prefilled with a large
+        # negative value (NO_OBJ_SCORE) to represent missing objects.
+        consolidated_out = {
+            "maskmem_features": None,
+            "maskmem_pos_enc": None,
+            consolidated_mask_key: torch.full(
+                size=(batch_size, 1, consolidated_H, consolidated_W),
+                fill_value=NO_OBJ_SCORE,
+                dtype=torch.float32,
+                device=inference_state["storage_device"],
+            ),
+            "obj_ptr": torch.full(
+                size=(batch_size, self.hidden_dim),
+                fill_value=NO_OBJ_SCORE,
+                dtype=torch.float32,
+                device=inference_state["device"],
+            ),
+            "object_score_logits": torch.full(
+                size=(batch_size, 1),
+                # default to 10.0 for object_score_logits, i.e. assuming the object is
+                # present as sigmoid(10)=1, same as in `predict_masks` of `MaskDecoder`
+                fill_value=10.0,
+                dtype=torch.float32,
+                device=inference_state["device"],
+            ),
+        }
+        empty_mask_ptr = None
+        for obj_idx in range(batch_size):
+            obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+            obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+            out = obj_temp_output_dict[storage_key].get(frame_idx, None)
+            # If the object doesn't appear in "temp_output_dict_per_obj" on this frame,
+            # we fall back and look up its previous output in "output_dict_per_obj".
+            # We look up both "cond_frame_outputs" and "non_cond_frame_outputs" in
+            # "output_dict_per_obj" to find a previous output for this object.
+            if out is None:
+                out = obj_output_dict["cond_frame_outputs"].get(frame_idx, None)
+            if out is None:
+                out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx, None)
+            # If the object doesn't appear in "output_dict_per_obj" either, we skip it
+            # and leave its mask scores to the default scores (i.e. the NO_OBJ_SCORE
+            # placeholder above) and set its object pointer to be a dummy pointer.
+            if out is None:
+                # Fill in dummy object pointers for those objects without any inputs or
+                # tracking outcomes on this frame (only do it under `run_mem_encoder=True`,
+                # i.e. when we need to build the memory for tracking).
+                if run_mem_encoder:
+                    if empty_mask_ptr is None:
+                        empty_mask_ptr = self._get_empty_mask_ptr(
+                            inference_state, frame_idx
+                        )
+                    # fill object pointer with a dummy pointer (based on an empty mask)
+                    consolidated_out["obj_ptr"][obj_idx : obj_idx + 1] = empty_mask_ptr
+                continue
+            # Add the temporary object output mask to consolidated output mask
+            obj_mask = out["pred_masks"]
+            consolidated_pred_masks = consolidated_out[consolidated_mask_key]
+            if obj_mask.shape[-2:] == consolidated_pred_masks.shape[-2:]:
+                consolidated_pred_masks[obj_idx : obj_idx + 1] = obj_mask
+            else:
+                # Resize first if temporary object mask has a different resolution
+                resized_obj_mask = torch.nn.functional.interpolate(
+                    obj_mask,
+                    size=consolidated_pred_masks.shape[-2:],
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                consolidated_pred_masks[obj_idx : obj_idx + 1] = resized_obj_mask
+            consolidated_out["obj_ptr"][obj_idx : obj_idx + 1] = out["obj_ptr"]
+            consolidated_out["object_score_logits"][obj_idx : obj_idx + 1] = out[
+                "object_score_logits"
+            ]
+
+        # Optionally, apply non-overlapping constraints on the consolidated scores
+        # and rerun the memory encoder
+        if run_mem_encoder:
+            device = inference_state["device"]
+            high_res_masks = torch.nn.functional.interpolate(
+                consolidated_out["pred_masks"].to(device, non_blocking=True),
+                size=(self.image_size, self.image_size),
+                mode="bilinear",
+                align_corners=False,
+            )
+            if self.non_overlap_masks_for_mem_enc:
+                high_res_masks = self._apply_non_overlapping_constraints(high_res_masks)
+            maskmem_features, maskmem_pos_enc = self._run_memory_encoder(
+                inference_state=inference_state,
+                frame_idx=frame_idx,
+                batch_size=batch_size,
+                high_res_masks=high_res_masks,
+                object_score_logits=consolidated_out["object_score_logits"],
+                is_mask_from_pts=True,  # these frames are what the user interacted with
+            )
+            consolidated_out["maskmem_features"] = maskmem_features
+            consolidated_out["maskmem_pos_enc"] = maskmem_pos_enc
+
+        return consolidated_out
+
+    def _get_empty_mask_ptr(self, inference_state, frame_idx):
+        """Get a dummy object pointer based on an empty mask on the current frame."""
+        # A dummy (empty) mask with a single object
+        batch_size = 1
+        mask_inputs = torch.zeros(
+            (batch_size, 1, self.image_size, self.image_size),
+            dtype=torch.float32,
+            device=inference_state["device"],
+        )
+
+        # Retrieve correct image features
+        (
+            _,
+            _,
+            current_vision_feats,
+            current_vision_pos_embeds,
+            feat_sizes,
+        ) = self._get_image_feature(inference_state, frame_idx, batch_size)
+
+        # Feed the empty mask and image feature above to get a dummy object pointer
+        current_out = self.track_step(
+            frame_idx=frame_idx,
+            is_init_cond_frame=True,
+            current_vision_feats=current_vision_feats,
+            current_vision_pos_embeds=current_vision_pos_embeds,
+            feat_sizes=feat_sizes,
+            point_inputs=None,
+            mask_inputs=mask_inputs,
+            output_dict={},
+            num_frames=inference_state["num_frames"],
+            track_in_reverse=False,
+            run_mem_encoder=False,
+            prev_sam_mask_logits=None,
+        )
+        return current_out["obj_ptr"]
+
+    @torch.inference_mode()
+    def propagate_in_video_preflight(self, inference_state):
+        """Prepare inference_state and consolidate temporary outputs before tracking."""
+        # Tracking has started and we don't allow adding new objects until session is reset.
+        inference_state["tracking_has_started"] = True
+        batch_size = self._get_obj_num(inference_state)
+
+        # Consolidate per-object temporary outputs in "temp_output_dict_per_obj" and
+        # add them into "output_dict".
+        temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
+        output_dict = inference_state["output_dict"]
+        # "consolidated_frame_inds" contains indices of those frames where consolidated
+        # temporary outputs have been added (either in this call or any previous calls
+        # to `propagate_in_video_preflight`).
+        consolidated_frame_inds = inference_state["consolidated_frame_inds"]
+        for is_cond in [False, True]:
+            # Separately consolidate conditioning and non-conditioning temp outputs
+            storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+            # Find all the frames that contain temporary outputs for any objects
+            # (these should be the frames that have just received clicks for mask inputs
+            # via `add_new_points_or_box` or `add_new_mask`)
+            temp_frame_inds = set()
+            for obj_temp_output_dict in temp_output_dict_per_obj.values():
+                temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
+            consolidated_frame_inds[storage_key].update(temp_frame_inds)
+            # consolidate the temporary output across all objects on this frame
+            for frame_idx in temp_frame_inds:
+                consolidated_out = self._consolidate_temp_output_across_obj(
+                    inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True
+                )
+                # merge them into "output_dict" and also create per-object slices
+                output_dict[storage_key][frame_idx] = consolidated_out
+                self._add_output_per_object(
+                    inference_state, frame_idx, consolidated_out, storage_key
+                )
+                clear_non_cond_mem = self.clear_non_cond_mem_around_input and (
+                    self.clear_non_cond_mem_for_multi_obj or batch_size <= 1
+                )
+                if clear_non_cond_mem:
+                    # clear non-conditioning memory of the surrounding frames
+                    self._clear_non_cond_mem_around_input(inference_state, frame_idx)
+
+            # clear temporary outputs in `temp_output_dict_per_obj`
+            for obj_temp_output_dict in temp_output_dict_per_obj.values():
+                obj_temp_output_dict[storage_key].clear()
+
+        # edge case: if an output is added to "cond_frame_outputs", we remove any prior
+        # output on the same frame in "non_cond_frame_outputs"
+        for frame_idx in output_dict["cond_frame_outputs"]:
+            output_dict["non_cond_frame_outputs"].pop(frame_idx, None)
+        for obj_output_dict in inference_state["output_dict_per_obj"].values():
+            for frame_idx in obj_output_dict["cond_frame_outputs"]:
+                obj_output_dict["non_cond_frame_outputs"].pop(frame_idx, None)
+        for frame_idx in consolidated_frame_inds["cond_frame_outputs"]:
+            assert frame_idx in output_dict["cond_frame_outputs"]
+            consolidated_frame_inds["non_cond_frame_outputs"].discard(frame_idx)
+
+        # Make sure that the frame indices in "consolidated_frame_inds" are exactly those frames
+        # with either points or mask inputs (which should be true under a correct workflow).
+        all_consolidated_frame_inds = (
+            consolidated_frame_inds["cond_frame_outputs"]
+            | consolidated_frame_inds["non_cond_frame_outputs"]
+        )
+        input_frames_inds = set()
+        for point_inputs_per_frame in inference_state["point_inputs_per_obj"].values():
+            input_frames_inds.update(point_inputs_per_frame.keys())
+        for mask_inputs_per_frame in inference_state["mask_inputs_per_obj"].values():
+            input_frames_inds.update(mask_inputs_per_frame.keys())
+        assert all_consolidated_frame_inds == input_frames_inds
+
+    @torch.inference_mode()
+    def propagate_in_video(
+        self,
+        inference_state,
+        start_frame_idx=None,
+        max_frame_num_to_track=None,
+        reverse=False,
+    ):
+        """Propagate the input points across frames to track in the entire video."""
+        self.propagate_in_video_preflight(inference_state)
+
+        output_dict = inference_state["output_dict"]
+        consolidated_frame_inds = inference_state["consolidated_frame_inds"]
+        obj_ids = inference_state["obj_ids"]
+        num_frames = inference_state["num_frames"]
+        batch_size = self._get_obj_num(inference_state)
+        if len(output_dict["cond_frame_outputs"]) == 0:
+            raise RuntimeError("No points are provided; please add points first")
+        clear_non_cond_mem = self.clear_non_cond_mem_around_input and (
+            self.clear_non_cond_mem_for_multi_obj or batch_size <= 1
+        )
+
+        # set start index, end index, and processing order
+        if start_frame_idx is None:
+            # default: start from the earliest frame with input points
+            start_frame_idx = min(output_dict["cond_frame_outputs"])
+        if max_frame_num_to_track is None:
+            # default: track all the frames in the video
+            max_frame_num_to_track = num_frames
+        if reverse:
+            end_frame_idx = max(start_frame_idx - max_frame_num_to_track, 0)
+            if start_frame_idx > 0:
+                processing_order = range(start_frame_idx, end_frame_idx - 1, -1)
+            else:
+                processing_order = []  # skip reverse tracking if starting from frame 0
+        else:
+            end_frame_idx = min(
+                start_frame_idx + max_frame_num_to_track, num_frames - 1
+            )
+            processing_order = range(start_frame_idx, end_frame_idx + 1)
+
+        for frame_idx in tqdm(processing_order, desc="propagate in video"):
+            # We skip those frames already in consolidated outputs (these are frames
+            # that received input clicks or mask). Note that we cannot directly run
+            # batched forward on them via `_run_single_frame_inference` because the
+            # number of clicks on each object might be different.
+            if frame_idx in consolidated_frame_inds["cond_frame_outputs"]:
+                storage_key = "cond_frame_outputs"
+                current_out = output_dict[storage_key][frame_idx]
+                pred_masks = current_out["pred_masks"]
+                if clear_non_cond_mem:
+                    # clear non-conditioning memory of the surrounding frames
+                    self._clear_non_cond_mem_around_input(inference_state, frame_idx)
+            elif frame_idx in consolidated_frame_inds["non_cond_frame_outputs"]:
+                storage_key = "non_cond_frame_outputs"
+                current_out = output_dict[storage_key][frame_idx]
+                pred_masks = current_out["pred_masks"]
+            else:
+                storage_key = "non_cond_frame_outputs"
+                current_out, pred_masks = self._run_single_frame_inference(
+                    inference_state=inference_state,
+                    output_dict=output_dict,
+                    frame_idx=frame_idx,
+                    batch_size=batch_size,
+                    is_init_cond_frame=False,
+                    point_inputs=None,
+                    mask_inputs=None,
+                    reverse=reverse,
+                    run_mem_encoder=True,
+                )
+                output_dict[storage_key][frame_idx] = current_out
+            # Create slices of per-object outputs for subsequent interaction with each
+            # individual object after tracking.
+            self._add_output_per_object(
+                inference_state, frame_idx, current_out, storage_key
+            )
+            inference_state["frames_already_tracked"][frame_idx] = {"reverse": reverse}
+
+            # Resize the output mask to the original video resolution (we directly use
+            # the mask scores on GPU for output to avoid any CPU conversion in between)
+            _, video_res_masks = self._get_orig_video_res_output(
+                inference_state, pred_masks
+            )
+            yield frame_idx, obj_ids, video_res_masks
+
+    def _add_output_per_object(
+        self, inference_state, frame_idx, current_out, storage_key
+    ):
+        """
+        Split a multi-object output into per-object output slices and add them into
+        `output_dict_per_obj`. The resulting slices share the same tensor storage.
+        """
+        maskmem_features = current_out["maskmem_features"]
+        assert maskmem_features is None or isinstance(maskmem_features, torch.Tensor)
+
+        maskmem_pos_enc = current_out["maskmem_pos_enc"]
+        assert maskmem_pos_enc is None or isinstance(maskmem_pos_enc, list)
+
+        output_dict_per_obj = inference_state["output_dict_per_obj"]
+        for obj_idx, obj_output_dict in output_dict_per_obj.items():
+            obj_slice = slice(obj_idx, obj_idx + 1)
+            obj_out = {
+                "maskmem_features": None,
+                "maskmem_pos_enc": None,
+                "pred_masks": current_out["pred_masks"][obj_slice],
+                "obj_ptr": current_out["obj_ptr"][obj_slice],
+                "object_score_logits": current_out["object_score_logits"][obj_slice],
+            }
+            if maskmem_features is not None:
+                obj_out["maskmem_features"] = maskmem_features[obj_slice]
+            if maskmem_pos_enc is not None:
+                obj_out["maskmem_pos_enc"] = [x[obj_slice] for x in maskmem_pos_enc]
+            obj_output_dict[storage_key][frame_idx] = obj_out
+
+    @torch.inference_mode()
+    def clear_all_prompts_in_frame(
+        self, inference_state, frame_idx, obj_id, need_output=True
+    ):
+        """Remove all input points or mask in a specific frame for a given object."""
+        obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+
+        # Clear the conditioning information on the given frame
+        inference_state["point_inputs_per_obj"][obj_idx].pop(frame_idx, None)
+        inference_state["mask_inputs_per_obj"][obj_idx].pop(frame_idx, None)
+
+        temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
+        temp_output_dict_per_obj[obj_idx]["cond_frame_outputs"].pop(frame_idx, None)
+        temp_output_dict_per_obj[obj_idx]["non_cond_frame_outputs"].pop(frame_idx, None)
+
+        # Check and see if there are still any inputs left on this frame
+        batch_size = self._get_obj_num(inference_state)
+        frame_has_input = False
+        for obj_idx2 in range(batch_size):
+            if frame_idx in inference_state["point_inputs_per_obj"][obj_idx2]:
+                frame_has_input = True
+                break
+            if frame_idx in inference_state["mask_inputs_per_obj"][obj_idx2]:
+                frame_has_input = True
+                break
+
+        # If this frame has no remaining inputs for any objects, we further clear its
+        # conditioning frame status
+        if not frame_has_input:
+            output_dict = inference_state["output_dict"]
+            consolidated_frame_inds = inference_state["consolidated_frame_inds"]
+            consolidated_frame_inds["cond_frame_outputs"].discard(frame_idx)
+            consolidated_frame_inds["non_cond_frame_outputs"].discard(frame_idx)
+            # Remove the frame's conditioning output (possibly downgrading it to non-conditioning)
+            out = output_dict["cond_frame_outputs"].pop(frame_idx, None)
+            if out is not None:
+                # The frame is not a conditioning frame anymore since it's not receiving inputs,
+                # so we "downgrade" its output (if exists) to a non-conditioning frame output.
+                output_dict["non_cond_frame_outputs"][frame_idx] = out
+                inference_state["frames_already_tracked"].pop(frame_idx, None)
+            # Similarly, do it for the sliced output on each object.
+            for obj_idx2 in range(batch_size):
+                obj_output_dict = inference_state["output_dict_per_obj"][obj_idx2]
+                obj_out = obj_output_dict["cond_frame_outputs"].pop(frame_idx, None)
+                if obj_out is not None:
+                    obj_output_dict["non_cond_frame_outputs"][frame_idx] = obj_out
+
+            # If all the conditioning frames have been removed, we also clear the tracking outputs
+            if len(output_dict["cond_frame_outputs"]) == 0:
+                self._reset_tracking_results(inference_state)
+
+        if not need_output:
+            return
+        # Finally, output updated masks per object (after removing the inputs above)
+        obj_ids = inference_state["obj_ids"]
+        is_cond = any(
+            frame_idx in obj_temp_output_dict["cond_frame_outputs"]
+            for obj_temp_output_dict in temp_output_dict_per_obj.values()
+        )
+        consolidated_out = self._consolidate_temp_output_across_obj(
+            inference_state,
+            frame_idx,
+            is_cond=is_cond,
+            run_mem_encoder=False,
+            consolidate_at_video_res=True,
+        )
+        _, video_res_masks = self._get_orig_video_res_output(
+            inference_state, consolidated_out["pred_masks_video_res"]
+        )
+        return frame_idx, obj_ids, video_res_masks
+
+    @torch.inference_mode()
+    def reset_state(self, inference_state):
+        """Remove all input points or mask in all frames throughout the video."""
+        self._reset_tracking_results(inference_state)
+        # Remove all object ids
+        inference_state["obj_id_to_idx"].clear()
+        inference_state["obj_idx_to_id"].clear()
+        inference_state["obj_ids"].clear()
+        inference_state["point_inputs_per_obj"].clear()
+        inference_state["mask_inputs_per_obj"].clear()
+        inference_state["output_dict_per_obj"].clear()
+        inference_state["temp_output_dict_per_obj"].clear()
+
+    def _reset_tracking_results(self, inference_state):
+        """Reset all tracking inputs and results across the videos."""
+        for v in inference_state["point_inputs_per_obj"].values():
+            v.clear()
+        for v in inference_state["mask_inputs_per_obj"].values():
+            v.clear()
+        for v in inference_state["output_dict_per_obj"].values():
+            v["cond_frame_outputs"].clear()
+            v["non_cond_frame_outputs"].clear()
+        for v in inference_state["temp_output_dict_per_obj"].values():
+            v["cond_frame_outputs"].clear()
+            v["non_cond_frame_outputs"].clear()
+        inference_state["output_dict"]["cond_frame_outputs"].clear()
+        inference_state["output_dict"]["non_cond_frame_outputs"].clear()
+        inference_state["consolidated_frame_inds"]["cond_frame_outputs"].clear()
+        inference_state["consolidated_frame_inds"]["non_cond_frame_outputs"].clear()
+        inference_state["tracking_has_started"] = False
+        inference_state["frames_already_tracked"].clear()
+
+    def _get_image_feature(self, inference_state, frame_idx, batch_size):
+        """Compute the image features on a given frame."""
+        # Look up in the cache first
+        image, backbone_out = inference_state["cached_features"].get(
+            frame_idx, (None, None)
+        )
+        if backbone_out is None:
+            # Cache miss -- we will run inference on a single image
+            device = inference_state["device"]
+            image = inference_state["images"][frame_idx].to(device).float().unsqueeze(0)
+            backbone_out = self.forward_image(image)
+            # Cache the most recent frame's feature (for repeated interactions with
+            # a frame; we can use an LRU cache for more frames in the future).
+            inference_state["cached_features"] = {frame_idx: (image, backbone_out)}
+
+        # expand the features to have the same dimension as the number of objects
+        expanded_image = image.expand(batch_size, -1, -1, -1)
+        expanded_backbone_out = {
+            "backbone_fpn": backbone_out["backbone_fpn"].copy(),
+            "vision_pos_enc": backbone_out["vision_pos_enc"].copy(),
+        }
+        for i, feat in enumerate(expanded_backbone_out["backbone_fpn"]):
+            expanded_backbone_out["backbone_fpn"][i] = feat.expand(
+                batch_size, -1, -1, -1
+            )
+        for i, pos in enumerate(expanded_backbone_out["vision_pos_enc"]):
+            pos = pos.expand(batch_size, -1, -1, -1)
+            expanded_backbone_out["vision_pos_enc"][i] = pos
+
+        features = self._prepare_backbone_features(expanded_backbone_out)
+        features = (expanded_image,) + features
+        return features
+
+    def _run_single_frame_inference(
+        self,
+        inference_state,
+        output_dict,
+        frame_idx,
+        batch_size,
+        is_init_cond_frame,
+        point_inputs,
+        mask_inputs,
+        reverse,
+        run_mem_encoder,
+        prev_sam_mask_logits=None,
+    ):
+        """Run tracking on a single frame based on current inputs and previous memory."""
+        # Retrieve correct image features
+        (
+            _,
+            _,
+            current_vision_feats,
+            current_vision_pos_embeds,
+            feat_sizes,
+        ) = self._get_image_feature(inference_state, frame_idx, batch_size)
+
+        # point and mask should not appear as input simultaneously on the same frame
+        assert point_inputs is None or mask_inputs is None
+        current_out = self.track_step(
+            frame_idx=frame_idx,
+            is_init_cond_frame=is_init_cond_frame,
+            current_vision_feats=current_vision_feats,
+            current_vision_pos_embeds=current_vision_pos_embeds,
+            feat_sizes=feat_sizes,
+            point_inputs=point_inputs,
+            mask_inputs=mask_inputs,
+            output_dict=output_dict,
+            num_frames=inference_state["num_frames"],
+            track_in_reverse=reverse,
+            run_mem_encoder=run_mem_encoder,
+            prev_sam_mask_logits=prev_sam_mask_logits,
+        )
+
+        # optionally offload the output to CPU memory to save GPU space
+        storage_device = inference_state["storage_device"]
+        maskmem_features = current_out["maskmem_features"]
+        if maskmem_features is not None:
+            maskmem_features = maskmem_features.to(torch.bfloat16)
+            maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
+        pred_masks_gpu = current_out["pred_masks"]
+        # potentially fill holes in the predicted masks
+        if self.fill_hole_area > 0:
+            pred_masks_gpu = fill_holes_in_mask_scores(
+                pred_masks_gpu, self.fill_hole_area
+            )
+        pred_masks = pred_masks_gpu.to(storage_device, non_blocking=True)
+        # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
+        maskmem_pos_enc = self._get_maskmem_pos_enc(inference_state, current_out)
+        # object pointer is a small tensor, so we always keep it on GPU memory for fast access
+        obj_ptr = current_out["obj_ptr"]
+        object_score_logits = current_out["object_score_logits"]
+        # make a compact version of this frame's output to reduce the state size
+        compact_current_out = {
+            "maskmem_features": maskmem_features,
+            "maskmem_pos_enc": maskmem_pos_enc,
+            "pred_masks": pred_masks,
+            "obj_ptr": obj_ptr,
+            "object_score_logits": object_score_logits,
+        }
+        return compact_current_out, pred_masks_gpu
+
+    def _run_memory_encoder(
+        self,
+        inference_state,
+        frame_idx,
+        batch_size,
+        high_res_masks,
+        object_score_logits,
+        is_mask_from_pts,
+    ):
+        """
+        Run the memory encoder on `high_res_masks`. This is usually after applying
+        non-overlapping constraints to object scores. Since their scores changed, their
+        memory also need to be computed again with the memory encoder.
+        """
+        # Retrieve correct image features
+        _, _, current_vision_feats, _, feat_sizes = self._get_image_feature(
+            inference_state, frame_idx, batch_size
+        )
+        maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+            current_vision_feats=current_vision_feats,
+            feat_sizes=feat_sizes,
+            pred_masks_high_res=high_res_masks,
+            object_score_logits=object_score_logits,
+            is_mask_from_pts=is_mask_from_pts,
+        )
+
+        # optionally offload the output to CPU memory to save GPU space
+        storage_device = inference_state["storage_device"]
+        maskmem_features = maskmem_features.to(torch.bfloat16)
+        maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
+        # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
+        maskmem_pos_enc = self._get_maskmem_pos_enc(
+            inference_state, {"maskmem_pos_enc": maskmem_pos_enc}
+        )
+        return maskmem_features, maskmem_pos_enc
+
+    def _get_maskmem_pos_enc(self, inference_state, current_out):
+        """
+        `maskmem_pos_enc` is the same across frames and objects, so we cache it as
+        a constant in the inference session to reduce session storage size.
+        """
+        model_constants = inference_state["constants"]
+        # "out_maskmem_pos_enc" should be either a list of tensors or None
+        out_maskmem_pos_enc = current_out["maskmem_pos_enc"]
+        if out_maskmem_pos_enc is not None:
+            if "maskmem_pos_enc" not in model_constants:
+                assert isinstance(out_maskmem_pos_enc, list)
+                # only take the slice for one object, since it's same across objects
+                maskmem_pos_enc = [x[0:1].clone() for x in out_maskmem_pos_enc]
+                model_constants["maskmem_pos_enc"] = maskmem_pos_enc
+            else:
+                maskmem_pos_enc = model_constants["maskmem_pos_enc"]
+            # expand the cached maskmem_pos_enc to the actual batch size
+            batch_size = out_maskmem_pos_enc[0].size(0)
+            expanded_maskmem_pos_enc = [
+                x.expand(batch_size, -1, -1, -1) for x in maskmem_pos_enc
+            ]
+        else:
+            expanded_maskmem_pos_enc = None
+        return expanded_maskmem_pos_enc
+
+    @torch.inference_mode()
+    def remove_object(self, inference_state, obj_id, strict=False, need_output=True):
+        """
+        Remove an object id from the tracking state. If strict is True, we check whether
+        the object id actually exists and raise an error if it doesn't exist.
+        """
+        old_obj_idx_to_rm = inference_state["obj_id_to_idx"].get(obj_id, None)
+        updated_frames = []
+        # Check whether this object_id to remove actually exists and possibly raise an error.
+        if old_obj_idx_to_rm is None:
+            if not strict:
+                return inference_state["obj_ids"], updated_frames
+            raise RuntimeError(
+                f"Cannot remove object id {obj_id} as it doesn't exist. "
+                f"All existing object ids: {inference_state['obj_ids']}."
+            )
+
+        # If this is the only remaining object id, we simply reset the state.
+        if len(inference_state["obj_id_to_idx"]) == 1:
+            self.reset_state(inference_state)
+            return inference_state["obj_ids"], updated_frames
+
+        # There are still remaining objects after removing this object id. In this case,
+        # we need to delete the object storage from inference state tensors.
+        # Step 0: clear the input on those frames where this object id has point or mask input
+        # (note that this step is required as it might downgrade conditioning frames to
+        # non-conditioning ones)
+        obj_input_frames_inds = set()
+        obj_input_frames_inds.update(
+            inference_state["point_inputs_per_obj"][old_obj_idx_to_rm]
+        )
+        obj_input_frames_inds.update(
+            inference_state["mask_inputs_per_obj"][old_obj_idx_to_rm]
+        )
+        for frame_idx in obj_input_frames_inds:
+            self.clear_all_prompts_in_frame(
+                inference_state, frame_idx, obj_id, need_output=False
+            )
+
+        # Step 1: Update the object id mapping (note that it must be done after Step 0,
+        # since Step 0 still requires the old object id mappings in inference_state)
+        old_obj_ids = inference_state["obj_ids"]
+        old_obj_inds = list(range(len(old_obj_ids)))
+        remain_old_obj_inds = old_obj_inds.copy()
+        remain_old_obj_inds.remove(old_obj_idx_to_rm)
+        new_obj_ids = [old_obj_ids[old_idx] for old_idx in remain_old_obj_inds]
+        new_obj_inds = list(range(len(new_obj_ids)))
+        # build new mappings
+        old_idx_to_new_idx = dict(zip(remain_old_obj_inds, new_obj_inds))
+        inference_state["obj_id_to_idx"] = dict(zip(new_obj_ids, new_obj_inds))
+        inference_state["obj_idx_to_id"] = dict(zip(new_obj_inds, new_obj_ids))
+        inference_state["obj_ids"] = new_obj_ids
+
+        # Step 2: For per-object tensor storage, we shift their obj_idx in the dict keys.
+        # (note that "consolidated_frame_inds" doesn't need to be updated in this step as
+        # it's already handled in Step 0)
+        def _map_keys(container):
+            new_kvs = []
+            for k in old_obj_inds:
+                v = container.pop(k)
+                if k in old_idx_to_new_idx:
+                    new_kvs.append((old_idx_to_new_idx[k], v))
+            container.update(new_kvs)
+
+        _map_keys(inference_state["point_inputs_per_obj"])
+        _map_keys(inference_state["mask_inputs_per_obj"])
+        _map_keys(inference_state["output_dict_per_obj"])
+        _map_keys(inference_state["temp_output_dict_per_obj"])
+
+        # Step 3: For packed tensor storage, we index the remaining ids and rebuild the per-object slices.
+        def _slice_state(output_dict, storage_key):
+            for frame_idx, out in output_dict[storage_key].items():
+                out["maskmem_features"] = out["maskmem_features"][remain_old_obj_inds]
+                out["maskmem_pos_enc"] = [
+                    x[remain_old_obj_inds] for x in out["maskmem_pos_enc"]
+                ]
+                # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
+                out["maskmem_pos_enc"] = self._get_maskmem_pos_enc(inference_state, out)
+                out["pred_masks"] = out["pred_masks"][remain_old_obj_inds]
+                out["obj_ptr"] = out["obj_ptr"][remain_old_obj_inds]
+                out["object_score_logits"] = out["object_score_logits"][
+                    remain_old_obj_inds
+                ]
+                # also update the per-object slices
+                self._add_output_per_object(
+                    inference_state, frame_idx, out, storage_key
+                )
+
+        _slice_state(inference_state["output_dict"], "cond_frame_outputs")
+        _slice_state(inference_state["output_dict"], "non_cond_frame_outputs")
+
+        # Step 4: Further collect the outputs on those frames in `obj_input_frames_inds`, which
+        # could show an updated mask for objects previously occluded by the object being removed
+        if need_output:
+            temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
+            for frame_idx in obj_input_frames_inds:
+                is_cond = any(
+                    frame_idx in obj_temp_output_dict["cond_frame_outputs"]
+                    for obj_temp_output_dict in temp_output_dict_per_obj.values()
+                )
+                consolidated_out = self._consolidate_temp_output_across_obj(
+                    inference_state,
+                    frame_idx,
+                    is_cond=is_cond,
+                    run_mem_encoder=False,
+                    consolidate_at_video_res=True,
+                )
+                _, video_res_masks = self._get_orig_video_res_output(
+                    inference_state, consolidated_out["pred_masks_video_res"]
+                )
+                updated_frames.append((frame_idx, video_res_masks))
+
+        return inference_state["obj_ids"], updated_frames
+
+    def _clear_non_cond_mem_around_input(self, inference_state, frame_idx):
+        """
+        Remove the non-conditioning memory around the input frame. When users provide
+        correction clicks, the surrounding frames' non-conditioning memories can still
+        contain outdated object appearance information and could confuse the model.
+
+        This method clears those non-conditioning memories surrounding the interacted
+        frame to avoid giving the model both old and new information about the object.
+        """
+        r = self.memory_temporal_stride_for_eval
+        frame_idx_begin = frame_idx - r * self.num_maskmem
+        frame_idx_end = frame_idx + r * self.num_maskmem
+        output_dict = inference_state["output_dict"]
+        non_cond_frame_outputs = output_dict["non_cond_frame_outputs"]
+        for t in range(frame_idx_begin, frame_idx_end + 1):
+            non_cond_frame_outputs.pop(t, None)
+            for obj_output_dict in inference_state["output_dict_per_obj"].values():
+                obj_output_dict["non_cond_frame_outputs"].pop(t, None)
diff --git a/phantom/submodules/sam2/sam2/utils/__init__.py b/phantom/submodules/sam2/sam2/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/phantom/submodules/sam2/sam2/utils/amg.py b/phantom/submodules/sam2/sam2/utils/amg.py
new file mode 100644
index 0000000000000000000000000000000000000000..986842960cf5deca00614b7b1cde1ab77dad7e6e
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/utils/amg.py
@@ -0,0 +1,348 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from copy import deepcopy
+from itertools import product
+from typing import Any, Dict, Generator, ItemsView, List, Tuple
+
+import numpy as np
+import torch
+
+# Very lightly adapted from https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/utils/amg.py
+
+
+class MaskData:
+    """
+    A structure for storing masks and their related data in batched format.
+    Implements basic filtering and concatenation.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        for v in kwargs.values():
+            assert isinstance(
+                v, (list, np.ndarray, torch.Tensor)
+            ), "MaskData only supports list, numpy arrays, and torch tensors."
+        self._stats = dict(**kwargs)
+
+    def __setitem__(self, key: str, item: Any) -> None:
+        assert isinstance(
+            item, (list, np.ndarray, torch.Tensor)
+        ), "MaskData only supports list, numpy arrays, and torch tensors."
+        self._stats[key] = item
+
+    def __delitem__(self, key: str) -> None:
+        del self._stats[key]
+
+    def __getitem__(self, key: str) -> Any:
+        return self._stats[key]
+
+    def items(self) -> ItemsView[str, Any]:
+        return self._stats.items()
+
+    def filter(self, keep: torch.Tensor) -> None:
+        for k, v in self._stats.items():
+            if v is None:
+                self._stats[k] = None
+            elif isinstance(v, torch.Tensor):
+                self._stats[k] = v[torch.as_tensor(keep, device=v.device)]
+            elif isinstance(v, np.ndarray):
+                self._stats[k] = v[keep.detach().cpu().numpy()]
+            elif isinstance(v, list) and keep.dtype == torch.bool:
+                self._stats[k] = [a for i, a in enumerate(v) if keep[i]]
+            elif isinstance(v, list):
+                self._stats[k] = [v[i] for i in keep]
+            else:
+                raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+
+    def cat(self, new_stats: "MaskData") -> None:
+        for k, v in new_stats.items():
+            if k not in self._stats or self._stats[k] is None:
+                self._stats[k] = deepcopy(v)
+            elif isinstance(v, torch.Tensor):
+                self._stats[k] = torch.cat([self._stats[k], v], dim=0)
+            elif isinstance(v, np.ndarray):
+                self._stats[k] = np.concatenate([self._stats[k], v], axis=0)
+            elif isinstance(v, list):
+                self._stats[k] = self._stats[k] + deepcopy(v)
+            else:
+                raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+
+    def to_numpy(self) -> None:
+        for k, v in self._stats.items():
+            if isinstance(v, torch.Tensor):
+                self._stats[k] = v.float().detach().cpu().numpy()
+
+
+def is_box_near_crop_edge(
+    boxes: torch.Tensor, crop_box: List[int], orig_box: List[int], atol: float = 20.0
+) -> torch.Tensor:
+    """Filter masks at the edge of a crop, but not at the edge of the original image."""
+    crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
+    orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device)
+    boxes = uncrop_boxes_xyxy(boxes, crop_box).float()
+    near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0)
+    near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0)
+    near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
+    return torch.any(near_crop_edge, dim=1)
+
+
+def box_xyxy_to_xywh(box_xyxy: torch.Tensor) -> torch.Tensor:
+    box_xywh = deepcopy(box_xyxy)
+    box_xywh[2] = box_xywh[2] - box_xywh[0]
+    box_xywh[3] = box_xywh[3] - box_xywh[1]
+    return box_xywh
+
+
+def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
+    assert len(args) > 0 and all(
+        len(a) == len(args[0]) for a in args
+    ), "Batched iteration must have inputs of all the same size."
+    n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0)
+    for b in range(n_batches):
+        yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args]
+
+
+def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]:
+    """
+    Encodes masks to an uncompressed RLE, in the format expected by
+    pycoco tools.
+    """
+    # Put in fortran order and flatten h,w
+    b, h, w = tensor.shape
+    tensor = tensor.permute(0, 2, 1).flatten(1)
+
+    # Compute change indices
+    diff = tensor[:, 1:] ^ tensor[:, :-1]
+    change_indices = diff.nonzero()
+
+    # Encode run length
+    out = []
+    for i in range(b):
+        cur_idxs = change_indices[change_indices[:, 0] == i, 1]
+        cur_idxs = torch.cat(
+            [
+                torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device),
+                cur_idxs + 1,
+                torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device),
+            ]
+        )
+        btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+        counts = [] if tensor[i, 0] == 0 else [0]
+        counts.extend(btw_idxs.detach().cpu().tolist())
+        out.append({"size": [h, w], "counts": counts})
+    return out
+
+
+def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
+    """Compute a binary mask from an uncompressed RLE."""
+    h, w = rle["size"]
+    mask = np.empty(h * w, dtype=bool)
+    idx = 0
+    parity = False
+    for count in rle["counts"]:
+        mask[idx : idx + count] = parity
+        idx += count
+        parity ^= True
+    mask = mask.reshape(w, h)
+    return mask.transpose()  # Put in C order
+
+
+def area_from_rle(rle: Dict[str, Any]) -> int:
+    return sum(rle["counts"][1::2])
+
+
+def calculate_stability_score(
+    masks: torch.Tensor, mask_threshold: float, threshold_offset: float
+) -> torch.Tensor:
+    """
+    Computes the stability score for a batch of masks. The stability
+    score is the IoU between the binary masks obtained by thresholding
+    the predicted mask logits at high and low values.
+    """
+    # One mask is always contained inside the other.
+    # Save memory by preventing unnecessary cast to torch.int64
+    intersections = (
+        (masks > (mask_threshold + threshold_offset))
+        .sum(-1, dtype=torch.int16)
+        .sum(-1, dtype=torch.int32)
+    )
+    unions = (
+        (masks > (mask_threshold - threshold_offset))
+        .sum(-1, dtype=torch.int16)
+        .sum(-1, dtype=torch.int32)
+    )
+    return intersections / unions
+
+
+def build_point_grid(n_per_side: int) -> np.ndarray:
+    """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
+    offset = 1 / (2 * n_per_side)
+    points_one_side = np.linspace(offset, 1 - offset, n_per_side)
+    points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
+    points_y = np.tile(points_one_side[:, None], (1, n_per_side))
+    points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
+    return points
+
+
+def build_all_layer_point_grids(
+    n_per_side: int, n_layers: int, scale_per_layer: int
+) -> List[np.ndarray]:
+    """Generates point grids for all crop layers."""
+    points_by_layer = []
+    for i in range(n_layers + 1):
+        n_points = int(n_per_side / (scale_per_layer**i))
+        points_by_layer.append(build_point_grid(n_points))
+    return points_by_layer
+
+
+def generate_crop_boxes(
+    im_size: Tuple[int, ...], n_layers: int, overlap_ratio: float
+) -> Tuple[List[List[int]], List[int]]:
+    """
+    Generates a list of crop boxes of different sizes. Each layer
+    has (2**i)**2 boxes for the ith layer.
+    """
+    crop_boxes, layer_idxs = [], []
+    im_h, im_w = im_size
+    short_side = min(im_h, im_w)
+
+    # Original image
+    crop_boxes.append([0, 0, im_w, im_h])
+    layer_idxs.append(0)
+
+    def crop_len(orig_len, n_crops, overlap):
+        return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
+
+    for i_layer in range(n_layers):
+        n_crops_per_side = 2 ** (i_layer + 1)
+        overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
+
+        crop_w = crop_len(im_w, n_crops_per_side, overlap)
+        crop_h = crop_len(im_h, n_crops_per_side, overlap)
+
+        crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)]
+        crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)]
+
+        # Crops in XYWH format
+        for x0, y0 in product(crop_box_x0, crop_box_y0):
+            box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)]
+            crop_boxes.append(box)
+            layer_idxs.append(i_layer + 1)
+
+    return crop_boxes, layer_idxs
+
+
+def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+    x0, y0, _, _ = crop_box
+    offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
+    # Check if boxes has a channel dimension
+    if len(boxes.shape) == 3:
+        offset = offset.unsqueeze(1)
+    return boxes + offset
+
+
+def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+    x0, y0, _, _ = crop_box
+    offset = torch.tensor([[x0, y0]], device=points.device)
+    # Check if points has a channel dimension
+    if len(points.shape) == 3:
+        offset = offset.unsqueeze(1)
+    return points + offset
+
+
+def uncrop_masks(
+    masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int
+) -> torch.Tensor:
+    x0, y0, x1, y1 = crop_box
+    if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
+        return masks
+    # Coordinate transform masks
+    pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0)
+    pad = (x0, pad_x - x0, y0, pad_y - y0)
+    return torch.nn.functional.pad(masks, pad, value=0)
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+
+def coco_encode_rle(uncompressed_rle: Dict[str, Any]) -> Dict[str, Any]:
+    from pycocotools import mask as mask_utils  # type: ignore
+
+    h, w = uncompressed_rle["size"]
+    rle = mask_utils.frPyObjects(uncompressed_rle, h, w)
+    rle["counts"] = rle["counts"].decode("utf-8")  # Necessary to serialize with json
+    return rle
+
+
+def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
+    """
+    Calculates boxes in XYXY format around masks. Return [0,0,0,0] for
+    an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4.
+    """
+    # torch.max below raises an error on empty inputs, just skip in this case
+    if torch.numel(masks) == 0:
+        return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
+
+    # Normalize shape to CxHxW
+    shape = masks.shape
+    h, w = shape[-2:]
+    if len(shape) > 2:
+        masks = masks.flatten(0, -3)
+    else:
+        masks = masks.unsqueeze(0)
+
+    # Get top and bottom edges
+    in_height, _ = torch.max(masks, dim=-1)
+    in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :]
+    bottom_edges, _ = torch.max(in_height_coords, dim=-1)
+    in_height_coords = in_height_coords + h * (~in_height)
+    top_edges, _ = torch.min(in_height_coords, dim=-1)
+
+    # Get left and right edges
+    in_width, _ = torch.max(masks, dim=-2)
+    in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :]
+    right_edges, _ = torch.max(in_width_coords, dim=-1)
+    in_width_coords = in_width_coords + w * (~in_width)
+    left_edges, _ = torch.min(in_width_coords, dim=-1)
+
+    # If the mask is empty the right edge will be to the left of the left edge.
+    # Replace these boxes with [0, 0, 0, 0]
+    empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
+    out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1)
+    out = out * (~empty_filter).unsqueeze(-1)
+
+    # Return to original shape
+    if len(shape) > 2:
+        out = out.reshape(*shape[:-2], 4)
+    else:
+        out = out[0]
+
+    return out
diff --git a/phantom/submodules/sam2/sam2/utils/misc.py b/phantom/submodules/sam2/sam2/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b65ee825732ff85137805be650edd4cbe8e6f6d4
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/utils/misc.py
@@ -0,0 +1,349 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import warnings
+from threading import Thread
+
+import numpy as np
+import torch
+from PIL import Image
+from tqdm import tqdm
+
+
+def get_sdpa_settings():
+    if torch.cuda.is_available():
+        old_gpu = torch.cuda.get_device_properties(0).major < 7
+        # only use Flash Attention on Ampere (8.0) or newer GPUs
+        use_flash_attn = torch.cuda.get_device_properties(0).major >= 8
+        if not use_flash_attn:
+            warnings.warn(
+                "Flash Attention is disabled as it requires a GPU with Ampere (8.0) CUDA capability.",
+                category=UserWarning,
+                stacklevel=2,
+            )
+        # keep math kernel for PyTorch versions before 2.2 (Flash Attention v2 is only
+        # available on PyTorch 2.2+, while Flash Attention v1 cannot handle all cases)
+        pytorch_version = tuple(int(v) for v in torch.__version__.split(".")[:2])
+        if pytorch_version < (2, 2):
+            warnings.warn(
+                f"You are using PyTorch {torch.__version__} without Flash Attention v2 support. "
+                "Consider upgrading to PyTorch 2.2+ for Flash Attention v2 (which could be faster).",
+                category=UserWarning,
+                stacklevel=2,
+            )
+        math_kernel_on = pytorch_version < (2, 2) or not use_flash_attn
+    else:
+        old_gpu = True
+        use_flash_attn = False
+        math_kernel_on = True
+
+    return old_gpu, use_flash_attn, math_kernel_on
+
+
+def get_connected_components(mask):
+    """
+    Get the connected components (8-connectivity) of binary masks of shape (N, 1, H, W).
+
+    Inputs:
+    - mask: A binary mask tensor of shape (N, 1, H, W), where 1 is foreground and 0 is
+            background.
+
+    Outputs:
+    - labels: A tensor of shape (N, 1, H, W) containing the connected component labels
+              for foreground pixels and 0 for background pixels.
+    - counts: A tensor of shape (N, 1, H, W) containing the area of the connected
+              components for foreground pixels and 0 for background pixels.
+    """
+    from sam2 import _C
+
+    return _C.get_connected_componnets(mask.to(torch.uint8).contiguous())
+
+
+def mask_to_box(masks: torch.Tensor):
+    """
+    compute bounding box given an input mask
+
+    Inputs:
+    - masks: [B, 1, H, W] masks, dtype=torch.Tensor
+
+    Returns:
+    - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
+    """
+    B, _, h, w = masks.shape
+    device = masks.device
+    xs = torch.arange(w, device=device, dtype=torch.int32)
+    ys = torch.arange(h, device=device, dtype=torch.int32)
+    grid_xs, grid_ys = torch.meshgrid(xs, ys, indexing="xy")
+    grid_xs = grid_xs[None, None, ...].expand(B, 1, h, w)
+    grid_ys = grid_ys[None, None, ...].expand(B, 1, h, w)
+    min_xs, _ = torch.min(torch.where(masks, grid_xs, w).flatten(-2), dim=-1)
+    max_xs, _ = torch.max(torch.where(masks, grid_xs, -1).flatten(-2), dim=-1)
+    min_ys, _ = torch.min(torch.where(masks, grid_ys, h).flatten(-2), dim=-1)
+    max_ys, _ = torch.max(torch.where(masks, grid_ys, -1).flatten(-2), dim=-1)
+    bbox_coords = torch.stack((min_xs, min_ys, max_xs, max_ys), dim=-1)
+
+    return bbox_coords
+
+
+def _load_img_as_tensor(img_path, image_size):
+    img_pil = Image.open(img_path)
+    img_np = np.array(img_pil.convert("RGB").resize((image_size, image_size)))
+    if img_np.dtype == np.uint8:  # np.uint8 is expected for JPEG images
+        img_np = img_np / 255.0
+    else:
+        raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {img_path}")
+    img = torch.from_numpy(img_np).permute(2, 0, 1)
+    video_width, video_height = img_pil.size  # the original video size
+    return img, video_height, video_width
+
+
+class AsyncVideoFrameLoader:
+    """
+    A list of video frames to be load asynchronously without blocking session start.
+    """
+
+    def __init__(
+        self,
+        img_paths,
+        image_size,
+        offload_video_to_cpu,
+        img_mean,
+        img_std,
+        compute_device,
+    ):
+        self.img_paths = img_paths
+        self.image_size = image_size
+        self.offload_video_to_cpu = offload_video_to_cpu
+        self.img_mean = img_mean
+        self.img_std = img_std
+        # items in `self.images` will be loaded asynchronously
+        self.images = [None] * len(img_paths)
+        # catch and raise any exceptions in the async loading thread
+        self.exception = None
+        # video_height and video_width be filled when loading the first image
+        self.video_height = None
+        self.video_width = None
+        self.compute_device = compute_device
+
+        # load the first frame to fill video_height and video_width and also
+        # to cache it (since it's most likely where the user will click)
+        self.__getitem__(0)
+
+        # load the rest of frames asynchronously without blocking the session start
+        def _load_frames():
+            try:
+                for n in tqdm(range(len(self.images)), desc="frame loading (JPEG)"):
+                    self.__getitem__(n)
+            except Exception as e:
+                self.exception = e
+
+        self.thread = Thread(target=_load_frames, daemon=True)
+        self.thread.start()
+
+    def __getitem__(self, index):
+        if self.exception is not None:
+            raise RuntimeError("Failure in frame loading thread") from self.exception
+
+        img = self.images[index]
+        if img is not None:
+            return img
+
+        img, video_height, video_width = _load_img_as_tensor(
+            self.img_paths[index], self.image_size
+        )
+        self.video_height = video_height
+        self.video_width = video_width
+        # normalize by mean and std
+        img -= self.img_mean
+        img /= self.img_std
+        if not self.offload_video_to_cpu:
+            img = img.to(self.compute_device, non_blocking=True)
+        self.images[index] = img
+        return img
+
+    def __len__(self):
+        return len(self.images)
+
+
+def load_video_frames(
+    video_path,
+    image_size,
+    offload_video_to_cpu,
+    img_mean=(0.485, 0.456, 0.406),
+    img_std=(0.229, 0.224, 0.225),
+    async_loading_frames=False,
+    compute_device=torch.device("cuda"),
+):
+    """
+    Load the video frames from video_path. The frames are resized to image_size as in
+    the model and are loaded to GPU if offload_video_to_cpu=False. This is used by the demo.
+    """
+    is_bytes = isinstance(video_path, bytes)
+    is_str = isinstance(video_path, str)
+    is_mp4_path = is_str and os.path.splitext(video_path)[-1] in [".mp4", ".MP4"]
+    if is_bytes or is_mp4_path:
+        return load_video_frames_from_video_file(
+            video_path=video_path,
+            image_size=image_size,
+            offload_video_to_cpu=offload_video_to_cpu,
+            img_mean=img_mean,
+            img_std=img_std,
+            compute_device=compute_device,
+        )
+    elif is_str and os.path.isdir(video_path):
+        return load_video_frames_from_jpg_images(
+            video_path=video_path,
+            image_size=image_size,
+            offload_video_to_cpu=offload_video_to_cpu,
+            img_mean=img_mean,
+            img_std=img_std,
+            async_loading_frames=async_loading_frames,
+            compute_device=compute_device,
+        )
+    else:
+        raise NotImplementedError(
+            "Only MP4 video and JPEG folder are supported at this moment"
+        )
+
+
+def load_video_frames_from_jpg_images(
+    video_path,
+    image_size,
+    offload_video_to_cpu,
+    img_mean=(0.485, 0.456, 0.406),
+    img_std=(0.229, 0.224, 0.225),
+    async_loading_frames=False,
+    compute_device=torch.device("cuda"),
+):
+    """
+    Load the video frames from a directory of JPEG files ("<frame_index>.jpg" format).
+
+    The frames are resized to image_size x image_size and are loaded to GPU if
+    `offload_video_to_cpu` is `False` and to CPU if `offload_video_to_cpu` is `True`.
+
+    You can load a frame asynchronously by setting `async_loading_frames` to `True`.
+    """
+    if isinstance(video_path, str) and os.path.isdir(video_path):
+        jpg_folder = video_path
+    else:
+        raise NotImplementedError(
+            "Only JPEG frames are supported at this moment. For video files, you may use "
+            "ffmpeg (https://ffmpeg.org/) to extract frames into a folder of JPEG files, such as \n"
+            "```\n"
+            "ffmpeg -i <your_video>.mp4 -q:v 2 -start_number 0 <output_dir>/'%05d.jpg'\n"
+            "```\n"
+            "where `-q:v` generates high-quality JPEG frames and `-start_number 0` asks "
+            "ffmpeg to start the JPEG file from 00000.jpg."
+        )
+
+    frame_names = [
+        p
+        for p in os.listdir(jpg_folder)
+        if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
+    ]
+    frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
+    num_frames = len(frame_names)
+    if num_frames == 0:
+        raise RuntimeError(f"no images found in {jpg_folder}")
+    img_paths = [os.path.join(jpg_folder, frame_name) for frame_name in frame_names]
+    img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
+    img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
+
+    if async_loading_frames:
+        lazy_images = AsyncVideoFrameLoader(
+            img_paths,
+            image_size,
+            offload_video_to_cpu,
+            img_mean,
+            img_std,
+            compute_device,
+        )
+        return lazy_images, lazy_images.video_height, lazy_images.video_width
+
+    images = torch.zeros(num_frames, 3, image_size, image_size, dtype=torch.float32)
+    for n, img_path in enumerate(tqdm(img_paths, desc="frame loading (JPEG)")):
+        images[n], video_height, video_width = _load_img_as_tensor(img_path, image_size)
+    if not offload_video_to_cpu:
+        images = images.to(compute_device)
+        img_mean = img_mean.to(compute_device)
+        img_std = img_std.to(compute_device)
+    # normalize by mean and std
+    images -= img_mean
+    images /= img_std
+    return images, video_height, video_width
+
+
+def load_video_frames_from_video_file(
+    video_path,
+    image_size,
+    offload_video_to_cpu,
+    img_mean=(0.485, 0.456, 0.406),
+    img_std=(0.229, 0.224, 0.225),
+    compute_device=torch.device("cuda"),
+):
+    """Load the video frames from a video file."""
+    import decord
+
+    img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
+    img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
+    # Get the original video height and width
+    decord.bridge.set_bridge("torch")
+    video_height, video_width, _ = decord.VideoReader(video_path).next().shape
+    # Iterate over all frames in the video
+    images = []
+    for frame in decord.VideoReader(video_path, width=image_size, height=image_size):
+        images.append(frame.permute(2, 0, 1))
+
+    images = torch.stack(images, dim=0).float() / 255.0
+    if not offload_video_to_cpu:
+        images = images.to(compute_device)
+        img_mean = img_mean.to(compute_device)
+        img_std = img_std.to(compute_device)
+    # normalize by mean and std
+    images -= img_mean
+    images /= img_std
+    return images, video_height, video_width
+
+
+def fill_holes_in_mask_scores(mask, max_area):
+    """
+    A post processor to fill small holes in mask scores with area under `max_area`.
+    """
+    # Holes are those connected components in background with area <= self.max_area
+    # (background regions are those with mask scores <= 0)
+    assert max_area > 0, "max_area must be positive"
+
+    input_mask = mask
+    try:
+        labels, areas = get_connected_components(mask <= 0)
+        is_hole = (labels > 0) & (areas <= max_area)
+        # We fill holes with a small positive mask score (0.1) to change them to foreground.
+        mask = torch.where(is_hole, 0.1, mask)
+    except Exception as e:
+        # Skip the post-processing step on removing small holes if the CUDA kernel fails
+        warnings.warn(
+            f"{e}\n\nSkipping the post-processing step due to the error above. You can "
+            "still use SAM 2 and it's OK to ignore the error above, although some post-processing "
+            "functionality may be limited (which doesn't affect the results in most cases; see "
+            "https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).",
+            category=UserWarning,
+            stacklevel=2,
+        )
+        mask = input_mask
+
+    return mask
+
+
+def concat_points(old_point_inputs, new_points, new_labels):
+    """Add new points and labels to previous point inputs (add at the end)."""
+    if old_point_inputs is None:
+        points, labels = new_points, new_labels
+    else:
+        points = torch.cat([old_point_inputs["point_coords"], new_points], dim=1)
+        labels = torch.cat([old_point_inputs["point_labels"], new_labels], dim=1)
+
+    return {"point_coords": points, "point_labels": labels}
diff --git a/phantom/submodules/sam2/sam2/utils/transforms.py b/phantom/submodules/sam2/sam2/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc17bebfab104b659c5469e8434cf357ae7e24b6
--- /dev/null
+++ b/phantom/submodules/sam2/sam2/utils/transforms.py
@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.transforms import Normalize, Resize, ToTensor
+
+
+class SAM2Transforms(nn.Module):
+    def __init__(
+        self, resolution, mask_threshold, max_hole_area=0.0, max_sprinkle_area=0.0
+    ):
+        """
+        Transforms for SAM2.
+        """
+        super().__init__()
+        self.resolution = resolution
+        self.mask_threshold = mask_threshold
+        self.max_hole_area = max_hole_area
+        self.max_sprinkle_area = max_sprinkle_area
+        self.mean = [0.485, 0.456, 0.406]
+        self.std = [0.229, 0.224, 0.225]
+        self.to_tensor = ToTensor()
+        self.transforms = torch.jit.script(
+            nn.Sequential(
+                Resize((self.resolution, self.resolution)),
+                Normalize(self.mean, self.std),
+            )
+        )
+
+    def __call__(self, x):
+        x = self.to_tensor(x)
+        return self.transforms(x)
+
+    def forward_batch(self, img_list):
+        img_batch = [self.transforms(self.to_tensor(img)) for img in img_list]
+        img_batch = torch.stack(img_batch, dim=0)
+        return img_batch
+
+    def transform_coords(
+        self, coords: torch.Tensor, normalize=False, orig_hw=None
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with length 2 in the last dimension. The coordinates can be in absolute image or normalized coordinates,
+        If the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
+
+        Returns
+            Un-normalized coordinates in the range of [0, 1] which is expected by the SAM2 model.
+        """
+        if normalize:
+            assert orig_hw is not None
+            h, w = orig_hw
+            coords = coords.clone()
+            coords[..., 0] = coords[..., 0] / w
+            coords[..., 1] = coords[..., 1] / h
+
+        coords = coords * self.resolution  # unnormalize coords
+        return coords
+
+    def transform_boxes(
+        self, boxes: torch.Tensor, normalize=False, orig_hw=None
+    ) -> torch.Tensor:
+        """
+        Expects a tensor of shape Bx4. The coordinates can be in absolute image or normalized coordinates,
+        if the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
+        """
+        boxes = self.transform_coords(boxes.reshape(-1, 2, 2), normalize, orig_hw)
+        return boxes
+
+    def postprocess_masks(self, masks: torch.Tensor, orig_hw) -> torch.Tensor:
+        """
+        Perform PostProcessing on output masks.
+        """
+        from sam2.utils.misc import get_connected_components
+
+        masks = masks.float()
+        input_masks = masks
+        mask_flat = masks.flatten(0, 1).unsqueeze(1)  # flatten as 1-channel image
+        try:
+            if self.max_hole_area > 0:
+                # Holes are those connected components in background with area <= self.fill_hole_area
+                # (background regions are those with mask scores <= self.mask_threshold)
+                labels, areas = get_connected_components(
+                    mask_flat <= self.mask_threshold
+                )
+                is_hole = (labels > 0) & (areas <= self.max_hole_area)
+                is_hole = is_hole.reshape_as(masks)
+                # We fill holes with a small positive mask score (10.0) to change them to foreground.
+                masks = torch.where(is_hole, self.mask_threshold + 10.0, masks)
+
+            if self.max_sprinkle_area > 0:
+                labels, areas = get_connected_components(
+                    mask_flat > self.mask_threshold
+                )
+                is_hole = (labels > 0) & (areas <= self.max_sprinkle_area)
+                is_hole = is_hole.reshape_as(masks)
+                # We fill holes with negative mask score (-10.0) to change them to background.
+                masks = torch.where(is_hole, self.mask_threshold - 10.0, masks)
+        except Exception as e:
+            # Skip the post-processing step if the CUDA kernel fails
+            warnings.warn(
+                f"{e}\n\nSkipping the post-processing step due to the error above. You can "
+                "still use SAM 2 and it's OK to ignore the error above, although some post-processing "
+                "functionality may be limited (which doesn't affect the results in most cases; see "
+                "https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).",
+                category=UserWarning,
+                stacklevel=2,
+            )
+            masks = input_masks
+
+        masks = F.interpolate(masks, orig_hw, mode="bilinear", align_corners=False)
+        return masks
diff --git a/phantom/submodules/sam2/setup.py b/phantom/submodules/sam2/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..78a634cddb19615c45601681ffbcd1f29af66f47
--- /dev/null
+++ b/phantom/submodules/sam2/setup.py
@@ -0,0 +1,174 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+
+from setuptools import find_packages, setup
+
+# Package metadata
+NAME = "SAM-2"
+VERSION = "1.0"
+DESCRIPTION = "SAM 2: Segment Anything in Images and Videos"
+URL = "https://github.com/facebookresearch/sam2"
+AUTHOR = "Meta AI"
+AUTHOR_EMAIL = "segment-anything@meta.com"
+LICENSE = "Apache 2.0"
+
+# Read the contents of README file
+with open("README.md", "r", encoding="utf-8") as f:
+    LONG_DESCRIPTION = f.read()
+
+# Required dependencies
+REQUIRED_PACKAGES = [
+    "torch>=2.5.1",
+    "torchvision>=0.20.1",
+    "numpy>=1.24.4",
+    "tqdm>=4.66.1",
+    "hydra-core>=1.3.2",
+    "iopath>=0.1.10",
+    "pillow>=9.4.0",
+]
+
+EXTRA_PACKAGES = {
+    "notebooks": [
+        "matplotlib>=3.9.1",
+        "jupyter>=1.0.0",
+        "opencv-python>=4.7.0",
+        "eva-decord>=0.6.1",
+    ],
+    "interactive-demo": [
+        "Flask>=3.0.3",
+        "Flask-Cors>=5.0.0",
+        "av>=13.0.0",
+        "dataclasses-json>=0.6.7",
+        "eva-decord>=0.6.1",
+        "gunicorn>=23.0.0",
+        "imagesize>=1.4.1",
+        "pycocotools>=2.0.8",
+        "strawberry-graphql>=0.243.0",
+    ],
+    "dev": [
+        "black==24.2.0",
+        "usort==1.0.2",
+        "ufmt==2.0.0b2",
+        "fvcore>=0.1.5.post20221221",
+        "pandas>=2.2.2",
+        "scikit-image>=0.24.0",
+        "tensorboard>=2.17.0",
+        "pycocotools>=2.0.8",
+        "tensordict>=0.6.0",
+        "opencv-python>=4.7.0",
+        "submitit>=1.5.1",
+    ],
+}
+
+# By default, we also build the SAM 2 CUDA extension.
+# You may turn off CUDA build with `export SAM2_BUILD_CUDA=0`.
+BUILD_CUDA = os.getenv("SAM2_BUILD_CUDA", "1") == "1"
+# By default, we allow SAM 2 installation to proceed even with build errors.
+# You may force stopping on errors with `export SAM2_BUILD_ALLOW_ERRORS=0`.
+BUILD_ALLOW_ERRORS = os.getenv("SAM2_BUILD_ALLOW_ERRORS", "1") == "1"
+
+# Catch and skip errors during extension building and print a warning message
+# (note that this message only shows up under verbose build mode
+# "pip install -v -e ." or "python setup.py build_ext -v")
+CUDA_ERROR_MSG = (
+    "{}\n\n"
+    "Failed to build the SAM 2 CUDA extension due to the error above. "
+    "You can still use SAM 2 and it's OK to ignore the error above, although some "
+    "post-processing functionality may be limited (which doesn't affect the results in most cases; "
+    "(see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).\n"
+)
+
+
+def get_extensions():
+    if not BUILD_CUDA:
+        return []
+
+    try:
+        from torch.utils.cpp_extension import CUDAExtension
+
+        srcs = ["sam2/csrc/connected_components.cu"]
+        compile_args = {
+            "cxx": [],
+            "nvcc": [
+                "-DCUDA_HAS_FP16=1",
+                "-D__CUDA_NO_HALF_OPERATORS__",
+                "-D__CUDA_NO_HALF_CONVERSIONS__",
+                "-D__CUDA_NO_HALF2_OPERATORS__",
+            ],
+        }
+        ext_modules = [CUDAExtension("sam2._C", srcs, extra_compile_args=compile_args)]
+    except Exception as e:
+        if BUILD_ALLOW_ERRORS:
+            print(CUDA_ERROR_MSG.format(e))
+            ext_modules = []
+        else:
+            raise e
+
+    return ext_modules
+
+
+try:
+    from torch.utils.cpp_extension import BuildExtension
+
+    class BuildExtensionIgnoreErrors(BuildExtension):
+
+        def finalize_options(self):
+            try:
+                super().finalize_options()
+            except Exception as e:
+                print(CUDA_ERROR_MSG.format(e))
+                self.extensions = []
+
+        def build_extensions(self):
+            try:
+                super().build_extensions()
+            except Exception as e:
+                print(CUDA_ERROR_MSG.format(e))
+                self.extensions = []
+
+        def get_ext_filename(self, ext_name):
+            try:
+                return super().get_ext_filename(ext_name)
+            except Exception as e:
+                print(CUDA_ERROR_MSG.format(e))
+                self.extensions = []
+                return "_C.so"
+
+    cmdclass = {
+        "build_ext": (
+            BuildExtensionIgnoreErrors.with_options(no_python_abi_suffix=True)
+            if BUILD_ALLOW_ERRORS
+            else BuildExtension.with_options(no_python_abi_suffix=True)
+        )
+    }
+except Exception as e:
+    cmdclass = {}
+    if BUILD_ALLOW_ERRORS:
+        print(CUDA_ERROR_MSG.format(e))
+    else:
+        raise e
+
+
+# Setup configuration
+setup(
+    name=NAME,
+    version=VERSION,
+    description=DESCRIPTION,
+    long_description=LONG_DESCRIPTION,
+    long_description_content_type="text/markdown",
+    url=URL,
+    author=AUTHOR,
+    author_email=AUTHOR_EMAIL,
+    license=LICENSE,
+    packages=find_packages(exclude="notebooks"),
+    include_package_data=True,
+    install_requires=REQUIRED_PACKAGES,
+    extras_require=EXTRA_PACKAGES,
+    python_requires=">=3.10.0",
+    ext_modules=get_extensions(),
+    cmdclass=cmdclass,
+)
diff --git a/phantom/submodules/sam2/tools/README.md b/phantom/submodules/sam2/tools/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1dd0e8a754f4bf27ee321084076f3ebdb2285450
--- /dev/null
+++ b/phantom/submodules/sam2/tools/README.md
@@ -0,0 +1,36 @@
+## SAM 2 toolkits
+
+This directory provides toolkits for additional SAM 2 use cases.
+
+### Semi-supervised VOS inference
+
+The `vos_inference.py` script can be used to generate predictions for semi-supervised video object segmentation (VOS) evaluation on datasets such as [DAVIS](https://davischallenge.org/index.html), [MOSE](https://henghuiding.github.io/MOSE/) or the SA-V dataset.
+
+After installing SAM 2 and its dependencies, it can be used as follows ([DAVIS 2017 dataset](https://davischallenge.org/davis2017/code.html) as an example). This script saves the prediction PNG files to the `--output_mask_dir`.
+```bash
+python ./tools/vos_inference.py \
+  --sam2_cfg configs/sam2.1/sam2.1_hiera_b+.yaml \
+  --sam2_checkpoint ./checkpoints/sam2.1_hiera_base_plus.pt \
+  --base_video_dir /path-to-davis-2017/JPEGImages/480p \
+  --input_mask_dir /path-to-davis-2017/Annotations/480p \
+  --video_list_file /path-to-davis-2017/ImageSets/2017/val.txt \
+  --output_mask_dir ./outputs/davis_2017_pred_pngs
+```
+(replace `/path-to-davis-2017` with the path to DAVIS 2017 dataset)
+
+To evaluate on the SA-V dataset with per-object PNG files for the object masks, we need to **add the `--per_obj_png_file` flag** as follows (using SA-V val as an example). This script will also save per-object PNG files for the output masks under the `--per_obj_png_file` flag.
+```bash
+python ./tools/vos_inference.py \
+  --sam2_cfg configs/sam2.1/sam2.1_hiera_b+.yaml \
+  --sam2_checkpoint ./checkpoints/sam2.1_hiera_base_plus.pt \
+  --base_video_dir /path-to-sav-val/JPEGImages_24fps \
+  --input_mask_dir /path-to-sav-val/Annotations_6fps \
+  --video_list_file /path-to-sav-val/sav_val.txt \
+  --per_obj_png_file \
+  --output_mask_dir ./outputs/sav_val_pred_pngs
+```
+(replace `/path-to-sav-val` with the path to SA-V val)
+
+Then, we can use the evaluation tools or servers for each dataset to get the performance of the prediction PNG files above.
+
+Note: by default, the `vos_inference.py` script above assumes that all objects to track already appear on frame 0 in each video (as is the case in DAVIS, MOSE or SA-V). **For VOS datasets that don't have all objects to track appearing in the first frame (such as LVOS or YouTube-VOS), please add the `--track_object_appearing_later_in_video` flag when using `vos_inference.py`**.
diff --git a/phantom/submodules/sam2/tools/vos_inference.py b/phantom/submodules/sam2/tools/vos_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef3e8c6740541f342cfbbe0fa8ad80e47caf4ac9
--- /dev/null
+++ b/phantom/submodules/sam2/tools/vos_inference.py
@@ -0,0 +1,507 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from collections import defaultdict
+
+import numpy as np
+import torch
+from PIL import Image
+from sam2.build_sam import build_sam2_video_predictor
+
+
+# the PNG palette for DAVIS 2017 dataset
+DAVIS_PALETTE = b"\x00\x00\x00\x80\x00\x00\x00\x80\x00\x80\x80\x00\x00\x00\x80\x80\x00\x80\x00\x80\x80\x80\x80\x80@\x00\x00\xc0\x00\x00@\x80\x00\xc0\x80\x00@\x00\x80\xc0\x00\x80@\x80\x80\xc0\x80\x80\x00@\x00\x80@\x00\x00\xc0\x00\x80\xc0\x00\x00@\x80\x80@\x80\x00\xc0\x80\x80\xc0\x80@@\x00\xc0@\x00@\xc0\x00\xc0\xc0\x00@@\x80\xc0@\x80@\xc0\x80\xc0\xc0\x80\x00\x00@\x80\x00@\x00\x80@\x80\x80@\x00\x00\xc0\x80\x00\xc0\x00\x80\xc0\x80\x80\xc0@\x00@\xc0\x00@@\x80@\xc0\x80@@\x00\xc0\xc0\x00\xc0@\x80\xc0\xc0\x80\xc0\x00@@\x80@@\x00\xc0@\x80\xc0@\x00@\xc0\x80@\xc0\x00\xc0\xc0\x80\xc0\xc0@@@\xc0@@@\xc0@\xc0\xc0@@@\xc0\xc0@\xc0@\xc0\xc0\xc0\xc0\xc0 \x00\x00\xa0\x00\x00 \x80\x00\xa0\x80\x00 \x00\x80\xa0\x00\x80 \x80\x80\xa0\x80\x80`\x00\x00\xe0\x00\x00`\x80\x00\xe0\x80\x00`\x00\x80\xe0\x00\x80`\x80\x80\xe0\x80\x80 @\x00\xa0@\x00 \xc0\x00\xa0\xc0\x00 @\x80\xa0@\x80 \xc0\x80\xa0\xc0\x80`@\x00\xe0@\x00`\xc0\x00\xe0\xc0\x00`@\x80\xe0@\x80`\xc0\x80\xe0\xc0\x80 \x00@\xa0\x00@ \x80@\xa0\x80@ \x00\xc0\xa0\x00\xc0 \x80\xc0\xa0\x80\xc0`\x00@\xe0\x00@`\x80@\xe0\x80@`\x00\xc0\xe0\x00\xc0`\x80\xc0\xe0\x80\xc0 @@\xa0@@ \xc0@\xa0\xc0@ @\xc0\xa0@\xc0 \xc0\xc0\xa0\xc0\xc0`@@\xe0@@`\xc0@\xe0\xc0@`@\xc0\xe0@\xc0`\xc0\xc0\xe0\xc0\xc0\x00 \x00\x80 \x00\x00\xa0\x00\x80\xa0\x00\x00 \x80\x80 \x80\x00\xa0\x80\x80\xa0\x80@ \x00\xc0 \x00@\xa0\x00\xc0\xa0\x00@ \x80\xc0 \x80@\xa0\x80\xc0\xa0\x80\x00`\x00\x80`\x00\x00\xe0\x00\x80\xe0\x00\x00`\x80\x80`\x80\x00\xe0\x80\x80\xe0\x80@`\x00\xc0`\x00@\xe0\x00\xc0\xe0\x00@`\x80\xc0`\x80@\xe0\x80\xc0\xe0\x80\x00 @\x80 @\x00\xa0@\x80\xa0@\x00 \xc0\x80 \xc0\x00\xa0\xc0\x80\xa0\xc0@ @\xc0 @@\xa0@\xc0\xa0@@ \xc0\xc0 \xc0@\xa0\xc0\xc0\xa0\xc0\x00`@\x80`@\x00\xe0@\x80\xe0@\x00`\xc0\x80`\xc0\x00\xe0\xc0\x80\xe0\xc0@`@\xc0`@@\xe0@\xc0\xe0@@`\xc0\xc0`\xc0@\xe0\xc0\xc0\xe0\xc0  \x00\xa0 \x00 \xa0\x00\xa0\xa0\x00  \x80\xa0 \x80 \xa0\x80\xa0\xa0\x80` \x00\xe0 \x00`\xa0\x00\xe0\xa0\x00` \x80\xe0 \x80`\xa0\x80\xe0\xa0\x80 `\x00\xa0`\x00 \xe0\x00\xa0\xe0\x00 `\x80\xa0`\x80 \xe0\x80\xa0\xe0\x80``\x00\xe0`\x00`\xe0\x00\xe0\xe0\x00``\x80\xe0`\x80`\xe0\x80\xe0\xe0\x80  @\xa0 @ \xa0@\xa0\xa0@  \xc0\xa0 \xc0 \xa0\xc0\xa0\xa0\xc0` @\xe0 @`\xa0@\xe0\xa0@` \xc0\xe0 \xc0`\xa0\xc0\xe0\xa0\xc0 `@\xa0`@ \xe0@\xa0\xe0@ `\xc0\xa0`\xc0 \xe0\xc0\xa0\xe0\xc0``@\xe0`@`\xe0@\xe0\xe0@``\xc0\xe0`\xc0`\xe0\xc0\xe0\xe0\xc0"
+
+
+def load_ann_png(path):
+    """Load a PNG file as a mask and its palette."""
+    mask = Image.open(path)
+    palette = mask.getpalette()
+    mask = np.array(mask).astype(np.uint8)
+    return mask, palette
+
+
+def save_ann_png(path, mask, palette):
+    """Save a mask as a PNG file with the given palette."""
+    assert mask.dtype == np.uint8
+    assert mask.ndim == 2
+    output_mask = Image.fromarray(mask)
+    output_mask.putpalette(palette)
+    output_mask.save(path)
+
+
+def get_per_obj_mask(mask):
+    """Split a mask into per-object masks."""
+    object_ids = np.unique(mask)
+    object_ids = object_ids[object_ids > 0].tolist()
+    per_obj_mask = {object_id: (mask == object_id) for object_id in object_ids}
+    return per_obj_mask
+
+
+def put_per_obj_mask(per_obj_mask, height, width):
+    """Combine per-object masks into a single mask."""
+    mask = np.zeros((height, width), dtype=np.uint8)
+    object_ids = sorted(per_obj_mask)[::-1]
+    for object_id in object_ids:
+        object_mask = per_obj_mask[object_id]
+        object_mask = object_mask.reshape(height, width)
+        mask[object_mask] = object_id
+    return mask
+
+
+def load_masks_from_dir(
+    input_mask_dir, video_name, frame_name, per_obj_png_file, allow_missing=False
+):
+    """Load masks from a directory as a dict of per-object masks."""
+    if not per_obj_png_file:
+        input_mask_path = os.path.join(input_mask_dir, video_name, f"{frame_name}.png")
+        if allow_missing and not os.path.exists(input_mask_path):
+            return {}, None
+        input_mask, input_palette = load_ann_png(input_mask_path)
+        per_obj_input_mask = get_per_obj_mask(input_mask)
+    else:
+        per_obj_input_mask = {}
+        input_palette = None
+        # each object is a directory in "{object_id:%03d}" format
+        for object_name in os.listdir(os.path.join(input_mask_dir, video_name)):
+            object_id = int(object_name)
+            input_mask_path = os.path.join(
+                input_mask_dir, video_name, object_name, f"{frame_name}.png"
+            )
+            if allow_missing and not os.path.exists(input_mask_path):
+                continue
+            input_mask, input_palette = load_ann_png(input_mask_path)
+            per_obj_input_mask[object_id] = input_mask > 0
+
+    return per_obj_input_mask, input_palette
+
+
+def save_masks_to_dir(
+    output_mask_dir,
+    video_name,
+    frame_name,
+    per_obj_output_mask,
+    height,
+    width,
+    per_obj_png_file,
+    output_palette,
+):
+    """Save masks to a directory as PNG files."""
+    os.makedirs(os.path.join(output_mask_dir, video_name), exist_ok=True)
+    if not per_obj_png_file:
+        output_mask = put_per_obj_mask(per_obj_output_mask, height, width)
+        output_mask_path = os.path.join(
+            output_mask_dir, video_name, f"{frame_name}.png"
+        )
+        save_ann_png(output_mask_path, output_mask, output_palette)
+    else:
+        for object_id, object_mask in per_obj_output_mask.items():
+            object_name = f"{object_id:03d}"
+            os.makedirs(
+                os.path.join(output_mask_dir, video_name, object_name),
+                exist_ok=True,
+            )
+            output_mask = object_mask.reshape(height, width).astype(np.uint8)
+            output_mask_path = os.path.join(
+                output_mask_dir, video_name, object_name, f"{frame_name}.png"
+            )
+            save_ann_png(output_mask_path, output_mask, output_palette)
+
+
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def vos_inference(
+    predictor,
+    base_video_dir,
+    input_mask_dir,
+    output_mask_dir,
+    video_name,
+    score_thresh=0.0,
+    use_all_masks=False,
+    per_obj_png_file=False,
+):
+    """Run VOS inference on a single video with the given predictor."""
+    # load the video frames and initialize the inference state on this video
+    video_dir = os.path.join(base_video_dir, video_name)
+    frame_names = [
+        os.path.splitext(p)[0]
+        for p in os.listdir(video_dir)
+        if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
+    ]
+    frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
+    inference_state = predictor.init_state(
+        video_path=video_dir, async_loading_frames=False
+    )
+    height = inference_state["video_height"]
+    width = inference_state["video_width"]
+    input_palette = None
+
+    # fetch mask inputs from input_mask_dir (either only mask for the first frame, or all available masks)
+    if not use_all_masks:
+        # use only the first video's ground-truth mask as the input mask
+        input_frame_inds = [0]
+    else:
+        # use all mask files available in the input_mask_dir as the input masks
+        if not per_obj_png_file:
+            input_frame_inds = [
+                idx
+                for idx, name in enumerate(frame_names)
+                if os.path.exists(
+                    os.path.join(input_mask_dir, video_name, f"{name}.png")
+                )
+            ]
+        else:
+            input_frame_inds = [
+                idx
+                for object_name in os.listdir(os.path.join(input_mask_dir, video_name))
+                for idx, name in enumerate(frame_names)
+                if os.path.exists(
+                    os.path.join(input_mask_dir, video_name, object_name, f"{name}.png")
+                )
+            ]
+        # check and make sure we got at least one input frame
+        if len(input_frame_inds) == 0:
+            raise RuntimeError(
+                f"In {video_name=}, got no input masks in {input_mask_dir=}. "
+                "Please make sure the input masks are available in the correct format."
+            )
+        input_frame_inds = sorted(set(input_frame_inds))
+
+    # add those input masks to SAM 2 inference state before propagation
+    object_ids_set = None
+    for input_frame_idx in input_frame_inds:
+        try:
+            per_obj_input_mask, input_palette = load_masks_from_dir(
+                input_mask_dir=input_mask_dir,
+                video_name=video_name,
+                frame_name=frame_names[input_frame_idx],
+                per_obj_png_file=per_obj_png_file,
+            )
+        except FileNotFoundError as e:
+            raise RuntimeError(
+                f"In {video_name=}, failed to load input mask for frame {input_frame_idx=}. "
+                "Please add the `--track_object_appearing_later_in_video` flag "
+                "for VOS datasets that don't have all objects to track appearing "
+                "in the first frame (such as LVOS or YouTube-VOS)."
+            ) from e
+        # get the list of object ids to track from the first input frame
+        if object_ids_set is None:
+            object_ids_set = set(per_obj_input_mask)
+        for object_id, object_mask in per_obj_input_mask.items():
+            # check and make sure no new object ids appear only in later frames
+            if object_id not in object_ids_set:
+                raise RuntimeError(
+                    f"In {video_name=}, got a new {object_id=} appearing only in a "
+                    f"later {input_frame_idx=} (but not appearing in the first frame). "
+                    "Please add the `--track_object_appearing_later_in_video` flag "
+                    "for VOS datasets that don't have all objects to track appearing "
+                    "in the first frame (such as LVOS or YouTube-VOS)."
+                )
+            predictor.add_new_mask(
+                inference_state=inference_state,
+                frame_idx=input_frame_idx,
+                obj_id=object_id,
+                mask=object_mask,
+            )
+
+    # check and make sure we have at least one object to track
+    if object_ids_set is None or len(object_ids_set) == 0:
+        raise RuntimeError(
+            f"In {video_name=}, got no object ids on {input_frame_inds=}. "
+            "Please add the `--track_object_appearing_later_in_video` flag "
+            "for VOS datasets that don't have all objects to track appearing "
+            "in the first frame (such as LVOS or YouTube-VOS)."
+        )
+    # run propagation throughout the video and collect the results in a dict
+    os.makedirs(os.path.join(output_mask_dir, video_name), exist_ok=True)
+    output_palette = input_palette or DAVIS_PALETTE
+    video_segments = {}  # video_segments contains the per-frame segmentation results
+    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
+        inference_state
+    ):
+        per_obj_output_mask = {
+            out_obj_id: (out_mask_logits[i] > score_thresh).cpu().numpy()
+            for i, out_obj_id in enumerate(out_obj_ids)
+        }
+        video_segments[out_frame_idx] = per_obj_output_mask
+
+    # write the output masks as palette PNG files to output_mask_dir
+    for out_frame_idx, per_obj_output_mask in video_segments.items():
+        save_masks_to_dir(
+            output_mask_dir=output_mask_dir,
+            video_name=video_name,
+            frame_name=frame_names[out_frame_idx],
+            per_obj_output_mask=per_obj_output_mask,
+            height=height,
+            width=width,
+            per_obj_png_file=per_obj_png_file,
+            output_palette=output_palette,
+        )
+
+
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def vos_separate_inference_per_object(
+    predictor,
+    base_video_dir,
+    input_mask_dir,
+    output_mask_dir,
+    video_name,
+    score_thresh=0.0,
+    use_all_masks=False,
+    per_obj_png_file=False,
+):
+    """
+    Run VOS inference on a single video with the given predictor.
+
+    Unlike `vos_inference`, this function run inference separately for each object
+    in a video, which could be applied to datasets like LVOS or YouTube-VOS that
+    don't have all objects to track appearing in the first frame (i.e. some objects
+    might appear only later in the video).
+    """
+    # load the video frames and initialize the inference state on this video
+    video_dir = os.path.join(base_video_dir, video_name)
+    frame_names = [
+        os.path.splitext(p)[0]
+        for p in os.listdir(video_dir)
+        if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
+    ]
+    frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
+    inference_state = predictor.init_state(
+        video_path=video_dir, async_loading_frames=False
+    )
+    height = inference_state["video_height"]
+    width = inference_state["video_width"]
+    input_palette = None
+
+    # collect all the object ids and their input masks
+    inputs_per_object = defaultdict(dict)
+    for idx, name in enumerate(frame_names):
+        if per_obj_png_file or os.path.exists(
+            os.path.join(input_mask_dir, video_name, f"{name}.png")
+        ):
+            per_obj_input_mask, input_palette = load_masks_from_dir(
+                input_mask_dir=input_mask_dir,
+                video_name=video_name,
+                frame_name=frame_names[idx],
+                per_obj_png_file=per_obj_png_file,
+                allow_missing=True,
+            )
+            for object_id, object_mask in per_obj_input_mask.items():
+                # skip empty masks
+                if not np.any(object_mask):
+                    continue
+                # if `use_all_masks=False`, we only use the first mask for each object
+                if len(inputs_per_object[object_id]) > 0 and not use_all_masks:
+                    continue
+                print(f"adding mask from frame {idx} as input for {object_id=}")
+                inputs_per_object[object_id][idx] = object_mask
+
+    # run inference separately for each object in the video
+    object_ids = sorted(inputs_per_object)
+    output_scores_per_object = defaultdict(dict)
+    for object_id in object_ids:
+        # add those input masks to SAM 2 inference state before propagation
+        input_frame_inds = sorted(inputs_per_object[object_id])
+        predictor.reset_state(inference_state)
+        for input_frame_idx in input_frame_inds:
+            predictor.add_new_mask(
+                inference_state=inference_state,
+                frame_idx=input_frame_idx,
+                obj_id=object_id,
+                mask=inputs_per_object[object_id][input_frame_idx],
+            )
+
+        # run propagation throughout the video and collect the results in a dict
+        for out_frame_idx, _, out_mask_logits in predictor.propagate_in_video(
+            inference_state,
+            start_frame_idx=min(input_frame_inds),
+            reverse=False,
+        ):
+            obj_scores = out_mask_logits.cpu().numpy()
+            output_scores_per_object[object_id][out_frame_idx] = obj_scores
+
+    # post-processing: consolidate the per-object scores into per-frame masks
+    os.makedirs(os.path.join(output_mask_dir, video_name), exist_ok=True)
+    output_palette = input_palette or DAVIS_PALETTE
+    video_segments = {}  # video_segments contains the per-frame segmentation results
+    for frame_idx in range(len(frame_names)):
+        scores = torch.full(
+            size=(len(object_ids), 1, height, width),
+            fill_value=-1024.0,
+            dtype=torch.float32,
+        )
+        for i, object_id in enumerate(object_ids):
+            if frame_idx in output_scores_per_object[object_id]:
+                scores[i] = torch.from_numpy(
+                    output_scores_per_object[object_id][frame_idx]
+                )
+
+        if not per_obj_png_file:
+            scores = predictor._apply_non_overlapping_constraints(scores)
+        per_obj_output_mask = {
+            object_id: (scores[i] > score_thresh).cpu().numpy()
+            for i, object_id in enumerate(object_ids)
+        }
+        video_segments[frame_idx] = per_obj_output_mask
+
+    # write the output masks as palette PNG files to output_mask_dir
+    for frame_idx, per_obj_output_mask in video_segments.items():
+        save_masks_to_dir(
+            output_mask_dir=output_mask_dir,
+            video_name=video_name,
+            frame_name=frame_names[frame_idx],
+            per_obj_output_mask=per_obj_output_mask,
+            height=height,
+            width=width,
+            per_obj_png_file=per_obj_png_file,
+            output_palette=output_palette,
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--sam2_cfg",
+        type=str,
+        default="configs/sam2.1/sam2.1_hiera_b+.yaml",
+        help="SAM 2 model configuration file",
+    )
+    parser.add_argument(
+        "--sam2_checkpoint",
+        type=str,
+        default="./checkpoints/sam2.1_hiera_base_plus.pt",
+        help="path to the SAM 2 model checkpoint",
+    )
+    parser.add_argument(
+        "--base_video_dir",
+        type=str,
+        required=True,
+        help="directory containing videos (as JPEG files) to run VOS prediction on",
+    )
+    parser.add_argument(
+        "--input_mask_dir",
+        type=str,
+        required=True,
+        help="directory containing input masks (as PNG files) of each video",
+    )
+    parser.add_argument(
+        "--video_list_file",
+        type=str,
+        default=None,
+        help="text file containing the list of video names to run VOS prediction on",
+    )
+    parser.add_argument(
+        "--output_mask_dir",
+        type=str,
+        required=True,
+        help="directory to save the output masks (as PNG files)",
+    )
+    parser.add_argument(
+        "--score_thresh",
+        type=float,
+        default=0.0,
+        help="threshold for the output mask logits (default: 0.0)",
+    )
+    parser.add_argument(
+        "--use_all_masks",
+        action="store_true",
+        help="whether to use all available PNG files in input_mask_dir "
+        "(default without this flag: just the first PNG file as input to the SAM 2 model; "
+        "usually we don't need this flag, since semi-supervised VOS evaluation usually takes input from the first frame only)",
+    )
+    parser.add_argument(
+        "--per_obj_png_file",
+        action="store_true",
+        help="whether use separate per-object PNG files for input and output masks "
+        "(default without this flag: all object masks are packed into a single PNG file on each frame following DAVIS format; "
+        "note that the SA-V dataset stores each object mask as an individual PNG file and requires this flag)",
+    )
+    parser.add_argument(
+        "--apply_postprocessing",
+        action="store_true",
+        help="whether to apply postprocessing (e.g. hole-filling) to the output masks "
+        "(we don't apply such post-processing in the SAM 2 model evaluation)",
+    )
+    parser.add_argument(
+        "--track_object_appearing_later_in_video",
+        action="store_true",
+        help="whether to track objects that appear later in the video (i.e. not on the first frame; "
+        "some VOS datasets like LVOS or YouTube-VOS don't have all objects appearing in the first frame)",
+    )
+    parser.add_argument(
+        "--use_vos_optimized_video_predictor",
+        action="store_true",
+        help="whether to use vos optimized video predictor with all modules compiled",
+    )
+    args = parser.parse_args()
+
+    # if we use per-object PNG files, they could possibly overlap in inputs and outputs
+    hydra_overrides_extra = [
+        "++model.non_overlap_masks=" + ("false" if args.per_obj_png_file else "true")
+    ]
+    predictor = build_sam2_video_predictor(
+        config_file=args.sam2_cfg,
+        ckpt_path=args.sam2_checkpoint,
+        apply_postprocessing=args.apply_postprocessing,
+        hydra_overrides_extra=hydra_overrides_extra,
+        vos_optimized=args.use_vos_optimized_video_predictor,
+    )
+
+    if args.use_all_masks:
+        print("using all available masks in input_mask_dir as input to the SAM 2 model")
+    else:
+        print(
+            "using only the first frame's mask in input_mask_dir as input to the SAM 2 model"
+        )
+    # if a video list file is provided, read the video names from the file
+    # (otherwise, we use all subdirectories in base_video_dir)
+    if args.video_list_file is not None:
+        with open(args.video_list_file, "r") as f:
+            video_names = [v.strip() for v in f.readlines()]
+    else:
+        video_names = [
+            p
+            for p in os.listdir(args.base_video_dir)
+            if os.path.isdir(os.path.join(args.base_video_dir, p))
+        ]
+    print(f"running VOS prediction on {len(video_names)} videos:\n{video_names}")
+
+    for n_video, video_name in enumerate(video_names):
+        print(f"\n{n_video + 1}/{len(video_names)} - running on {video_name}")
+        if not args.track_object_appearing_later_in_video:
+            vos_inference(
+                predictor=predictor,
+                base_video_dir=args.base_video_dir,
+                input_mask_dir=args.input_mask_dir,
+                output_mask_dir=args.output_mask_dir,
+                video_name=video_name,
+                score_thresh=args.score_thresh,
+                use_all_masks=args.use_all_masks,
+                per_obj_png_file=args.per_obj_png_file,
+            )
+        else:
+            vos_separate_inference_per_object(
+                predictor=predictor,
+                base_video_dir=args.base_video_dir,
+                input_mask_dir=args.input_mask_dir,
+                output_mask_dir=args.output_mask_dir,
+                video_name=video_name,
+                score_thresh=args.score_thresh,
+                use_all_masks=args.use_all_masks,
+                per_obj_png_file=args.per_obj_png_file,
+            )
+
+    print(
+        f"completed VOS prediction on {len(video_names)} videos -- "
+        f"output masks saved to {args.output_mask_dir}"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/phantom/submodules/sam2/training/README.md b/phantom/submodules/sam2/training/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b0c829d49d051d8f72e7bef959e33e6f0329c94d
--- /dev/null
+++ b/phantom/submodules/sam2/training/README.md
@@ -0,0 +1,116 @@
+# Training Code for SAM 2
+
+This folder contains the training code for SAM 2, a foundation model for promptable visual segmentation in images and videos. 
+The code allows users to train and fine-tune SAM 2 on their own datasets (image, video, or both).
+
+## Structure
+
+The training code is organized into the following subfolders:
+
+* `dataset`: This folder contains image and video dataset and dataloader classes as well as their transforms.
+* `model`: This folder contains the main model class (`SAM2Train`) for training/fine-tuning. `SAM2Train` inherits from `SAM2Base` model and provides functions to enable training or fine-tuning SAM 2. It also accepts all training-time parameters used for simulating user prompts (e.g. iterative point sampling).
+* `utils`: This folder contains training utils such as loggers and distributed training utils.
+* `scripts`: This folder contains the script to extract the frames of SA-V dataset to be used in training.
+* `loss_fns.py`: This file has the main loss class (`MultiStepMultiMasksAndIous`) used for training.
+* `optimizer.py`:  This file contains all optimizer utils that support arbitrary schedulers.
+* `trainer.py`: This file contains the `Trainer` class that accepts all the `Hydra` configurable modules (model, optimizer, datasets, etc..) and implements the main train/eval loop.
+* `train.py`: This script is used to launch training jobs. It supports single and multi-node jobs. For usage, please check the [Getting Started](README.md#getting-started) section or run `python training/train.py -h`
+
+## Getting Started
+
+To get started with the training code, we provide a simple example to fine-tune our checkpoints on [MOSE](https://henghuiding.github.io/MOSE/) dataset, which can be extended to your custom datasets.
+
+#### Requirements:
+- We assume training on A100 GPUs with **80 GB** of memory.
+- Download the MOSE dataset using one of the provided links from [here](https://github.com/henghuiding/MOSE-api?tab=readme-ov-file#download).
+
+#### Steps to fine-tune on MOSE:
+- Install the packages required for training by running `pip install -e ".[dev]"`.
+- Set the paths for MOSE dataset in `configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml`.
+    ```yaml
+    dataset:
+        # PATHS to Dataset
+        img_folder: null # PATH to MOSE JPEGImages folder
+        gt_folder: null # PATH to MOSE Annotations folder
+        file_list_txt: null # Optional PATH to filelist containing a subset of videos to be used for training
+    ```
+- To fine-tune the base model on MOSE using 8 GPUs, run 
+
+    ```python
+    python training/train.py \
+        -c configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml \
+        --use-cluster 0 \
+        --num-gpus 8
+    ```
+
+    We also support multi-node training on a cluster using [SLURM](https://slurm.schedmd.com/documentation.html), for example, you can train on 2 nodes by running
+
+    ```python
+    python training/train.py \
+        -c configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml \
+        --use-cluster 1 \
+        --num-gpus 8 \
+        --num-nodes 2
+        --partition $PARTITION \
+        --qos $QOS \
+        --account $ACCOUNT
+    ```
+    where partition, qos, and account are optional and depend on your SLURM configuration.
+    By default, the checkpoint and logs will be saved under `sam2_logs` directory in the root of the repo. Alternatively, you can set the experiment log directory in the config file as follows:
+  
+    ```yaml
+      experiment_log_dir: null # Path to log directory, defaults to ./sam2_logs/${config_name}
+    ```
+    The training losses can be monitored using `tensorboard` logs stored under `tensorboard/` in the experiment log directory. We also provide a sample validation [split]( ../training/assets/MOSE_sample_val_list.txt) for evaluation purposes. To generate predictions, follow this [guide](../tools/README.md) on how to use our `vos_inference.py` script. After generating the predictions, you can run the `sav_evaluator.py` as detailed [here](../sav_dataset/README.md#sa-v-val-and-test-evaluation). The expected MOSE J&F after fine-tuning the Base plus model is 79.4.
+    
+    
+    After training/fine-tuning, you can then use the new checkpoint (saved in `checkpoints/` in the experiment log directory) similar to SAM 2 released checkpoints (as illustrated [here](../README.md#image-prediction)).
+## Training on images and videos
+The code supports training on images and videos (similar to how SAM 2 is trained). We provide classes for loading SA-1B as a sample image dataset, SA-V as a sample video dataset, as well as any DAVIS-style video dataset (e.g. MOSE). Note that to train on SA-V, you must first extract all videos to JPEG frames using the provided extraction [script](./scripts/sav_frame_extraction_submitit.py). Below is an example of how to setup the datasets in your config to train on a mix of image and video datasets:
+
+```yaml
+data:
+  train:
+    _target_: training.dataset.sam2_datasets.TorchTrainMixedDataset 
+    phases_per_epoch: ${phases_per_epoch} # Chunks a single epoch into smaller phases
+    batch_sizes: # List of batch sizes corresponding to each dataset
+    - ${bs1} # Batch size of dataset 1
+    - ${bs2} # Batch size of dataset 2
+    datasets:
+    # SA1B as an example of an image dataset
+    - _target_: training.dataset.vos_dataset.VOSDataset
+      training: true
+      video_dataset:
+        _target_: training.dataset.vos_raw_dataset.SA1BRawDataset
+        img_folder: ${path_to_img_folder}
+        gt_folder: ${path_to_gt_folder}
+        file_list_txt: ${path_to_train_filelist} # Optional
+      sampler:
+        _target_: training.dataset.vos_sampler.RandomUniformSampler
+        num_frames: 1
+        max_num_objects: ${max_num_objects_per_image}
+      transforms: ${image_transforms}
+    # SA-V as an example of a video dataset
+    - _target_: training.dataset.vos_dataset.VOSDataset
+      training: true
+      video_dataset:
+        _target_: training.dataset.vos_raw_dataset.JSONRawDataset
+        img_folder: ${path_to_img_folder}
+        gt_folder: ${path_to_gt_folder}
+        file_list_txt: ${path_to_train_filelist} # Optional
+        ann_every: 4
+      sampler:
+        _target_: training.dataset.vos_sampler.RandomUniformSampler
+        num_frames: 8 # Number of frames per video
+        max_num_objects: ${max_num_objects_per_video}
+        reverse_time_prob: ${reverse_time_prob} # probability to reverse video
+      transforms: ${video_transforms}
+    shuffle: True
+    num_workers: ${num_train_workers}
+    pin_memory: True
+    drop_last: True
+    collate_fn:
+    _target_: training.utils.data_utils.collate_fn
+    _partial_: true
+    dict_key: all
+```
diff --git a/phantom/submodules/sam2/training/__init__.py b/phantom/submodules/sam2/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/phantom/submodules/sam2/training/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/phantom/submodules/sam2/training/assets/MOSE_sample_train_list.txt b/phantom/submodules/sam2/training/assets/MOSE_sample_train_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..28b22e3170f63de0fba3c77ef999f958cd6c48ff
--- /dev/null
+++ b/phantom/submodules/sam2/training/assets/MOSE_sample_train_list.txt
@@ -0,0 +1,1246 @@
+28191f94
+662487fe
+80906bf9
+7e704f2e
+efa25913
+b6f03bd9
+6834d249
+5a723c30
+07779415
+4ce088c6
+199995b5
+54273925
+4fa342f5
+110da3cf
+65856fa0
+46705bb3
+d869a3cf
+555aa049
+8f01fb2c
+37b07a28
+5e80b3dd
+ba0e4dd4
+6f5144b6
+acec8407
+93723f88
+c7c7528c
+97f58761
+e71f9faa
+e64c13dc
+8830d59d
+0e4aeed9
+63437cf3
+95215aa1
+255f86ef
+dc54aab2
+327cd258
+198021ad
+c690220c
+d25ff89d
+7875b874
+4fa6d325
+9fc933f6
+4d8baafe
+55ae6921
+6a3bc149
+89f8163f
+2d65d2ac
+dba172b1
+a14de179
+4017d1b3
+52ddf44c
+3ba93641
+34a5f964
+da7dee28
+872b76de
+1dc12eca
+265a69f4
+86a2b59f
+51e5ca25
+ddf80bcd
+6786602e
+4fa28c89
+f56942e9
+2184bb93
+d883e976
+bfe1469e
+bc4e7b11
+1c80acb0
+2b0e34d3
+56b9ce41
+15f0b0cd
+cc5d0dd1
+1b7eada8
+7286b176
+0ab42ab1
+adb82dc9
+c060b1e6
+3da63bd5
+5488796e
+d7066e20
+aab5ed11
+17f66311
+24df9789
+208fa934
+7ce2c865
+debe4249
+4c56bbea
+149dbae2
+beb693c9
+49eb0315
+e7ad4717
+4e016d5a
+95e24093
+07b5d86c
+80701b6c
+337dfa1e
+b624a46e
+3f849de8
+5db21df2
+47891b4c
+a966d7fd
+013103f6
+da5e4bc5
+ba9ea03d
+526195de
+57f3a53e
+b3aff7f8
+26048547
+bb7ee856
+aef0d049
+e35a8262
+57ad022e
+f45d3823
+e5e9eb29
+39cc637e
+a4fc4f17
+dd5a4739
+bbe97d18
+33602f6b
+9061dac9
+23454d80
+a20baeec
+794f01d4
+02de2f2a
+055fca57
+a69df343
+e307510e
+d07ad1be
+1fc5e086
+db6533a5
+fe9706b7
+87e32230
+8ba58e4c
+561f6380
+2ab9ba0f
+86571569
+756cc6c9
+aa185af5
+c6d7f94b
+7f54c579
+71f4b40e
+4190c83a
+fef0aba4
+2f7c71bb
+e4b6f2ef
+76adaeea
+11cdeb64
+733f2a02
+e50dbddb
+f643141f
+d2e75e95
+84559bc3
+7ade3068
+e69db797
+0b787263
+57895315
+d7969c29
+62529cd4
+203733e7
+48fd97a6
+723fd024
+849f0efb
+aafea009
+dd4eb8f1
+d18554ae
+f3c0f0cf
+90fe55b9
+b0ffaf3b
+e79ecd47
+d670ce7b
+56a5643a
+90ff1d09
+1fb378d9
+57014c7d
+994ed763
+5bc7ea74
+e99bd793
+cbb66185
+5f3fcff6
+05ed1023
+85efa9e3
+652929ce
+905d8740
+a6fcde01
+0fdf67f7
+a5cf4c8d
+e1c48bdd
+782551f7
+6acd353f
+c30641cf
+81d12756
+51befc31
+9d5ab5ca
+d262b7e4
+2cd705a9
+f7360199
+d3f3bf9d
+028f6f64
+94767cb4
+3a739934
+72433603
+ec66879d
+6149becc
+5845c157
+c5082b3c
+f89b54d0
+f3ada126
+409dcb8a
+4411fdee
+eb93ed20
+9cb1ba0e
+b8e1ec26
+7edd8b4f
+5e9412c0
+2744f35a
+dafeb75e
+f3f072f2
+6f1df574
+5a064706
+89c76ac4
+a6adef89
+76303516
+dbd67417
+a53ef3fa
+10552818
+ac7deb19
+2d403c59
+55c157f1
+214aeac3
+a9f5e251
+d7807996
+d1dba33b
+1367e367
+44476e77
+0644075b
+eda37457
+f2de4198
+9a4ce701
+46e00caf
+2ae75f99
+cd49fb99
+4e4483e7
+a0669957
+a6f0d882
+9ce1d54a
+1fc2314b
+21f363b3
+32ecef67
+70bcaf68
+115348f9
+60827ada
+a218e951
+6d30d5ac
+6da17988
+f22c39ce
+5825f0e0
+f415f9ad
+0d4feda2
+832fc243
+414ca58b
+a92390a0
+ddd383cc
+43dc67f7
+962ae0e2
+6dd74e7b
+2bcd6c3b
+b394847f
+637fd121
+d46e771b
+f6bfc699
+63f138de
+932ad0a6
+2080824a
+52fa9174
+843d3bf7
+f3431885
+5c20c48a
+134a2ab0
+2ea465de
+f6786ab5
+2bf49664
+a49ce97b
+6a50e93a
+a7c21e95
+616ad8ec
+0a8d7b41
+b0c90527
+2d893fb7
+19310598
+7744dc51
+4539b907
+9d299f60
+e495537a
+0b02886a
+f4c4a2ca
+e957b2b5
+e6f3bf07
+258944c8
+54364322
+ebb77f95
+0af03282
+cbdbc6c3
+494ecef0
+ee91f783
+9698f06e
+11e16068
+b942ce0a
+423a50e6
+fb16e746
+9c88ae45
+8620c024
+d3af3c85
+780a25de
+e569a15f
+c4f9f19e
+1106f3a7
+d37e29a7
+e53611da
+fdb2e432
+18ad3117
+6fcd426d
+3bfa8379
+3b19c5c3
+ff1142df
+cd182615
+b60ea255
+b3f5d019
+6dc5e55d
+103166c7
+37af9ac1
+ad1881d1
+731149b3
+90e3338a
+6aa0b6f2
+a25316a3
+dc8679e0
+571fb490
+80afed16
+983a551b
+a58578e5
+2bc0bba4
+1143b3fe
+fdd8dd49
+7fe2bf77
+890ef032
+8466eeb2
+c791ddbb
+631b82bd
+78bf9b51
+a99df45f
+2bdb692f
+e89b1501
+4e6aa1e8
+e5665030
+fe21fd5c
+635577d5
+4414cd3a
+03c99e83
+ff041cd1
+c33adbc2
+a988ec74
+576031e0
+03c21af7
+79b25f4b
+bbc485d6
+d36d5a0d
+efdab888
+b20e6781
+81fdc526
+e1c26a53
+7c6d3504
+52a04667
+f22e34d4
+bb936ead
+13f0606c
+d2abc61e
+af509e8f
+bea1c144
+e15e4de8
+e727099f
+b30744df
+ffb6a2e4
+0d31d3a6
+a23048fe
+7d452630
+6c736334
+046ed4f4
+94f4c2aa
+c290cfd3
+f7203226
+2fdae3c5
+7c78e351
+02b72b8d
+2d22d3be
+ba28d02e
+197f6587
+43199a98
+b563b04f
+9293b755
+9cef7489
+d156b96f
+15e9161e
+6d094cd5
+0d876a65
+c818d30a
+8094b12b
+a4a8e24b
+14655f54
+11c14893
+8a48f62a
+7f3d9c22
+d952481c
+03e0f9b8
+28980657
+6a0b5563
+5879983c
+37549a79
+4a7162bd
+7a6aa1ef
+0dc1b78c
+f6dba17b
+1dba51af
+b2f4d608
+e2e6f421
+464066da
+5d24e4ea
+1e75004d
+a02ed92c
+673adbcc
+c2a0c0fd
+85addee5
+54b8f502
+f5d2d8d3
+a19507e1
+803e1756
+0d1fe009
+5968c2d8
+b926e1ad
+a9162e14
+ae470d2b
+bd731802
+68c879f2
+21fe05d9
+c1ed21d0
+831498e4
+cc45a7f2
+cb170015
+59750be4
+30d1cb6b
+03e5f069
+106d33db
+3f003746
+3e5ad020
+8bc5a91c
+64b89eb5
+bfd28682
+f8687b9a
+7bbf38ee
+d6d92b30
+ceaa6c65
+677c8ed7
+dc33acf8
+cfd1de31
+e5be4781
+85585220
+5d2316f6
+dd3f4a07
+34535f5f
+3ae0bc5d
+f521e3c5
+74c2284f
+12a42fd9
+61403519
+88cd32f3
+662a1846
+825a1944
+cf376cf1
+8465d99c
+61a2e246
+62d44645
+103b3ca8
+c7e745ed
+4ed71139
+230c2edf
+529c6889
+9e509c0d
+54b9dea2
+a8934c0d
+29cffe2f
+48017512
+c9f7f69d
+ce691ee6
+21c89360
+3b97c07b
+ebd82d35
+2895bb8b
+7043c5c1
+85d694d7
+88fd7507
+18d8931e
+aa718745
+89b671bb
+0d8d30ae
+26163977
+a6121689
+1589579d
+159789c4
+f5ca8271
+fcc16740
+3158be0b
+860fc1f7
+3f54a330
+82f24ce7
+069f6a2a
+2fa9c523
+c9f1d87f
+efe9cbca
+8f969ea5
+4f5db794
+62c501f8
+2d3b0320
+c99637f0
+0f3b1fcb
+6e4ee861
+e0d9aff0
+230ddb91
+e14d1f96
+c83aa6a1
+eabdf66a
+6783a303
+81659eb2
+ce954bd7
+9a48c0c9
+0ab807b4
+f0617f71
+fe86f2f8
+61d80e22
+e4b6d2a0
+ac093040
+0e05fabe
+d0b507c3
+3d828137
+c4fa0bab
+f7783321
+ec27366a
+404e4c58
+073baf48
+0f685e01
+b0e98fdd
+b4891f7f
+a46b7b77
+ee059f99
+3c87888e
+8d23ddcc
+2d8d7d35
+5680be79
+fc79c03e
+20660b72
+53f67585
+90956534
+7e709e2d
+dae93f5c
+54b9dbba
+cc41ba05
+1e207fe0
+a9c6abf2
+35e0ca09
+e3dcd186
+1b8bb699
+92162474
+cdad6812
+50b91533
+570215ac
+6042d64a
+b6e2c041
+08746283
+7a056996
+b8651773
+adf443e1
+6a6e0e3b
+886ed981
+c1d57fea
+43030c4c
+7ebfbf57
+0770ad03
+e85301d5
+31ac3d98
+acaef45e
+8f415dd1
+fe2dc281
+2c0b9d99
+8e24501e
+911ec4ad
+8036b58e
+c3b350b9
+b6cadd11
+a3a80cf7
+88ab50cd
+59c755a8
+1339321a
+91b2f707
+97b0811e
+1da33959
+31b09833
+c1a40349
+708098a9
+1f220f98
+999e07cb
+0b5e5d29
+94c63453
+b826d642
+a598602d
+4c83eab8
+2efd5e50
+6ec5da3a
+9fcd95eb
+9a2c6b5b
+c205a718
+e638e950
+cb43141c
+494dd91d
+c4957274
+4975a81d
+a1f4c54d
+51e6fafa
+514490e5
+b0d09e6a
+c6726eb8
+06772c9a
+5a65ffd7
+3657c62b
+03012cfd
+529df209
+f1c38e66
+ab417352
+118a067e
+8957514f
+22e8b380
+3b1a4616
+a4457543
+57c9f6e0
+e362c16b
+0f809e41
+857e375e
+9cff25e3
+d754fb65
+6ad44b86
+051052d8
+a4564b94
+f68507d0
+80a7cf7b
+ad8cd1e0
+60b19cd3
+274fe944
+f06632aa
+628a337b
+92c96c05
+87fc565c
+6f6e6c37
+228a0234
+6487110a
+aa911a8e
+40c47fa3
+9606508b
+6ba9e61f
+c8c1d5a9
+cf01df5b
+9421b9ad
+006e6b64
+1c28e081
+06273084
+8925e11b
+b46c822b
+00501424
+cfd946b2
+2e92a7dc
+1c5f5bb6
+1d29944c
+8248698e
+19247506
+1eac1aff
+ee9caa47
+4a41cbf8
+d97c9309
+4ca87c14
+9707f1e3
+8bb9a221
+6605e67d
+95cf72d7
+1c6fb814
+033130b2
+4344808d
+5f14e5d2
+a810399b
+e325a6d4
+7014ddf4
+725d4bfb
+790285e8
+1a6a731f
+fbfb6e30
+0d4d88f6
+80ce18a4
+572495b7
+4b44dc50
+95dce33c
+4a6fb202
+3142014e
+a3c56751
+96b2a414
+c4aa176c
+fd1e394f
+93f0f509
+f494e9fa
+bfa42a75
+db5319c7
+aa92e070
+81220a93
+e4a72496
+fc467bf1
+5397b01d
+1dc0c9a0
+f6f8b4a6
+53dc7db4
+8ef303eb
+62ca45c9
+e9d3465e
+3784e3f6
+8c934e67
+5ba84e3f
+30e41f1e
+61cf0ec8
+e93e8f01
+fc6086dd
+a95f0aea
+33a04ef2
+6f295adb
+d2aa8c66
+724cc810
+d8623d26
+8d0d641a
+4bda7a76
+38030c69
+56199c41
+d2f4b9e2
+a7b8ac96
+64044df1
+fd1078cc
+0165667b
+16e1cca7
+915f0d9a
+eeaaa67e
+378430d5
+a84c60e6
+b4ae36cc
+2a3a0571
+13e6df75
+aa348c45
+59d7a11d
+68954daf
+d6f883c6
+f28b429a
+32dc49d4
+ccf14ee0
+7d512591
+9bdabdb2
+ed878d94
+54eda06d
+132561ee
+3c4b6736
+0367af42
+531c1c36
+843d8f25
+333bdbdc
+c3c21268
+07b00746
+c7fe0584
+49fc9f2e
+9ed4317a
+d29991b4
+98b0033d
+f0b922bf
+89fe6899
+58264713
+2f49220a
+6ff85ca5
+4b96b2c8
+a42f54f5
+aa425600
+22fdee40
+dde85a9d
+3722f6fe
+e7529cbc
+5ae23f9f
+cc32235b
+730bc486
+b12701b7
+a96b3010
+16130bd3
+2c713560
+f7935d24
+a7eb6616
+0d6e7177
+100edaef
+0442a954
+60f4fa43
+37bf7edf
+76b18413
+ab0646a9
+c575434d
+1e356390
+5416fbb7
+df7cf932
+269872de
+9033b607
+c2e88575
+932542cd
+23e046fb
+3d08dadd
+7999adc5
+ed81c485
+3bd7facd
+1feae28e
+8d72533b
+6a8d35d6
+65308bdc
+7f0b7662
+98290486
+fee3371f
+c463c7e5
+faf7d852
+75c34dc5
+96a6722e
+e5605136
+851bc5d9
+15c41c4b
+6a39e104
+5fbff256
+0e7001dd
+5411113f
+3ea2f7f2
+242b74b1
+87727003
+ec6dd0e9
+980baf58
+9d0b7bf1
+9113c9d4
+5ebef6bd
+a5f70ce7
+b0240233
+06ad78e0
+8745edd0
+d8e8d984
+ac32a655
+38568758
+d48c552d
+0b27d5f7
+c65d0736
+800e3c14
+d37a5857
+bcebc660
+d3ab52cc
+405e3ee7
+e33cddc9
+b0197182
+89fd5681
+9e192417
+8554c402
+aae923b8
+31af515d
+75b26f88
+60471744
+460945aa
+c0fe8e1a
+1731babb
+2e85e35d
+f9c20062
+115da184
+ddfa88c7
+359003f8
+dfa99126
+bf04814f
+f407a414
+e18723c4
+0a7a3629
+c07ab37e
+1251a1c9
+4d09d22a
+5984ed74
+34504f63
+ced51047
+08ff419c
+d942e98c
+2697f864
+3b671a61
+72a2f7e2
+48e7cafe
+6adad2f7
+18840617
+1e44f47e
+36cc4055
+8c494902
+2982de7a
+6a428397
+c4a0ecfb
+231d6945
+fe470104
+f93e1bd0
+bd18bc5a
+7bd70d93
+8f81a0ee
+db78e7a1
+7593caea
+86d5b29b
+5457b298
+0d967fd1
+62372d4c
+68259db3
+f0944ea2
+7b017dbf
+bcb6e338
+03692b14
+f7d36a47
+1ca2531a
+6728528d
+1fc0e6a8
+0ba9c5ad
+a386eaa2
+b0c5459f
+1d64aff3
+b97d4f1a
+b3745d91
+c461003e
+910bf878
+ae42601c
+8d2ddeff
+aaecaa39
+250b5034
+edb11192
+7bfe9b57
+6d533759
+51586b36
+a38d648a
+8fdb48e5
+6075d6b0
+3588ea03
+bc844942
+398d41f5
+660e3b70
+0b99f522
+f169fd1b
+7bfa2ab5
+ab461319
+25153e58
+002b4dce
+a2df1bee
+550a7357
+b604f2dd
+2f477d05
+bdf9eb5a
+857ddc6e
+c8f0fd41
+6df96f15
+e147ab26
+788da8e8
+02221fb0
+d1d95c61
+a3f0cb28
+3a6e6ace
+67c2909a
+220382ab
+eaed776d
+aff08a61
+b99d1bd6
+9d9ae988
+34ccea00
+41dae436
+18513251
+ad57acd1
+67f110fc
+3f09f5c9
+25ef7d43
+12a5d0d7
+3ff48b8b
+26ed56e6
+c047a092
+bb8639e1
+8788747f
+584838d4
+f8e5f837
+657242e8
+cb8eedf4
+74a917f1
+578f71da
+c9b27125
+22e1f53c
+f40145c2
+4795259b
+3f313a2f
+c9012bf6
+22167a50
+6e7f9437
+ef51a724
+356e0fcb
+d3ea999d
+08a5c662
+85aa3b0e
+579fadec
+7bc95dc2
+c097af8e
+f01d8b9f
+80fb79c6
+ea65e6b7
+29ff29f6
+9e1f739d
+b7fb59c9
+e2160f17
+0be33bc1
+e96b9b04
+b1affe79
+c4f4b2e2
+f4c8ffb1
+6a009e50
+a8828854
+2786f841
+a64e724c
+5f54d077
+7040385d
+6e0f0ecc
+f33d3c15
+8108b358
+46a502de
+1e0fb02a
+ddbdfa32
+e7b34ab6
+c9080ed1
+395224b3
+33f9ab47
+c245ecda
+c28d81a9
+37303a3b
+6380dd6f
+2fb5a55b
+83b7c53c
+41c8d0d2
+3aab2d13
+dc7d21fb
+86a88668
+37bb38fe
+ab6413a8
+bbe585b2
+a0ca072a
+9d5940d2
+ddb1d0b1
+a946317a
+988b29a4
+89dc0432
+5df8490d
+5e167efa
+50a86faa
+fe6a535a
+a9f8b8b4
+6e2dce1b
+d0696759
+c09da3b2
+f07dd347
+67408899
+406165ff
+a4a9d03d
+9b5f0f47
+5f3e8022
+1d7a23e0
+25af2eeb
+82a3db34
+c9351029
+6c93d44c
+f088ad1c
+9ee59f51
+b5276b3f
+ca74a924
+781af187
+fa3e0b85
+b898c99e
+1ca51f06
+5a92a0c1
+138c81fe
+d0722d0f
+05a7d84d
+e18f1dea
+799a2d61
+8276e558
+f0ba8748
+ce733e8a
+2f9d0911
+58f24fa4
+66a25278
+3135d31d
+4b9223ee
+bdd5e6b3
+ddbebec1
+8dbebbd9
+3020b38f
+e607450d
+724a5d1c
+91b754c5
+2e85e790
+3a407bd9
+fd137178
+a304029b
+4023fc77
+440d5072
+2eb73c7c
+164a7305
+b33ade7c
+277ad883
+b0f7e75c
+74107936
+83924bdb
+b72beb78
+86c01d64
+f6f441eb
+23b9a3ea
+80b73f1a
+93c6411d
+1e95ef5e
+800b5eac
+9519832a
+ae043406
+b06a902e
+1dbca5cc
+571f88a1
+b1faf52b
+45572497
+8d016cdb
+f92cdae8
+316931f8
+f9884439
+e1b7f212
+e23c6392
+ccfae073
+5aa1efda
+74f0687c
+eaff3301
+b6520a94
+c5398714
+15e7e4d1
+0fc00006
+8cf49218
+3a8ddc0a
+e7e2a0b9
+eec4c008
+8d73085e
+77e246da
+00e92ab4
+f76f6cf9
+19801183
+233406ef
+b80e028c
+342c0b2a
+a2768c47
+99350a74
+adbd400b
+f3978ade
+b87a4f6c
+fa95a6a2
+6dff20c9
+935b5ad8
+dbbbb401
+1b6472c1
+9c0e6331
+04ae7a6b
+4c94e4f3
+90cb46cb
+2831ecf5
+ff77a145
+79af6097
+ba61a719
+abcb7665
+7e87750e
+c4c7bc5d
+3a670b81
+3d9a7023
+82667d52
+a4587f62
+ca619b7f
+7c5462f5
+bda5c60d
+e6e48ac8
+405c6000
+7981f344
+f7375ab3
+bb467ff9
+cfc68a82
+e417a6d8
+1a6177c1
+7b75dace
+b1af350d
+484d48a3
+1f805416
+7416ab4e
+1291276c
+9e85179b
+5a74660c
+7e6d00df
+01e3cec8
+ee2c0688
+f6de8226
+a217538c
+b432c3ef
+49e5ff4e
+035359e5
+8ae8e7ed
+2da12766
+cac39070
+115adda4
+1a2872dc
+fac3378e
+294e7bf8
+a1a4991f
+c062f4d7
+72b2b77d
+158062aa
+9ae447a7
+a7b05677
+fdfd5d56
+eac1a9e6
+a5905593
+59992293
+84298fae
+f708e55f
+093d3d93
+75d26197
+924f5d88
+3184a7ec
+b454fdbc
+2d9101b8
+ae70fb7c
+4385b2c4
+63b37343
+0b4b662c
+2883ae72
+ffcab778
+0f96e2d7
+897066e3
+f23e98ad
+797a7b7e
+2fc476f9
diff --git a/phantom/submodules/sam2/training/assets/MOSE_sample_val_list.txt b/phantom/submodules/sam2/training/assets/MOSE_sample_val_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9721028718245ff5297fdae59d35a7c89cb5f56a
--- /dev/null
+++ b/phantom/submodules/sam2/training/assets/MOSE_sample_val_list.txt
@@ -0,0 +1,200 @@
+32e5d721
+5bad0bab
+267bfd6c
+0a43a414
+56c56ca9
+9a1146b3
+c6ad7aaf
+78a1f4b1
+fc455e73
+072e7b3f
+77ccb57d
+a76ee415
+8cdcfc17
+5d518b42
+376dd830
+0e843fc8
+2af0e766
+2bd4e845
+de2f2a6a
+ade9ee91
+001ca3cb
+fc4c1c67
+8ef55579
+b84ce852
+4cc8528a
+767ffaaa
+112a2ef0
+a338c8aa
+cbd144f5
+5ff72128
+86a949e2
+9f2323ac
+1fab1d1c
+75924351
+ef55817b
+02deca50
+4d979d99
+4d65f873
+28470fa0
+0d1575fe
+06ea172e
+29a6ddc2
+797f1bec
+780e7a99
+b9ed5b44
+02a236b4
+607d8ff5
+af5666b2
+0558d0ed
+a938c6b2
+103df575
+77110e80
+739e5a07
+6763a576
+06ebc138
+ba4b3b09
+b35cc2f3
+4e0597a0
+5949ee84
+5348d547
+323c4236
+b3b51117
+55727ddd
+ab2714f3
+d2878895
+c0734cb3
+94f7c53e
+2a2745e5
+442ffb54
+3592425a
+50ae03b0
+5f150435
+3067f9fa
+9ffb2818
+adeaf5aa
+31caacec
+1cd99b86
+aa22f9d0
+8fa50320
+e6348d2c
+42ff84a5
+8c8b7913
+c96adcbc
+495be321
+db735509
+ee113fc4
+a678cdab
+c409ca4d
+68d2b259
+592b4dee
+4e2b4dc7
+eb4d26e1
+2009a00f
+bec5c89d
+67191f24
+a3e85b4b
+da7080cd
+80d978e9
+36dcb93f
+a41e8c44
+12fdc864
+46d140ea
+657c9dd9
+a86f84ee
+90c1c43d
+33015509
+afc7664d
+23df06e1
+291d4799
+0ab75563
+251bf059
+bcefdcc4
+ce9a2796
+94d3403a
+8f2e04bc
+f9cda066
+9dfa2cc5
+66924c91
+e765a09e
+15654ee1
+48e0bd39
+ee095221
+2463609b
+544d0d1f
+51b8c2e1
+d321dde4
+4cb11a5f
+d7058a0d
+37af282a
+fabae187
+7be91184
+181ec185
+2d16ceeb
+b56be4b1
+6699eff0
+79acac96
+d61c4665
+0c13e1e7
+100f6ecf
+71217dfc
+82df0888
+4c42c747
+c9fdf703
+d2efeb4b
+69ed9d14
+64914fb6
+255bedbc
+4ea934d8
+a034feb2
+e4f4ddae
+e36a3026
+c1489591
+111bb373
+e1d9fb32
+93e22d48
+c1ec4b26
+d9638e69
+60ab04c5
+cfe7773a
+62132822
+2f5fb2a3
+7bdd197d
+033333fd
+130fcdbe
+12e509c2
+67138c33
+6f90cc5f
+4e3020fe
+bbdd8bb7
+b399ccdb
+fecd10d2
+2e0967f7
+f509054f
+792c6ff7
+48e2afc5
+d904c048
+111e0a5c
+b83024e2
+e6a7b79c
+bdc5ccf7
+b8146d00
+9d394f1a
+645b84f9
+95ab2d0f
+e6f8a31d
+b4f876fb
+dc2c570d
+3afd02d7
+5c80c82c
+b1b32ddd
+9f25fc61
+ba538072
+f8916fef
+43c04ad2
+a658e949
+2861dd53
+f6e40aba
+09d305d1
+aac33bff
+8d9d4c08
diff --git a/phantom/submodules/sam2/training/dataset/__init__.py b/phantom/submodules/sam2/training/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/phantom/submodules/sam2/training/dataset/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/phantom/submodules/sam2/training/dataset/sam2_datasets.py b/phantom/submodules/sam2/training/dataset/sam2_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..6deda056bea555fc07ace455ccc62c606a7b81c9
--- /dev/null
+++ b/phantom/submodules/sam2/training/dataset/sam2_datasets.py
@@ -0,0 +1,180 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+from typing import Callable, Iterable, List, Optional, Sequence
+
+import torch
+
+from torch.utils.data import BatchSampler, DataLoader, Dataset, IterableDataset, Subset
+
+from torch.utils.data.distributed import DistributedSampler
+
+
+class MixedDataLoader:
+    def __init__(self, dataloaders: List[DataLoader], mixing_prob: torch.FloatTensor):
+        """
+        Args:
+            dataloaders (List[DataLoader]): List of DataLoaders to be mixed.
+            mixing_prob (torch.FloatTensor): Probability of each dataloader to be sampled from
+
+        """
+        assert len(dataloaders) == mixing_prob.shape[0]
+        self.dataloaders = dataloaders
+        self.mixing_prob = mixing_prob
+        # Iterator state
+        self._iter_dls = None
+        self._iter_mixing_prob = None
+        self.random_generator = torch.Generator()
+
+    def __len__(self):
+        return sum([len(d) for d in self.dataloaders])
+
+    def __iter__(self):
+        # Synchronize dataloader seeds
+        self.random_generator.manual_seed(42)
+        self._iter_dls = [iter(loader) for loader in self.dataloaders]
+        self._iter_mixing_prob = self.mixing_prob.clone()
+        return self
+
+    def __next__(self):
+        """
+        Sample a dataloader to sample from based on mixing probabilities. If one of the dataloaders is exhausted, we continue sampling from the other loaders until all are exhausted.
+        """
+        if self._iter_dls is None:
+            raise TypeError(f"{type(self).__name__} object is not an iterator")
+
+        while self._iter_mixing_prob.any():  # at least one D-Loader with non-zero prob.
+            dataset_idx = self._iter_mixing_prob.multinomial(
+                1, generator=self.random_generator
+            ).item()
+            try:
+                item = next(self._iter_dls[dataset_idx])
+                return item
+            except StopIteration:
+                # No more iterations for this dataset, set it's mixing probability to zero and try again.
+                self._iter_mixing_prob[dataset_idx] = 0
+            except Exception as e:
+                # log and raise any other unexpected error.
+                logging.error(e)
+                raise e
+
+        # Exhausted all iterators
+        raise StopIteration
+
+
+class TorchTrainMixedDataset:
+    def __init__(
+        self,
+        datasets: List[Dataset],
+        batch_sizes: List[int],
+        num_workers: int,
+        shuffle: bool,
+        pin_memory: bool,
+        drop_last: bool,
+        collate_fn: Optional[Callable] = None,
+        worker_init_fn: Optional[Callable] = None,
+        phases_per_epoch: int = 1,
+        dataset_prob: Optional[List[float]] = None,
+    ) -> None:
+        """
+        Args:
+            datasets (List[Dataset]): List of Datasets to be mixed.
+            batch_sizes (List[int]): Batch sizes for each dataset in the list.
+            num_workers (int): Number of workers per dataloader.
+            shuffle (bool): Whether or not to shuffle data.
+            pin_memory (bool): If True, use pinned memory when loading tensors from disk.
+            drop_last (bool): Whether or not to drop the last batch of data.
+            collate_fn (Callable): Function to merge a list of samples into a mini-batch.
+            worker_init_fn (Callable): Function to init each dataloader worker.
+            phases_per_epoch (int): Number of phases per epoch.
+            dataset_prob (List[float]): Probability of choosing the dataloader to sample from. Should sum to 1.0
+        """
+
+        self.datasets = datasets
+        self.batch_sizes = batch_sizes
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.pin_memory = pin_memory
+        self.drop_last = drop_last
+        self.collate_fn = collate_fn
+        self.worker_init_fn = worker_init_fn
+        assert len(self.datasets) > 0
+        for dataset in self.datasets:
+            assert not isinstance(dataset, IterableDataset), "Not supported"
+            # `RepeatFactorWrapper` requires calling set_epoch first to get its length
+            self._set_dataset_epoch(dataset, 0)
+        self.phases_per_epoch = phases_per_epoch
+        self.chunks = [None] * len(datasets)
+        if dataset_prob is None:
+            # If not provided, assign each dataset a probability proportional to its length.
+            dataset_lens = [
+                (math.floor(len(d) / bs) if drop_last else math.ceil(len(d) / bs))
+                for d, bs in zip(datasets, batch_sizes)
+            ]
+            total_len = sum(dataset_lens)
+            dataset_prob = torch.tensor([d_len / total_len for d_len in dataset_lens])
+        else:
+            assert len(dataset_prob) == len(datasets)
+            dataset_prob = torch.tensor(dataset_prob)
+
+        logging.info(f"Dataset mixing probabilities: {dataset_prob.tolist()}")
+        assert dataset_prob.sum().item() == 1.0, "Probabilities should sum to 1.0"
+        self.dataset_prob = dataset_prob
+
+    def _set_dataset_epoch(self, dataset, epoch: int) -> None:
+        if hasattr(dataset, "epoch"):
+            dataset.epoch = epoch
+        if hasattr(dataset, "set_epoch"):
+            dataset.set_epoch(epoch)
+
+    def get_loader(self, epoch) -> Iterable:
+        dataloaders = []
+        for d_idx, (dataset, batch_size) in enumerate(
+            zip(self.datasets, self.batch_sizes)
+        ):
+            if self.phases_per_epoch > 1:
+                # Major epoch that looops over entire dataset
+                # len(main_epoch) == phases_per_epoch * len(epoch)
+                main_epoch = epoch // self.phases_per_epoch
+
+                # Phase with in the main epoch
+                local_phase = epoch % self.phases_per_epoch
+
+                # Start of new data-epoch or job is resumed after preemtion.
+                if local_phase == 0 or self.chunks[d_idx] is None:
+                    # set seed for dataset epoch
+                    # If using RepeatFactorWrapper, this step currectly re-samples indices before chunking.
+                    self._set_dataset_epoch(dataset, main_epoch)
+
+                    # Separate random generator for subset sampling
+                    g = torch.Generator()
+                    g.manual_seed(main_epoch)
+                    self.chunks[d_idx] = torch.chunk(
+                        torch.randperm(len(dataset), generator=g),
+                        self.phases_per_epoch,
+                    )
+
+                dataset = Subset(dataset, self.chunks[d_idx][local_phase])
+            else:
+                self._set_dataset_epoch(dataset, epoch)
+
+            sampler = DistributedSampler(dataset, shuffle=self.shuffle)
+            sampler.set_epoch(epoch)
+
+            batch_sampler = BatchSampler(sampler, batch_size, drop_last=self.drop_last)
+            dataloaders.append(
+                DataLoader(
+                    dataset,
+                    num_workers=self.num_workers,
+                    pin_memory=self.pin_memory,
+                    batch_sampler=batch_sampler,
+                    collate_fn=self.collate_fn,
+                    worker_init_fn=self.worker_init_fn,
+                )
+            )
+        return MixedDataLoader(dataloaders, self.dataset_prob)
diff --git a/phantom/submodules/sam2/training/dataset/transforms.py b/phantom/submodules/sam2/training/dataset/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e5c6512ac7fd9548273fb152a3b57ef75e4fc18
--- /dev/null
+++ b/phantom/submodules/sam2/training/dataset/transforms.py
@@ -0,0 +1,528 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+
+import logging
+
+import random
+from typing import Iterable
+
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+import torchvision.transforms.v2.functional as Fv2
+from PIL import Image as PILImage
+
+from torchvision.transforms import InterpolationMode
+
+from training.utils.data_utils import VideoDatapoint
+
+
+def hflip(datapoint, index):
+
+    datapoint.frames[index].data = F.hflip(datapoint.frames[index].data)
+    for obj in datapoint.frames[index].objects:
+        if obj.segment is not None:
+            obj.segment = F.hflip(obj.segment)
+
+    return datapoint
+
+
+def get_size_with_aspect_ratio(image_size, size, max_size=None):
+    w, h = image_size
+    if max_size is not None:
+        min_original_size = float(min((w, h)))
+        max_original_size = float(max((w, h)))
+        if max_original_size / min_original_size * size > max_size:
+            size = max_size * min_original_size / max_original_size
+
+    if (w <= h and w == size) or (h <= w and h == size):
+        return (h, w)
+
+    if w < h:
+        ow = int(round(size))
+        oh = int(round(size * h / w))
+    else:
+        oh = int(round(size))
+        ow = int(round(size * w / h))
+
+    return (oh, ow)
+
+
+def resize(datapoint, index, size, max_size=None, square=False, v2=False):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    if square:
+        size = size, size
+    else:
+        cur_size = (
+            datapoint.frames[index].data.size()[-2:][::-1]
+            if v2
+            else datapoint.frames[index].data.size
+        )
+        size = get_size(cur_size, size, max_size)
+
+    old_size = (
+        datapoint.frames[index].data.size()[-2:][::-1]
+        if v2
+        else datapoint.frames[index].data.size
+    )
+    if v2:
+        datapoint.frames[index].data = Fv2.resize(
+            datapoint.frames[index].data, size, antialias=True
+        )
+    else:
+        datapoint.frames[index].data = F.resize(datapoint.frames[index].data, size)
+
+    new_size = (
+        datapoint.frames[index].data.size()[-2:][::-1]
+        if v2
+        else datapoint.frames[index].data.size
+    )
+
+    for obj in datapoint.frames[index].objects:
+        if obj.segment is not None:
+            obj.segment = F.resize(obj.segment[None, None], size).squeeze()
+
+    h, w = size
+    datapoint.frames[index].size = (h, w)
+    return datapoint
+
+
+def pad(datapoint, index, padding, v2=False):
+    old_h, old_w = datapoint.frames[index].size
+    h, w = old_h, old_w
+    if len(padding) == 2:
+        # assumes that we only pad on the bottom right corners
+        datapoint.frames[index].data = F.pad(
+            datapoint.frames[index].data, (0, 0, padding[0], padding[1])
+        )
+        h += padding[1]
+        w += padding[0]
+    else:
+        # left, top, right, bottom
+        datapoint.frames[index].data = F.pad(
+            datapoint.frames[index].data,
+            (padding[0], padding[1], padding[2], padding[3]),
+        )
+        h += padding[1] + padding[3]
+        w += padding[0] + padding[2]
+
+    datapoint.frames[index].size = (h, w)
+
+    for obj in datapoint.frames[index].objects:
+        if obj.segment is not None:
+            if v2:
+                if len(padding) == 2:
+                    obj.segment = Fv2.pad(obj.segment, (0, 0, padding[0], padding[1]))
+                else:
+                    obj.segment = Fv2.pad(obj.segment, tuple(padding))
+            else:
+                if len(padding) == 2:
+                    obj.segment = F.pad(obj.segment, (0, 0, padding[0], padding[1]))
+                else:
+                    obj.segment = F.pad(obj.segment, tuple(padding))
+    return datapoint
+
+
+class RandomHorizontalFlip:
+    def __init__(self, consistent_transform, p=0.5):
+        self.p = p
+        self.consistent_transform = consistent_transform
+
+    def __call__(self, datapoint, **kwargs):
+        if self.consistent_transform:
+            if random.random() < self.p:
+                for i in range(len(datapoint.frames)):
+                    datapoint = hflip(datapoint, i)
+            return datapoint
+        for i in range(len(datapoint.frames)):
+            if random.random() < self.p:
+                datapoint = hflip(datapoint, i)
+        return datapoint
+
+
+class RandomResizeAPI:
+    def __init__(
+        self, sizes, consistent_transform, max_size=None, square=False, v2=False
+    ):
+        if isinstance(sizes, int):
+            sizes = (sizes,)
+        assert isinstance(sizes, Iterable)
+        self.sizes = list(sizes)
+        self.max_size = max_size
+        self.square = square
+        self.consistent_transform = consistent_transform
+        self.v2 = v2
+
+    def __call__(self, datapoint, **kwargs):
+        if self.consistent_transform:
+            size = random.choice(self.sizes)
+            for i in range(len(datapoint.frames)):
+                datapoint = resize(
+                    datapoint, i, size, self.max_size, square=self.square, v2=self.v2
+                )
+            return datapoint
+        for i in range(len(datapoint.frames)):
+            size = random.choice(self.sizes)
+            datapoint = resize(
+                datapoint, i, size, self.max_size, square=self.square, v2=self.v2
+            )
+        return datapoint
+
+
+class ToTensorAPI:
+    def __init__(self, v2=False):
+        self.v2 = v2
+
+    def __call__(self, datapoint: VideoDatapoint, **kwargs):
+        for img in datapoint.frames:
+            if self.v2:
+                img.data = Fv2.to_image_tensor(img.data)
+            else:
+                img.data = F.to_tensor(img.data)
+        return datapoint
+
+
+class NormalizeAPI:
+    def __init__(self, mean, std, v2=False):
+        self.mean = mean
+        self.std = std
+        self.v2 = v2
+
+    def __call__(self, datapoint: VideoDatapoint, **kwargs):
+        for img in datapoint.frames:
+            if self.v2:
+                img.data = Fv2.convert_image_dtype(img.data, torch.float32)
+                img.data = Fv2.normalize(img.data, mean=self.mean, std=self.std)
+            else:
+                img.data = F.normalize(img.data, mean=self.mean, std=self.std)
+
+        return datapoint
+
+
+class ComposeAPI:
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, datapoint, **kwargs):
+        for t in self.transforms:
+            datapoint = t(datapoint, **kwargs)
+        return datapoint
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
+
+
+class RandomGrayscale:
+    def __init__(self, consistent_transform, p=0.5):
+        self.p = p
+        self.consistent_transform = consistent_transform
+        self.Grayscale = T.Grayscale(num_output_channels=3)
+
+    def __call__(self, datapoint: VideoDatapoint, **kwargs):
+        if self.consistent_transform:
+            if random.random() < self.p:
+                for img in datapoint.frames:
+                    img.data = self.Grayscale(img.data)
+            return datapoint
+        for img in datapoint.frames:
+            if random.random() < self.p:
+                img.data = self.Grayscale(img.data)
+        return datapoint
+
+
+class ColorJitter:
+    def __init__(self, consistent_transform, brightness, contrast, saturation, hue):
+        self.consistent_transform = consistent_transform
+        self.brightness = (
+            brightness
+            if isinstance(brightness, list)
+            else [max(0, 1 - brightness), 1 + brightness]
+        )
+        self.contrast = (
+            contrast
+            if isinstance(contrast, list)
+            else [max(0, 1 - contrast), 1 + contrast]
+        )
+        self.saturation = (
+            saturation
+            if isinstance(saturation, list)
+            else [max(0, 1 - saturation), 1 + saturation]
+        )
+        self.hue = hue if isinstance(hue, list) or hue is None else ([-hue, hue])
+
+    def __call__(self, datapoint: VideoDatapoint, **kwargs):
+        if self.consistent_transform:
+            # Create a color jitter transformation params
+            (
+                fn_idx,
+                brightness_factor,
+                contrast_factor,
+                saturation_factor,
+                hue_factor,
+            ) = T.ColorJitter.get_params(
+                self.brightness, self.contrast, self.saturation, self.hue
+            )
+        for img in datapoint.frames:
+            if not self.consistent_transform:
+                (
+                    fn_idx,
+                    brightness_factor,
+                    contrast_factor,
+                    saturation_factor,
+                    hue_factor,
+                ) = T.ColorJitter.get_params(
+                    self.brightness, self.contrast, self.saturation, self.hue
+                )
+            for fn_id in fn_idx:
+                if fn_id == 0 and brightness_factor is not None:
+                    img.data = F.adjust_brightness(img.data, brightness_factor)
+                elif fn_id == 1 and contrast_factor is not None:
+                    img.data = F.adjust_contrast(img.data, contrast_factor)
+                elif fn_id == 2 and saturation_factor is not None:
+                    img.data = F.adjust_saturation(img.data, saturation_factor)
+                elif fn_id == 3 and hue_factor is not None:
+                    img.data = F.adjust_hue(img.data, hue_factor)
+        return datapoint
+
+
+class RandomAffine:
+    def __init__(
+        self,
+        degrees,
+        consistent_transform,
+        scale=None,
+        translate=None,
+        shear=None,
+        image_mean=(123, 116, 103),
+        log_warning=True,
+        num_tentatives=1,
+        image_interpolation="bicubic",
+    ):
+        """
+        The mask is required for this transform.
+        if consistent_transform if True, then the same random affine is applied to all frames and masks.
+        """
+        self.degrees = degrees if isinstance(degrees, list) else ([-degrees, degrees])
+        self.scale = scale
+        self.shear = (
+            shear if isinstance(shear, list) else ([-shear, shear] if shear else None)
+        )
+        self.translate = translate
+        self.fill_img = image_mean
+        self.consistent_transform = consistent_transform
+        self.log_warning = log_warning
+        self.num_tentatives = num_tentatives
+
+        if image_interpolation == "bicubic":
+            self.image_interpolation = InterpolationMode.BICUBIC
+        elif image_interpolation == "bilinear":
+            self.image_interpolation = InterpolationMode.BILINEAR
+        else:
+            raise NotImplementedError
+
+    def __call__(self, datapoint: VideoDatapoint, **kwargs):
+        for _tentative in range(self.num_tentatives):
+            res = self.transform_datapoint(datapoint)
+            if res is not None:
+                return res
+
+        if self.log_warning:
+            logging.warning(
+                f"Skip RandomAffine for zero-area mask in first frame after {self.num_tentatives} tentatives"
+            )
+        return datapoint
+
+    def transform_datapoint(self, datapoint: VideoDatapoint):
+        _, height, width = F.get_dimensions(datapoint.frames[0].data)
+        img_size = [width, height]
+
+        if self.consistent_transform:
+            # Create a random affine transformation
+            affine_params = T.RandomAffine.get_params(
+                degrees=self.degrees,
+                translate=self.translate,
+                scale_ranges=self.scale,
+                shears=self.shear,
+                img_size=img_size,
+            )
+
+        for img_idx, img in enumerate(datapoint.frames):
+            this_masks = [
+                obj.segment.unsqueeze(0) if obj.segment is not None else None
+                for obj in img.objects
+            ]
+            if not self.consistent_transform:
+                # if not consistent we create a new affine params for every frame&mask pair Create a random affine transformation
+                affine_params = T.RandomAffine.get_params(
+                    degrees=self.degrees,
+                    translate=self.translate,
+                    scale_ranges=self.scale,
+                    shears=self.shear,
+                    img_size=img_size,
+                )
+
+            transformed_bboxes, transformed_masks = [], []
+            for i in range(len(img.objects)):
+                if this_masks[i] is None:
+                    transformed_masks.append(None)
+                    # Dummy bbox for a dummy target
+                    transformed_bboxes.append(torch.tensor([[0, 0, 1, 1]]))
+                else:
+                    transformed_mask = F.affine(
+                        this_masks[i],
+                        *affine_params,
+                        interpolation=InterpolationMode.NEAREST,
+                        fill=0.0,
+                    )
+                    if img_idx == 0 and transformed_mask.max() == 0:
+                        # We are dealing with a video and the object is not visible in the first frame
+                        # Return the datapoint without transformation
+                        return None
+                    transformed_masks.append(transformed_mask.squeeze())
+
+            for i in range(len(img.objects)):
+                img.objects[i].segment = transformed_masks[i]
+
+            img.data = F.affine(
+                img.data,
+                *affine_params,
+                interpolation=self.image_interpolation,
+                fill=self.fill_img,
+            )
+        return datapoint
+
+
+def random_mosaic_frame(
+    datapoint,
+    index,
+    grid_h,
+    grid_w,
+    target_grid_y,
+    target_grid_x,
+    should_hflip,
+):
+    # Step 1: downsize the images and paste them into a mosaic
+    image_data = datapoint.frames[index].data
+    is_pil = isinstance(image_data, PILImage.Image)
+    if is_pil:
+        H_im = image_data.height
+        W_im = image_data.width
+        image_data_output = PILImage.new("RGB", (W_im, H_im))
+    else:
+        H_im = image_data.size(-2)
+        W_im = image_data.size(-1)
+        image_data_output = torch.zeros_like(image_data)
+
+    downsize_cache = {}
+    for grid_y in range(grid_h):
+        for grid_x in range(grid_w):
+            y_offset_b = grid_y * H_im // grid_h
+            x_offset_b = grid_x * W_im // grid_w
+            y_offset_e = (grid_y + 1) * H_im // grid_h
+            x_offset_e = (grid_x + 1) * W_im // grid_w
+            H_im_downsize = y_offset_e - y_offset_b
+            W_im_downsize = x_offset_e - x_offset_b
+
+            if (H_im_downsize, W_im_downsize) in downsize_cache:
+                image_data_downsize = downsize_cache[(H_im_downsize, W_im_downsize)]
+            else:
+                image_data_downsize = F.resize(
+                    image_data,
+                    size=(H_im_downsize, W_im_downsize),
+                    interpolation=InterpolationMode.BILINEAR,
+                    antialias=True,  # antialiasing for downsizing
+                )
+                downsize_cache[(H_im_downsize, W_im_downsize)] = image_data_downsize
+            if should_hflip[grid_y, grid_x].item():
+                image_data_downsize = F.hflip(image_data_downsize)
+
+            if is_pil:
+                image_data_output.paste(image_data_downsize, (x_offset_b, y_offset_b))
+            else:
+                image_data_output[:, y_offset_b:y_offset_e, x_offset_b:x_offset_e] = (
+                    image_data_downsize
+                )
+
+    datapoint.frames[index].data = image_data_output
+
+    # Step 2: downsize the masks and paste them into the target grid of the mosaic
+    for obj in datapoint.frames[index].objects:
+        if obj.segment is None:
+            continue
+        assert obj.segment.shape == (H_im, W_im) and obj.segment.dtype == torch.uint8
+        segment_output = torch.zeros_like(obj.segment)
+
+        target_y_offset_b = target_grid_y * H_im // grid_h
+        target_x_offset_b = target_grid_x * W_im // grid_w
+        target_y_offset_e = (target_grid_y + 1) * H_im // grid_h
+        target_x_offset_e = (target_grid_x + 1) * W_im // grid_w
+        target_H_im_downsize = target_y_offset_e - target_y_offset_b
+        target_W_im_downsize = target_x_offset_e - target_x_offset_b
+
+        segment_downsize = F.resize(
+            obj.segment[None, None],
+            size=(target_H_im_downsize, target_W_im_downsize),
+            interpolation=InterpolationMode.BILINEAR,
+            antialias=True,  # antialiasing for downsizing
+        )[0, 0]
+        if should_hflip[target_grid_y, target_grid_x].item():
+            segment_downsize = F.hflip(segment_downsize[None, None])[0, 0]
+
+        segment_output[
+            target_y_offset_b:target_y_offset_e, target_x_offset_b:target_x_offset_e
+        ] = segment_downsize
+        obj.segment = segment_output
+
+    return datapoint
+
+
+class RandomMosaicVideoAPI:
+    def __init__(self, prob=0.15, grid_h=2, grid_w=2, use_random_hflip=False):
+        self.prob = prob
+        self.grid_h = grid_h
+        self.grid_w = grid_w
+        self.use_random_hflip = use_random_hflip
+
+    def __call__(self, datapoint, **kwargs):
+        if random.random() > self.prob:
+            return datapoint
+
+        # select a random location to place the target mask in the mosaic
+        target_grid_y = random.randint(0, self.grid_h - 1)
+        target_grid_x = random.randint(0, self.grid_w - 1)
+        # whether to flip each grid in the mosaic horizontally
+        if self.use_random_hflip:
+            should_hflip = torch.rand(self.grid_h, self.grid_w) < 0.5
+        else:
+            should_hflip = torch.zeros(self.grid_h, self.grid_w, dtype=torch.bool)
+        for i in range(len(datapoint.frames)):
+            datapoint = random_mosaic_frame(
+                datapoint,
+                i,
+                grid_h=self.grid_h,
+                grid_w=self.grid_w,
+                target_grid_y=target_grid_y,
+                target_grid_x=target_grid_x,
+                should_hflip=should_hflip,
+            )
+
+        return datapoint
diff --git a/phantom/submodules/sam2/training/dataset/utils.py b/phantom/submodules/sam2/training/dataset/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a658df234c3dcf74404f844b5be793b0545485ed
--- /dev/null
+++ b/phantom/submodules/sam2/training/dataset/utils.py
@@ -0,0 +1,104 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Some wrapping utilities extended from pytorch's to support repeat factor sampling in particular"""
+
+from typing import Iterable
+
+import torch
+from torch.utils.data import (
+    ConcatDataset as TorchConcatDataset,
+    Dataset,
+    Subset as TorchSubset,
+)
+
+
+class ConcatDataset(TorchConcatDataset):
+    def __init__(self, datasets: Iterable[Dataset]) -> None:
+        super(ConcatDataset, self).__init__(datasets)
+
+        self.repeat_factors = torch.cat([d.repeat_factors for d in datasets])
+
+    def set_epoch(self, epoch: int):
+        for dataset in self.datasets:
+            if hasattr(dataset, "epoch"):
+                dataset.epoch = epoch
+            if hasattr(dataset, "set_epoch"):
+                dataset.set_epoch(epoch)
+
+
+class Subset(TorchSubset):
+    def __init__(self, dataset, indices) -> None:
+        super(Subset, self).__init__(dataset, indices)
+
+        self.repeat_factors = dataset.repeat_factors[indices]
+        assert len(indices) == len(self.repeat_factors)
+
+
+# Adapted from Detectron2
+class RepeatFactorWrapper(Dataset):
+    """
+    Thin wrapper around a dataset to implement repeat factor sampling.
+    The underlying dataset must have a repeat_factors member to indicate the per-image factor.
+    Set it to uniformly ones to disable repeat factor sampling
+    """
+
+    def __init__(self, dataset, seed: int = 0):
+        self.dataset = dataset
+        self.epoch_ids = None
+        self._seed = seed
+
+        # Split into whole number (_int_part) and fractional (_frac_part) parts.
+        self._int_part = torch.trunc(dataset.repeat_factors)
+        self._frac_part = dataset.repeat_factors - self._int_part
+
+    def _get_epoch_indices(self, generator):
+        """
+        Create a list of dataset indices (with repeats) to use for one epoch.
+
+        Args:
+            generator (torch.Generator): pseudo random number generator used for
+                stochastic rounding.
+
+        Returns:
+            torch.Tensor: list of dataset indices to use in one epoch. Each index
+                is repeated based on its calculated repeat factor.
+        """
+        # Since repeat factors are fractional, we use stochastic rounding so
+        # that the target repeat factor is achieved in expectation over the
+        # course of training
+        rands = torch.rand(len(self._frac_part), generator=generator)
+        rep_factors = self._int_part + (rands < self._frac_part).float()
+        # Construct a list of indices in which we repeat images as specified
+        indices = []
+        for dataset_index, rep_factor in enumerate(rep_factors):
+            indices.extend([dataset_index] * int(rep_factor.item()))
+        return torch.tensor(indices, dtype=torch.int64)
+
+    def __len__(self):
+        if self.epoch_ids is None:
+            # Here we raise an error instead of returning original len(self.dataset) avoid
+            # accidentally using unwrapped length. Otherwise it's error-prone since the
+            # length changes to `len(self.epoch_ids)`changes after set_epoch is called.
+            raise RuntimeError("please call set_epoch first to get wrapped length")
+            # return len(self.dataset)
+
+        return len(self.epoch_ids)
+
+    def set_epoch(self, epoch: int):
+        g = torch.Generator()
+        g.manual_seed(self._seed + epoch)
+        self.epoch_ids = self._get_epoch_indices(g)
+        if hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(epoch)
+
+    def __getitem__(self, idx):
+        if self.epoch_ids is None:
+            raise RuntimeError(
+                "Repeat ids haven't been computed. Did you forget to call set_epoch?"
+            )
+
+        return self.dataset[self.epoch_ids[idx]]
diff --git a/phantom/submodules/sam2/training/dataset/vos_dataset.py b/phantom/submodules/sam2/training/dataset/vos_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1e9d39fe184cf0d86fbf22b5385dc05988cab83
--- /dev/null
+++ b/phantom/submodules/sam2/training/dataset/vos_dataset.py
@@ -0,0 +1,162 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import random
+from copy import deepcopy
+
+import numpy as np
+
+import torch
+from iopath.common.file_io import g_pathmgr
+from PIL import Image as PILImage
+from torchvision.datasets.vision import VisionDataset
+
+from training.dataset.vos_raw_dataset import VOSRawDataset
+from training.dataset.vos_sampler import VOSSampler
+from training.dataset.vos_segment_loader import JSONSegmentLoader
+
+from training.utils.data_utils import Frame, Object, VideoDatapoint
+
+MAX_RETRIES = 100
+
+
+class VOSDataset(VisionDataset):
+    def __init__(
+        self,
+        transforms,
+        training: bool,
+        video_dataset: VOSRawDataset,
+        sampler: VOSSampler,
+        multiplier: int,
+        always_target=True,
+        target_segments_available=True,
+    ):
+        self._transforms = transforms
+        self.training = training
+        self.video_dataset = video_dataset
+        self.sampler = sampler
+
+        self.repeat_factors = torch.ones(len(self.video_dataset), dtype=torch.float32)
+        self.repeat_factors *= multiplier
+        print(f"Raw dataset length = {len(self.video_dataset)}")
+
+        self.curr_epoch = 0  # Used in case data loader behavior changes across epochs
+        self.always_target = always_target
+        self.target_segments_available = target_segments_available
+
+    def _get_datapoint(self, idx):
+
+        for retry in range(MAX_RETRIES):
+            try:
+                if isinstance(idx, torch.Tensor):
+                    idx = idx.item()
+                # sample a video
+                video, segment_loader = self.video_dataset.get_video(idx)
+                # sample frames and object indices to be used in a datapoint
+                sampled_frms_and_objs = self.sampler.sample(
+                    video, segment_loader, epoch=self.curr_epoch
+                )
+                break  # Succesfully loaded video
+            except Exception as e:
+                if self.training:
+                    logging.warning(
+                        f"Loading failed (id={idx}); Retry {retry} with exception: {e}"
+                    )
+                    idx = random.randrange(0, len(self.video_dataset))
+                else:
+                    # Shouldn't fail to load a val video
+                    raise e
+
+        datapoint = self.construct(video, sampled_frms_and_objs, segment_loader)
+        for transform in self._transforms:
+            datapoint = transform(datapoint, epoch=self.curr_epoch)
+        return datapoint
+
+    def construct(self, video, sampled_frms_and_objs, segment_loader):
+        """
+        Constructs a VideoDatapoint sample to pass to transforms
+        """
+        sampled_frames = sampled_frms_and_objs.frames
+        sampled_object_ids = sampled_frms_and_objs.object_ids
+
+        images = []
+        rgb_images = load_images(sampled_frames)
+        # Iterate over the sampled frames and store their rgb data and object data (bbox, segment)
+        for frame_idx, frame in enumerate(sampled_frames):
+            w, h = rgb_images[frame_idx].size
+            images.append(
+                Frame(
+                    data=rgb_images[frame_idx],
+                    objects=[],
+                )
+            )
+            # We load the gt segments associated with the current frame
+            if isinstance(segment_loader, JSONSegmentLoader):
+                segments = segment_loader.load(
+                    frame.frame_idx, obj_ids=sampled_object_ids
+                )
+            else:
+                segments = segment_loader.load(frame.frame_idx)
+            for obj_id in sampled_object_ids:
+                # Extract the segment
+                if obj_id in segments:
+                    assert (
+                        segments[obj_id] is not None
+                    ), "None targets are not supported"
+                    # segment is uint8 and remains uint8 throughout the transforms
+                    segment = segments[obj_id].to(torch.uint8)
+                else:
+                    # There is no target, we either use a zero mask target or drop this object
+                    if not self.always_target:
+                        continue
+                    segment = torch.zeros(h, w, dtype=torch.uint8)
+
+                images[frame_idx].objects.append(
+                    Object(
+                        object_id=obj_id,
+                        frame_index=frame.frame_idx,
+                        segment=segment,
+                    )
+                )
+        return VideoDatapoint(
+            frames=images,
+            video_id=video.video_id,
+            size=(h, w),
+        )
+
+    def __getitem__(self, idx):
+        return self._get_datapoint(idx)
+
+    def __len__(self):
+        return len(self.video_dataset)
+
+
+def load_images(frames):
+    all_images = []
+    cache = {}
+    for frame in frames:
+        if frame.data is None:
+            # Load the frame rgb data from file
+            path = frame.image_path
+            if path in cache:
+                all_images.append(deepcopy(all_images[cache[path]]))
+                continue
+            with g_pathmgr.open(path, "rb") as fopen:
+                all_images.append(PILImage.open(fopen).convert("RGB"))
+            cache[path] = len(all_images) - 1
+        else:
+            # The frame rgb data has already been loaded
+            # Convert it to a PILImage
+            all_images.append(tensor_2_PIL(frame.data))
+
+    return all_images
+
+
+def tensor_2_PIL(data: torch.Tensor) -> PILImage.Image:
+    data = data.cpu().numpy().transpose((1, 2, 0)) * 255.0
+    data = data.astype(np.uint8)
+    return PILImage.fromarray(data)
diff --git a/phantom/submodules/sam2/training/dataset/vos_raw_dataset.py b/phantom/submodules/sam2/training/dataset/vos_raw_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..44fe893717a3e3bd85b043baa33d349b52b4b34e
--- /dev/null
+++ b/phantom/submodules/sam2/training/dataset/vos_raw_dataset.py
@@ -0,0 +1,308 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import glob
+import logging
+import os
+from dataclasses import dataclass
+
+from typing import List, Optional
+
+import pandas as pd
+
+import torch
+
+from iopath.common.file_io import g_pathmgr
+
+from omegaconf.listconfig import ListConfig
+
+from training.dataset.vos_segment_loader import (
+    JSONSegmentLoader,
+    MultiplePNGSegmentLoader,
+    PalettisedPNGSegmentLoader,
+    SA1BSegmentLoader,
+)
+
+
+@dataclass
+class VOSFrame:
+    frame_idx: int
+    image_path: str
+    data: Optional[torch.Tensor] = None
+    is_conditioning_only: Optional[bool] = False
+
+
+@dataclass
+class VOSVideo:
+    video_name: str
+    video_id: int
+    frames: List[VOSFrame]
+
+    def __len__(self):
+        return len(self.frames)
+
+
+class VOSRawDataset:
+    def __init__(self):
+        pass
+
+    def get_video(self, idx):
+        raise NotImplementedError()
+
+
+class PNGRawDataset(VOSRawDataset):
+    def __init__(
+        self,
+        img_folder,
+        gt_folder,
+        file_list_txt=None,
+        excluded_videos_list_txt=None,
+        sample_rate=1,
+        is_palette=True,
+        single_object_mode=False,
+        truncate_video=-1,
+        frames_sampling_mult=False,
+    ):
+        self.img_folder = img_folder
+        self.gt_folder = gt_folder
+        self.sample_rate = sample_rate
+        self.is_palette = is_palette
+        self.single_object_mode = single_object_mode
+        self.truncate_video = truncate_video
+
+        # Read the subset defined in file_list_txt
+        if file_list_txt is not None:
+            with g_pathmgr.open(file_list_txt, "r") as f:
+                subset = [os.path.splitext(line.strip())[0] for line in f]
+        else:
+            subset = os.listdir(self.img_folder)
+
+        # Read and process excluded files if provided
+        if excluded_videos_list_txt is not None:
+            with g_pathmgr.open(excluded_videos_list_txt, "r") as f:
+                excluded_files = [os.path.splitext(line.strip())[0] for line in f]
+        else:
+            excluded_files = []
+
+        # Check if it's not in excluded_files
+        self.video_names = sorted(
+            [video_name for video_name in subset if video_name not in excluded_files]
+        )
+
+        if self.single_object_mode:
+            # single object mode
+            self.video_names = sorted(
+                [
+                    os.path.join(video_name, obj)
+                    for video_name in self.video_names
+                    for obj in os.listdir(os.path.join(self.gt_folder, video_name))
+                ]
+            )
+
+        if frames_sampling_mult:
+            video_names_mult = []
+            for video_name in self.video_names:
+                num_frames = len(os.listdir(os.path.join(self.img_folder, video_name)))
+                video_names_mult.extend([video_name] * num_frames)
+            self.video_names = video_names_mult
+
+    def get_video(self, idx):
+        """
+        Given a VOSVideo object, return the mask tensors.
+        """
+        video_name = self.video_names[idx]
+
+        if self.single_object_mode:
+            video_frame_root = os.path.join(
+                self.img_folder, os.path.dirname(video_name)
+            )
+        else:
+            video_frame_root = os.path.join(self.img_folder, video_name)
+
+        video_mask_root = os.path.join(self.gt_folder, video_name)
+
+        if self.is_palette:
+            segment_loader = PalettisedPNGSegmentLoader(video_mask_root)
+        else:
+            segment_loader = MultiplePNGSegmentLoader(
+                video_mask_root, self.single_object_mode
+            )
+
+        all_frames = sorted(glob.glob(os.path.join(video_frame_root, "*.jpg")))
+        if self.truncate_video > 0:
+            all_frames = all_frames[: self.truncate_video]
+        frames = []
+        for _, fpath in enumerate(all_frames[:: self.sample_rate]):
+            fid = int(os.path.basename(fpath).split(".")[0])
+            frames.append(VOSFrame(fid, image_path=fpath))
+        video = VOSVideo(video_name, idx, frames)
+        return video, segment_loader
+
+    def __len__(self):
+        return len(self.video_names)
+
+
+class SA1BRawDataset(VOSRawDataset):
+    def __init__(
+        self,
+        img_folder,
+        gt_folder,
+        file_list_txt=None,
+        excluded_videos_list_txt=None,
+        num_frames=1,
+        mask_area_frac_thresh=1.1,  # no filtering by default
+        uncertain_iou=-1,  # no filtering by default
+    ):
+        self.img_folder = img_folder
+        self.gt_folder = gt_folder
+        self.num_frames = num_frames
+        self.mask_area_frac_thresh = mask_area_frac_thresh
+        self.uncertain_iou = uncertain_iou  # stability score
+
+        # Read the subset defined in file_list_txt
+        if file_list_txt is not None:
+            with g_pathmgr.open(file_list_txt, "r") as f:
+                subset = [os.path.splitext(line.strip())[0] for line in f]
+        else:
+            subset = os.listdir(self.img_folder)
+            subset = [
+                path.split(".")[0] for path in subset if path.endswith(".jpg")
+            ]  # remove extension
+
+        # Read and process excluded files if provided
+        if excluded_videos_list_txt is not None:
+            with g_pathmgr.open(excluded_videos_list_txt, "r") as f:
+                excluded_files = [os.path.splitext(line.strip())[0] for line in f]
+        else:
+            excluded_files = []
+
+        # Check if it's not in excluded_files and it exists
+        self.video_names = [
+            video_name for video_name in subset if video_name not in excluded_files
+        ]
+
+    def get_video(self, idx):
+        """
+        Given a VOSVideo object, return the mask tensors.
+        """
+        video_name = self.video_names[idx]
+
+        video_frame_path = os.path.join(self.img_folder, video_name + ".jpg")
+        video_mask_path = os.path.join(self.gt_folder, video_name + ".json")
+
+        segment_loader = SA1BSegmentLoader(
+            video_mask_path,
+            mask_area_frac_thresh=self.mask_area_frac_thresh,
+            video_frame_path=video_frame_path,
+            uncertain_iou=self.uncertain_iou,
+        )
+
+        frames = []
+        for frame_idx in range(self.num_frames):
+            frames.append(VOSFrame(frame_idx, image_path=video_frame_path))
+        video_name = video_name.split("_")[-1]  # filename is sa_{int}
+        # video id needs to be image_id to be able to load correct annotation file during eval
+        video = VOSVideo(video_name, int(video_name), frames)
+        return video, segment_loader
+
+    def __len__(self):
+        return len(self.video_names)
+
+
+class JSONRawDataset(VOSRawDataset):
+    """
+    Dataset where the annotation in the format of SA-V json files
+    """
+
+    def __init__(
+        self,
+        img_folder,
+        gt_folder,
+        file_list_txt=None,
+        excluded_videos_list_txt=None,
+        sample_rate=1,
+        rm_unannotated=True,
+        ann_every=1,
+        frames_fps=24,
+    ):
+        self.gt_folder = gt_folder
+        self.img_folder = img_folder
+        self.sample_rate = sample_rate
+        self.rm_unannotated = rm_unannotated
+        self.ann_every = ann_every
+        self.frames_fps = frames_fps
+
+        # Read and process excluded files if provided
+        excluded_files = []
+        if excluded_videos_list_txt is not None:
+            if isinstance(excluded_videos_list_txt, str):
+                excluded_videos_lists = [excluded_videos_list_txt]
+            elif isinstance(excluded_videos_list_txt, ListConfig):
+                excluded_videos_lists = list(excluded_videos_list_txt)
+            else:
+                raise NotImplementedError
+
+            for excluded_videos_list_txt in excluded_videos_lists:
+                with open(excluded_videos_list_txt, "r") as f:
+                    excluded_files.extend(
+                        [os.path.splitext(line.strip())[0] for line in f]
+                    )
+        excluded_files = set(excluded_files)
+
+        # Read the subset defined in file_list_txt
+        if file_list_txt is not None:
+            with g_pathmgr.open(file_list_txt, "r") as f:
+                subset = [os.path.splitext(line.strip())[0] for line in f]
+        else:
+            subset = os.listdir(self.img_folder)
+
+        self.video_names = sorted(
+            [video_name for video_name in subset if video_name not in excluded_files]
+        )
+
+    def get_video(self, video_idx):
+        """
+        Given a VOSVideo object, return the mask tensors.
+        """
+        video_name = self.video_names[video_idx]
+        video_json_path = os.path.join(self.gt_folder, video_name + "_manual.json")
+        segment_loader = JSONSegmentLoader(
+            video_json_path=video_json_path,
+            ann_every=self.ann_every,
+            frames_fps=self.frames_fps,
+        )
+
+        frame_ids = [
+            int(os.path.splitext(frame_name)[0])
+            for frame_name in sorted(
+                os.listdir(os.path.join(self.img_folder, video_name))
+            )
+        ]
+
+        frames = [
+            VOSFrame(
+                frame_id,
+                image_path=os.path.join(
+                    self.img_folder, f"{video_name}/%05d.jpg" % (frame_id)
+                ),
+            )
+            for frame_id in frame_ids[:: self.sample_rate]
+        ]
+
+        if self.rm_unannotated:
+            # Eliminate the frames that have not been annotated
+            valid_frame_ids = [
+                i * segment_loader.ann_every
+                for i, annot in enumerate(segment_loader.frame_annots)
+                if annot is not None and None not in annot
+            ]
+            frames = [f for f in frames if f.frame_idx in valid_frame_ids]
+
+        video = VOSVideo(video_name, video_idx, frames)
+        return video, segment_loader
+
+    def __len__(self):
+        return len(self.video_names)
diff --git a/phantom/submodules/sam2/training/dataset/vos_sampler.py b/phantom/submodules/sam2/training/dataset/vos_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad84b759d0f66191a84017d17140d128b634ca0
--- /dev/null
+++ b/phantom/submodules/sam2/training/dataset/vos_sampler.py
@@ -0,0 +1,105 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+from dataclasses import dataclass
+from typing import List
+
+from training.dataset.vos_segment_loader import LazySegments
+
+MAX_RETRIES = 1000
+
+
+@dataclass
+class SampledFramesAndObjects:
+    frames: List[int]
+    object_ids: List[int]
+
+
+class VOSSampler:
+    def __init__(self, sort_frames=True):
+        # frames are ordered by frame id when sort_frames is True
+        self.sort_frames = sort_frames
+
+    def sample(self, video):
+        raise NotImplementedError()
+
+
+class RandomUniformSampler(VOSSampler):
+    def __init__(
+        self,
+        num_frames,
+        max_num_objects,
+        reverse_time_prob=0.0,
+    ):
+        self.num_frames = num_frames
+        self.max_num_objects = max_num_objects
+        self.reverse_time_prob = reverse_time_prob
+
+    def sample(self, video, segment_loader, epoch=None):
+
+        for retry in range(MAX_RETRIES):
+            if len(video.frames) < self.num_frames:
+                raise Exception(
+                    f"Cannot sample {self.num_frames} frames from video {video.video_name} as it only has {len(video.frames)} annotated frames."
+                )
+            start = random.randrange(0, len(video.frames) - self.num_frames + 1)
+            frames = [video.frames[start + step] for step in range(self.num_frames)]
+            if random.uniform(0, 1) < self.reverse_time_prob:
+                # Reverse time
+                frames = frames[::-1]
+
+            # Get first frame object ids
+            visible_object_ids = []
+            loaded_segms = segment_loader.load(frames[0].frame_idx)
+            if isinstance(loaded_segms, LazySegments):
+                # LazySegments for SA1BRawDataset
+                visible_object_ids = list(loaded_segms.keys())
+            else:
+                for object_id, segment in segment_loader.load(
+                    frames[0].frame_idx
+                ).items():
+                    if segment.sum():
+                        visible_object_ids.append(object_id)
+
+            # First frame needs to have at least a target to track
+            if len(visible_object_ids) > 0:
+                break
+            if retry >= MAX_RETRIES - 1:
+                raise Exception("No visible objects")
+
+        object_ids = random.sample(
+            visible_object_ids,
+            min(len(visible_object_ids), self.max_num_objects),
+        )
+        return SampledFramesAndObjects(frames=frames, object_ids=object_ids)
+
+
+class EvalSampler(VOSSampler):
+    """
+    VOS Sampler for evaluation: sampling all the frames and all the objects in a video
+    """
+
+    def __init__(
+        self,
+    ):
+        super().__init__()
+
+    def sample(self, video, segment_loader, epoch=None):
+        """
+        Sampling all the frames and all the objects
+        """
+        if self.sort_frames:
+            # ordered by frame id
+            frames = sorted(video.frames, key=lambda x: x.frame_idx)
+        else:
+            # use the original order
+            frames = video.frames
+        object_ids = segment_loader.load(frames[0].frame_idx).keys()
+        if len(object_ids) == 0:
+            raise Exception("First frame of the video has no objects")
+
+        return SampledFramesAndObjects(frames=frames, object_ids=object_ids)
diff --git a/phantom/submodules/sam2/training/dataset/vos_segment_loader.py b/phantom/submodules/sam2/training/dataset/vos_segment_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..27e17010cc8b010e103c3ac399689d80da7cfde9
--- /dev/null
+++ b/phantom/submodules/sam2/training/dataset/vos_segment_loader.py
@@ -0,0 +1,300 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import glob
+import json
+import os
+
+import numpy as np
+import pandas as pd
+import torch
+
+from PIL import Image as PILImage
+
+try:
+    from pycocotools import mask as mask_utils
+except:
+    pass
+
+
+class JSONSegmentLoader:
+    def __init__(self, video_json_path, ann_every=1, frames_fps=24, valid_obj_ids=None):
+        # Annotations in the json are provided every ann_every th frame
+        self.ann_every = ann_every
+        # Ids of the objects to consider when sampling this video
+        self.valid_obj_ids = valid_obj_ids
+        with open(video_json_path, "r") as f:
+            data = json.load(f)
+            if isinstance(data, list):
+                self.frame_annots = data
+            elif isinstance(data, dict):
+                masklet_field_name = "masklet" if "masklet" in data else "masks"
+                self.frame_annots = data[masklet_field_name]
+                if "fps" in data:
+                    if isinstance(data["fps"], list):
+                        annotations_fps = int(data["fps"][0])
+                    else:
+                        annotations_fps = int(data["fps"])
+                    assert frames_fps % annotations_fps == 0
+                    self.ann_every = frames_fps // annotations_fps
+            else:
+                raise NotImplementedError
+
+    def load(self, frame_id, obj_ids=None):
+        assert frame_id % self.ann_every == 0
+        rle_mask = self.frame_annots[frame_id // self.ann_every]
+
+        valid_objs_ids = set(range(len(rle_mask)))
+        if self.valid_obj_ids is not None:
+            # Remove the masklets that have been filtered out for this video
+            valid_objs_ids &= set(self.valid_obj_ids)
+        if obj_ids is not None:
+            # Only keep the objects that have been sampled
+            valid_objs_ids &= set(obj_ids)
+        valid_objs_ids = sorted(list(valid_objs_ids))
+
+        # Construct rle_masks_filtered that only contains the rle masks we are interested in
+        id_2_idx = {}
+        rle_mask_filtered = []
+        for obj_id in valid_objs_ids:
+            if rle_mask[obj_id] is not None:
+                id_2_idx[obj_id] = len(rle_mask_filtered)
+                rle_mask_filtered.append(rle_mask[obj_id])
+            else:
+                id_2_idx[obj_id] = None
+
+        # Decode the masks
+        raw_segments = torch.from_numpy(mask_utils.decode(rle_mask_filtered)).permute(
+            2, 0, 1
+        )  # （num_obj, h, w）
+        segments = {}
+        for obj_id in valid_objs_ids:
+            if id_2_idx[obj_id] is None:
+                segments[obj_id] = None
+            else:
+                idx = id_2_idx[obj_id]
+                segments[obj_id] = raw_segments[idx]
+        return segments
+
+    def get_valid_obj_frames_ids(self, num_frames_min=None):
+        # For each object, find all the frames with a valid (not None) mask
+        num_objects = len(self.frame_annots[0])
+
+        # The result dict associates each obj_id with the id of its valid frames
+        res = {obj_id: [] for obj_id in range(num_objects)}
+
+        for annot_idx, annot in enumerate(self.frame_annots):
+            for obj_id in range(num_objects):
+                if annot[obj_id] is not None:
+                    res[obj_id].append(int(annot_idx * self.ann_every))
+
+        if num_frames_min is not None:
+            # Remove masklets that have less than num_frames_min valid masks
+            for obj_id, valid_frames in list(res.items()):
+                if len(valid_frames) < num_frames_min:
+                    res.pop(obj_id)
+
+        return res
+
+
+class PalettisedPNGSegmentLoader:
+    def __init__(self, video_png_root):
+        """
+        SegmentLoader for datasets with masks stored as palettised PNGs.
+        video_png_root: the folder contains all the masks stored in png
+        """
+        self.video_png_root = video_png_root
+        # build a mapping from frame id to their PNG mask path
+        # note that in some datasets, the PNG paths could have more
+        # than 5 digits, e.g. "00000000.png" instead of "00000.png"
+        png_filenames = os.listdir(self.video_png_root)
+        self.frame_id_to_png_filename = {}
+        for filename in png_filenames:
+            frame_id, _ = os.path.splitext(filename)
+            self.frame_id_to_png_filename[int(frame_id)] = filename
+
+    def load(self, frame_id):
+        """
+        load the single palettised mask from the disk (path: f'{self.video_png_root}/{frame_id:05d}.png')
+        Args:
+            frame_id: int, define the mask path
+        Return:
+            binary_segments: dict
+        """
+        # check the path
+        mask_path = os.path.join(
+            self.video_png_root, self.frame_id_to_png_filename[frame_id]
+        )
+
+        # load the mask
+        masks = PILImage.open(mask_path).convert("P")
+        masks = np.array(masks)
+
+        object_id = pd.unique(masks.flatten())
+        object_id = object_id[object_id != 0]  # remove background (0)
+
+        # convert into N binary segmentation masks
+        binary_segments = {}
+        for i in object_id:
+            bs = masks == i
+            binary_segments[i] = torch.from_numpy(bs)
+
+        return binary_segments
+
+    def __len__(self):
+        return
+
+
+class MultiplePNGSegmentLoader:
+    def __init__(self, video_png_root, single_object_mode=False):
+        """
+        video_png_root: the folder contains all the masks stored in png
+        single_object_mode: whether to load only a single object at a time
+        """
+        self.video_png_root = video_png_root
+        self.single_object_mode = single_object_mode
+        # read a mask to know the resolution of the video
+        if self.single_object_mode:
+            tmp_mask_path = glob.glob(os.path.join(video_png_root, "*.png"))[0]
+        else:
+            tmp_mask_path = glob.glob(os.path.join(video_png_root, "*", "*.png"))[0]
+        tmp_mask = np.array(PILImage.open(tmp_mask_path))
+        self.H = tmp_mask.shape[0]
+        self.W = tmp_mask.shape[1]
+        if self.single_object_mode:
+            self.obj_id = (
+                int(video_png_root.split("/")[-1]) + 1
+            )  # offset by 1 as bg is 0
+        else:
+            self.obj_id = None
+
+    def load(self, frame_id):
+        if self.single_object_mode:
+            return self._load_single_png(frame_id)
+        else:
+            return self._load_multiple_pngs(frame_id)
+
+    def _load_single_png(self, frame_id):
+        """
+        load single png from the disk (path: f'{self.obj_id}/{frame_id:05d}.png')
+        Args:
+            frame_id: int, define the mask path
+        Return:
+            binary_segments: dict
+        """
+        mask_path = os.path.join(self.video_png_root, f"{frame_id:05d}.png")
+        binary_segments = {}
+
+        if os.path.exists(mask_path):
+            mask = np.array(PILImage.open(mask_path))
+        else:
+            # if png doesn't exist, empty mask
+            mask = np.zeros((self.H, self.W), dtype=bool)
+        binary_segments[self.obj_id] = torch.from_numpy(mask > 0)
+        return binary_segments
+
+    def _load_multiple_pngs(self, frame_id):
+        """
+        load multiple png masks from the disk (path: f'{obj_id}/{frame_id:05d}.png')
+        Args:
+            frame_id: int, define the mask path
+        Return:
+            binary_segments: dict
+        """
+        # get the path
+        all_objects = sorted(glob.glob(os.path.join(self.video_png_root, "*")))
+        num_objects = len(all_objects)
+        assert num_objects > 0
+
+        # load the masks
+        binary_segments = {}
+        for obj_folder in all_objects:
+            # obj_folder is {video_name}/{obj_id}, obj_id is specified by the name of the folder
+            obj_id = int(obj_folder.split("/")[-1])
+            obj_id = obj_id + 1  # offset 1 as bg is 0
+            mask_path = os.path.join(obj_folder, f"{frame_id:05d}.png")
+            if os.path.exists(mask_path):
+                mask = np.array(PILImage.open(mask_path))
+            else:
+                mask = np.zeros((self.H, self.W), dtype=bool)
+            binary_segments[obj_id] = torch.from_numpy(mask > 0)
+
+        return binary_segments
+
+    def __len__(self):
+        return
+
+
+class LazySegments:
+    """
+    Only decodes segments that are actually used.
+    """
+
+    def __init__(self):
+        self.segments = {}
+        self.cache = {}
+
+    def __setitem__(self, key, item):
+        self.segments[key] = item
+
+    def __getitem__(self, key):
+        if key in self.cache:
+            return self.cache[key]
+        rle = self.segments[key]
+        mask = torch.from_numpy(mask_utils.decode([rle])).permute(2, 0, 1)[0]
+        self.cache[key] = mask
+        return mask
+
+    def __contains__(self, key):
+        return key in self.segments
+
+    def __len__(self):
+        return len(self.segments)
+
+    def keys(self):
+        return self.segments.keys()
+
+
+class SA1BSegmentLoader:
+    def __init__(
+        self,
+        video_mask_path,
+        mask_area_frac_thresh=1.1,
+        video_frame_path=None,
+        uncertain_iou=-1,
+    ):
+        with open(video_mask_path, "r") as f:
+            self.frame_annots = json.load(f)
+
+        if mask_area_frac_thresh <= 1.0:
+            # Lazily read frame
+            orig_w, orig_h = PILImage.open(video_frame_path).size
+            area = orig_w * orig_h
+
+        self.frame_annots = self.frame_annots["annotations"]
+
+        rle_masks = []
+        for frame_annot in self.frame_annots:
+            if not frame_annot["area"] > 0:
+                continue
+            if ("uncertain_iou" in frame_annot) and (
+                frame_annot["uncertain_iou"] < uncertain_iou
+            ):
+                # uncertain_iou is stability score
+                continue
+            if (
+                mask_area_frac_thresh <= 1.0
+                and (frame_annot["area"] / area) >= mask_area_frac_thresh
+            ):
+                continue
+            rle_masks.append(frame_annot["segmentation"])
+
+        self.segments = LazySegments()
+        for i, rle in enumerate(rle_masks):
+            self.segments[i] = rle
+
+    def load(self, frame_idx):
+        return self.segments
diff --git a/phantom/submodules/sam2/training/loss_fns.py b/phantom/submodules/sam2/training/loss_fns.py
new file mode 100644
index 0000000000000000000000000000000000000000..d281b1a9c059771ee0ae3a4d4426f1e445178110
--- /dev/null
+++ b/phantom/submodules/sam2/training/loss_fns.py
@@ -0,0 +1,307 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+from typing import Dict, List
+
+import torch
+import torch.distributed
+import torch.nn as nn
+import torch.nn.functional as F
+
+from training.trainer import CORE_LOSS_KEY
+
+from training.utils.distributed import get_world_size, is_dist_avail_and_initialized
+
+
+def dice_loss(inputs, targets, num_objects, loss_on_multimask=False):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        num_objects: Number of objects in the batch
+        loss_on_multimask: True if multimask prediction is enabled
+    Returns:
+        Dice loss tensor
+    """
+    inputs = inputs.sigmoid()
+    if loss_on_multimask:
+        # inputs and targets are [N, M, H, W] where M corresponds to multiple predicted masks
+        assert inputs.dim() == 4 and targets.dim() == 4
+        # flatten spatial dimension while keeping multimask channel dimension
+        inputs = inputs.flatten(2)
+        targets = targets.flatten(2)
+        numerator = 2 * (inputs * targets).sum(-1)
+    else:
+        inputs = inputs.flatten(1)
+        numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    if loss_on_multimask:
+        return loss / num_objects
+    return loss.sum() / num_objects
+
+
+def sigmoid_focal_loss(
+    inputs,
+    targets,
+    num_objects,
+    alpha: float = 0.25,
+    gamma: float = 2,
+    loss_on_multimask=False,
+):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        num_objects: Number of objects in the batch
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+        loss_on_multimask: True if multimask prediction is enabled
+    Returns:
+        focal loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    if loss_on_multimask:
+        # loss is [N, M, H, W] where M corresponds to multiple predicted masks
+        assert loss.dim() == 4
+        return loss.flatten(2).mean(-1) / num_objects  # average over spatial dims
+    return loss.mean(1).sum() / num_objects
+
+
+def iou_loss(
+    inputs, targets, pred_ious, num_objects, loss_on_multimask=False, use_l1_loss=False
+):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        pred_ious: A float tensor containing the predicted IoUs scores per mask
+        num_objects: Number of objects in the batch
+        loss_on_multimask: True if multimask prediction is enabled
+        use_l1_loss: Whether to use L1 loss is used instead of MSE loss
+    Returns:
+        IoU loss tensor
+    """
+    assert inputs.dim() == 4 and targets.dim() == 4
+    pred_mask = inputs.flatten(2) > 0
+    gt_mask = targets.flatten(2) > 0
+    area_i = torch.sum(pred_mask & gt_mask, dim=-1).float()
+    area_u = torch.sum(pred_mask | gt_mask, dim=-1).float()
+    actual_ious = area_i / torch.clamp(area_u, min=1.0)
+
+    if use_l1_loss:
+        loss = F.l1_loss(pred_ious, actual_ious, reduction="none")
+    else:
+        loss = F.mse_loss(pred_ious, actual_ious, reduction="none")
+    if loss_on_multimask:
+        return loss / num_objects
+    return loss.sum() / num_objects
+
+
+class MultiStepMultiMasksAndIous(nn.Module):
+    def __init__(
+        self,
+        weight_dict,
+        focal_alpha=0.25,
+        focal_gamma=2,
+        supervise_all_iou=False,
+        iou_use_l1_loss=False,
+        pred_obj_scores=False,
+        focal_gamma_obj_score=0.0,
+        focal_alpha_obj_score=-1,
+    ):
+        """
+        This class computes the multi-step multi-mask and IoU losses.
+        Args:
+            weight_dict: dict containing weights for focal, dice, iou losses
+            focal_alpha: alpha for sigmoid focal loss
+            focal_gamma: gamma for sigmoid focal loss
+            supervise_all_iou: if True, back-prop iou losses for all predicted masks
+            iou_use_l1_loss: use L1 loss instead of MSE loss for iou
+            pred_obj_scores: if True, compute loss for object scores
+            focal_gamma_obj_score: gamma for sigmoid focal loss on object scores
+            focal_alpha_obj_score: alpha for sigmoid focal loss on object scores
+        """
+
+        super().__init__()
+        self.weight_dict = weight_dict
+        self.focal_alpha = focal_alpha
+        self.focal_gamma = focal_gamma
+        assert "loss_mask" in self.weight_dict
+        assert "loss_dice" in self.weight_dict
+        assert "loss_iou" in self.weight_dict
+        if "loss_class" not in self.weight_dict:
+            self.weight_dict["loss_class"] = 0.0
+
+        self.focal_alpha_obj_score = focal_alpha_obj_score
+        self.focal_gamma_obj_score = focal_gamma_obj_score
+        self.supervise_all_iou = supervise_all_iou
+        self.iou_use_l1_loss = iou_use_l1_loss
+        self.pred_obj_scores = pred_obj_scores
+
+    def forward(self, outs_batch: List[Dict], targets_batch: torch.Tensor):
+        assert len(outs_batch) == len(targets_batch)
+        num_objects = torch.tensor(
+            (targets_batch.shape[1]), device=targets_batch.device, dtype=torch.float
+        )  # Number of objects is fixed within a batch
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_objects)
+        num_objects = torch.clamp(num_objects / get_world_size(), min=1).item()
+
+        losses = defaultdict(int)
+        for outs, targets in zip(outs_batch, targets_batch):
+            cur_losses = self._forward(outs, targets, num_objects)
+            for k, v in cur_losses.items():
+                losses[k] += v
+
+        return losses
+
+    def _forward(self, outputs: Dict, targets: torch.Tensor, num_objects):
+        """
+        Compute the losses related to the masks: the focal loss and the dice loss.
+        and also the MAE or MSE loss between predicted IoUs and actual IoUs.
+
+        Here "multistep_pred_multimasks_high_res" is a list of multimasks (tensors
+        of shape [N, M, H, W], where M could be 1 or larger, corresponding to
+        one or multiple predicted masks from a click.
+
+        We back-propagate focal, dice losses only on the prediction channel
+        with the lowest focal+dice loss between predicted mask and ground-truth.
+        If `supervise_all_iou` is True, we backpropagate ious losses for all predicted masks.
+        """
+
+        target_masks = targets.unsqueeze(1).float()
+        assert target_masks.dim() == 4  # [N, 1, H, W]
+        src_masks_list = outputs["multistep_pred_multimasks_high_res"]
+        ious_list = outputs["multistep_pred_ious"]
+        object_score_logits_list = outputs["multistep_object_score_logits"]
+
+        assert len(src_masks_list) == len(ious_list)
+        assert len(object_score_logits_list) == len(ious_list)
+
+        # accumulate the loss over prediction steps
+        losses = {"loss_mask": 0, "loss_dice": 0, "loss_iou": 0, "loss_class": 0}
+        for src_masks, ious, object_score_logits in zip(
+            src_masks_list, ious_list, object_score_logits_list
+        ):
+            self._update_losses(
+                losses, src_masks, target_masks, ious, num_objects, object_score_logits
+            )
+        losses[CORE_LOSS_KEY] = self.reduce_loss(losses)
+        return losses
+
+    def _update_losses(
+        self, losses, src_masks, target_masks, ious, num_objects, object_score_logits
+    ):
+        target_masks = target_masks.expand_as(src_masks)
+        # get focal, dice and iou loss on all output masks in a prediction step
+        loss_multimask = sigmoid_focal_loss(
+            src_masks,
+            target_masks,
+            num_objects,
+            alpha=self.focal_alpha,
+            gamma=self.focal_gamma,
+            loss_on_multimask=True,
+        )
+        loss_multidice = dice_loss(
+            src_masks, target_masks, num_objects, loss_on_multimask=True
+        )
+        if not self.pred_obj_scores:
+            loss_class = torch.tensor(
+                0.0, dtype=loss_multimask.dtype, device=loss_multimask.device
+            )
+            target_obj = torch.ones(
+                loss_multimask.shape[0],
+                1,
+                dtype=loss_multimask.dtype,
+                device=loss_multimask.device,
+            )
+        else:
+            target_obj = torch.any((target_masks[:, 0] > 0).flatten(1), dim=-1)[
+                ..., None
+            ].float()
+            loss_class = sigmoid_focal_loss(
+                object_score_logits,
+                target_obj,
+                num_objects,
+                alpha=self.focal_alpha_obj_score,
+                gamma=self.focal_gamma_obj_score,
+            )
+
+        loss_multiiou = iou_loss(
+            src_masks,
+            target_masks,
+            ious,
+            num_objects,
+            loss_on_multimask=True,
+            use_l1_loss=self.iou_use_l1_loss,
+        )
+        assert loss_multimask.dim() == 2
+        assert loss_multidice.dim() == 2
+        assert loss_multiiou.dim() == 2
+        if loss_multimask.size(1) > 1:
+            # take the mask indices with the smallest focal + dice loss for back propagation
+            loss_combo = (
+                loss_multimask * self.weight_dict["loss_mask"]
+                + loss_multidice * self.weight_dict["loss_dice"]
+            )
+            best_loss_inds = torch.argmin(loss_combo, dim=-1)
+            batch_inds = torch.arange(loss_combo.size(0), device=loss_combo.device)
+            loss_mask = loss_multimask[batch_inds, best_loss_inds].unsqueeze(1)
+            loss_dice = loss_multidice[batch_inds, best_loss_inds].unsqueeze(1)
+            # calculate the iou prediction and slot losses only in the index
+            # with the minimum loss for each mask (to be consistent w/ SAM)
+            if self.supervise_all_iou:
+                loss_iou = loss_multiiou.mean(dim=-1).unsqueeze(1)
+            else:
+                loss_iou = loss_multiiou[batch_inds, best_loss_inds].unsqueeze(1)
+        else:
+            loss_mask = loss_multimask
+            loss_dice = loss_multidice
+            loss_iou = loss_multiiou
+
+        # backprop focal, dice and iou loss only if obj present
+        loss_mask = loss_mask * target_obj
+        loss_dice = loss_dice * target_obj
+        loss_iou = loss_iou * target_obj
+
+        # sum over batch dimension (note that the losses are already divided by num_objects)
+        losses["loss_mask"] += loss_mask.sum()
+        losses["loss_dice"] += loss_dice.sum()
+        losses["loss_iou"] += loss_iou.sum()
+        losses["loss_class"] += loss_class
+
+    def reduce_loss(self, losses):
+        reduced_loss = 0.0
+        for loss_key, weight in self.weight_dict.items():
+            if loss_key not in losses:
+                raise ValueError(f"{type(self)} doesn't compute {loss_key}")
+            if weight != 0:
+                reduced_loss += losses[loss_key] * weight
+
+        return reduced_loss
diff --git a/phantom/submodules/sam2/training/model/__init__.py b/phantom/submodules/sam2/training/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/phantom/submodules/sam2/training/model/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/phantom/submodules/sam2/training/model/sam2.py b/phantom/submodules/sam2/training/model/sam2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef7567c4dc99942d48e5890529ba9e3ca265e02d
--- /dev/null
+++ b/phantom/submodules/sam2/training/model/sam2.py
@@ -0,0 +1,541 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+import torch
+import torch.distributed
+from sam2.modeling.sam2_base import SAM2Base
+from sam2.modeling.sam2_utils import (
+    get_1d_sine_pe,
+    get_next_point,
+    sample_box_points,
+    select_closest_cond_frames,
+)
+
+from sam2.utils.misc import concat_points
+
+from training.utils.data_utils import BatchedVideoDatapoint
+
+
+class SAM2Train(SAM2Base):
+    def __init__(
+        self,
+        image_encoder,
+        memory_attention=None,
+        memory_encoder=None,
+        prob_to_use_pt_input_for_train=0.0,
+        prob_to_use_pt_input_for_eval=0.0,
+        prob_to_use_box_input_for_train=0.0,
+        prob_to_use_box_input_for_eval=0.0,
+        # if it is greater than 1, we interactive point sampling in the 1st frame and other randomly selected frames
+        num_frames_to_correct_for_train=1,  # default: only iteratively sample on first frame
+        num_frames_to_correct_for_eval=1,  # default: only iteratively sample on first frame
+        rand_frames_to_correct_for_train=False,
+        rand_frames_to_correct_for_eval=False,
+        # how many frames to use as initial conditioning frames (for both point input and mask input; the first frame is always used as an initial conditioning frame)
+        # - if `rand_init_cond_frames` below is True, we randomly sample 1~num_init_cond_frames initial conditioning frames
+        # - otherwise we sample a fixed number of num_init_cond_frames initial conditioning frames
+        # note: for point input, we sample correction points on all such initial conditioning frames, and we require that `num_frames_to_correct` >= `num_init_cond_frames`;
+        # these are initial conditioning frames because as we track the video, more conditioning frames might be added
+        # when a frame receives correction clicks under point input if `add_all_frames_to_correct_as_cond=True`
+        num_init_cond_frames_for_train=1,  # default: only use the first frame as initial conditioning frame
+        num_init_cond_frames_for_eval=1,  # default: only use the first frame as initial conditioning frame
+        rand_init_cond_frames_for_train=True,  # default: random 1~num_init_cond_frames_for_train cond frames (to be constent w/ previous TA data loader)
+        rand_init_cond_frames_for_eval=False,
+        # if `add_all_frames_to_correct_as_cond` is True, we also append to the conditioning frame list any frame that receives a later correction click
+        # if `add_all_frames_to_correct_as_cond` is False, we conditioning frame list to only use those initial conditioning frames
+        add_all_frames_to_correct_as_cond=False,
+        # how many additional correction points to sample (on each frame selected to be corrected)
+        # note that the first frame receives an initial input click (in addition to any correction clicks)
+        num_correction_pt_per_frame=7,
+        # method for point sampling during evaluation
+        # "uniform" (sample uniformly from error region) or "center" (use the point with the largest distance to error region boundary)
+        # default to "center" to be consistent with evaluation in the SAM paper
+        pt_sampling_for_eval="center",
+        # During training, we optionally allow sampling the correction points from GT regions
+        # instead of the prediction error regions with a small probability. This might allow the
+        # model to overfit less to the error regions in training datasets
+        prob_to_sample_from_gt_for_train=0.0,
+        use_act_ckpt_iterative_pt_sampling=False,
+        # whether to forward image features per frame (as it's being tracked) during evaluation, instead of forwarding image features
+        # of all frames at once. This avoids backbone OOM errors on very long videos in evaluation, but could be slightly slower.
+        forward_backbone_per_frame_for_eval=False,
+        freeze_image_encoder=False,
+        **kwargs,
+    ):
+        super().__init__(image_encoder, memory_attention, memory_encoder, **kwargs)
+        self.use_act_ckpt_iterative_pt_sampling = use_act_ckpt_iterative_pt_sampling
+        self.forward_backbone_per_frame_for_eval = forward_backbone_per_frame_for_eval
+
+        # Point sampler and conditioning frames
+        self.prob_to_use_pt_input_for_train = prob_to_use_pt_input_for_train
+        self.prob_to_use_box_input_for_train = prob_to_use_box_input_for_train
+        self.prob_to_use_pt_input_for_eval = prob_to_use_pt_input_for_eval
+        self.prob_to_use_box_input_for_eval = prob_to_use_box_input_for_eval
+        if prob_to_use_pt_input_for_train > 0 or prob_to_use_pt_input_for_eval > 0:
+            logging.info(
+                f"Training with points (sampled from masks) as inputs with p={prob_to_use_pt_input_for_train}"
+            )
+            assert num_frames_to_correct_for_train >= num_init_cond_frames_for_train
+            assert num_frames_to_correct_for_eval >= num_init_cond_frames_for_eval
+
+        self.num_frames_to_correct_for_train = num_frames_to_correct_for_train
+        self.num_frames_to_correct_for_eval = num_frames_to_correct_for_eval
+        self.rand_frames_to_correct_for_train = rand_frames_to_correct_for_train
+        self.rand_frames_to_correct_for_eval = rand_frames_to_correct_for_eval
+        # Initial multi-conditioning frames
+        self.num_init_cond_frames_for_train = num_init_cond_frames_for_train
+        self.num_init_cond_frames_for_eval = num_init_cond_frames_for_eval
+        self.rand_init_cond_frames_for_train = rand_init_cond_frames_for_train
+        self.rand_init_cond_frames_for_eval = rand_init_cond_frames_for_eval
+        self.add_all_frames_to_correct_as_cond = add_all_frames_to_correct_as_cond
+        self.num_correction_pt_per_frame = num_correction_pt_per_frame
+        self.pt_sampling_for_eval = pt_sampling_for_eval
+        self.prob_to_sample_from_gt_for_train = prob_to_sample_from_gt_for_train
+        # A random number generator with a fixed initial seed across GPUs
+        self.rng = np.random.default_rng(seed=42)
+
+        if freeze_image_encoder:
+            for p in self.image_encoder.parameters():
+                p.requires_grad = False
+
+    def forward(self, input: BatchedVideoDatapoint):
+        if self.training or not self.forward_backbone_per_frame_for_eval:
+            # precompute image features on all frames before tracking
+            backbone_out = self.forward_image(input.flat_img_batch)
+        else:
+            # defer image feature computation on a frame until it's being tracked
+            backbone_out = {"backbone_fpn": None, "vision_pos_enc": None}
+        backbone_out = self.prepare_prompt_inputs(backbone_out, input)
+        previous_stages_out = self.forward_tracking(backbone_out, input)
+
+        return previous_stages_out
+
+    def _prepare_backbone_features_per_frame(self, img_batch, img_ids):
+        """Compute the image backbone features on the fly for the given img_ids."""
+        # Only forward backbone on unique image ids to avoid repetitive computation
+        # (if `img_ids` has only one element, it's already unique so we skip this step).
+        if img_ids.numel() > 1:
+            unique_img_ids, inv_ids = torch.unique(img_ids, return_inverse=True)
+        else:
+            unique_img_ids, inv_ids = img_ids, None
+
+        # Compute the image features on those unique image ids
+        image = img_batch[unique_img_ids]
+        backbone_out = self.forward_image(image)
+        (
+            _,
+            vision_feats,
+            vision_pos_embeds,
+            feat_sizes,
+        ) = self._prepare_backbone_features(backbone_out)
+        # Inverse-map image features for `unique_img_ids` to the final image features
+        # for the original input `img_ids`.
+        if inv_ids is not None:
+            image = image[inv_ids]
+            vision_feats = [x[:, inv_ids] for x in vision_feats]
+            vision_pos_embeds = [x[:, inv_ids] for x in vision_pos_embeds]
+
+        return image, vision_feats, vision_pos_embeds, feat_sizes
+
+    def prepare_prompt_inputs(self, backbone_out, input, start_frame_idx=0):
+        """
+        Prepare input mask, point or box prompts. Optionally, we allow tracking from
+        a custom `start_frame_idx` to the end of the video (for evaluation purposes).
+        """
+        # Load the ground-truth masks on all frames (so that we can later
+        # sample correction points from them)
+        # gt_masks_per_frame = {
+        #     stage_id: targets.segments.unsqueeze(1)  # [B, 1, H_im, W_im]
+        #     for stage_id, targets in enumerate(input.find_targets)
+        # }
+        gt_masks_per_frame = {
+            stage_id: masks.unsqueeze(1)  # [B, 1, H_im, W_im]
+            for stage_id, masks in enumerate(input.masks)
+        }
+        # gt_masks_per_frame = input.masks.unsqueeze(2) # [T,B,1,H_im,W_im] keep everything in tensor form
+        backbone_out["gt_masks_per_frame"] = gt_masks_per_frame
+        num_frames = input.num_frames
+        backbone_out["num_frames"] = num_frames
+
+        # Randomly decide whether to use point inputs or mask inputs
+        if self.training:
+            prob_to_use_pt_input = self.prob_to_use_pt_input_for_train
+            prob_to_use_box_input = self.prob_to_use_box_input_for_train
+            num_frames_to_correct = self.num_frames_to_correct_for_train
+            rand_frames_to_correct = self.rand_frames_to_correct_for_train
+            num_init_cond_frames = self.num_init_cond_frames_for_train
+            rand_init_cond_frames = self.rand_init_cond_frames_for_train
+        else:
+            prob_to_use_pt_input = self.prob_to_use_pt_input_for_eval
+            prob_to_use_box_input = self.prob_to_use_box_input_for_eval
+            num_frames_to_correct = self.num_frames_to_correct_for_eval
+            rand_frames_to_correct = self.rand_frames_to_correct_for_eval
+            num_init_cond_frames = self.num_init_cond_frames_for_eval
+            rand_init_cond_frames = self.rand_init_cond_frames_for_eval
+        if num_frames == 1:
+            # here we handle a special case for mixing video + SAM on image training,
+            # where we force using point input for the SAM task on static images
+            prob_to_use_pt_input = 1.0
+            num_frames_to_correct = 1
+            num_init_cond_frames = 1
+        assert num_init_cond_frames >= 1
+        # (here `self.rng.random()` returns value in range 0.0 <= X < 1.0)
+        use_pt_input = self.rng.random() < prob_to_use_pt_input
+        if rand_init_cond_frames and num_init_cond_frames > 1:
+            # randomly select 1 to `num_init_cond_frames` frames as initial conditioning frames
+            num_init_cond_frames = self.rng.integers(
+                1, num_init_cond_frames, endpoint=True
+            )
+        if (
+            use_pt_input
+            and rand_frames_to_correct
+            and num_frames_to_correct > num_init_cond_frames
+        ):
+            # randomly select `num_init_cond_frames` to `num_frames_to_correct` frames to sample
+            # correction clicks (only for the case of point input)
+            num_frames_to_correct = self.rng.integers(
+                num_init_cond_frames, num_frames_to_correct, endpoint=True
+            )
+        backbone_out["use_pt_input"] = use_pt_input
+
+        # Sample initial conditioning frames
+        if num_init_cond_frames == 1:
+            init_cond_frames = [start_frame_idx]  # starting frame
+        else:
+            # starting frame + randomly selected remaining frames (without replacement)
+            init_cond_frames = [start_frame_idx] + self.rng.choice(
+                range(start_frame_idx + 1, num_frames),
+                num_init_cond_frames - 1,
+                replace=False,
+            ).tolist()
+        backbone_out["init_cond_frames"] = init_cond_frames
+        backbone_out["frames_not_in_init_cond"] = [
+            t for t in range(start_frame_idx, num_frames) if t not in init_cond_frames
+        ]
+        # Prepare mask or point inputs on initial conditioning frames
+        backbone_out["mask_inputs_per_frame"] = {}  # {frame_idx: <input_masks>}
+        backbone_out["point_inputs_per_frame"] = {}  # {frame_idx: <input_points>}
+        for t in init_cond_frames:
+            if not use_pt_input:
+                backbone_out["mask_inputs_per_frame"][t] = gt_masks_per_frame[t]
+            else:
+                # During training # P(box) = prob_to_use_pt_input * prob_to_use_box_input
+                use_box_input = self.rng.random() < prob_to_use_box_input
+                if use_box_input:
+                    points, labels = sample_box_points(
+                        gt_masks_per_frame[t],
+                    )
+                else:
+                    # (here we only sample **one initial point** on initial conditioning frames from the
+                    # ground-truth mask; we may sample more correction points on the fly)
+                    points, labels = get_next_point(
+                        gt_masks=gt_masks_per_frame[t],
+                        pred_masks=None,
+                        method=(
+                            "uniform" if self.training else self.pt_sampling_for_eval
+                        ),
+                    )
+
+                point_inputs = {"point_coords": points, "point_labels": labels}
+                backbone_out["point_inputs_per_frame"][t] = point_inputs
+
+        # Sample frames where we will add correction clicks on the fly
+        # based on the error between prediction and ground-truth masks
+        if not use_pt_input:
+            # no correction points will be sampled when using mask inputs
+            frames_to_add_correction_pt = []
+        elif num_frames_to_correct == num_init_cond_frames:
+            frames_to_add_correction_pt = init_cond_frames
+        else:
+            assert num_frames_to_correct > num_init_cond_frames
+            # initial cond frame + randomly selected remaining frames (without replacement)
+            extra_num = num_frames_to_correct - num_init_cond_frames
+            frames_to_add_correction_pt = (
+                init_cond_frames
+                + self.rng.choice(
+                    backbone_out["frames_not_in_init_cond"], extra_num, replace=False
+                ).tolist()
+            )
+        backbone_out["frames_to_add_correction_pt"] = frames_to_add_correction_pt
+
+        return backbone_out
+
+    def forward_tracking(
+        self, backbone_out, input: BatchedVideoDatapoint, return_dict=False
+    ):
+        """Forward video tracking on each frame (and sample correction clicks)."""
+        img_feats_already_computed = backbone_out["backbone_fpn"] is not None
+        if img_feats_already_computed:
+            # Prepare the backbone features
+            # - vision_feats and vision_pos_embeds are in (HW)BC format
+            (
+                _,
+                vision_feats,
+                vision_pos_embeds,
+                feat_sizes,
+            ) = self._prepare_backbone_features(backbone_out)
+
+        # Starting the stage loop
+        num_frames = backbone_out["num_frames"]
+        init_cond_frames = backbone_out["init_cond_frames"]
+        frames_to_add_correction_pt = backbone_out["frames_to_add_correction_pt"]
+        # first process all the initial conditioning frames to encode them as memory,
+        # and then conditioning on them to track the remaining frames
+        processing_order = init_cond_frames + backbone_out["frames_not_in_init_cond"]
+        output_dict = {
+            "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+            "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+        }
+        for stage_id in processing_order:
+            # Get the image features for the current frames
+            # img_ids = input.find_inputs[stage_id].img_ids
+            img_ids = input.flat_obj_to_img_idx[stage_id]
+            if img_feats_already_computed:
+                # Retrieve image features according to img_ids (if they are already computed).
+                current_vision_feats = [x[:, img_ids] for x in vision_feats]
+                current_vision_pos_embeds = [x[:, img_ids] for x in vision_pos_embeds]
+            else:
+                # Otherwise, compute the image features on the fly for the given img_ids
+                # (this might be used for evaluation on long videos to avoid backbone OOM).
+                (
+                    _,
+                    current_vision_feats,
+                    current_vision_pos_embeds,
+                    feat_sizes,
+                ) = self._prepare_backbone_features_per_frame(
+                    input.flat_img_batch, img_ids
+                )
+
+            # Get output masks based on this frame's prompts and previous memory
+            current_out = self.track_step(
+                frame_idx=stage_id,
+                is_init_cond_frame=stage_id in init_cond_frames,
+                current_vision_feats=current_vision_feats,
+                current_vision_pos_embeds=current_vision_pos_embeds,
+                feat_sizes=feat_sizes,
+                point_inputs=backbone_out["point_inputs_per_frame"].get(stage_id, None),
+                mask_inputs=backbone_out["mask_inputs_per_frame"].get(stage_id, None),
+                gt_masks=backbone_out["gt_masks_per_frame"].get(stage_id, None),
+                frames_to_add_correction_pt=frames_to_add_correction_pt,
+                output_dict=output_dict,
+                num_frames=num_frames,
+            )
+            # Append the output, depending on whether it's a conditioning frame
+            add_output_as_cond_frame = stage_id in init_cond_frames or (
+                self.add_all_frames_to_correct_as_cond
+                and stage_id in frames_to_add_correction_pt
+            )
+            if add_output_as_cond_frame:
+                output_dict["cond_frame_outputs"][stage_id] = current_out
+            else:
+                output_dict["non_cond_frame_outputs"][stage_id] = current_out
+
+        if return_dict:
+            return output_dict
+        # turn `output_dict` into a list for loss function
+        all_frame_outputs = {}
+        all_frame_outputs.update(output_dict["cond_frame_outputs"])
+        all_frame_outputs.update(output_dict["non_cond_frame_outputs"])
+        all_frame_outputs = [all_frame_outputs[t] for t in range(num_frames)]
+        # Make DDP happy with activation checkpointing by removing unused keys
+        all_frame_outputs = [
+            {k: v for k, v in d.items() if k != "obj_ptr"} for d in all_frame_outputs
+        ]
+
+        return all_frame_outputs
+
+    def track_step(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        point_inputs,
+        mask_inputs,
+        output_dict,
+        num_frames,
+        track_in_reverse=False,  # tracking in reverse time order (for demo usage)
+        run_mem_encoder=True,  # Whether to run the memory encoder on the predicted masks.
+        prev_sam_mask_logits=None,  # The previously predicted SAM mask logits.
+        frames_to_add_correction_pt=None,
+        gt_masks=None,
+    ):
+        if frames_to_add_correction_pt is None:
+            frames_to_add_correction_pt = []
+        current_out, sam_outputs, high_res_features, pix_feat = self._track_step(
+            frame_idx,
+            is_init_cond_frame,
+            current_vision_feats,
+            current_vision_pos_embeds,
+            feat_sizes,
+            point_inputs,
+            mask_inputs,
+            output_dict,
+            num_frames,
+            track_in_reverse,
+            prev_sam_mask_logits,
+        )
+
+        (
+            low_res_multimasks,
+            high_res_multimasks,
+            ious,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        ) = sam_outputs
+
+        current_out["multistep_pred_masks"] = low_res_masks
+        current_out["multistep_pred_masks_high_res"] = high_res_masks
+        current_out["multistep_pred_multimasks"] = [low_res_multimasks]
+        current_out["multistep_pred_multimasks_high_res"] = [high_res_multimasks]
+        current_out["multistep_pred_ious"] = [ious]
+        current_out["multistep_point_inputs"] = [point_inputs]
+        current_out["multistep_object_score_logits"] = [object_score_logits]
+
+        # Optionally, sample correction points iteratively to correct the mask
+        if frame_idx in frames_to_add_correction_pt:
+            point_inputs, final_sam_outputs = self._iter_correct_pt_sampling(
+                is_init_cond_frame,
+                point_inputs,
+                gt_masks,
+                high_res_features,
+                pix_feat,
+                low_res_multimasks,
+                high_res_multimasks,
+                ious,
+                low_res_masks,
+                high_res_masks,
+                object_score_logits,
+                current_out,
+            )
+            (
+                _,
+                _,
+                _,
+                low_res_masks,
+                high_res_masks,
+                obj_ptr,
+                object_score_logits,
+            ) = final_sam_outputs
+
+        # Use the final prediction (after all correction steps for output and eval)
+        current_out["pred_masks"] = low_res_masks
+        current_out["pred_masks_high_res"] = high_res_masks
+        current_out["obj_ptr"] = obj_ptr
+
+        # Finally run the memory encoder on the predicted mask to encode
+        # it into a new memory feature (that can be used in future frames)
+        self._encode_memory_in_output(
+            current_vision_feats,
+            feat_sizes,
+            point_inputs,
+            run_mem_encoder,
+            high_res_masks,
+            object_score_logits,
+            current_out,
+        )
+        return current_out
+
+    def _iter_correct_pt_sampling(
+        self,
+        is_init_cond_frame,
+        point_inputs,
+        gt_masks,
+        high_res_features,
+        pix_feat_with_mem,
+        low_res_multimasks,
+        high_res_multimasks,
+        ious,
+        low_res_masks,
+        high_res_masks,
+        object_score_logits,
+        current_out,
+    ):
+
+        assert gt_masks is not None
+        all_pred_masks = [low_res_masks]
+        all_pred_high_res_masks = [high_res_masks]
+        all_pred_multimasks = [low_res_multimasks]
+        all_pred_high_res_multimasks = [high_res_multimasks]
+        all_pred_ious = [ious]
+        all_point_inputs = [point_inputs]
+        all_object_score_logits = [object_score_logits]
+        for _ in range(self.num_correction_pt_per_frame):
+            # sample a new point from the error between prediction and ground-truth
+            # (with a small probability, directly sample from GT masks instead of errors)
+            if self.training and self.prob_to_sample_from_gt_for_train > 0:
+                sample_from_gt = (
+                    self.rng.random() < self.prob_to_sample_from_gt_for_train
+                )
+            else:
+                sample_from_gt = False
+            # if `pred_for_new_pt` is None, only GT masks will be used for point sampling
+            pred_for_new_pt = None if sample_from_gt else (high_res_masks > 0)
+            new_points, new_labels = get_next_point(
+                gt_masks=gt_masks,
+                pred_masks=pred_for_new_pt,
+                method="uniform" if self.training else self.pt_sampling_for_eval,
+            )
+            point_inputs = concat_points(point_inputs, new_points, new_labels)
+            # Feed the mask logits of the previous SAM outputs in the next SAM decoder step.
+            # For tracking, this means that when the user adds a correction click, we also feed
+            # the tracking output mask logits along with the click as input to the SAM decoder.
+            mask_inputs = low_res_masks
+            multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
+            if self.use_act_ckpt_iterative_pt_sampling and not multimask_output:
+                sam_outputs = torch.utils.checkpoint.checkpoint(
+                    self._forward_sam_heads,
+                    backbone_features=pix_feat_with_mem,
+                    point_inputs=point_inputs,
+                    mask_inputs=mask_inputs,
+                    high_res_features=high_res_features,
+                    multimask_output=multimask_output,
+                    use_reentrant=False,
+                )
+            else:
+                sam_outputs = self._forward_sam_heads(
+                    backbone_features=pix_feat_with_mem,
+                    point_inputs=point_inputs,
+                    mask_inputs=mask_inputs,
+                    high_res_features=high_res_features,
+                    multimask_output=multimask_output,
+                )
+            (
+                low_res_multimasks,
+                high_res_multimasks,
+                ious,
+                low_res_masks,
+                high_res_masks,
+                _,
+                object_score_logits,
+            ) = sam_outputs
+            all_pred_masks.append(low_res_masks)
+            all_pred_high_res_masks.append(high_res_masks)
+            all_pred_multimasks.append(low_res_multimasks)
+            all_pred_high_res_multimasks.append(high_res_multimasks)
+            all_pred_ious.append(ious)
+            all_point_inputs.append(point_inputs)
+            all_object_score_logits.append(object_score_logits)
+
+        # Concatenate the masks along channel (to compute losses on all of them,
+        # using `MultiStepIteractiveMasks`)
+        current_out["multistep_pred_masks"] = torch.cat(all_pred_masks, dim=1)
+        current_out["multistep_pred_masks_high_res"] = torch.cat(
+            all_pred_high_res_masks, dim=1
+        )
+        current_out["multistep_pred_multimasks"] = all_pred_multimasks
+        current_out["multistep_pred_multimasks_high_res"] = all_pred_high_res_multimasks
+        current_out["multistep_pred_ious"] = all_pred_ious
+        current_out["multistep_point_inputs"] = all_point_inputs
+        current_out["multistep_object_score_logits"] = all_object_score_logits
+
+        return point_inputs, sam_outputs
diff --git a/phantom/submodules/sam2/training/optimizer.py b/phantom/submodules/sam2/training/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae159663f6efc2dac4f5ffa3b1c91b97a78dec76
--- /dev/null
+++ b/phantom/submodules/sam2/training/optimizer.py
@@ -0,0 +1,502 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import fnmatch
+import inspect
+import itertools
+import logging
+import types
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Mapping,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import hydra
+
+import torch
+import torch.nn as nn
+from omegaconf import DictConfig
+from torch import Tensor
+
+
+class Optimizer:
+    def __init__(self, optimizer, schedulers=None) -> None:
+        self.optimizer = optimizer
+        self.schedulers = schedulers
+        self._validate_optimizer_schedulers()
+        self.step_schedulers(0.0, 0)
+
+    def _validate_optimizer_schedulers(self):
+        if self.schedulers is None:
+            return
+        for _, set_of_schedulers in enumerate(self.schedulers):
+            for option, _ in set_of_schedulers.items():
+                assert option in self.optimizer.defaults, (
+                    "Optimizer option "
+                    f"{option} not found in {self.optimizer}. Valid options are "
+                    f"{self.optimizer.defaults.keys()}"
+                )
+
+    def step_schedulers(self, where: float, step: int) -> None:
+        if self.schedulers is None:
+            return
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            for option, scheduler in self.schedulers[i].items():
+                if "step" in inspect.signature(scheduler.__call__).parameters:
+                    new_value = scheduler(step=step, where=where)
+                elif (
+                    hasattr(scheduler, "scheduler")
+                    and "step"
+                    in inspect.signature(scheduler.scheduler.__call__).parameters
+                ):
+                    # To handle ValueScaler wrappers
+                    new_value = scheduler(step=step, where=where)
+                else:
+                    new_value = scheduler(where)
+                param_group[option] = new_value
+
+    def step(self, where, step, closure=None):
+        self.step_schedulers(where, step)
+        return self.optimizer.step(closure)
+
+    def zero_grad(self, *args, **kwargs):
+        return self.optimizer.zero_grad(*args, **kwargs)
+
+
+def set_default_parameters(
+    scheduler_cfgs: List[DictConfig], all_parameter_names: Set[str]
+) -> None:
+    """Set up the "default" scheduler with the right parameters.
+
+    Args:
+        scheduler_cgfs: A list of scheduler configs, where each scheduler also
+            specifies which parameters it applies to, based on the names of parameters
+            or the class of the modules. At most one scheduler is allowed to skip this
+            specification, which is used as a "default" specification for any remaining
+            parameters.
+        all_parameter_names: Names of all the parameters to consider.
+    """
+    constraints = [
+        scheduler_cfg.parameter_names
+        for scheduler_cfg in scheduler_cfgs
+        if scheduler_cfg.parameter_names is not None
+    ]
+    if len(constraints) == 0:
+        default_params = set(all_parameter_names)
+    else:
+        default_params = all_parameter_names - set.union(*constraints)
+    default_count = 0
+    for scheduler_cfg in scheduler_cfgs:
+        if scheduler_cfg.parameter_names is None:
+            scheduler_cfg.parameter_names = default_params
+            default_count += 1
+    assert default_count <= 1, "Only one scheduler per option can be default"
+    if default_count == 0:
+        # No default scheduler specified, add a default, but without any scheduler
+        # for that option
+        scheduler_cfgs.append({"parameter_names": default_params})
+
+
+def name_constraints_to_parameters(
+    param_constraints: List[Set[str]], named_parameters: Dict[str, Tensor]
+) -> List[torch.nn.Parameter]:
+    """Return parameters which match the intersection of parameter constraints.
+
+    Note that this returns the parameters themselves, not their names.
+
+    Args:
+        param_constraints: A list, with each element being a set of allowed parameters.
+        named_parameters: Mapping from a parameter name to the parameter itself.
+
+    Returns:
+        A list containing the parameters which overlap with _each_ constraint set from
+        param_constraints.
+    """
+    matching_names = set.intersection(*param_constraints)
+    return [value for name, value in named_parameters.items() if name in matching_names]
+
+
+def map_scheduler_cfgs_to_param_groups(
+    all_scheduler_cfgs: Iterable[List[Dict]],
+    named_parameters: Dict[str, Tensor],
+) -> Tuple[List[Dict[Any, Any]], List[Dict[str, List[torch.nn.Parameter]]]]:
+    """Produce parameter groups corresponding to all the scheduler configs.
+
+    Takes all the scheduler configs, each of which applies to a specific optimizer
+    option (like "lr" or "weight_decay") and has a set of parameter names which it
+    applies to, and produces a final set of param groups where each param group
+    covers all the options which apply to a particular set of parameters.
+
+    Args:
+        all_scheduler_cfgs: All the scheduler configs covering every option.
+        named_parameters: Mapping from a parameter name to the parameter itself.
+    Returns:
+        Tuple of lists of schedulers and param_groups, where schedulers[i]
+        applies to param_groups[i].
+    """
+
+    scheduler_cfgs_per_param_group = itertools.product(*all_scheduler_cfgs)
+    schedulers = []
+    param_groups = []
+    for scheduler_cfgs in scheduler_cfgs_per_param_group:
+        param_constraints = [
+            scheduler_cfg["parameter_names"] for scheduler_cfg in scheduler_cfgs
+        ]
+        matching_parameters = name_constraints_to_parameters(
+            param_constraints, named_parameters
+        )
+        if len(matching_parameters) == 0:  # If no overlap of parameters, skip
+            continue
+        schedulers_for_group = {
+            scheduler_cfg["option"]: scheduler_cfg["scheduler"]
+            for scheduler_cfg in scheduler_cfgs
+            if "option" in scheduler_cfg
+        }
+        schedulers.append(schedulers_for_group)
+        param_groups.append({"params": matching_parameters})
+    return schedulers, param_groups
+
+
+def validate_param_group_params(param_groups: List[Dict], model: nn.Module):
+    """Check that the param groups are non-overlapping and cover all the parameters.
+
+    Args:
+        param_groups: List of all param groups
+        model: Model to validate against. The check ensures that all the model
+            parameters are part of param_groups
+    """
+    for pg in param_groups:
+        # no param should be repeated within a group
+        assert len(pg["params"]) == len(set(pg["params"]))
+    parameters = [set(param_group["params"]) for param_group in param_groups]
+    model_parameters = {parameter for _, parameter in model.named_parameters()}
+    for p1, p2 in itertools.permutations(parameters, 2):
+        assert p1.isdisjoint(p2), "Scheduler generated param_groups should be disjoint"
+    assert set.union(*parameters) == model_parameters, (
+        "Scheduler generated param_groups must include all parameters of the model."
+        f" Found {len(set.union(*parameters))} params whereas model has"
+        f" {len(model_parameters)} params"
+    )
+
+
+def unix_module_cls_pattern_to_parameter_names(
+    filter_module_cls_names: List[str],
+    module_cls_to_param_names: Dict[Type, str],
+) -> Union[None, Set[str]]:
+    """Returns param names which pass the filters specified in filter_module_cls_names.
+
+    Args:
+        filter_module_cls_names: A list of filter strings containing class names, like
+            ["torch.nn.LayerNorm", "torch.nn.BatchNorm2d"]
+        module_cls_to_param_names: Mapping from module classes to the parameter names
+            they contain. See `get_module_cls_to_param_names`.
+    """
+    if filter_module_cls_names is None:
+        return set()
+    allowed_parameter_names = []
+    for module_cls_name in filter_module_cls_names:
+        module_cls = hydra.utils.get_class(module_cls_name)
+        if module_cls not in module_cls_to_param_names:
+            raise AssertionError(
+                f"module_cls_name {module_cls_name} does not "
+                "match any classes in the model"
+            )
+        matching_parameters = module_cls_to_param_names[module_cls]
+        assert (
+            len(matching_parameters) > 0
+        ), f"module_cls_name {module_cls_name} does not contain any parameters in the model"
+        logging.info(
+            f"Matches for module_cls_name [{module_cls_name}]: {matching_parameters} "
+        )
+        allowed_parameter_names.append(matching_parameters)
+    return set.union(*allowed_parameter_names)
+
+
+def unix_param_pattern_to_parameter_names(
+    filter_param_names: Optional[List[str]],
+    parameter_names: Dict[str, torch.Tensor],
+) -> Union[None, Set[str]]:
+    """Returns param names which pass the filters specified in filter_param_names.
+
+    Args:
+        filter_param_names: A list of unix-style filter strings with optional
+            wildcards, like ["block.2.*", "block.2.linear.weight"]
+        module_cls_to_param_names: Mapping from module classes to the parameter names
+            they contain. See `get_module_cls_to_param_names`.
+    """
+
+    if filter_param_names is None:
+        return set()
+    allowed_parameter_names = []
+    for param_name in filter_param_names:
+        matching_parameters = set(fnmatch.filter(parameter_names, param_name))
+        assert (
+            len(matching_parameters) >= 1
+        ), f"param_name {param_name} does not match any parameters in the model"
+        logging.info(f"Matches for param_name [{param_name}]: {matching_parameters}")
+        allowed_parameter_names.append(matching_parameters)
+    return set.union(*allowed_parameter_names)
+
+
+def _unix_pattern_to_parameter_names(
+    scheduler_cfg: DictConfig,
+    parameter_names: Set[str],
+    module_cls_to_param_names: Dict[Type, str],
+) -> Union[None, Set[str]]:
+    """Returns param names which pass the filters specified in scheduler_cfg.
+
+    Args:
+        scheduler_cfg: The config for the scheduler
+        parameter_names: The set of all parameter names which will be filtered
+    """
+    if "param_names" not in scheduler_cfg and "module_cls_names" not in scheduler_cfg:
+        return None
+    return unix_param_pattern_to_parameter_names(
+        scheduler_cfg.get("param_names"), parameter_names
+    ).union(
+        unix_module_cls_pattern_to_parameter_names(
+            scheduler_cfg.get("module_cls_names"), module_cls_to_param_names
+        )
+    )
+
+
+def get_module_cls_to_param_names(
+    model: nn.Module, param_allowlist: Set[str] = None
+) -> Dict[Type, str]:
+    """Produce a mapping from all the modules classes to the names of parames they own.
+
+    Only counts a parameter as part of the immediate parent module, i.e. recursive
+    parents do not count.
+
+    Args:
+        model: Model to iterate over
+        param_allowlist: If specified, only these param names will be processed
+    """
+
+    module_cls_to_params = {}
+    for module_name, module in model.named_modules():
+        module_cls = type(module)
+        module_cls_to_params.setdefault(module_cls, set())
+        for param_name, _ in module.named_parameters(recurse=False):
+            full_param_name = get_full_parameter_name(module_name, param_name)
+            if param_allowlist is None or full_param_name in param_allowlist:
+                module_cls_to_params[module_cls].add(full_param_name)
+    return module_cls_to_params
+
+
+def construct_optimizer(
+    model: torch.nn.Module,
+    optimizer_conf: Any,
+    options_conf: Mapping[str, List] = None,
+    param_group_modifiers_conf: List[Callable] = None,
+    param_allowlist: Optional[Set[str]] = None,
+    validate_param_groups=True,
+) -> Optimizer:
+    """
+    Constructs a stochastic gradient descent or ADAM (or ADAMw) optimizer
+    with momentum. i.e, constructs a torch.optim.Optimizer with zero-weight decay
+    Batchnorm and/or no-update 1-D parameters support, based on the config.
+
+    Supports wrapping the optimizer with Layer-wise Adaptive Rate Scaling
+    (LARS): https://arxiv.org/abs/1708.03888
+
+    Args:
+        model: model to perform stochastic gradient descent
+            optimization or ADAM optimization.
+        optimizer_conf: Hydra config consisting a partial torch optimizer like SGD or
+            ADAM, still missing the params argument which this function provides to
+            produce the final optimizer
+        param_group_modifiers_conf: Optional user specified functions which can modify
+            the final scheduler configs before the optimizer's param groups are built
+        param_allowlist: The parameters to optimize. Parameters which are not part of
+            this allowlist will be skipped.
+        validate_param_groups: If enabled, valides that the produced param_groups don't
+            overlap and cover all the model parameters.
+    """
+    if param_allowlist is None:
+        param_allowlist = {name for name, _ in model.named_parameters()}
+
+    named_parameters = {
+        name: param
+        for name, param in model.named_parameters()
+        if name in param_allowlist
+    }
+
+    if not options_conf:
+        optimizer = hydra.utils.instantiate(optimizer_conf, named_parameters.values())
+        return Optimizer(optimizer)
+
+    all_parameter_names = {
+        name for name, _ in model.named_parameters() if name in param_allowlist
+    }
+    module_cls_to_all_param_names = get_module_cls_to_param_names(
+        model, param_allowlist
+    )
+
+    scheduler_cfgs_per_option = hydra.utils.instantiate(options_conf)
+    all_scheduler_cfgs = []
+    for option, scheduler_cfgs in scheduler_cfgs_per_option.items():
+        for config in scheduler_cfgs:
+            config.option = option
+            config.parameter_names = _unix_pattern_to_parameter_names(
+                config, all_parameter_names, module_cls_to_all_param_names
+            )
+        set_default_parameters(scheduler_cfgs, all_parameter_names)
+        all_scheduler_cfgs.append(scheduler_cfgs)
+
+    if param_group_modifiers_conf:
+        for custom_param_modifier in param_group_modifiers_conf:
+            custom_param_modifier = hydra.utils.instantiate(custom_param_modifier)
+            all_scheduler_cfgs = custom_param_modifier(
+                scheduler_cfgs=all_scheduler_cfgs, model=model
+            )
+    schedulers, param_groups = map_scheduler_cfgs_to_param_groups(
+        all_scheduler_cfgs, named_parameters
+    )
+    if validate_param_groups:
+        validate_param_group_params(param_groups, model)
+    optimizer = hydra.utils.instantiate(optimizer_conf, param_groups)
+    return Optimizer(optimizer, schedulers)
+
+
+def get_full_parameter_name(module_name, param_name):
+    if module_name == "":
+        return param_name
+    return f"{module_name}.{param_name}"
+
+
+class GradientClipper:
+    """
+    Gradient clipping utils that works for DDP
+    """
+
+    def __init__(self, max_norm: float = 1.0, norm_type: int = 2):
+        assert isinstance(max_norm, (int, float)) or max_norm is None
+        self.max_norm = max_norm if max_norm is None else float(max_norm)
+        self.norm_type = norm_type
+
+    def __call__(self, model: nn.Module):
+        if self.max_norm is None:
+            return  # no-op
+
+        nn.utils.clip_grad_norm_(
+            model.parameters(), max_norm=self.max_norm, norm_type=self.norm_type
+        )
+
+
+class ValueScaler:
+    def __init__(self, scheduler, mult_val: float):
+        self.scheduler = scheduler
+        self.mult_val = mult_val
+
+    def __call__(self, *args, **kwargs):
+        val = self.scheduler(*args, **kwargs)
+        return val * self.mult_val
+
+
+def rgetattr(obj, rattrs: str = None):
+    """
+    Like getattr(), but supports dotted notation for nested objects.
+    rattrs is a str of form 'attr1.attr2', returns obj.attr1.attr2
+    """
+    if rattrs is None:
+        return obj
+    attrs = rattrs.split(".")
+    for attr in attrs:
+        obj = getattr(obj, attr)
+    return obj
+
+
+def layer_decay_param_modifier(
+    scheduler_cfgs: List[List[Dict]],
+    model,
+    layer_decay_value: float,
+    layer_decay_min: Optional[float] = None,
+    apply_to: Optional[str] = None,
+    overrides: List[Dict] = (),
+) -> List[List[Dict]]:
+    """
+    Args
+    - scheduler_cfgs: a list of omegaconf.ListConfigs.
+        Each element in the list is a omegaconfg.DictConfig with the following structure
+        {
+            "scheduler": <some fvcore scheduler>
+            "option": <value> possible options are "lr", "weight_decay" etc.
+            "parameter_names": Set of str indicating param names that this scheduler applies to
+        }
+    - model: a model that implements a method `get_layer_id` that maps layer_name to an integer and
+            and a method get_num_layers.
+            Alternatively, use apply_to argument to select a specific component of the model.
+    - layer_decay_value: float
+    - layer_decay_min: min val for layer decay
+    - apply_to: optional arg to select which component of the model to apply the the layer decay modifier to
+    - overrides: to manually override lr for specific patterns. Is a list of dicts. Each dict, has keys "pattern", "value".
+    Returns
+    - scheduler_configs: same structure as the input, elements can be modified
+    """
+    model = rgetattr(model, apply_to)
+    num_layers = model.get_num_layers() + 1
+    layer_decays = [
+        layer_decay_value ** (num_layers - i) for i in range(num_layers + 1)
+    ]
+    if layer_decay_min is not None:
+        layer_decays = [max(val, layer_decay_min) for val in layer_decays]
+    final_scheduler_cfgs = []
+    # scheduler_cfgs is a list of lists
+    for scheduler_cfg_group in scheduler_cfgs:
+        curr_cfg_group = []
+        # scheduler_cfg_group is a list of dictionaries
+        for scheduler_cfg in scheduler_cfg_group:
+            if scheduler_cfg["option"] != "lr":
+                curr_cfg_group.append(scheduler_cfg)
+                continue
+            # Need sorted so that the list of parameter names is deterministic and consistent
+            # across re-runs of this job. Else it was causing issues with loading the optimizer
+            # state during a job restart (D38591759)
+            parameter_names = sorted(scheduler_cfg["parameter_names"])
+
+            # Only want one cfg group per layer
+            layer_cfg_groups = {}
+            for param_name in parameter_names:
+                layer_id = num_layers
+                this_scale = layer_decays[layer_id]
+                if param_name.startswith(apply_to):
+                    layer_id = model.get_layer_id(param_name)
+                    this_scale = layer_decays[layer_id]
+                    # Overrides
+                    for override in overrides:
+                        if fnmatch.fnmatchcase(param_name, override["pattern"]):
+                            this_scale = float(override["value"])
+                            layer_id = override["pattern"]
+                            break
+
+                if layer_id not in layer_cfg_groups:
+                    curr_param = {
+                        "option": scheduler_cfg["option"],
+                        "scheduler": ValueScaler(
+                            scheduler_cfg["scheduler"], this_scale
+                        ),
+                        "parameter_names": {param_name},
+                    }
+                else:
+                    curr_param = layer_cfg_groups[layer_id]
+                    curr_param["parameter_names"].add(param_name)
+                layer_cfg_groups[layer_id] = curr_param
+
+            for layer_cfg in layer_cfg_groups.values():
+                curr_cfg_group.append(layer_cfg)
+
+        final_scheduler_cfgs.append(curr_cfg_group)
+    return final_scheduler_cfgs
diff --git a/phantom/submodules/sam2/training/scripts/sav_frame_extraction_submitit.py b/phantom/submodules/sam2/training/scripts/sav_frame_extraction_submitit.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d5ed2fc77deecf87c8d823bb3fdcf3cb856fc94
--- /dev/null
+++ b/phantom/submodules/sam2/training/scripts/sav_frame_extraction_submitit.py
@@ -0,0 +1,163 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+import argparse
+import os
+from pathlib import Path
+
+import cv2
+
+import numpy as np
+import submitit
+import tqdm
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser(
+        description="[SA-V Preprocessing] Extracting JPEG frames",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # ------------
+    # DATA
+    # ------------
+    data_parser = parser.add_argument_group(
+        title="SA-V dataset data root",
+        description="What data to load and how to process it.",
+    )
+    data_parser.add_argument(
+        "--sav-vid-dir",
+        type=str,
+        required=True,
+        help=("Where to find the SAV videos"),
+    )
+    data_parser.add_argument(
+        "--sav-frame-sample-rate",
+        type=int,
+        default=4,
+        help="Rate at which to sub-sample frames",
+    )
+
+    # ------------
+    # LAUNCH
+    # ------------
+    launch_parser = parser.add_argument_group(
+        title="Cluster launch settings",
+        description="Number of jobs and retry settings.",
+    )
+    launch_parser.add_argument(
+        "--n-jobs",
+        type=int,
+        required=True,
+        help="Shard the run over this many jobs.",
+    )
+    launch_parser.add_argument(
+        "--timeout", type=int, required=True, help="SLURM timeout parameter in minutes."
+    )
+    launch_parser.add_argument(
+        "--partition", type=str, required=True, help="Partition to launch on."
+    )
+    launch_parser.add_argument(
+        "--account", type=str, required=True, help="Partition to launch on."
+    )
+    launch_parser.add_argument("--qos", type=str, required=True, help="QOS.")
+
+    # ------------
+    # OUTPUT
+    # ------------
+    output_parser = parser.add_argument_group(
+        title="Setting for results output", description="Where and how to save results."
+    )
+    output_parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help=("Where to dump the extracted jpeg frames"),
+    )
+    output_parser.add_argument(
+        "--slurm-output-root-dir",
+        type=str,
+        required=True,
+        help=("Where to save slurm outputs"),
+    )
+    return parser
+
+
+def decode_video(video_path: str):
+    assert os.path.exists(video_path)
+    video = cv2.VideoCapture(video_path)
+    video_frames = []
+    while video.isOpened():
+        ret, frame = video.read()
+        if ret:
+            video_frames.append(frame)
+        else:
+            break
+    return video_frames
+
+
+def extract_frames(video_path, sample_rate):
+    frames = decode_video(video_path)
+    return frames[::sample_rate]
+
+
+def submitit_launch(video_paths, sample_rate, save_root):
+    for path in tqdm.tqdm(video_paths):
+        frames = extract_frames(path, sample_rate)
+        output_folder = os.path.join(save_root, Path(path).stem)
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+        for fid, frame in enumerate(frames):
+            frame_path = os.path.join(output_folder, f"{fid*sample_rate:05d}.jpg")
+            cv2.imwrite(frame_path, frame)
+    print(f"Saved output to {save_root}")
+
+
+if __name__ == "__main__":
+    parser = get_args_parser()
+    args = parser.parse_args()
+
+    sav_vid_dir = args.sav_vid_dir
+    save_root = args.output_dir
+    sample_rate = args.sav_frame_sample_rate
+
+    # List all SA-V videos
+    mp4_files = sorted([str(p) for p in Path(sav_vid_dir).glob("*/*.mp4")])
+    mp4_files = np.array(mp4_files)
+    chunked_mp4_files = [x.tolist() for x in np.array_split(mp4_files, args.n_jobs)]
+
+    print(f"Processing videos in: {sav_vid_dir}")
+    print(f"Processing {len(mp4_files)} files")
+    print(f"Beginning processing in {args.n_jobs} processes")
+
+    # Submitit params
+    jobs_dir = os.path.join(args.slurm_output_root_dir, "%j")
+    cpus_per_task = 4
+    executor = submitit.AutoExecutor(folder=jobs_dir)
+    executor.update_parameters(
+        timeout_min=args.timeout,
+        gpus_per_node=0,
+        tasks_per_node=1,
+        slurm_array_parallelism=args.n_jobs,
+        cpus_per_task=cpus_per_task,
+        slurm_partition=args.partition,
+        slurm_account=args.account,
+        slurm_qos=args.qos,
+    )
+    executor.update_parameters(slurm_srun_args=["-vv", "--cpu-bind", "none"])
+
+    # Launch
+    jobs = []
+    with executor.batch():
+        for _, mp4_chunk in tqdm.tqdm(enumerate(chunked_mp4_files)):
+            job = executor.submit(
+                submitit_launch,
+                video_paths=mp4_chunk,
+                sample_rate=sample_rate,
+                save_root=save_root,
+            )
+            jobs.append(job)
+
+    for j in jobs:
+        print(f"Slurm JobID: {j.job_id}")
+    print(f"Saving outputs to {save_root}")
+    print(f"Slurm outputs at {args.slurm_output_root_dir}")
diff --git a/phantom/submodules/sam2/training/train.py b/phantom/submodules/sam2/training/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..db06123fcb1b2ba8ff5f462dbb7411d42a57c9a0
--- /dev/null
+++ b/phantom/submodules/sam2/training/train.py
@@ -0,0 +1,270 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import random
+import sys
+import traceback
+from argparse import ArgumentParser
+
+import submitit
+import torch
+
+from hydra import compose, initialize_config_module
+from hydra.utils import instantiate
+
+from iopath.common.file_io import g_pathmgr
+from omegaconf import OmegaConf
+
+from training.utils.train_utils import makedir, register_omegaconf_resolvers
+
+os.environ["HYDRA_FULL_ERROR"] = "1"
+
+
+def single_proc_run(local_rank, main_port, cfg, world_size):
+    """Single GPU process"""
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(main_port)
+    os.environ["RANK"] = str(local_rank)
+    os.environ["LOCAL_RANK"] = str(local_rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    try:
+        register_omegaconf_resolvers()
+    except Exception as e:
+        logging.info(e)
+
+    trainer = instantiate(cfg.trainer, _recursive_=False)
+    trainer.run()
+
+
+def single_node_runner(cfg, main_port: int):
+    assert cfg.launcher.num_nodes == 1
+    num_proc = cfg.launcher.gpus_per_node
+    torch.multiprocessing.set_start_method(
+        "spawn"
+    )  # CUDA runtime does not support `fork`
+    if num_proc == 1:
+        # directly call single_proc so we can easily set breakpoints
+        # mp.spawn does not let us set breakpoints
+        single_proc_run(local_rank=0, main_port=main_port, cfg=cfg, world_size=num_proc)
+    else:
+        mp_runner = torch.multiprocessing.start_processes
+        args = (main_port, cfg, num_proc)
+        # Note: using "fork" below, "spawn" causes time and error regressions. Using
+        # spawn changes the default multiprocessing context to spawn, which doesn't
+        # interact well with the dataloaders (likely due to the use of OpenCV).
+        mp_runner(single_proc_run, args=args, nprocs=num_proc, start_method="spawn")
+
+
+def format_exception(e: Exception, limit=20):
+    traceback_str = "".join(traceback.format_tb(e.__traceback__, limit=limit))
+    return f"{type(e).__name__}: {e}\nTraceback:\n{traceback_str}"
+
+
+class SubmititRunner(submitit.helpers.Checkpointable):
+    """A callable which is passed to submitit to launch the jobs."""
+
+    def __init__(self, port, cfg):
+        self.cfg = cfg
+        self.port = port
+        self.has_setup = False
+
+    def run_trainer(self):
+        job_env = submitit.JobEnvironment()
+        # Need to add this again so the hydra.job.set_env PYTHONPATH
+        # is also set when launching jobs.
+        add_pythonpath_to_sys_path()
+        os.environ["MASTER_ADDR"] = job_env.hostnames[0]
+        os.environ["MASTER_PORT"] = str(self.port)
+        os.environ["RANK"] = str(job_env.global_rank)
+        os.environ["LOCAL_RANK"] = str(job_env.local_rank)
+        os.environ["WORLD_SIZE"] = str(job_env.num_tasks)
+
+        register_omegaconf_resolvers()
+        cfg_resolved = OmegaConf.to_container(self.cfg, resolve=False)
+        cfg_resolved = OmegaConf.create(cfg_resolved)
+
+        trainer = instantiate(cfg_resolved.trainer, _recursive_=False)
+        trainer.run()
+
+    def __call__(self):
+        job_env = submitit.JobEnvironment()
+        self.setup_job_info(job_env.job_id, job_env.global_rank)
+        try:
+            self.run_trainer()
+        except Exception as e:
+            # Log the exception. Then raise it again (as what SubmititRunner currently does).
+            message = format_exception(e)
+            logging.error(message)
+            raise e
+
+    def setup_job_info(self, job_id, rank):
+        """Set up slurm job info"""
+        self.job_info = {
+            "job_id": job_id,
+            "rank": rank,
+            "cluster": self.cfg.get("cluster", None),
+            "experiment_log_dir": self.cfg.launcher.experiment_log_dir,
+        }
+
+        self.has_setup = True
+
+
+def add_pythonpath_to_sys_path():
+    if "PYTHONPATH" not in os.environ or not os.environ["PYTHONPATH"]:
+        return
+    sys.path = os.environ["PYTHONPATH"].split(":") + sys.path
+
+
+def main(args) -> None:
+    cfg = compose(config_name=args.config)
+    if cfg.launcher.experiment_log_dir is None:
+        cfg.launcher.experiment_log_dir = os.path.join(
+            os.getcwd(), "sam2_logs", args.config
+        )
+    print("###################### Train App Config ####################")
+    print(OmegaConf.to_yaml(cfg))
+    print("############################################################")
+
+    add_pythonpath_to_sys_path()
+    makedir(cfg.launcher.experiment_log_dir)
+    with g_pathmgr.open(
+        os.path.join(cfg.launcher.experiment_log_dir, "config.yaml"), "w"
+    ) as f:
+        f.write(OmegaConf.to_yaml(cfg))
+
+    cfg_resolved = OmegaConf.to_container(cfg, resolve=False)
+    cfg_resolved = OmegaConf.create(cfg_resolved)
+
+    with g_pathmgr.open(
+        os.path.join(cfg.launcher.experiment_log_dir, "config_resolved.yaml"), "w"
+    ) as f:
+        f.write(OmegaConf.to_yaml(cfg_resolved, resolve=True))
+
+    submitit_conf = cfg.get("submitit", None)
+    assert submitit_conf is not None, "Missing submitit config"
+
+    submitit_dir = cfg.launcher.experiment_log_dir
+    submitit_dir = os.path.join(submitit_dir, "submitit_logs")
+    # Priotrize cmd line args
+    cfg.launcher.gpus_per_node = (
+        args.num_gpus if args.num_gpus is not None else cfg.launcher.gpus_per_node
+    )
+    cfg.launcher.num_nodes = (
+        args.num_nodes if args.num_nodes is not None else cfg.launcher.num_nodes
+    )
+    submitit_conf.use_cluster = (
+        args.use_cluster if args.use_cluster is not None else submitit_conf.use_cluster
+    )
+    if submitit_conf.use_cluster:
+        executor = submitit.AutoExecutor(folder=submitit_dir)
+        submitit_conf.partition = (
+            args.partition
+            if args.partition is not None
+            else submitit_conf.get("partition", None)
+        )
+        submitit_conf.account = (
+            args.account
+            if args.account is not None
+            else submitit_conf.get("account", None)
+        )
+        submitit_conf.qos = (
+            args.qos if args.qos is not None else submitit_conf.get("qos", None)
+        )
+        job_kwargs = {
+            "timeout_min": 60 * submitit_conf.timeout_hour,
+            "name": (
+                submitit_conf.name if hasattr(submitit_conf, "name") else args.config
+            ),
+            "slurm_partition": submitit_conf.partition,
+            "gpus_per_node": cfg.launcher.gpus_per_node,
+            "tasks_per_node": cfg.launcher.gpus_per_node,  # one task per GPU
+            "cpus_per_task": submitit_conf.cpus_per_task,
+            "nodes": cfg.launcher.num_nodes,
+            "slurm_additional_parameters": {
+                "exclude": " ".join(submitit_conf.get("exclude_nodes", [])),
+            },
+        }
+        if "include_nodes" in submitit_conf:
+            assert (
+                len(submitit_conf["include_nodes"]) >= cfg.launcher.num_nodes
+            ), "Not enough nodes"
+            job_kwargs["slurm_additional_parameters"]["nodelist"] = " ".join(
+                submitit_conf["include_nodes"]
+            )
+        if submitit_conf.account is not None:
+            job_kwargs["slurm_additional_parameters"]["account"] = submitit_conf.account
+        if submitit_conf.qos is not None:
+            job_kwargs["slurm_additional_parameters"]["qos"] = submitit_conf.qos
+
+        if submitit_conf.get("mem_gb", None) is not None:
+            job_kwargs["mem_gb"] = submitit_conf.mem_gb
+        elif submitit_conf.get("mem", None) is not None:
+            job_kwargs["slurm_mem"] = submitit_conf.mem
+
+        if submitit_conf.get("constraints", None) is not None:
+            job_kwargs["slurm_constraint"] = submitit_conf.constraints
+
+        if submitit_conf.get("comment", None) is not None:
+            job_kwargs["slurm_comment"] = submitit_conf.comment
+
+        # Supports only cpu-bind option within srun_args. New options can be added here
+        if submitit_conf.get("srun_args", None) is not None:
+            job_kwargs["slurm_srun_args"] = []
+            if submitit_conf.srun_args.get("cpu_bind", None) is not None:
+                job_kwargs["slurm_srun_args"].extend(
+                    ["--cpu-bind", submitit_conf.srun_args.cpu_bind]
+                )
+
+        print("###################### SLURM Config ####################")
+        print(job_kwargs)
+        print("##########################################")
+        executor.update_parameters(**job_kwargs)
+
+        main_port = random.randint(
+            submitit_conf.port_range[0], submitit_conf.port_range[1]
+        )
+        runner = SubmititRunner(main_port, cfg)
+        job = executor.submit(runner)
+        print(f"Submitit Job ID: {job.job_id}")
+        runner.setup_job_info(job.job_id, rank=0)
+    else:
+        cfg.launcher.num_nodes = 1
+        main_port = random.randint(
+            submitit_conf.port_range[0], submitit_conf.port_range[1]
+        )
+        single_node_runner(cfg, main_port)
+
+
+if __name__ == "__main__":
+
+    initialize_config_module("sam2", version_base="1.2")
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-c",
+        "--config",
+        required=True,
+        type=str,
+        help="path to config file (e.g. configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml)",
+    )
+    parser.add_argument(
+        "--use-cluster",
+        type=int,
+        default=None,
+        help="whether to launch on a cluster, 0: run locally, 1: run on a cluster",
+    )
+    parser.add_argument("--partition", type=str, default=None, help="SLURM partition")
+    parser.add_argument("--account", type=str, default=None, help="SLURM account")
+    parser.add_argument("--qos", type=str, default=None, help="SLURM qos")
+    parser.add_argument(
+        "--num-gpus", type=int, default=None, help="number of GPUS per node"
+    )
+    parser.add_argument("--num-nodes", type=int, default=None, help="Number of nodes")
+    args = parser.parse_args()
+    args.use_cluster = bool(args.use_cluster) if args.use_cluster is not None else None
+    register_omegaconf_resolvers()
+    main(args)
diff --git a/phantom/submodules/sam2/training/trainer.py b/phantom/submodules/sam2/training/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b7c27b5145e2c03848331345ac246296accbc1d
--- /dev/null
+++ b/phantom/submodules/sam2/training/trainer.py
@@ -0,0 +1,1113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import gc
+import json
+import logging
+import math
+import os
+import time
+from collections import OrderedDict
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Mapping, Optional
+
+import numpy as np
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from hydra.utils import instantiate
+from iopath.common.file_io import g_pathmgr
+
+from training.optimizer import construct_optimizer
+
+from training.utils.checkpoint_utils import (
+    assert_skipped_parameters_are_frozen,
+    exclude_params_matching_unix_pattern,
+    load_state_dict_into_model,
+    with_check_parameter_frozen,
+)
+from training.utils.data_utils import BatchedVideoDatapoint
+from training.utils.distributed import all_reduce_max, barrier, get_rank
+
+from training.utils.logger import Logger, setup_logging
+
+from training.utils.train_utils import (
+    AverageMeter,
+    collect_dict_keys,
+    DurationMeter,
+    get_amp_type,
+    get_machine_local_and_dist_rank,
+    get_resume_checkpoint,
+    human_readable_time,
+    is_dist_avail_and_initialized,
+    log_env_variables,
+    makedir,
+    MemMeter,
+    Phase,
+    ProgressMeter,
+    set_seeds,
+    setup_distributed_backend,
+)
+
+
+CORE_LOSS_KEY = "core_loss"
+
+
+def unwrap_ddp_if_wrapped(model):
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        return model.module
+    return model
+
+
+@dataclass
+class OptimAMPConf:
+    enabled: bool = False
+    amp_dtype: str = "float16"
+
+
+@dataclass
+class OptimConf:
+    optimizer: torch.optim.Optimizer = None
+    options: Optional[Dict[str, Any]] = None
+    param_group_modifiers: Optional[List] = None
+    amp: Optional[Dict[str, Any]] = None
+    gradient_clip: Any = None
+    gradient_logger: Any = None
+
+    def __post_init__(self):
+        # amp
+        if not isinstance(self.amp, OptimAMPConf):
+            if self.amp is None:
+                self.amp = {}
+            assert isinstance(self.amp, Mapping)
+            self.amp = OptimAMPConf(**self.amp)
+
+
+@dataclass
+class DistributedConf:
+    backend: Optional[str] = None  # inferred from accelerator type
+    comms_dtype: Optional[str] = None
+    find_unused_parameters: bool = False
+    timeout_mins: int = 30
+
+
+@dataclass
+class CudaConf:
+    cudnn_deterministic: bool = False
+    cudnn_benchmark: bool = True
+    allow_tf32: bool = False
+    # if not None, `matmul_allow_tf32` key will override `allow_tf32` for matmul
+    matmul_allow_tf32: Optional[bool] = None
+    # if not None, `cudnn_allow_tf32` key will override `allow_tf32` for cudnn
+    cudnn_allow_tf32: Optional[bool] = None
+
+
+@dataclass
+class CheckpointConf:
+    save_dir: str
+    save_freq: int
+    save_list: List[int] = field(default_factory=list)
+    model_weight_initializer: Any = None
+    save_best_meters: List[str] = None
+    skip_saving_parameters: List[str] = field(default_factory=list)
+    initialize_after_preemption: Optional[bool] = None
+    # if not None, training will be resumed from this checkpoint
+    resume_from: Optional[str] = None
+
+    def infer_missing(self):
+        if self.initialize_after_preemption is None:
+            with_skip_saving = len(self.skip_saving_parameters) > 0
+            self.initialize_after_preemption = with_skip_saving
+        return self
+
+
+@dataclass
+class LoggingConf:
+    log_dir: str
+    log_freq: int  # In iterations
+    tensorboard_writer: Any
+    log_level_primary: str = "INFO"
+    log_level_secondary: str = "ERROR"
+    log_scalar_frequency: int = 100
+    log_visual_frequency: int = 100
+    scalar_keys_to_log: Optional[Dict[str, Any]] = None
+    log_batch_stats: bool = False
+
+
+class Trainer:
+    """
+    Trainer supporting the DDP training strategies.
+    """
+
+    EPSILON = 1e-8
+
+    def __init__(
+        self,
+        *,  # the order of these args can change at any time, so they are keyword-only
+        data: Dict[str, Any],
+        model: Dict[str, Any],
+        logging: Dict[str, Any],
+        checkpoint: Dict[str, Any],
+        max_epochs: int,
+        mode: str = "train",
+        accelerator: str = "cuda",
+        seed_value: int = 123,
+        val_epoch_freq: int = 1,
+        distributed: Dict[str, bool] = None,
+        cuda: Dict[str, bool] = None,
+        env_variables: Optional[Dict[str, Any]] = None,
+        optim: Optional[Dict[str, Any]] = None,
+        optim_overrides: Optional[List[Dict[str, Any]]] = None,
+        meters: Optional[Dict[str, Any]] = None,
+        loss: Optional[Dict[str, Any]] = None,
+    ):
+
+        self._setup_env_variables(env_variables)
+        self._setup_timers()
+
+        self.data_conf = data
+        self.model_conf = model
+        self.logging_conf = LoggingConf(**logging)
+        self.checkpoint_conf = CheckpointConf(**checkpoint).infer_missing()
+        self.max_epochs = max_epochs
+        self.mode = mode
+        self.val_epoch_freq = val_epoch_freq
+        self.optim_conf = OptimConf(**optim) if optim is not None else None
+        self.meters_conf = meters
+        self.loss_conf = loss
+        distributed = DistributedConf(**distributed or {})
+        cuda = CudaConf(**cuda or {})
+        self.where = 0.0
+
+        self._infer_distributed_backend_if_none(distributed, accelerator)
+
+        self._setup_device(accelerator)
+
+        self._setup_torch_dist_and_backend(cuda, distributed)
+
+        makedir(self.logging_conf.log_dir)
+        setup_logging(
+            __name__,
+            output_dir=self.logging_conf.log_dir,
+            rank=self.rank,
+            log_level_primary=self.logging_conf.log_level_primary,
+            log_level_secondary=self.logging_conf.log_level_secondary,
+        )
+
+        set_seeds(seed_value, self.max_epochs, self.distributed_rank)
+        log_env_variables()
+
+        assert (
+            is_dist_avail_and_initialized()
+        ), "Torch distributed needs to be initialized before calling the trainer."
+
+        self._setup_components()  # Except Optimizer everything is setup here.
+        self._move_to_device()
+        self._construct_optimizers()
+        self._setup_dataloaders()
+
+        self.time_elapsed_meter = DurationMeter("Time Elapsed", self.device, ":.2f")
+
+        if self.checkpoint_conf.resume_from is not None:
+            assert os.path.exists(
+                self.checkpoint_conf.resume_from
+            ), f"The 'resume_from' checkpoint {self.checkpoint_conf.resume_from} does not exist!"
+            dst = os.path.join(self.checkpoint_conf.save_dir, "checkpoint.pt")
+            if self.distributed_rank == 0 and not os.path.exists(dst):
+                # Copy the "resume_from" checkpoint to the checkpoint folder
+                # if there is not a checkpoint to resume from already there
+                makedir(self.checkpoint_conf.save_dir)
+                g_pathmgr.copy(self.checkpoint_conf.resume_from, dst)
+            barrier()
+
+        self.load_checkpoint()
+        self._setup_ddp_distributed_training(distributed, accelerator)
+        barrier()
+
+    def _setup_timers(self):
+        """
+        Initializes counters for elapsed time and eta.
+        """
+        self.start_time = time.time()
+        self.ckpt_time_elapsed = 0
+        self.est_epoch_time = dict.fromkeys([Phase.TRAIN, Phase.VAL], 0)
+
+    def _get_meters(self, phase_filters=None):
+        if self.meters is None:
+            return {}
+        meters = {}
+        for phase, phase_meters in self.meters.items():
+            if phase_filters is not None and phase not in phase_filters:
+                continue
+            for key, key_meters in phase_meters.items():
+                if key_meters is None:
+                    continue
+                for name, meter in key_meters.items():
+                    meters[f"{phase}_{key}/{name}"] = meter
+        return meters
+
+    def _infer_distributed_backend_if_none(self, distributed_conf, accelerator):
+        if distributed_conf.backend is None:
+            distributed_conf.backend = "nccl" if accelerator == "cuda" else "gloo"
+
+    def _setup_env_variables(self, env_variables_conf) -> None:
+        if env_variables_conf is not None:
+            for variable_name, value in env_variables_conf.items():
+                os.environ[variable_name] = value
+
+    def _setup_torch_dist_and_backend(self, cuda_conf, distributed_conf) -> None:
+        if torch.cuda.is_available():
+            torch.backends.cudnn.deterministic = cuda_conf.cudnn_deterministic
+            torch.backends.cudnn.benchmark = cuda_conf.cudnn_benchmark
+            torch.backends.cuda.matmul.allow_tf32 = (
+                cuda_conf.matmul_allow_tf32
+                if cuda_conf.matmul_allow_tf32 is not None
+                else cuda_conf.allow_tf32
+            )
+            torch.backends.cudnn.allow_tf32 = (
+                cuda_conf.cudnn_allow_tf32
+                if cuda_conf.cudnn_allow_tf32 is not None
+                else cuda_conf.allow_tf32
+            )
+
+        self.rank = setup_distributed_backend(
+            distributed_conf.backend, distributed_conf.timeout_mins
+        )
+
+    def _setup_device(self, accelerator):
+        self.local_rank, self.distributed_rank = get_machine_local_and_dist_rank()
+        if accelerator == "cuda":
+            self.device = torch.device("cuda", self.local_rank)
+            torch.cuda.set_device(self.local_rank)
+        elif accelerator == "cpu":
+            self.device = torch.device("cpu")
+        else:
+            raise ValueError(f"Unsupported accelerator: {accelerator}")
+
+    def _setup_ddp_distributed_training(self, distributed_conf, accelerator):
+
+        assert isinstance(self.model, torch.nn.Module)
+
+        self.model = nn.parallel.DistributedDataParallel(
+            self.model,
+            device_ids=[self.local_rank] if accelerator == "cuda" else [],
+            find_unused_parameters=distributed_conf.find_unused_parameters,
+        )
+        if distributed_conf.comms_dtype is not None:  # noqa
+            from torch.distributed.algorithms import ddp_comm_hooks
+
+            amp_type = get_amp_type(distributed_conf.comms_dtype)
+            if amp_type == torch.bfloat16:
+                hook = ddp_comm_hooks.default_hooks.bf16_compress_hook
+                logging.info("Enabling bfloat16 grad communication")
+            else:
+                hook = ddp_comm_hooks.default_hooks.fp16_compress_hook
+                logging.info("Enabling fp16 grad communication")
+            process_group = None
+            self.model.register_comm_hook(process_group, hook)
+
+    def _move_to_device(self):
+        logging.info(
+            f"Moving components to device {self.device} and local rank {self.local_rank}."
+        )
+
+        self.model.to(self.device)
+
+        logging.info(
+            f"Done moving components to device {self.device} and local rank {self.local_rank}."
+        )
+
+    def save_checkpoint(self, epoch, checkpoint_names=None):
+        checkpoint_folder = self.checkpoint_conf.save_dir
+        makedir(checkpoint_folder)
+        if checkpoint_names is None:
+            checkpoint_names = ["checkpoint"]
+            if (
+                self.checkpoint_conf.save_freq > 0
+                and (int(epoch) % self.checkpoint_conf.save_freq == 0)
+            ) or int(epoch) in self.checkpoint_conf.save_list:
+                checkpoint_names.append(f"checkpoint_{int(epoch)}")
+
+        checkpoint_paths = []
+        for ckpt_name in checkpoint_names:
+            checkpoint_paths.append(os.path.join(checkpoint_folder, f"{ckpt_name}.pt"))
+
+        state_dict = unwrap_ddp_if_wrapped(self.model).state_dict()
+        state_dict = exclude_params_matching_unix_pattern(
+            patterns=self.checkpoint_conf.skip_saving_parameters, state_dict=state_dict
+        )
+
+        checkpoint = {
+            "model": state_dict,
+            "optimizer": self.optim.optimizer.state_dict(),
+            "epoch": epoch,
+            "loss": self.loss.state_dict(),
+            "steps": self.steps,
+            "time_elapsed": self.time_elapsed_meter.val,
+            "best_meter_values": self.best_meter_values,
+        }
+        if self.optim_conf.amp.enabled:
+            checkpoint["scaler"] = self.scaler.state_dict()
+
+        # DDP checkpoints are only saved on rank 0 (all workers are identical)
+        if self.distributed_rank != 0:
+            return
+
+        for checkpoint_path in checkpoint_paths:
+            self._save_checkpoint(checkpoint, checkpoint_path)
+
+    def _save_checkpoint(self, checkpoint, checkpoint_path):
+        """
+        Save a checkpoint while guarding against the job being killed in the middle
+        of checkpoint saving (which corrupts the checkpoint file and ruins the
+        entire training since usually only the last checkpoint is kept per run).
+
+        We first save the new checkpoint to a temp file (with a '.tmp' suffix), and
+        and move it to overwrite the old checkpoint_path.
+        """
+        checkpoint_path_tmp = f"{checkpoint_path}.tmp"
+        with g_pathmgr.open(checkpoint_path_tmp, "wb") as f:
+            torch.save(checkpoint, f)
+        # after torch.save is completed, replace the old checkpoint with the new one
+        if g_pathmgr.exists(checkpoint_path):
+            # remove the old checkpoint_path file first (otherwise g_pathmgr.mv fails)
+            g_pathmgr.rm(checkpoint_path)
+        success = g_pathmgr.mv(checkpoint_path_tmp, checkpoint_path)
+        assert success
+
+    def load_checkpoint(self):
+        ckpt_path = get_resume_checkpoint(self.checkpoint_conf.save_dir)
+        if ckpt_path is None:
+            self._init_model_state()
+        else:
+            if self.checkpoint_conf.initialize_after_preemption:
+                self._call_model_initializer()
+            self._load_resuming_checkpoint(ckpt_path)
+
+    def _init_model_state(self):
+        # Checking that parameters that won't be saved are indeed frozen
+        # We do this check here before even saving the model to catch errors
+        # are early as possible and not at the end of the first epoch
+        assert_skipped_parameters_are_frozen(
+            patterns=self.checkpoint_conf.skip_saving_parameters,
+            model=self.model,
+        )
+
+        # Checking that parameters that won't be saved are initialized from
+        # within the model definition, unless `initialize_after_preemption`
+        # is explicitly set to `True`. If not, this is a bug, and after
+        # preemption, the `skip_saving_parameters` will have random values
+        allow_init_skip_parameters = self.checkpoint_conf.initialize_after_preemption
+        with with_check_parameter_frozen(
+            patterns=self.checkpoint_conf.skip_saving_parameters,
+            model=self.model,
+            disabled=allow_init_skip_parameters,
+        ):
+            self._call_model_initializer()
+
+    def _call_model_initializer(self):
+        model_weight_initializer = instantiate(
+            self.checkpoint_conf.model_weight_initializer
+        )
+        if model_weight_initializer is not None:
+            logging.info(
+                f"Loading pretrained checkpoint from {self.checkpoint_conf.model_weight_initializer}"
+            )
+            self.model = model_weight_initializer(model=self.model)
+
+    def _load_resuming_checkpoint(self, ckpt_path: str):
+        logging.info(f"Resuming training from {ckpt_path}")
+
+        with g_pathmgr.open(ckpt_path, "rb") as f:
+            checkpoint = torch.load(f, map_location="cpu")
+        load_state_dict_into_model(
+            model=self.model,
+            state_dict=checkpoint["model"],
+            ignore_missing_keys=self.checkpoint_conf.skip_saving_parameters,
+        )
+
+        self.optim.optimizer.load_state_dict(checkpoint["optimizer"])
+        self.loss.load_state_dict(checkpoint["loss"], strict=True)
+        self.epoch = checkpoint["epoch"]
+        self.steps = checkpoint["steps"]
+        self.ckpt_time_elapsed = checkpoint.get("time_elapsed")
+
+        if self.optim_conf.amp.enabled and "scaler" in checkpoint:
+            self.scaler.load_state_dict(checkpoint["scaler"])
+
+        self.best_meter_values = checkpoint.get("best_meter_values", {})
+
+        if "train_dataset" in checkpoint and self.train_dataset is not None:
+            self.train_dataset.load_checkpoint_state(checkpoint["train_dataset"])
+
+    def is_intermediate_val_epoch(self, epoch):
+        return epoch % self.val_epoch_freq == 0 and epoch < self.max_epochs - 1
+
+    def _step(
+        self,
+        batch: BatchedVideoDatapoint,
+        model: nn.Module,
+        phase: str,
+    ):
+
+        outputs = model(batch)
+        targets = batch.masks
+        batch_size = len(batch.img_batch)
+
+        key = batch.dict_key  # key for dataset
+        loss = self.loss[key](outputs, targets)
+        loss_str = f"Losses/{phase}_{key}_loss"
+
+        loss_log_str = os.path.join("Step_Losses", loss_str)
+
+        # loss contains multiple sub-components we wish to log
+        step_losses = {}
+        if isinstance(loss, dict):
+            step_losses.update(
+                {f"Losses/{phase}_{key}_{k}": v for k, v in loss.items()}
+            )
+            loss = self._log_loss_detailed_and_return_core_loss(
+                loss, loss_log_str, self.steps[phase]
+            )
+
+        if self.steps[phase] % self.logging_conf.log_scalar_frequency == 0:
+            self.logger.log(
+                loss_log_str,
+                loss,
+                self.steps[phase],
+            )
+
+        self.steps[phase] += 1
+
+        ret_tuple = {loss_str: loss}, batch_size, step_losses
+
+        if phase in self.meters and key in self.meters[phase]:
+            meters_dict = self.meters[phase][key]
+            if meters_dict is not None:
+                for _, meter in meters_dict.items():
+                    meter.update(
+                        find_stages=outputs,
+                        find_metadatas=batch.metadata,
+                    )
+
+        return ret_tuple
+
+    def run(self):
+        assert self.mode in ["train", "train_only", "val"]
+        if self.mode == "train":
+            if self.epoch > 0:
+                logging.info(f"Resuming training from epoch: {self.epoch}")
+                # resuming from a checkpoint
+                if self.is_intermediate_val_epoch(self.epoch - 1):
+                    logging.info("Running previous val epoch")
+                    self.epoch -= 1
+                    self.run_val()
+                    self.epoch += 1
+            self.run_train()
+            self.run_val()
+        elif self.mode == "val":
+            self.run_val()
+        elif self.mode == "train_only":
+            self.run_train()
+
+    def _setup_dataloaders(self):
+        self.train_dataset = None
+        self.val_dataset = None
+
+        if self.mode in ["train", "val"]:
+            self.val_dataset = instantiate(self.data_conf.get(Phase.VAL, None))
+
+        if self.mode in ["train", "train_only"]:
+            self.train_dataset = instantiate(self.data_conf.train)
+
+    def run_train(self):
+
+        while self.epoch < self.max_epochs:
+            dataloader = self.train_dataset.get_loader(epoch=int(self.epoch))
+            barrier()
+            outs = self.train_epoch(dataloader)
+            self.logger.log_dict(outs, self.epoch)  # Logged only on rank 0
+
+            # log train to text file.
+            if self.distributed_rank == 0:
+                with g_pathmgr.open(
+                    os.path.join(self.logging_conf.log_dir, "train_stats.json"),
+                    "a",
+                ) as f:
+                    f.write(json.dumps(outs) + "\n")
+
+            # Save checkpoint before validating
+            self.save_checkpoint(self.epoch + 1)
+
+            del dataloader
+            gc.collect()
+
+            # Run val, not running on last epoch since will run after the
+            # loop anyway
+            if self.is_intermediate_val_epoch(self.epoch):
+                self.run_val()
+
+            if self.distributed_rank == 0:
+                self.best_meter_values.update(self._get_trainer_state("train"))
+                with g_pathmgr.open(
+                    os.path.join(self.logging_conf.log_dir, "best_stats.json"),
+                    "a",
+                ) as f:
+                    f.write(json.dumps(self.best_meter_values) + "\n")
+
+            self.epoch += 1
+        # epoch was incremented in the loop but the val step runs out of the loop
+        self.epoch -= 1
+
+    def run_val(self):
+        if not self.val_dataset:
+            return
+
+        dataloader = self.val_dataset.get_loader(epoch=int(self.epoch))
+        outs = self.val_epoch(dataloader, phase=Phase.VAL)
+        del dataloader
+        gc.collect()
+        self.logger.log_dict(outs, self.epoch)  # Logged only on rank 0
+
+        if self.distributed_rank == 0:
+            with g_pathmgr.open(
+                os.path.join(self.logging_conf.log_dir, "val_stats.json"),
+                "a",
+            ) as f:
+                f.write(json.dumps(outs) + "\n")
+
+    def val_epoch(self, val_loader, phase):
+        batch_time = AverageMeter("Batch Time", self.device, ":.2f")
+        data_time = AverageMeter("Data Time", self.device, ":.2f")
+        mem = MemMeter("Mem (GB)", self.device, ":.2f")
+
+        iters_per_epoch = len(val_loader)
+
+        curr_phases = [phase]
+        curr_models = [self.model]
+
+        loss_names = []
+        for p in curr_phases:
+            for key in self.loss.keys():
+                loss_names.append(f"Losses/{p}_{key}_loss")
+
+        loss_mts = OrderedDict(
+            [(name, AverageMeter(name, self.device, ":.2e")) for name in loss_names]
+        )
+        extra_loss_mts = {}
+
+        for model in curr_models:
+            model.eval()
+            if hasattr(unwrap_ddp_if_wrapped(model), "on_validation_epoch_start"):
+                unwrap_ddp_if_wrapped(model).on_validation_epoch_start()
+
+        progress = ProgressMeter(
+            iters_per_epoch,
+            [batch_time, data_time, mem, self.time_elapsed_meter, *loss_mts.values()],
+            self._get_meters(curr_phases),
+            prefix="Val Epoch: [{}]".format(self.epoch),
+        )
+
+        end = time.time()
+
+        for data_iter, batch in enumerate(val_loader):
+
+            # measure data loading time
+            data_time.update(time.time() - end)
+
+            batch = batch.to(self.device, non_blocking=True)
+
+            # compute output
+            with torch.no_grad():
+                with torch.cuda.amp.autocast(
+                    enabled=(self.optim_conf.amp.enabled if self.optim_conf else False),
+                    dtype=(
+                        get_amp_type(self.optim_conf.amp.amp_dtype)
+                        if self.optim_conf
+                        else None
+                    ),
+                ):
+                    for phase, model in zip(curr_phases, curr_models):
+                        loss_dict, batch_size, extra_losses = self._step(
+                            batch,
+                            model,
+                            phase,
+                        )
+
+                        assert len(loss_dict) == 1
+                        loss_key, loss = loss_dict.popitem()
+
+                        loss_mts[loss_key].update(loss.item(), batch_size)
+
+                        for k, v in extra_losses.items():
+                            if k not in extra_loss_mts:
+                                extra_loss_mts[k] = AverageMeter(k, self.device, ":.2e")
+                            extra_loss_mts[k].update(v.item(), batch_size)
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            self.time_elapsed_meter.update(
+                time.time() - self.start_time + self.ckpt_time_elapsed
+            )
+
+            if torch.cuda.is_available():
+                mem.update(reset_peak_usage=True)
+
+            if data_iter % self.logging_conf.log_freq == 0:
+                progress.display(data_iter)
+
+            if data_iter % self.logging_conf.log_scalar_frequency == 0:
+                # Log progress meters.
+                for progress_meter in progress.meters:
+                    self.logger.log(
+                        os.path.join("Step_Stats", phase, progress_meter.name),
+                        progress_meter.val,
+                        self.steps[Phase.VAL],
+                    )
+
+            if data_iter % 10 == 0:
+                dist.barrier()
+
+        self.est_epoch_time[phase] = batch_time.avg * iters_per_epoch
+        self._log_timers(phase)
+        for model in curr_models:
+            if hasattr(unwrap_ddp_if_wrapped(model), "on_validation_epoch_end"):
+                unwrap_ddp_if_wrapped(model).on_validation_epoch_end()
+
+        out_dict = self._log_meters_and_save_best_ckpts(curr_phases)
+
+        for k, v in loss_mts.items():
+            out_dict[k] = v.avg
+        for k, v in extra_loss_mts.items():
+            out_dict[k] = v.avg
+
+        for phase in curr_phases:
+            out_dict.update(self._get_trainer_state(phase))
+        self._reset_meters(curr_phases)
+        logging.info(f"Meters: {out_dict}")
+        return out_dict
+
+    def _get_trainer_state(self, phase):
+        return {
+            "Trainer/where": self.where,
+            "Trainer/epoch": self.epoch,
+            f"Trainer/steps_{phase}": self.steps[phase],
+        }
+
+    def train_epoch(self, train_loader):
+
+        # Init stat meters
+        batch_time_meter = AverageMeter("Batch Time", self.device, ":.2f")
+        data_time_meter = AverageMeter("Data Time", self.device, ":.2f")
+        mem_meter = MemMeter("Mem (GB)", self.device, ":.2f")
+        data_times = []
+        phase = Phase.TRAIN
+
+        iters_per_epoch = len(train_loader)
+
+        loss_names = []
+        for batch_key in self.loss.keys():
+            loss_names.append(f"Losses/{phase}_{batch_key}_loss")
+
+        loss_mts = OrderedDict(
+            [(name, AverageMeter(name, self.device, ":.2e")) for name in loss_names]
+        )
+        extra_loss_mts = {}
+
+        progress = ProgressMeter(
+            iters_per_epoch,
+            [
+                batch_time_meter,
+                data_time_meter,
+                mem_meter,
+                self.time_elapsed_meter,
+                *loss_mts.values(),
+            ],
+            self._get_meters([phase]),
+            prefix="Train Epoch: [{}]".format(self.epoch),
+        )
+
+        # Model training loop
+        self.model.train()
+        end = time.time()
+
+        for data_iter, batch in enumerate(train_loader):
+            # measure data loading time
+            data_time_meter.update(time.time() - end)
+            data_times.append(data_time_meter.val)
+            batch = batch.to(
+                self.device, non_blocking=True
+            )  # move tensors in a tensorclass
+
+            try:
+                self._run_step(batch, phase, loss_mts, extra_loss_mts)
+
+                # compute gradient and do optim step
+                exact_epoch = self.epoch + float(data_iter) / iters_per_epoch
+                self.where = float(exact_epoch) / self.max_epochs
+                assert self.where <= 1 + self.EPSILON
+                if self.where < 1.0:
+                    self.optim.step_schedulers(
+                        self.where, step=int(exact_epoch * iters_per_epoch)
+                    )
+                else:
+                    logging.warning(
+                        f"Skipping scheduler update since the training is at the end, i.e, {self.where} of [0,1]."
+                    )
+
+                # Log schedulers
+                if data_iter % self.logging_conf.log_scalar_frequency == 0:
+                    for j, param_group in enumerate(self.optim.optimizer.param_groups):
+                        for option in self.optim.schedulers[j]:
+                            optim_prefix = (
+                                "" + f"{j}_"
+                                if len(self.optim.optimizer.param_groups) > 1
+                                else ""
+                            )
+                            self.logger.log(
+                                os.path.join("Optim", f"{optim_prefix}", option),
+                                param_group[option],
+                                self.steps[phase],
+                            )
+
+                # Clipping gradients and detecting diverging gradients
+                if self.gradient_clipper is not None:
+                    self.scaler.unscale_(self.optim.optimizer)
+                    self.gradient_clipper(model=self.model)
+
+                if self.gradient_logger is not None:
+                    self.gradient_logger(
+                        self.model, rank=self.distributed_rank, where=self.where
+                    )
+
+                # Optimizer step: the scaler will make sure gradients are not
+                # applied if the gradients are infinite
+                self.scaler.step(self.optim.optimizer)
+                self.scaler.update()
+
+                # measure elapsed time
+                batch_time_meter.update(time.time() - end)
+                end = time.time()
+
+                self.time_elapsed_meter.update(
+                    time.time() - self.start_time + self.ckpt_time_elapsed
+                )
+
+                mem_meter.update(reset_peak_usage=True)
+                if data_iter % self.logging_conf.log_freq == 0:
+                    progress.display(data_iter)
+
+                if data_iter % self.logging_conf.log_scalar_frequency == 0:
+                    # Log progress meters.
+                    for progress_meter in progress.meters:
+                        self.logger.log(
+                            os.path.join("Step_Stats", phase, progress_meter.name),
+                            progress_meter.val,
+                            self.steps[phase],
+                        )
+
+            # Catching NaN/Inf errors in the loss
+            except FloatingPointError as e:
+                raise e
+
+        self.est_epoch_time[Phase.TRAIN] = batch_time_meter.avg * iters_per_epoch
+        self._log_timers(Phase.TRAIN)
+        self._log_sync_data_times(Phase.TRAIN, data_times)
+
+        out_dict = self._log_meters_and_save_best_ckpts([Phase.TRAIN])
+
+        for k, v in loss_mts.items():
+            out_dict[k] = v.avg
+        for k, v in extra_loss_mts.items():
+            out_dict[k] = v.avg
+        out_dict.update(self._get_trainer_state(phase))
+        logging.info(f"Losses and meters: {out_dict}")
+        self._reset_meters([phase])
+        return out_dict
+
+    def _log_sync_data_times(self, phase, data_times):
+        data_times = all_reduce_max(torch.tensor(data_times)).tolist()
+        steps = range(self.steps[phase] - len(data_times), self.steps[phase])
+        for step, data_time in zip(steps, data_times):
+            if step % self.logging_conf.log_scalar_frequency == 0:
+                self.logger.log(
+                    os.path.join("Step_Stats", phase, "Data Time Synced"),
+                    data_time,
+                    step,
+                )
+
+    def _run_step(
+        self,
+        batch: BatchedVideoDatapoint,
+        phase: str,
+        loss_mts: Dict[str, AverageMeter],
+        extra_loss_mts: Dict[str, AverageMeter],
+        raise_on_error: bool = True,
+    ):
+        """
+        Run the forward / backward
+        """
+
+        # it's important to set grads to None, especially with Adam since 0
+        # grads will also update a model even if the step doesn't produce
+        # gradients
+        self.optim.zero_grad(set_to_none=True)
+        with torch.cuda.amp.autocast(
+            enabled=self.optim_conf.amp.enabled,
+            dtype=get_amp_type(self.optim_conf.amp.amp_dtype),
+        ):
+            loss_dict, batch_size, extra_losses = self._step(
+                batch,
+                self.model,
+                phase,
+            )
+
+        assert len(loss_dict) == 1
+        loss_key, loss = loss_dict.popitem()
+
+        if not math.isfinite(loss.item()):
+            error_msg = f"Loss is {loss.item()}, attempting to stop training"
+            logging.error(error_msg)
+            if raise_on_error:
+                raise FloatingPointError(error_msg)
+            else:
+                return
+
+        self.scaler.scale(loss).backward()
+        loss_mts[loss_key].update(loss.item(), batch_size)
+        for extra_loss_key, extra_loss in extra_losses.items():
+            if extra_loss_key not in extra_loss_mts:
+                extra_loss_mts[extra_loss_key] = AverageMeter(
+                    extra_loss_key, self.device, ":.2e"
+                )
+            extra_loss_mts[extra_loss_key].update(extra_loss.item(), batch_size)
+
+    def _log_meters_and_save_best_ckpts(self, phases: List[str]):
+        logging.info("Synchronizing meters")
+        out_dict = {}
+        checkpoint_save_keys = []
+        for key, meter in self._get_meters(phases).items():
+            meter_output = meter.compute_synced()
+            is_better_check = getattr(meter, "is_better", None)
+
+            for meter_subkey, meter_value in meter_output.items():
+                out_dict[os.path.join("Meters_train", key, meter_subkey)] = meter_value
+
+                if is_better_check is None:
+                    continue
+
+                tracked_meter_key = os.path.join(key, meter_subkey)
+                if tracked_meter_key not in self.best_meter_values or is_better_check(
+                    meter_value,
+                    self.best_meter_values[tracked_meter_key],
+                ):
+                    self.best_meter_values[tracked_meter_key] = meter_value
+
+                    if (
+                        self.checkpoint_conf.save_best_meters is not None
+                        and key in self.checkpoint_conf.save_best_meters
+                    ):
+                        checkpoint_save_keys.append(tracked_meter_key.replace("/", "_"))
+
+        if len(checkpoint_save_keys) > 0:
+            self.save_checkpoint(self.epoch + 1, checkpoint_save_keys)
+
+        return out_dict
+
+    def _log_timers(self, phase):
+        time_remaining = 0
+        epochs_remaining = self.max_epochs - self.epoch - 1
+        val_epochs_remaining = sum(
+            n % self.val_epoch_freq == 0 for n in range(self.epoch, self.max_epochs)
+        )
+
+        # Adding the guaranteed val run at the end if val_epoch_freq doesn't coincide with
+        # the end epoch.
+        if (self.max_epochs - 1) % self.val_epoch_freq != 0:
+            val_epochs_remaining += 1
+
+        # Remove the current val run from estimate
+        if phase == Phase.VAL:
+            val_epochs_remaining -= 1
+
+        time_remaining += (
+            epochs_remaining * self.est_epoch_time[Phase.TRAIN]
+            + val_epochs_remaining * self.est_epoch_time[Phase.VAL]
+        )
+
+        self.logger.log(
+            os.path.join("Step_Stats", phase, self.time_elapsed_meter.name),
+            self.time_elapsed_meter.val,
+            self.steps[phase],
+        )
+
+        logging.info(f"Estimated time remaining: {human_readable_time(time_remaining)}")
+
+    def _reset_meters(self, phases: str) -> None:
+        for meter in self._get_meters(phases).values():
+            meter.reset()
+
+    def _check_val_key_match(self, val_keys, phase):
+        if val_keys is not None:
+            # Check if there are any duplicates
+            assert len(val_keys) == len(
+                set(val_keys)
+            ), f"Duplicate keys in val datasets, keys: {val_keys}"
+
+            # Check that the keys match the meter keys
+            if self.meters_conf is not None and phase in self.meters_conf:
+                assert set(val_keys) == set(self.meters_conf[phase].keys()), (
+                    f"Keys in val datasets do not match the keys in meters."
+                    f"\nMissing in meters: {set(val_keys) - set(self.meters_conf[phase].keys())}"
+                    f"\nMissing in val datasets: {set(self.meters_conf[phase].keys()) - set(val_keys)}"
+                )
+
+            if self.loss_conf is not None:
+                loss_keys = set(self.loss_conf.keys()) - set(["all"])
+                assert all([k in loss_keys for k in val_keys]), (
+                    f"Keys in val datasets do not match the keys in losses."
+                    f"\nMissing in losses: {set(val_keys) - loss_keys}"
+                    f"\nMissing in val datasets: {loss_keys - set(val_keys)}"
+                )
+
+    def _setup_components(self):
+
+        # Get the keys for all the val datasets, if any
+        val_phase = Phase.VAL
+        val_keys = None
+        if self.data_conf.get(val_phase, None) is not None:
+            val_keys = collect_dict_keys(self.data_conf[val_phase])
+        # Additional checks on the sanity of the config for val datasets
+        self._check_val_key_match(val_keys, phase=val_phase)
+
+        logging.info("Setting up components: Model, loss, optim, meters etc.")
+        self.epoch = 0
+        self.steps = {Phase.TRAIN: 0, Phase.VAL: 0}
+
+        self.logger = Logger(self.logging_conf)
+
+        self.model = instantiate(self.model_conf, _convert_="all")
+        print_model_summary(self.model)
+
+        self.loss = None
+        if self.loss_conf:
+            self.loss = {
+                key: el  # wrap_base_loss(el)
+                for (key, el) in instantiate(self.loss_conf, _convert_="all").items()
+            }
+            self.loss = nn.ModuleDict(self.loss)
+
+        self.meters = {}
+        self.best_meter_values = {}
+        if self.meters_conf:
+            self.meters = instantiate(self.meters_conf, _convert_="all")
+
+        self.scaler = torch.amp.GradScaler(
+            self.device,
+            enabled=self.optim_conf.amp.enabled if self.optim_conf else False,
+        )
+
+        self.gradient_clipper = (
+            instantiate(self.optim_conf.gradient_clip) if self.optim_conf else None
+        )
+        self.gradient_logger = (
+            instantiate(self.optim_conf.gradient_logger) if self.optim_conf else None
+        )
+
+        logging.info("Finished setting up components: Model, loss, optim, meters etc.")
+
+    def _construct_optimizers(self):
+        self.optim = construct_optimizer(
+            self.model,
+            self.optim_conf.optimizer,
+            self.optim_conf.options,
+            self.optim_conf.param_group_modifiers,
+        )
+
+    def _log_loss_detailed_and_return_core_loss(self, loss, loss_str, step):
+        core_loss = loss.pop(CORE_LOSS_KEY)
+        if step % self.logging_conf.log_scalar_frequency == 0:
+            for k in loss:
+                log_str = os.path.join(loss_str, k)
+                self.logger.log(log_str, loss[k], step)
+        return core_loss
+
+
+def print_model_summary(model: torch.nn.Module, log_dir: str = ""):
+    """
+    Prints the model and the number of parameters in the model.
+    # Multiple packages provide this info in a nice table format
+    # However, they need us to provide an `input` (as they also write down the output sizes)
+    # Our models are complex, and a single input is restrictive.
+    # https://github.com/sksq96/pytorch-summary
+    # https://github.com/nmhkahn/torchsummaryX
+    """
+    if get_rank() != 0:
+        return
+    param_kwargs = {}
+    trainable_parameters = sum(
+        p.numel() for p in model.parameters(**param_kwargs) if p.requires_grad
+    )
+    total_parameters = sum(p.numel() for p in model.parameters(**param_kwargs))
+    non_trainable_parameters = total_parameters - trainable_parameters
+    logging.info("==" * 10)
+    logging.info(f"Summary for model {type(model)}")
+    logging.info(f"Model is {model}")
+    logging.info(f"\tTotal parameters {get_human_readable_count(total_parameters)}")
+    logging.info(
+        f"\tTrainable parameters {get_human_readable_count(trainable_parameters)}"
+    )
+    logging.info(
+        f"\tNon-Trainable parameters {get_human_readable_count(non_trainable_parameters)}"
+    )
+    logging.info("==" * 10)
+
+    if log_dir:
+        output_fpath = os.path.join(log_dir, "model.txt")
+        with g_pathmgr.open(output_fpath, "w") as f:
+            print(model, file=f)
+
+
+PARAMETER_NUM_UNITS = [" ", "K", "M", "B", "T"]
+
+
+def get_human_readable_count(number: int) -> str:
+    """
+    Abbreviates an integer number with K, M, B, T for thousands, millions,
+    billions and trillions, respectively.
+    Examples:
+        >>> get_human_readable_count(123)
+        '123  '
+        >>> get_human_readable_count(1234)  # (one thousand)
+        '1.2 K'
+        >>> get_human_readable_count(2e6)   # (two million)
+        '2.0 M'
+        >>> get_human_readable_count(3e9)   # (three billion)
+        '3.0 B'
+        >>> get_human_readable_count(4e14)  # (four hundred trillion)
+        '400 T'
+        >>> get_human_readable_count(5e15)  # (more than trillion)
+        '5,000 T'
+    Args:
+        number: a positive integer number
+    Return:
+        A string formatted according to the pattern described above.
+    """
+    assert number >= 0
+    labels = PARAMETER_NUM_UNITS
+    num_digits = int(np.floor(np.log10(number)) + 1 if number > 0 else 1)
+    num_groups = int(np.ceil(num_digits / 3))
+    num_groups = min(num_groups, len(labels))  # don't abbreviate beyond trillions
+    shift = -3 * (num_groups - 1)
+    number = number * (10**shift)
+    index = num_groups - 1
+    if index < 1 or number >= 100:
+        return f"{int(number):,d} {labels[index]}"
+    else:
+        return f"{number:,.1f} {labels[index]}"
diff --git a/phantom/submodules/sam2/training/utils/__init__.py b/phantom/submodules/sam2/training/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/phantom/submodules/sam2/training/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/phantom/submodules/sam2/training/utils/checkpoint_utils.py b/phantom/submodules/sam2/training/utils/checkpoint_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f76689f341dedc485c0c32d096fb5b2e8337bea9
--- /dev/null
+++ b/phantom/submodules/sam2/training/utils/checkpoint_utils.py
@@ -0,0 +1,361 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import fnmatch
+import logging
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+
+import numpy as np
+import torch
+import torch.nn as nn
+from iopath.common.file_io import g_pathmgr
+from torch.jit._script import RecursiveScriptModule
+
+
+def unix_pattern_to_parameter_names(
+    constraints: List[str], all_parameter_names: Sequence[str]
+) -> Union[None, Set[str]]:
+    """
+    Go through the list of parameter names and select those that match
+    any of the provided constraints
+    """
+    parameter_names = []
+    for param_name in constraints:
+        matching_parameters = set(fnmatch.filter(all_parameter_names, param_name))
+        assert (
+            len(matching_parameters) > 0
+        ), f"param_names {param_name} don't match any param in the given names."
+        parameter_names.append(matching_parameters)
+    return set.union(*parameter_names)
+
+
+def filter_params_matching_unix_pattern(
+    patterns: List[str], state_dict: Dict[str, torch.Tensor]
+) -> Dict[str, torch.Tensor]:
+    """
+    Remove from the state dictionary the parameters matching the provided unix patterns
+
+    Args:
+        patterns: the list of unix patterns to exclude
+        state_dict: the dictionary to filter
+
+    Returns:
+        A new state dictionary
+    """
+    if len(patterns) == 0:
+        return {}
+
+    all_keys = list(state_dict.keys())
+    included_keys = unix_pattern_to_parameter_names(patterns, all_keys)
+    return {k: state_dict[k] for k in included_keys}
+
+
+def exclude_params_matching_unix_pattern(
+    patterns: List[str], state_dict: Dict[str, torch.Tensor]
+) -> Dict[str, torch.Tensor]:
+    """
+    Remove from the state dictionary the parameters matching the provided unix patterns
+
+    Args:
+        patterns: the list of unix patterns to exclude
+        state_dict: the dictionary to filter
+
+    Returns:
+        A new state dictionary
+    """
+    if len(patterns) == 0:
+        return state_dict
+
+    all_keys = list(state_dict.keys())
+    excluded_keys = unix_pattern_to_parameter_names(patterns, all_keys)
+    return {k: v for k, v in state_dict.items() if k not in excluded_keys}
+
+
+def _get_state_dict_summary(state_dict: Dict[str, torch.Tensor]):
+    keys = []
+    trace = []
+    for k, v in state_dict.items():
+        keys.append(k)
+        trace.append(v.sum().item())
+    trace = np.array(trace)[np.argsort(keys)]
+    return trace
+
+
+def assert_skipped_parameters_are_frozen(model: nn.Module, patterns: List[str]):
+    """
+    Verifies that all the parameters matching the provided patterns
+    are frozen - this acts as a safeguard when ignoring parameter
+    when saving checkpoints - if the parameters are in fact trainable
+    """
+    if not patterns:
+        return
+
+    frozen_state_dict = filter_params_matching_unix_pattern(
+        patterns=patterns, state_dict=model.state_dict()
+    )
+    non_frozen_keys = {
+        n
+        for n, p in model.named_parameters()
+        if n in frozen_state_dict and p.requires_grad
+    }
+    if non_frozen_keys:
+        raise ValueError(
+            f"Parameters excluded with `skip_saving_parameters` should be frozen: {non_frozen_keys}"
+        )
+
+
+@contextlib.contextmanager
+def with_check_parameter_frozen(
+    model: nn.Module, patterns: List[str], disabled: bool = True
+):
+    """
+    Context manager that inspects a model surrounding a piece of code
+    and verifies if the model has been updated by this piece of code
+
+    The function will raise an exception if the model has been updated
+    on at least one of the parameter that matches one of the pattern
+
+    Args:
+        model: the model that might have been updated
+        patterns: for the parameters we want to observe
+        allowed:
+    """
+    if not patterns or disabled:
+        yield
+        return
+
+    frozen_state_dict = filter_params_matching_unix_pattern(
+        patterns=patterns, state_dict=model.state_dict()
+    )
+    summary_before = _get_state_dict_summary(frozen_state_dict)
+
+    yield
+
+    frozen_state_dict = filter_params_matching_unix_pattern(
+        patterns=patterns, state_dict=model.state_dict()
+    )
+    summary_after = _get_state_dict_summary(frozen_state_dict)
+
+    if not np.allclose(summary_before, summary_after, atol=1e-6):
+        raise ValueError(
+            f"""
+            The `model_weight_initializer` has initialized parameters frozen with `skip_saving_parameters`.
+            You can resolve this error by either initializing those parameters from within the model definition
+            or using the flag `trainer.checkpoint.initialize_after_preemption` to True.
+        """
+        )
+
+
+class CkptExcludeKernel:
+    """
+    Removes the keys from the given model state_dict that match the key_pattern.
+
+    Args:
+        key_pattern: Patterns used to select the keys in the state_dict
+            that are eligible for this kernel.
+    """
+
+    def __init__(self, key_pattern: List[str]):
+        self.key_pattern = key_pattern
+
+    def __call__(self, state_dict: Dict):
+        """
+        Args:
+            state_dict: A dictionary representing the given checkpoint's state dict.
+        """
+        if len(self.key_pattern) == 0:
+            return state_dict
+        exclude_keys = unix_pattern_to_parameter_names(
+            self.key_pattern, state_dict.keys()
+        )
+        return {k: v for k, v in state_dict.items() if k not in exclude_keys}
+
+
+def load_checkpoint(
+    path_list: List[str],
+    pick_recursive_keys: Optional[List[str]] = None,
+    map_location: str = "cpu",
+) -> Any:
+    """
+    Loads a checkpoint from the specified path.
+
+    Args:
+        path_list: A list of paths which contain the checkpoint. Each element
+            is tried (in order) until a file that exists is found. That file is then
+            used to read the checkpoint.
+        pick_recursive_keys: Picks sub dicts from the loaded checkpoint if not None.
+            For pick_recursive_keys = ["a", "b"], will return checkpoint_dict["a"]["b"]
+        map_location (str): a function, torch.device, string or a dict specifying how to
+            remap storage locations
+
+    Returns: Model with the matchin pre-trained weights loaded.
+    """
+    path_exists = False
+    for path in path_list:
+        if g_pathmgr.exists(path):
+            path_exists = True
+            break
+
+    if not path_exists:
+        raise ValueError(f"No path exists in {path_list}")
+
+    with g_pathmgr.open(path, "rb") as f:
+        checkpoint = torch.load(f, map_location=map_location)
+
+    logging.info(f"Loaded checkpoint from {path}")
+    if pick_recursive_keys is not None:
+        for key in pick_recursive_keys:
+            checkpoint = checkpoint[key]
+    return checkpoint
+
+
+def get_state_dict(checkpoint, ckpt_state_dict_keys):
+    if isinstance(checkpoint, RecursiveScriptModule):
+        # This is a torchscript JIT model
+        return checkpoint.state_dict()
+    pre_train_dict = checkpoint
+    for i, key in enumerate(ckpt_state_dict_keys):
+        if (isinstance(pre_train_dict, Mapping) and key not in pre_train_dict) or (
+            isinstance(pre_train_dict, Sequence) and key >= len(pre_train_dict)
+        ):
+            key_str = (
+                '["' + '"]["'.join(list(map(ckpt_state_dict_keys[:i], str))) + '"]'
+            )
+            raise KeyError(
+                f"'{key}' not found in checkpoint{key_str} "
+                f"with keys: {pre_train_dict.keys()}"
+            )
+        pre_train_dict = pre_train_dict[key]
+    return pre_train_dict
+
+
+def load_checkpoint_and_apply_kernels(
+    checkpoint_path: str,
+    checkpoint_kernels: List[Callable] = None,
+    ckpt_state_dict_keys: Tuple[str] = ("state_dict",),
+    map_location: str = "cpu",
+) -> nn.Module:
+    """
+    Performs checkpoint loading with a variety of pre-processing kernel applied in
+    sequence.
+
+    Args:
+        checkpoint_path (str): Path to the checkpoint.
+        checkpoint_kernels List(Callable): A list of checkpoint processing kernels
+            to apply in the specified order. Supported kernels include `CkptIncludeKernel`,
+            `CkptExcludeKernel`, etc. These kernels are applied in the
+            given order.
+        ckpt_state_dict_keys (str): Keys containing the model state dict.
+        map_location (str): a function, torch.device, string or a dict specifying how to
+            remap storage locations
+
+    Returns: Model with the matchin pre-trained weights loaded.
+    """
+    assert g_pathmgr.exists(checkpoint_path), "Checkpoint '{}' not found".format(
+        checkpoint_path
+    )
+
+    # Load the checkpoint on CPU to avoid GPU mem spike.
+    with g_pathmgr.open(checkpoint_path, "rb") as f:
+        checkpoint = torch.load(f, map_location=map_location)
+
+    pre_train_dict = get_state_dict(checkpoint, ckpt_state_dict_keys)
+
+    # Not logging into info etc since it's a huge log
+    logging.debug(
+        "Loaded Checkpoint State Dict pre-kernel application: %s"
+        % str(", ".join(list(pre_train_dict.keys())))
+    )
+    # Apply kernels
+    if checkpoint_kernels is not None:
+        for f in checkpoint_kernels:
+            pre_train_dict = f(state_dict=pre_train_dict)
+
+    logging.debug(
+        "Loaded Checkpoint State Dict Post-kernel application %s"
+        % str(", ".join(list(pre_train_dict.keys())))
+    )
+
+    return pre_train_dict
+
+
+def check_load_state_dict_errors(
+    missing_keys,
+    unexpected_keys,
+    strict: bool,
+    ignore_missing_keys: List[str] = None,
+    ignore_unexpected_keys: List[str] = None,
+):
+    if ignore_missing_keys is not None and len(ignore_missing_keys) > 0:
+        ignored_keys = unix_pattern_to_parameter_names(
+            ignore_missing_keys, missing_keys
+        )
+        missing_keys = [key for key in missing_keys if key not in ignored_keys]
+
+    if ignore_unexpected_keys is not None and len(ignore_unexpected_keys) > 0:
+        ignored_unexpected_keys = unix_pattern_to_parameter_names(
+            ignore_unexpected_keys, unexpected_keys
+        )
+        unexpected_keys = [
+            key for key in unexpected_keys if key not in ignored_unexpected_keys
+        ]
+
+    err = "State key mismatch."
+    if unexpected_keys:
+        err += f" Unexpected keys: {unexpected_keys}."
+    if missing_keys:
+        err += f" Missing keys: {missing_keys}."
+
+    if unexpected_keys or missing_keys:
+        logging.warning(err)
+        if unexpected_keys or strict:
+            raise KeyError(err)
+
+
+def load_state_dict_into_model(
+    state_dict: Dict,
+    model: nn.Module,
+    strict: bool = True,
+    ignore_missing_keys: List[str] = None,
+    ignore_unexpected_keys: List[str] = None,
+    checkpoint_kernels: List[Callable] = None,
+):
+    """
+    Loads a state dict into the given model.
+
+    Args:
+        state_dict: A dictionary containing the model's
+            state dict, or a subset if strict is False
+        model: Model to load the checkpoint weights into
+        strict: raise if the state_dict has missing state keys
+        ignore_missing_keys: unix pattern of keys to ignore
+    """
+    # Apply kernels
+    if checkpoint_kernels is not None:
+        for f in checkpoint_kernels:
+            state_dict = f(state_dict=state_dict)
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+
+    check_load_state_dict_errors(
+        missing_keys,
+        unexpected_keys,
+        strict=strict,
+        ignore_missing_keys=ignore_missing_keys,
+        ignore_unexpected_keys=ignore_unexpected_keys,
+    )
+    return model
diff --git a/phantom/submodules/sam2/training/utils/data_utils.py b/phantom/submodules/sam2/training/utils/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbd0115355c97a27c601a833985466e558063b91
--- /dev/null
+++ b/phantom/submodules/sam2/training/utils/data_utils.py
@@ -0,0 +1,179 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Misc functions, including distributed helpers.
+
+Mostly copy-paste from torchvision references.
+"""
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from PIL import Image as PILImage
+from tensordict import tensorclass
+
+
+@tensorclass
+class BatchedVideoMetaData:
+    """
+    This class represents metadata about a batch of videos.
+    Attributes:
+        unique_objects_identifier: A tensor of shape Bx3 containing unique identifiers for each object in the batch. Index consists of (video_id, obj_id, frame_id)
+        frame_orig_size: A tensor of shape Bx2 containing the original size of each frame in the batch.
+    """
+
+    unique_objects_identifier: torch.LongTensor
+    frame_orig_size: torch.LongTensor
+
+
+@tensorclass
+class BatchedVideoDatapoint:
+    """
+    This class represents a batch of videos with associated annotations and metadata.
+    Attributes:
+        img_batch: A [TxBxCxHxW] tensor containing the image data for each frame in the batch, where T is the number of frames per video, and B is the number of videos in the batch.
+        obj_to_frame_idx: A [TxOx2] tensor containing the image_batch index which the object belongs to. O is the number of objects in the batch.
+        masks: A [TxOxHxW] tensor containing binary masks for each object in the batch.
+        metadata: An instance of BatchedVideoMetaData containing metadata about the batch.
+        dict_key: A string key used to identify the batch.
+    """
+
+    img_batch: torch.FloatTensor
+    obj_to_frame_idx: torch.IntTensor
+    masks: torch.BoolTensor
+    metadata: BatchedVideoMetaData
+
+    dict_key: str
+
+    def pin_memory(self, device=None):
+        return self.apply(torch.Tensor.pin_memory, device=device)
+
+    @property
+    def num_frames(self) -> int:
+        """
+        Returns the number of frames per video.
+        """
+        return self.batch_size[0]
+
+    @property
+    def num_videos(self) -> int:
+        """
+        Returns the number of videos in the batch.
+        """
+        return self.img_batch.shape[1]
+
+    @property
+    def flat_obj_to_img_idx(self) -> torch.IntTensor:
+        """
+        Returns a flattened tensor containing the object to img index.
+        The flat index can be used to access a flattened img_batch of shape [(T*B)xCxHxW]
+        """
+        frame_idx, video_idx = self.obj_to_frame_idx.unbind(dim=-1)
+        flat_idx = video_idx * self.num_frames + frame_idx
+        return flat_idx
+
+    @property
+    def flat_img_batch(self) -> torch.FloatTensor:
+        """
+        Returns a flattened img_batch_tensor of shape [(B*T)xCxHxW]
+        """
+
+        return self.img_batch.transpose(0, 1).flatten(0, 1)
+
+
+@dataclass
+class Object:
+    # Id of the object in the media
+    object_id: int
+    # Index of the frame in the media (0 if single image)
+    frame_index: int
+    segment: Union[torch.Tensor, dict]  # RLE dict or binary mask
+
+
+@dataclass
+class Frame:
+    data: Union[torch.Tensor, PILImage.Image]
+    objects: List[Object]
+
+
+@dataclass
+class VideoDatapoint:
+    """Refers to an image/video and all its annotations"""
+
+    frames: List[Frame]
+    video_id: int
+    size: Tuple[int, int]
+
+
+def collate_fn(
+    batch: List[VideoDatapoint],
+    dict_key,
+) -> BatchedVideoDatapoint:
+    """
+    Args:
+        batch: A list of VideoDatapoint instances.
+        dict_key (str): A string key used to identify the batch.
+    """
+    img_batch = []
+    for video in batch:
+        img_batch += [torch.stack([frame.data for frame in video.frames], dim=0)]
+
+    img_batch = torch.stack(img_batch, dim=0).permute((1, 0, 2, 3, 4))
+    T = img_batch.shape[0]
+    # Prepare data structures for sequential processing. Per-frame processing but batched across videos.
+    step_t_objects_identifier = [[] for _ in range(T)]
+    step_t_frame_orig_size = [[] for _ in range(T)]
+
+    step_t_masks = [[] for _ in range(T)]
+    step_t_obj_to_frame_idx = [
+        [] for _ in range(T)
+    ]  # List to store frame indices for each time step
+
+    for video_idx, video in enumerate(batch):
+        orig_video_id = video.video_id
+        orig_frame_size = video.size
+        for t, frame in enumerate(video.frames):
+            objects = frame.objects
+            for obj in objects:
+                orig_obj_id = obj.object_id
+                orig_frame_idx = obj.frame_index
+                step_t_obj_to_frame_idx[t].append(
+                    torch.tensor([t, video_idx], dtype=torch.int)
+                )
+                step_t_masks[t].append(obj.segment.to(torch.bool))
+                step_t_objects_identifier[t].append(
+                    torch.tensor([orig_video_id, orig_obj_id, orig_frame_idx])
+                )
+                step_t_frame_orig_size[t].append(torch.tensor(orig_frame_size))
+
+    obj_to_frame_idx = torch.stack(
+        [
+            torch.stack(obj_to_frame_idx, dim=0)
+            for obj_to_frame_idx in step_t_obj_to_frame_idx
+        ],
+        dim=0,
+    )
+    masks = torch.stack([torch.stack(masks, dim=0) for masks in step_t_masks], dim=0)
+    objects_identifier = torch.stack(
+        [torch.stack(id, dim=0) for id in step_t_objects_identifier], dim=0
+    )
+    frame_orig_size = torch.stack(
+        [torch.stack(id, dim=0) for id in step_t_frame_orig_size], dim=0
+    )
+    return BatchedVideoDatapoint(
+        img_batch=img_batch,
+        obj_to_frame_idx=obj_to_frame_idx,
+        masks=masks,
+        metadata=BatchedVideoMetaData(
+            unique_objects_identifier=objects_identifier,
+            frame_orig_size=frame_orig_size,
+        ),
+        dict_key=dict_key,
+        batch_size=[T],
+    )
diff --git a/phantom/submodules/sam2/training/utils/distributed.py b/phantom/submodules/sam2/training/utils/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..f614b40427f40350c4df9e695cd327cb4d6a96f6
--- /dev/null
+++ b/phantom/submodules/sam2/training/utils/distributed.py
@@ -0,0 +1,576 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import datetime
+import functools
+import io
+import logging
+import os
+import random
+import tempfile
+import time
+from typing import Any, Callable, List, Tuple
+
+import torch
+import torch.autograd as autograd
+import torch.distributed as dist
+
+
+# Default to GPU 0
+_cuda_device_index: int = 0
+
+# Setting _cuda_device_index to -1 internally implies that we should use CPU
+_CPU_DEVICE_INDEX = -1
+_PRIMARY_RANK = 0
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """
+    Return a process group based on gloo backend, containing all the ranks
+    The result is cached.
+    """
+
+    if dist.get_backend() == "nccl":
+        # Increase timeout from 1800 sec to 43200 sec (12 hr) to avoid some processes
+        # being much slower than others causing a timeout (which can happen in relation
+        # or LVIS class mAP evaluation).
+        timeout = 43200
+        return dist.new_group(
+            backend="gloo",
+            timeout=datetime.timedelta(seconds=timeout),
+        )
+
+    return dist.group.WORLD
+
+
+def is_main_process():
+    """Return true if the current process is the main one"""
+    return get_rank() == 0
+
+
+def all_gather_via_filesys(data, filesys_save_dir=None, gather_to_rank_0_only=False):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors), similar to
+    `all_gather` above, but using filesystem instead of collective ops.
+
+    If gather_to_rank_0_only is True, only rank 0 will load the gathered object list
+    (and other ranks will have an empty list).
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    print("gathering via files")
+    cpu_group = _get_global_gloo_group()
+
+    # if unspecified, we will save to the current python file dir
+    if filesys_save_dir is not None:
+        save_dir = filesys_save_dir
+    elif "EXP_DIR" in os.environ:
+        save_dir = os.environ["EXP_DIR"]
+    else:
+        # try the same directory where the code is stored
+        save_dir = filesys_save_dir or os.path.dirname(__file__)
+    save_dir = os.path.join(save_dir, "all_gather_via_filesys")
+    if is_main_process():
+        os.makedirs(save_dir, exist_ok=True)
+
+    # use a timestamp and salt to distinguish different all_gather
+    timestamp = int(time.time()) if is_main_process() else 0
+    salt = random.randint(0, 2**31 - 1) if is_main_process() else 0
+    # broadcast the timestamp and salt across ranks
+    # (all-reduce will do the broadcasting since only rank 0 is non-zero)
+    timestamp_and_salt = torch.tensor([timestamp, salt], dtype=torch.long)
+    dist.all_reduce(timestamp_and_salt, group=cpu_group)
+    timestamp, salt = timestamp_and_salt.tolist()
+
+    # save the data to a file on the disk
+    rank_save = get_rank()
+    save_data_filename = f"data_to_gather_{timestamp}_{salt}_{rank_save}.pkl"
+    save_data_path = os.path.join(save_dir, save_data_filename)
+    assert not os.path.exists(save_data_path), f"{save_data_path} already exists"
+    torch.save(data, save_data_path)
+    dist.barrier(group=cpu_group)
+
+    # read the data from the files
+    data_list = []
+    if rank_save == 0 or not gather_to_rank_0_only:
+        for rank_load in range(world_size):
+            load_data_filename = f"data_to_gather_{timestamp}_{salt}_{rank_load}.pkl"
+            load_data_path = os.path.join(save_dir, load_data_filename)
+            assert os.path.exists(load_data_path), f"cannot read {save_data_path}"
+            data_list.append(torch.load(load_data_path))
+    dist.barrier(group=cpu_group)
+
+    # delete the saved file
+    os.remove(save_data_path)
+    return data_list
+
+
+def all_gather(data, force_cpu=False, force_filesys=False, filesys_save_dir=None):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    if os.getenv("MDETR_FILESYS_REDUCE_RANK_0_ONLY") == "1":
+        return all_gather_via_filesys(
+            data, filesys_save_dir, gather_to_rank_0_only=True
+        )
+
+    if os.getenv("MDETR_FILESYS_REDUCE") == "1" or force_filesys:
+        return all_gather_via_filesys(data, filesys_save_dir)
+
+    cpu_group = None
+    if os.getenv("MDETR_CPU_REDUCE") == "1" or force_cpu:
+        cpu_group = _get_global_gloo_group()
+
+    buffer = io.BytesIO()
+    torch.save(data, buffer)
+    data_view = buffer.getbuffer()
+    device = "cuda" if cpu_group is None else "cpu"
+    tensor = torch.ByteTensor(data_view).to(device)
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device=device, dtype=torch.long)
+    size_list = [
+        torch.tensor([0], device=device, dtype=torch.long) for _ in range(world_size)
+    ]
+    if cpu_group is None:
+        dist.all_gather(size_list, local_size)
+    else:
+        print("gathering on cpu")
+        dist.all_gather(size_list, local_size, group=cpu_group)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    assert isinstance(local_size.item(), int)
+    local_size = int(local_size.item())
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device=device))
+    if local_size != max_size:
+        padding = torch.empty(
+            size=(max_size - local_size,), dtype=torch.uint8, device=device
+        )
+        tensor = torch.cat((tensor, padding), dim=0)
+    if cpu_group is None:
+        dist.all_gather(tensor_list, tensor)
+    else:
+        dist.all_gather(tensor_list, tensor, group=cpu_group)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        tensor = torch.split(tensor, [size, max_size - size], dim=0)[0]
+        buffer = io.BytesIO(tensor.cpu().numpy())
+        obj = torch.load(buffer)
+        data_list.append(obj)
+
+    return data_list
+
+
+def convert_to_distributed_tensor(tensor: torch.Tensor) -> Tuple[torch.Tensor, str]:
+    """
+    For some backends, such as NCCL, communication only works if the
+    tensor is on the GPU. This helper function converts to the correct
+    device and returns the tensor + original device.
+    """
+    orig_device = "cpu" if not tensor.is_cuda else "gpu"
+    if (
+        torch.distributed.is_available()
+        and torch.distributed.get_backend() == torch.distributed.Backend.NCCL
+        and not tensor.is_cuda
+    ):
+        tensor = tensor.cuda()
+    return (tensor, orig_device)
+
+
+def convert_to_normal_tensor(tensor: torch.Tensor, orig_device: str) -> torch.Tensor:
+    """
+    For some backends, such as NCCL, communication only works if the
+    tensor is on the GPU. This converts the tensor back to original device.
+    """
+    if tensor.is_cuda and orig_device == "cpu":
+        tensor = tensor.cpu()
+    return tensor
+
+
+def is_distributed_training_run() -> bool:
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and (torch.distributed.get_world_size() > 1)
+    )
+
+
+def is_primary() -> bool:
+    """
+    Returns True if this is rank 0 of a distributed training job OR if it is
+    a single trainer job. Otherwise False.
+    """
+    return get_rank() == _PRIMARY_RANK
+
+
+def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Wrapper over torch.distributed.all_reduce for performing mean reduction
+    of tensor over all processes.
+    """
+    return all_reduce_op(
+        tensor,
+        torch.distributed.ReduceOp.SUM,
+        lambda t: t / torch.distributed.get_world_size(),
+    )
+
+
+def all_reduce_sum(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Wrapper over torch.distributed.all_reduce for performing sum
+    reduction of tensor over all processes in both distributed /
+    non-distributed scenarios.
+    """
+    return all_reduce_op(tensor, torch.distributed.ReduceOp.SUM)
+
+
+def all_reduce_min(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Wrapper over torch.distributed.all_reduce for performing min
+    reduction of tensor over all processes in both distributed /
+    non-distributed scenarios.
+    """
+    return all_reduce_op(tensor, torch.distributed.ReduceOp.MIN)
+
+
+def all_reduce_max(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Wrapper over torch.distributed.all_reduce for performing min
+    reduction of tensor over all processes in both distributed /
+    non-distributed scenarios.
+    """
+    return all_reduce_op(tensor, torch.distributed.ReduceOp.MAX)
+
+
+def all_reduce_op(
+    tensor: torch.Tensor,
+    op: torch.distributed.ReduceOp,
+    after_op_func: Callable[[torch.Tensor], torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    Wrapper over torch.distributed.all_reduce for performing
+    reduction of tensor over all processes in both distributed /
+    non-distributed scenarios.
+    """
+    if is_distributed_training_run():
+        tensor, orig_device = convert_to_distributed_tensor(tensor)
+        torch.distributed.all_reduce(tensor, op)
+        if after_op_func is not None:
+            tensor = after_op_func(tensor)
+        tensor = convert_to_normal_tensor(tensor, orig_device)
+    return tensor
+
+
+def gather_tensors_from_all(tensor: torch.Tensor) -> List[torch.Tensor]:
+    """
+    Wrapper over torch.distributed.all_gather for performing
+    'gather' of 'tensor' over all processes in both distributed /
+    non-distributed scenarios.
+    """
+    if tensor.ndim == 0:
+        # 0 dim tensors cannot be gathered. so unsqueeze
+        tensor = tensor.unsqueeze(0)
+
+    if is_distributed_training_run():
+        tensor, orig_device = convert_to_distributed_tensor(tensor)
+        gathered_tensors = [
+            torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())
+        ]
+        torch.distributed.all_gather(gathered_tensors, tensor)
+        gathered_tensors = [
+            convert_to_normal_tensor(_tensor, orig_device)
+            for _tensor in gathered_tensors
+        ]
+    else:
+        gathered_tensors = [tensor]
+
+    return gathered_tensors
+
+
+def gather_from_all(tensor: torch.Tensor) -> torch.Tensor:
+    gathered_tensors = gather_tensors_from_all(tensor)
+    gathered_tensor = torch.cat(gathered_tensors, 0)
+    return gathered_tensor
+
+
+def broadcast(tensor: torch.Tensor, src: int = 0) -> torch.Tensor:
+    """
+    Wrapper over torch.distributed.broadcast for broadcasting a tensor from the source
+    to all processes in both distributed / non-distributed scenarios.
+    """
+    if is_distributed_training_run():
+        tensor, orig_device = convert_to_distributed_tensor(tensor)
+        torch.distributed.broadcast(tensor, src)
+        tensor = convert_to_normal_tensor(tensor, orig_device)
+    return tensor
+
+
+def barrier() -> None:
+    """
+    Wrapper over torch.distributed.barrier, returns without waiting
+    if the distributed process group is not initialized instead of throwing error.
+    """
+    if not torch.distributed.is_available() or not torch.distributed.is_initialized():
+        return
+    torch.distributed.barrier()
+
+
+def get_world_size() -> int:
+    """
+    Simple wrapper for correctly getting worldsize in both distributed
+    / non-distributed settings
+    """
+    return (
+        torch.distributed.get_world_size()
+        if torch.distributed.is_available() and torch.distributed.is_initialized()
+        else 1
+    )
+
+
+def get_rank() -> int:
+    """
+    Simple wrapper for correctly getting rank in both distributed
+    / non-distributed settings
+    """
+    return (
+        torch.distributed.get_rank()
+        if torch.distributed.is_available() and torch.distributed.is_initialized()
+        else 0
+    )
+
+
+def get_primary_rank() -> int:
+    return _PRIMARY_RANK
+
+
+def set_cuda_device_index(idx: int) -> None:
+    global _cuda_device_index
+    _cuda_device_index = idx
+    torch.cuda.set_device(_cuda_device_index)
+
+
+def set_cpu_device() -> None:
+    global _cuda_device_index
+    _cuda_device_index = _CPU_DEVICE_INDEX
+
+
+def get_cuda_device_index() -> int:
+    return _cuda_device_index
+
+
+def init_distributed_data_parallel_model(
+    model: torch.nn.Module,
+    broadcast_buffers: bool = False,
+    find_unused_parameters: bool = True,
+    bucket_cap_mb: int = 25,
+) -> torch.nn.parallel.DistributedDataParallel:
+    global _cuda_device_index
+
+    if _cuda_device_index == _CPU_DEVICE_INDEX:
+        # CPU-only model, don't specify device
+        return torch.nn.parallel.DistributedDataParallel(
+            model,
+            broadcast_buffers=broadcast_buffers,
+            find_unused_parameters=find_unused_parameters,
+            bucket_cap_mb=bucket_cap_mb,
+        )
+    else:
+        # GPU model
+        return torch.nn.parallel.DistributedDataParallel(
+            model,
+            device_ids=[_cuda_device_index],
+            output_device=_cuda_device_index,
+            broadcast_buffers=broadcast_buffers,
+            find_unused_parameters=find_unused_parameters,
+            bucket_cap_mb=bucket_cap_mb,
+        )
+
+
+def broadcast_object(obj: Any, src: int = _PRIMARY_RANK, use_disk: bool = True) -> Any:
+    """Broadcast an object from a source to all workers.
+
+    Args:
+        obj: Object to broadcast, must be serializable
+        src: Source rank for broadcast (default is primary)
+        use_disk: If enabled, removes redundant CPU memory copies by writing to
+            disk
+    """
+    # Either broadcast from primary to the fleet (default),
+    # or use the src setting as the original rank
+    if get_rank() == src:
+        # Emit data
+        buffer = io.BytesIO()
+        torch.save(obj, buffer)
+        data_view = buffer.getbuffer()
+        length_tensor = torch.LongTensor([len(data_view)])
+        length_tensor = broadcast(length_tensor, src=src)
+        data_tensor = torch.ByteTensor(data_view)
+        data_tensor = broadcast(data_tensor, src=src)
+    else:
+        # Fetch from the source
+        length_tensor = torch.LongTensor([0])
+        length_tensor = broadcast(length_tensor, src=src)
+        data_tensor = torch.empty([length_tensor.item()], dtype=torch.uint8)
+        data_tensor = broadcast(data_tensor, src=src)
+        if use_disk:
+            with tempfile.TemporaryFile("r+b") as f:
+                f.write(data_tensor.numpy())
+                # remove reference to the data tensor and hope that Python garbage
+                # collects it
+                del data_tensor
+                f.seek(0)
+                obj = torch.load(f)
+        else:
+            buffer = io.BytesIO(data_tensor.numpy())
+            obj = torch.load(buffer)
+    return obj
+
+
+def all_gather_tensor(tensor: torch.Tensor, world_size=None):
+    if world_size is None:
+        world_size = get_world_size()
+    # make contiguous because NCCL won't gather the tensor otherwise
+    assert tensor.is_contiguous(), f"{tensor.shape} is not contiguous!"
+    tensor, orig_device = convert_to_distributed_tensor(tensor)
+    tensor_all = [torch.ones_like(tensor) for _ in range(world_size)]
+    dist.all_gather(tensor_all, tensor, async_op=False)  # performance opt
+    tensor_all = [
+        convert_to_normal_tensor(tensor, orig_device) for tensor in tensor_all
+    ]
+    return tensor_all
+
+
+def all_gather_batch(tensors: List[torch.Tensor]):
+    """
+    Performs all_gather operation on the provided tensors.
+    """
+    # Queue the gathered tensors
+    world_size = get_world_size()
+    # There is no need for reduction in the single-proc case
+    if world_size == 1:
+        return tensors
+    tensor_list = []
+    output_tensor = []
+    for tensor in tensors:
+        tensor_all = all_gather_tensor(tensor, world_size)
+        tensor_list.append(tensor_all)
+
+    for tensor_all in tensor_list:
+        output_tensor.append(torch.cat(tensor_all, dim=0))
+    return output_tensor
+
+
+class GatherLayer(autograd.Function):
+    """
+    Gather tensors from all workers with support for backward propagation:
+    This implementation does not cut the gradients as torch.distributed.all_gather does.
+    """
+
+    @staticmethod
+    def forward(ctx, x):
+        output = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
+        dist.all_gather(output, x)
+        return tuple(output)
+
+    @staticmethod
+    def backward(ctx, *grads):
+        all_gradients = torch.stack(grads)
+        dist.all_reduce(all_gradients)
+        return all_gradients[dist.get_rank()]
+
+
+def all_gather_batch_with_grad(tensors):
+    """
+    Performs all_gather operation on the provided tensors.
+    Graph remains connected for backward grad computation.
+    """
+    # Queue the gathered tensors
+    world_size = get_world_size()
+    # There is no need for reduction in the single-proc case
+    if world_size == 1:
+        return tensors
+    tensor_list = []
+    output_tensor = []
+
+    for tensor in tensors:
+        tensor_all = GatherLayer.apply(tensor)
+        tensor_list.append(tensor_all)
+
+    for tensor_all in tensor_list:
+        output_tensor.append(torch.cat(tensor_all, dim=0))
+    return output_tensor
+
+
+def unwrap_ddp_if_wrapped(model):
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        return model.module
+    return model
+
+
+def create_new_process_group(group_size):
+    """
+    Creates process groups of a gives `group_size` and returns
+    process group that current GPU participates in.
+
+    `group_size` must divide the total number of GPUs (world_size).
+
+    Modified from
+    https://github.com/NVIDIA/apex/blob/4e1ae43f7f7ac69113ef426dd15f37123f0a2ed3/apex/parallel/__init__.py#L60
+
+    Args:
+        group_size (int): number of GPU's to collaborate for sync bn
+    """
+
+    assert group_size > 0
+
+    world_size = torch.distributed.get_world_size()
+    if world_size <= 8:
+        if group_size > world_size:
+            logging.warning(
+                f"Requested group size [{group_size}] > world size [{world_size}]. "
+                "Assuming local debug run and capping it to world size."
+            )
+            group_size = world_size
+    assert world_size >= group_size
+    assert world_size % group_size == 0
+
+    group = None
+    for group_num in range(world_size // group_size):
+        group_ids = range(group_num * group_size, (group_num + 1) * group_size)
+        cur_group = torch.distributed.new_group(ranks=group_ids)
+        if torch.distributed.get_rank() // group_size == group_num:
+            group = cur_group
+            # can not drop out and return here, every process must go through creation of all subgroups
+
+    assert group is not None
+    return group
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
diff --git a/phantom/submodules/sam2/training/utils/logger.py b/phantom/submodules/sam2/training/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4b4ef0ebe359063e1ca2c3a46cb8fcc76d067c2
--- /dev/null
+++ b/phantom/submodules/sam2/training/utils/logger.py
@@ -0,0 +1,246 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Code borrowed from TLC - https://www.internalfb.com/code/fbsource/fbcode/pytorch/tlc/torchtlc/loggers/tensorboard.py
+import atexit
+import functools
+import logging
+import sys
+import uuid
+from typing import Any, Dict, Optional, Union
+
+from hydra.utils import instantiate
+
+from iopath.common.file_io import g_pathmgr
+from numpy import ndarray
+from torch import Tensor
+from torch.utils.tensorboard import SummaryWriter
+
+from training.utils.train_utils import get_machine_local_and_dist_rank, makedir
+
+Scalar = Union[Tensor, ndarray, int, float]
+
+
+def make_tensorboard_logger(log_dir: str, **writer_kwargs: Any):
+    makedir(log_dir)
+    summary_writer_method = SummaryWriter
+    return TensorBoardLogger(
+        path=log_dir, summary_writer_method=summary_writer_method, **writer_kwargs
+    )
+
+
+class TensorBoardWriterWrapper:
+    """
+    A wrapper around a SummaryWriter object.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        *args: Any,
+        filename_suffix: str = None,
+        summary_writer_method: Any = SummaryWriter,
+        **kwargs: Any,
+    ) -> None:
+        """Create a new TensorBoard logger.
+        On construction, the logger creates a new events file that logs
+        will be written to.  If the environment variable `RANK` is defined,
+        logger will only log if RANK = 0.
+
+        NOTE: If using the logger with distributed training:
+        - This logger can call collective operations
+        - Logs will be written on rank 0 only
+        - Logger must be constructed synchronously *after* initializing distributed process group.
+
+        Args:
+            path (str): path to write logs to
+            *args, **kwargs: Extra arguments to pass to SummaryWriter
+        """
+        self._writer: Optional[SummaryWriter] = None
+        _, self._rank = get_machine_local_and_dist_rank()
+        self._path: str = path
+        if self._rank == 0:
+            logging.info(
+                f"TensorBoard SummaryWriter instantiated. Files will be stored in: {path}"
+            )
+            self._writer = summary_writer_method(
+                log_dir=path,
+                *args,
+                filename_suffix=filename_suffix or str(uuid.uuid4()),
+                **kwargs,
+            )
+        else:
+            logging.debug(
+                f"Not logging meters on this host because env RANK: {self._rank} != 0"
+            )
+        atexit.register(self.close)
+
+    @property
+    def writer(self) -> Optional[SummaryWriter]:
+        return self._writer
+
+    @property
+    def path(self) -> str:
+        return self._path
+
+    def flush(self) -> None:
+        """Writes pending logs to disk."""
+
+        if not self._writer:
+            return
+
+        self._writer.flush()
+
+    def close(self) -> None:
+        """Close writer, flushing pending logs to disk.
+        Logs cannot be written after `close` is called.
+        """
+
+        if not self._writer:
+            return
+
+        self._writer.close()
+        self._writer = None
+
+
+class TensorBoardLogger(TensorBoardWriterWrapper):
+    """
+    A simple logger for TensorBoard.
+    """
+
+    def log_dict(self, payload: Dict[str, Scalar], step: int) -> None:
+        """Add multiple scalar values to TensorBoard.
+
+        Args:
+            payload (dict): dictionary of tag name and scalar value
+            step (int, Optional): step value to record
+        """
+        if not self._writer:
+            return
+        for k, v in payload.items():
+            self.log(k, v, step)
+
+    def log(self, name: str, data: Scalar, step: int) -> None:
+        """Add scalar data to TensorBoard.
+
+        Args:
+            name (string): tag name used to group scalars
+            data (float/int/Tensor): scalar data to log
+            step (int, optional): step value to record
+        """
+        if not self._writer:
+            return
+        self._writer.add_scalar(name, data, global_step=step, new_style=True)
+
+    def log_hparams(
+        self, hparams: Dict[str, Scalar], meters: Dict[str, Scalar]
+    ) -> None:
+        """Add hyperparameter data to TensorBoard.
+
+        Args:
+            hparams (dict): dictionary of hyperparameter names and corresponding values
+            meters (dict): dictionary of name of meter and corersponding values
+        """
+        if not self._writer:
+            return
+        self._writer.add_hparams(hparams, meters)
+
+
+class Logger:
+    """
+    A logger class that can interface with multiple loggers. It now supports tensorboard only for simplicity, but you can extend it with your own logger.
+    """
+
+    def __init__(self, logging_conf):
+        # allow turning off TensorBoard with "should_log: false" in config
+        tb_config = logging_conf.tensorboard_writer
+        tb_should_log = tb_config and tb_config.pop("should_log", True)
+        self.tb_logger = instantiate(tb_config) if tb_should_log else None
+
+    def log_dict(self, payload: Dict[str, Scalar], step: int) -> None:
+        if self.tb_logger:
+            self.tb_logger.log_dict(payload, step)
+
+    def log(self, name: str, data: Scalar, step: int) -> None:
+        if self.tb_logger:
+            self.tb_logger.log(name, data, step)
+
+    def log_hparams(
+        self, hparams: Dict[str, Scalar], meters: Dict[str, Scalar]
+    ) -> None:
+        if self.tb_logger:
+            self.tb_logger.log_hparams(hparams, meters)
+
+
+# cache the opened file object, so that different calls to `setup_logger`
+# with the same file name can safely write to the same file.
+@functools.lru_cache(maxsize=None)
+def _cached_log_stream(filename):
+    # we tune the buffering value so that the logs are updated
+    # frequently.
+    log_buffer_kb = 10 * 1024  # 10KB
+    io = g_pathmgr.open(filename, mode="a", buffering=log_buffer_kb)
+    atexit.register(io.close)
+    return io
+
+
+def setup_logging(
+    name,
+    output_dir=None,
+    rank=0,
+    log_level_primary="INFO",
+    log_level_secondary="ERROR",
+):
+    """
+    Setup various logging streams: stdout and file handlers.
+    For file handlers, we only setup for the master gpu.
+    """
+    # get the filename if we want to log to the file as well
+    log_filename = None
+    if output_dir:
+        makedir(output_dir)
+        if rank == 0:
+            log_filename = f"{output_dir}/log.txt"
+
+    logger = logging.getLogger(name)
+    logger.setLevel(log_level_primary)
+
+    # create formatter
+    FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)4d: %(message)s"
+    formatter = logging.Formatter(FORMAT)
+
+    # Cleanup any existing handlers
+    for h in logger.handlers:
+        logger.removeHandler(h)
+    logger.root.handlers = []
+
+    # setup the console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+    if rank == 0:
+        console_handler.setLevel(log_level_primary)
+    else:
+        console_handler.setLevel(log_level_secondary)
+
+    # we log to file as well if user wants
+    if log_filename and rank == 0:
+        file_handler = logging.StreamHandler(_cached_log_stream(log_filename))
+        file_handler.setLevel(log_level_primary)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+
+    logging.root = logger
+
+
+def shutdown_logging():
+    """
+    After training is done, we ensure to shut down all the logger streams.
+    """
+    logging.info("Shutting down loggers...")
+    handlers = logging.root.handlers
+    for handler in handlers:
+        handler.close()
diff --git a/phantom/submodules/sam2/training/utils/train_utils.py b/phantom/submodules/sam2/training/utils/train_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d5577d5f50c81624737d221dc572ac3c4cee56
--- /dev/null
+++ b/phantom/submodules/sam2/training/utils/train_utils.py
@@ -0,0 +1,288 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+import os
+import random
+import re
+from datetime import timedelta
+from typing import Optional
+
+import hydra
+
+import numpy as np
+import omegaconf
+import torch
+import torch.distributed as dist
+from iopath.common.file_io import g_pathmgr
+from omegaconf import OmegaConf
+
+
+def multiply_all(*args):
+    return np.prod(np.array(args)).item()
+
+
+def collect_dict_keys(config):
+    """This function recursively iterates through a dataset configuration, and collect all the dict_key that are defined"""
+    val_keys = []
+    # If the this config points to the collate function, then it has a key
+    if "_target_" in config and re.match(r".*collate_fn.*", config["_target_"]):
+        val_keys.append(config["dict_key"])
+    else:
+        # Recursively proceed
+        for v in config.values():
+            if isinstance(v, type(config)):
+                val_keys.extend(collect_dict_keys(v))
+            elif isinstance(v, omegaconf.listconfig.ListConfig):
+                for item in v:
+                    if isinstance(item, type(config)):
+                        val_keys.extend(collect_dict_keys(item))
+    return val_keys
+
+
+class Phase:
+    TRAIN = "train"
+    VAL = "val"
+
+
+def register_omegaconf_resolvers():
+    OmegaConf.register_new_resolver("get_method", hydra.utils.get_method)
+    OmegaConf.register_new_resolver("get_class", hydra.utils.get_class)
+    OmegaConf.register_new_resolver("add", lambda x, y: x + y)
+    OmegaConf.register_new_resolver("times", multiply_all)
+    OmegaConf.register_new_resolver("divide", lambda x, y: x / y)
+    OmegaConf.register_new_resolver("pow", lambda x, y: x**y)
+    OmegaConf.register_new_resolver("subtract", lambda x, y: x - y)
+    OmegaConf.register_new_resolver("range", lambda x: list(range(x)))
+    OmegaConf.register_new_resolver("int", lambda x: int(x))
+    OmegaConf.register_new_resolver("ceil_int", lambda x: int(math.ceil(x)))
+    OmegaConf.register_new_resolver("merge", lambda *x: OmegaConf.merge(*x))
+
+
+def setup_distributed_backend(backend, timeout_mins):
+    """
+    Initialize torch.distributed and set the CUDA device.
+    Expects environment variables to be set as per
+    https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization
+    along with the environ variable "LOCAL_RANK" which is used to set the CUDA device.
+    """
+    # enable TORCH_NCCL_ASYNC_ERROR_HANDLING to ensure dist nccl ops time out after timeout_mins
+    # of waiting
+    os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+    logging.info(f"Setting up torch.distributed with a timeout of {timeout_mins} mins")
+    dist.init_process_group(backend=backend, timeout=timedelta(minutes=timeout_mins))
+    return dist.get_rank()
+
+
+def get_machine_local_and_dist_rank():
+    """
+    Get the distributed and local rank of the current gpu.
+    """
+    local_rank = int(os.environ.get("LOCAL_RANK", None))
+    distributed_rank = int(os.environ.get("RANK", None))
+    assert (
+        local_rank is not None and distributed_rank is not None
+    ), "Please the set the RANK and LOCAL_RANK environment variables."
+    return local_rank, distributed_rank
+
+
+def print_cfg(cfg):
+    """
+    Supports printing both Hydra DictConfig and also the AttrDict config
+    """
+    logging.info("Training with config:")
+    logging.info(OmegaConf.to_yaml(cfg))
+
+
+def set_seeds(seed_value, max_epochs, dist_rank):
+    """
+    Set the python random, numpy and torch seed for each gpu. Also set the CUDA
+    seeds if the CUDA is available. This ensures deterministic nature of the training.
+    """
+    # Since in the pytorch sampler, we increment the seed by 1 for every epoch.
+    seed_value = (seed_value + dist_rank) * max_epochs
+    logging.info(f"MACHINE SEED: {seed_value}")
+    random.seed(seed_value)
+    np.random.seed(seed_value)
+    torch.manual_seed(seed_value)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed_value)
+
+
+def makedir(dir_path):
+    """
+    Create the directory if it does not exist.
+    """
+    is_success = False
+    try:
+        if not g_pathmgr.exists(dir_path):
+            g_pathmgr.mkdirs(dir_path)
+        is_success = True
+    except BaseException:
+        logging.info(f"Error creating directory: {dir_path}")
+    return is_success
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_amp_type(amp_type: Optional[str] = None):
+    if amp_type is None:
+        return None
+    assert amp_type in ["bfloat16", "float16"], "Invalid Amp type."
+    if amp_type == "bfloat16":
+        return torch.bfloat16
+    else:
+        return torch.float16
+
+
+def log_env_variables():
+    env_keys = sorted(list(os.environ.keys()))
+    st = ""
+    for k in env_keys:
+        v = os.environ[k]
+        st += f"{k}={v}\n"
+    logging.info("Logging ENV_VARIABLES")
+    logging.info(st)
+
+
+class AverageMeter:
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, device, fmt=":f"):
+        self.name = name
+        self.fmt = fmt
+        self.device = device
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+        self._allow_updates = True
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = "{name}: {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+
+
+class MemMeter:
+    """Computes and stores the current, avg, and max of peak Mem usage per iteration"""
+
+    def __init__(self, name, device, fmt=":f"):
+        self.name = name
+        self.fmt = fmt
+        self.device = device
+        self.reset()
+
+    def reset(self):
+        self.val = 0  # Per iteration max usage
+        self.avg = 0  # Avg per iteration max usage
+        self.peak = 0  # Peak usage for lifetime of program
+        self.sum = 0
+        self.count = 0
+        self._allow_updates = True
+
+    def update(self, n=1, reset_peak_usage=True):
+        self.val = torch.cuda.max_memory_allocated() // 1e9
+        self.sum += self.val * n
+        self.count += n
+        self.avg = self.sum / self.count
+        self.peak = max(self.peak, self.val)
+        if reset_peak_usage:
+            torch.cuda.reset_peak_memory_stats()
+
+    def __str__(self):
+        fmtstr = (
+            "{name}: {val"
+            + self.fmt
+            + "} ({avg"
+            + self.fmt
+            + "}/{peak"
+            + self.fmt
+            + "})"
+        )
+        return fmtstr.format(**self.__dict__)
+
+
+def human_readable_time(time_seconds):
+    time = int(time_seconds)
+    minutes, seconds = divmod(time, 60)
+    hours, minutes = divmod(minutes, 60)
+    days, hours = divmod(hours, 24)
+    return f"{days:02}d {hours:02}h {minutes:02}m"
+
+
+class DurationMeter:
+    def __init__(self, name, device, fmt=":f"):
+        self.name = name
+        self.device = device
+        self.fmt = fmt
+        self.val = 0
+
+    def reset(self):
+        self.val = 0
+
+    def update(self, val):
+        self.val = val
+
+    def add(self, val):
+        self.val += val
+
+    def __str__(self):
+        return f"{self.name}: {human_readable_time(self.val)}"
+
+
+class ProgressMeter:
+    def __init__(self, num_batches, meters, real_meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.real_meters = real_meters
+        self.prefix = prefix
+
+    def display(self, batch, enable_print=False):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        entries += [
+            " | ".join(
+                [
+                    f"{os.path.join(name, subname)}: {val:.4f}"
+                    for subname, val in meter.compute().items()
+                ]
+            )
+            for name, meter in self.real_meters.items()
+        ]
+        logging.info(" | ".join(entries))
+        if enable_print:
+            print(" | ".join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = "{:" + str(num_digits) + "d}"
+        return "[" + fmt + "/" + fmt.format(num_batches) + "]"
+
+
+def get_resume_checkpoint(checkpoint_save_dir):
+    if not g_pathmgr.isdir(checkpoint_save_dir):
+        return None
+    ckpt_file = os.path.join(checkpoint_save_dir, "checkpoint.pt")
+    if not g_pathmgr.isfile(ckpt_file):
+        return None
+
+    return ckpt_file
diff --git a/requirements.txt b/requirements.txt
index 97ff04d2ffb49bebfbd8a0f1ae3f7e9763110c51..4a39351a69e39e7bd88910dac47bbd7334b9ec9d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,31 +1,32 @@
-# Gradio和Spaces
-gradio==4.44.0
-spaces==0.28.3
+# Gradio 和 Spaces
+gradio>=4.44.0
+spaces>=0.28.3
 
-# PyTorch (HF已预装，但指定版本)
+# PyTorch (使用 CUDA 12.1 版本)
+--extra-index-url https://download.pytorch.org/whl/cu121
 torch==2.1.0
 torchvision==0.16.0
 
 # 基础科学计算
 numpy==1.26.4
 opencv-python==4.8.1.78
-pillow==10.1.0
-scipy==1.11.4
-scikit-learn==1.3.2
+pillow>=10.1.0
+scipy>=1.11.4
+scikit-learn>=1.3.2
 
 # 配置管理
 hydra-core==1.3.2
 omegaconf==2.3.0
 
 # 视频处理
-mediapy==1.1.9
+mediapy>=1.1.9
 
-# 3D处理
-open3d==0.18.0
+# 3D 处理
+open3d>=0.18.0
 
 # 机器学习
 transformers==4.42.4
-timm==0.9.12
+timm>=0.9.12
 
 # 其他工具
 joblib
@@ -35,5 +36,5 @@ Rtree
 protobuf==3.20.0
 gdown
 
-# MMCV (Phantom需要)
+# MMCV (将在 setup.sh 中安装 full 版本)
 mmcv==1.3.9
diff --git a/setup.sh b/setup.sh
index 19f79b00c8c592dff77f2b0bd272b0cfe71336f7..00965cf30c2ac9e1b8ecde5c64dd8fd0619ada0f 100644
--- a/setup.sh
+++ b/setup.sh
@@ -1,127 +1,156 @@
 #!/bin/bash
-# Phantom环境配置脚本
-# 在app.py启动时运行（仅首次）
+# Phantom HuggingFace Spaces 安装脚本
+# 仅 Inference 模式 - 跳过 training 相关依赖
 
 set -e
 
 PHANTOM_DIR="/home/user/app/phantom"
 LOG_FILE="/tmp/phantom_setup.log"
 
-# 日志函数
 log() {
     echo "[$(date +'%H:%M:%S')] $1" | tee -a "$LOG_FILE"
 }
 
-log "🚀 开始配置Phantom环境"
+log "🚀 开始配置 Phantom 环境 (Inference Only)"
 
-# 检查phantom目录
+# 检查 phantom 目录
 if [ ! -d "$PHANTOM_DIR" ]; then
-    log "❌ Phantom目录不存在"
+    log "❌ Phantom 目录不存在"
     exit 1
 fi
 
 cd "$PHANTOM_DIR"
 
-# ========== 安装子模块 ==========
+# ========== 安装 Inference 必需依赖 ==========
 
-# 1. SAM2
+# 1. 安装 PyTorch (如果尚未安装)
+if ! python -c "import torch" 2>/dev/null; then
+    log "📦 安装 PyTorch..."
+    pip install -q torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121
+fi
+
+# 2. SAM2 (分割模型)
 if [ ! -f "/tmp/.sam2_installed" ]; then
-    log "📦 安装SAM2..."
-    cd submodules/sam2
-    pip install -q -e . 2>&1 | tee -a "$LOG_FILE" || log "⚠️ SAM2安装出现警告"
+    log "📦 安装 SAM2..."
+    cd "$PHANTOM_DIR/submodules/sam2"
+    pip install -q -e . 2>&1 | tee -a "$LOG_FILE" || log "⚠️ SAM2 警告"
     touch /tmp/.sam2_installed
-    log "✅ SAM2完成"
+    log "✅ SAM2 完成"
 fi
 
-# 2. HaMeR
+# 3. HaMeR (手部姿态估计)
 if [ ! -f "/tmp/.hamer_installed" ]; then
-    log "📦 安装HaMeR..."
+    log "📦 安装 HaMeR..."
     cd "$PHANTOM_DIR/submodules/phantom-hamer"
-    pip install -q -e .[all] 2>&1 | tee -a "$LOG_FILE" || log "⚠️ HaMeR安装出现警告"
-    
-    # 安装ViTPose
+    pip install -q -e .[all] 2>&1 | tee -a "$LOG_FILE" || log "⚠️ HaMeR 警告"
+
+    # 安装 ViTPose
     if [ -d "third-party/ViTPose" ]; then
-        pip install -q -e third-party/ViTPose 2>&1 | tee -a "$LOG_FILE"
+        log "📦 安装 ViTPose..."
+        pip install -q -e third-party/ViTPose 2>&1 | tee -a "$LOG_FILE" || true
     fi
-    
-    # 下载demo数据（如果不存在）
-    if [ ! -d "_DATA/hamer_demo_data" ]; then
-        log "📥 下载HaMeR demo数据..."
-        cd _DATA
-        wget -q https://www.cs.utexas.edu/~pavlakos/hamer/data/hamer_demo_data.tar.gz
-        tar -xzf hamer_demo_data.tar.gz 2>&1 | tee -a "$LOG_FILE"
-        rm hamer_demo_data.tar.gz
-    fi
-    
-    touch /tmp/.hamer_installed
-    log "✅ HaMeR完成"
-fi
 
-# 3. MMCV-Full
-if [ ! -f "/tmp/.mmcv_installed" ]; then
-    log "📦 安装MMCV-Full..."
-    pip install -q mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html 2>&1 | tee -a "$LOG_FILE"
-    touch /tmp/.mmcv_installed
-    log "✅ MMCV完成"
+    touch /tmp/.hamer_installed
+    log "✅ HaMeR 完成"
 fi
 
-# 4. Robosuite
-if [ ! -f "/tmp/.robosuite_installed" ]; then
-    log "📦 安装Robosuite..."
-    cd "$PHANTOM_DIR/submodules/phantom-robosuite"
-    pip install -q -e . 2>&1 | tee -a "$LOG_FILE"
-    touch /tmp/.robosuite_installed
-    log "✅ Robosuite完成"
+# 4. 下载 HaMeR demo 数据
+if [ ! -d "$PHANTOM_DIR/submodules/phantom-hamer/_DATA/hamer_demo_data" ]; then
+    log "📥 下载 HaMeR demo 数据..."
+    cd "$PHANTOM_DIR/submodules/phantom-hamer"
+    mkdir -p _DATA && cd _DATA
+    if [ ! -f "hamer_demo_data.tar.gz" ]; then
+        wget -q https://www.cs.utexas.edu/~pavlakos/hamer/data/hamer_demo_data.tar.gz || log "⚠️ HaMeR 数据下载失败"
+    fi
+    if [ -f "hamer_demo_data.tar.gz" ]; then
+        tar --warning=no-unknown-keyword -xzf hamer_demo_data.tar.gz 2>&1 | tee -a "$LOG_FILE" || true
+        rm -f hamer_demo_data.tar.gz
+        log "✅ HaMeR 数据完成"
+    fi
 fi
 
-# 5. Robomimic
-if [ ! -f "/tmp/.robomimic_installed" ]; then
-    log "📦 安装Robomimic..."
-    cd "$PHANTOM_DIR/submodules/phantom-robomimic"
-    pip install -q -e . 2>&1 | tee -a "$LOG_FILE"
-    touch /tmp/.robomimic_installed
-    log "✅ Robomimic完成"
+# 5. MMCV (仅基础版本，inference 够用)
+if [ ! -f "/tmp/.mmcv_installed" ]; then
+    log "📦 安装 MMCV..."
+    pip install -q mmcv==1.3.9 2>&1 | tee -a "$LOG_FILE" || true
+    # 尝试安装 mmcv-full，失败也没关系
+    pip install -q mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html 2>&1 | tee -a "$LOG_FILE" || log "⚠️ MMCV-full 跳过，使用基础版本"
+    touch /tmp/.mmcv_installed
+    log "✅ MMCV 完成"
 fi
 
-# 6. E2FGVI权重
-if [ ! -f "/tmp/.e2fgvi_weights" ]; then
-    log "📥 下载E2FGVI权重..."
-    cd "$PHANTOM_DIR/submodules/phantom-E2FGVI/E2FGVI/release_model"
-    if [ ! -f "E2FGVI-HQ.pth" ]; then
-        gdown --fuzzy https://drive.google.com/file/d/10wGdKSUOie0XmCr8SQ2A2FeDe-mfn5w3/view?usp=sharing 2>&1 | tee -a "$LOG_FILE"
-    fi
-    touch /tmp/.e2fgvi_weights
-    log "✅ E2FGVI权重完成"
+# 6. E2FGVI (视频修复)
+E2FGVI_DIR="$PHANTOM_DIR/submodules/phantom-E2FGVI/E2FGVI/release_model"
+if [ ! -f "$E2FGVI_DIR/E2FGVI-HQ.pth" ]; then
+    log "📥 下载 E2FGVI 权重..."
+    mkdir -p "$E2FGVI_DIR"
+    cd "$E2FGVI_DIR"
+    pip install -q gdown
+    gdown --fuzzy "https://drive.google.com/file/d/10wGdKSUOie0XmCr8SQ2A2FeDe-mfn5w3/view?usp=sharing" 2>&1 | tee -a "$LOG_FILE" || log "⚠️ E2FGVI 权重下载失败"
+    log "✅ E2FGVI 权重完成"
 fi
 
-# 7. E2FGVI
 if [ ! -f "/tmp/.e2fgvi_installed" ]; then
-    log "📦 安装E2FGVI..."
+    log "📦 安装 E2FGVI..."
     cd "$PHANTOM_DIR/submodules/phantom-E2FGVI"
     pip install -q -e . 2>&1 | tee -a "$LOG_FILE"
     touch /tmp/.e2fgvi_installed
-    log "✅ E2FGVI完成"
+    log "✅ E2FGVI 完成"
 fi
 
-# 8. Phantom主包
-if [ ! -f "/tmp/.phantom_installed" ]; then
-    log "📦 安装Phantom主包..."
+# ========== 跳过 Training 依赖 ==========
+# 以下包仅用于训练，inference 不需要:
+# - phantom-robosuite (机器人仿真)
+# - phantom-robomimic (机器人学习)
+log "⏭️ 跳过 Training 依赖 (robosuite, robomimic)"
+
+# 7. 其他 inference 依赖
+log "📦 安装其他依赖..."
+pip install -q joblib mediapy 2>&1 | tee -a "$LOG_FILE" || true
+pip install -q transformers==4.42.4 2>&1 | tee -a "$LOG_FILE" || true
+pip install -q PyOpenGL==3.1.4 Rtree protobuf==3.20.0 2>&1 | tee -a "$LOG_FILE" || true
+pip install -q hydra-core==1.3.2 omegaconf==2.3.0 2>&1 | tee -a "$LOG_FILE" || true
+pip install -q numpy==1.26.4 2>&1 | tee -a "$LOG_FILE" || true
+# open3d 体积大，尝试安装但不强求
+pip install -q open3d 2>&1 | tee -a "$LOG_FILE" || log "⚠️ open3d 跳过"
+
+# 8. Phantom 主包
+if [ ! -f "/tmp/.phantom_pkg_installed" ]; then
+    log "📦 安装 Phantom 主包..."
     cd "$PHANTOM_DIR"
     pip install -q -e . 2>&1 | tee -a "$LOG_FILE"
-    touch /tmp/.phantom_installed
-    log "✅ Phantom主包完成"
+    touch /tmp/.phantom_pkg_installed
+    log "✅ Phantom 主包完成"
 fi
 
-# 9. 验证MANO模型
+# 9. 下载示例数据（可选）
+SAMPLE_DATA_DIR="$PHANTOM_DIR/data/raw"
+if [ ! -d "$SAMPLE_DATA_DIR/pick_and_place" ]; then
+    log "📥 下载示例数据..."
+    mkdir -p "$SAMPLE_DATA_DIR"
+    cd "$SAMPLE_DATA_DIR"
+    wget -q https://download.cs.stanford.edu/juno/phantom/pick_and_place.zip || log "⚠️ 示例数据下载失败"
+    if [ -f "pick_and_place.zip" ]; then
+        unzip -q pick_and_place.zip
+        rm -f pick_and_place.zip
+        log "✅ 示例数据完成"
+    fi
+fi
+
+# 10. 检查 MANO 模型
 MANO_DIR="$PHANTOM_DIR/submodules/phantom-hamer/_DATA/data/mano"
+mkdir -p "$MANO_DIR"
+
+# 检查是否已存在（可能用户已经放在仓库里了）
 if [ -f "$MANO_DIR/MANO_LEFT.pkl" ] && [ -f "$MANO_DIR/MANO_RIGHT.pkl" ]; then
-    log "✅ MANO模型已就绪"
+    log "✅ MANO 模型已就绪"
 else
-    log "⚠️ MANO模型缺失，请上传"
+    log "⚠️ MANO 模型缺失！"
+    log "   请将文件放到: $MANO_DIR"
 fi
 
-log "🎉 Phantom环境配置完成"
-log "日志文件: $LOG_FILE"
-
 # 标记完成
 touch /tmp/.phantom_ready
+
+log "🎉 Phantom 环境配置完成 (Inference Only)"
+log "📝 日志文件: $LOG_FILE"
diff --git a/your b/your
new file mode 100644
index 0000000000000000000000000000000000000000..35241f92ccf16369c0fbb565142156b2ff500732
--- /dev/null
+++ b/your
@@ -0,0 +1 @@
+export PATH="$HOME/.local/bin:$PATH" shell config file